libflame revision_anchor
Functions | Variables
bl1_dotsv3.c File Reference

(r)

Functions

void bl1_sdotsv3 (conj1_t conjxyw, int n, float *x, int inc_x, float *y, int inc_y, float *w, int inc_w, float *z, int inc_z, float *beta, float *rho_xz, float *rho_yz, float *rho_wz)
 
void bl1_ddotsv3 (conj1_t conjxyw, int n, double *x, int inc_x, double *y, int inc_y, double *w, int inc_w, double *z, int inc_z, double *beta, double *rho_xz, double *rho_yz, double *rho_wz)
 
 if (inc_x !=1||inc_y !=1||inc_w !=1||inc_z !=1)
 
 for (i=0;i< n_run;++i)
 
 if (n_left > 0)
 
void bl1_cdotsv3 (conj1_t conjxyw, int n, scomplex *x, int inc_x, scomplex *y, int inc_y, scomplex *w, int inc_w, scomplex *z, int inc_z, scomplex *beta, scomplex *rho_xz, scomplex *rho_yz, scomplex *rho_wz)
 
void bl1_zdotsv3 (conj1_t conjxyw, int n, dcomplex *x, int inc_x, dcomplex *y, int inc_y, dcomplex *w, int inc_w, dcomplex *z, int inc_z, dcomplex *beta, dcomplex *rho_xz, dcomplex *rho_yz, dcomplex *rho_wz)
 
 if (bl1_is_conj(conjxyw))
 
 bl1_zscals (beta, rho_yz)
 
 bl1_zscals (beta, rho_wz)
 

Variables

double *restrict y1
 
double *restrict w1 = w
 
double *restrict z1 = z
 
double rho1
 
double rho2
 
double rho3
 
double x1c
 
double y1c
 
double w1c
 
double z1c
 
double x2c
 
double y2c
 
double w2c
 
double z2c
 
int i
 
int n_pre
 
int n_run
 
int n_left
 
rho_xz = *beta * *rho_xz + rho1
 
rho_yz = *beta * *rho_yz + rho2
 
rho_wz = *beta * *rho_wz + rho3
 
 x1 = x
 
rho1 real = 0.0
 
rho1 imag = 0.0
 
 else
 

Function Documentation

◆ bl1_cdotsv3()

void bl1_cdotsv3 ( conj1_t  conjxyw,
int  n,
scomplex x,
int  inc_x,
scomplex y,
int  inc_y,
scomplex w,
int  inc_w,
scomplex z,
int  inc_z,
scomplex beta,
scomplex rho_xz,
scomplex rho_yz,
scomplex rho_wz 
)
285{
286 bl1_abort();
287}
void bl1_abort(void)
Definition bl1_abort.c:13

References bl1_abort().

◆ bl1_ddotsv3()

void bl1_ddotsv3 ( conj1_t  conjxyw,
int  n,
double x,
int  inc_x,
double y,
int  inc_y,
double w,
int  inc_w,
double z,
int  inc_z,
double beta,
double rho_xz,
double rho_yz,
double rho_wz 
)
49{
50 double* restrict x1;
51 double* restrict y1;
52 double* restrict w1;
53 double* restrict z1;
54 double rho1, rho2, rho3;
55 double x1c, y1c, w1c, z1c;
56 int i;
57
58 int n_pre;
59 int n_run;
60 int n_left;
61
63 v2df_t x1v, y1v, w1v, z1v;
64 v2df_t x2v, y2v, w2v, z2v;
65
66 if ( inc_x != 1 ||
67 inc_y != 1 ||
68 inc_w != 1 ||
69 inc_z != 1 ) bl1_abort();
70
71 n_pre = 0;
72 if ( ( unsigned long ) z % 16 != 0 )
73 {
74 if ( ( unsigned long ) x % 16 == 0 ||
75 ( unsigned long ) y % 16 == 0 ||
76 ( unsigned long ) w % 16 == 0 ) bl1_abort();
77
78 n_pre = 1;
79 }
80
81 n_run = ( n - n_pre ) / 4;
82 n_left = ( n - n_pre ) % 4;
83
84 x1 = x;
85 y1 = y;
86 w1 = w;
87 z1 = z;
88
89 rho1 = 0.0;
90 rho2 = 0.0;
91 rho3 = 0.0;
92
93 if ( n_pre == 1 )
94 {
95 x1c = *x1;
96 y1c = *y1;
97 w1c = *w1;
98 z1c = *z1;
99
100 rho1 += x1c * z1c;
101 rho2 += y1c * z1c;
102 rho3 += w1c * z1c;
103
104 x1 += inc_x;
105 y1 += inc_y;
106 w1 += inc_w;
107 z1 += inc_z;
108 }
109
110 rho1v.v = _mm_setzero_pd();
111 rho2v.v = _mm_setzero_pd();
112 rho3v.v = _mm_setzero_pd();
113
114 for ( i = 0; i < n_run; ++i )
115 {
116 x1v.v = _mm_load_pd( ( double* )x1 );
117 y1v.v = _mm_load_pd( ( double* )y1 );
118 w1v.v = _mm_load_pd( ( double* )w1 );
119 z1v.v = _mm_load_pd( ( double* )z1 );
120
121 rho1v.v += x1v.v * z1v.v;
122 rho2v.v += y1v.v * z1v.v;
123 rho3v.v += w1v.v * z1v.v;
124
125 x2v.v = _mm_load_pd( ( double* )(x1 + 2) );
126 y2v.v = _mm_load_pd( ( double* )(y1 + 2) );
127 w2v.v = _mm_load_pd( ( double* )(w1 + 2) );
128 z2v.v = _mm_load_pd( ( double* )(z1 + 2) );
129
130 rho1v.v += x2v.v * z2v.v;
131 rho2v.v += y2v.v * z2v.v;
132 rho3v.v += w2v.v * z2v.v;
133
134 x1 += 4;
135 y1 += 4;
136 w1 += 4;
137 z1 += 4;
138 }
139
140 rho1 += rho1v.d[0] + rho1v.d[1];
141 rho2 += rho2v.d[0] + rho2v.d[1];
142 rho3 += rho3v.d[0] + rho3v.d[1];
143
144 if ( n_left > 0 )
145 {
146 for ( i = 0; i < n_left; ++i )
147 {
148 x1c = *x1;
149 y1c = *y1;
150 w1c = *w1;
151 z1c = *z1;
152
153 rho1 += x1c * z1c;
154 rho2 += y1c * z1c;
155 rho3 += w1c * z1c;
156
157 x1 += inc_x;
158 y1 += inc_y;
159 w1 += inc_w;
160 z1 += inc_z;
161 }
162 }
163
164 *rho_xz = *beta * *rho_xz + rho1;
165 *rho_yz = *beta * *rho_yz + rho2;
166 *rho_wz = *beta * *rho_wz + rho3;
167}
double *restrict z1
Definition bl1_dotsv3.c:173
int n_left
Definition bl1_dotsv3.c:181
* rho_wz
Definition bl1_dotsv3.c:270
int n_pre
Definition bl1_dotsv3.c:179
double *restrict y1
Definition bl1_dotsv3.c:171
double *restrict w1
Definition bl1_dotsv3.c:172
double z1c
Definition bl1_dotsv3.c:175
int n_run
Definition bl1_dotsv3.c:180
double rho1
Definition bl1_dotsv3.c:174
double rho3
Definition bl1_dotsv3.c:174
double y1c
Definition bl1_dotsv3.c:175
double rho2
Definition bl1_dotsv3.c:174
* rho_xz
Definition bl1_dotsv3.c:268
x1
Definition bl1_dotsv3.c:452
int i
Definition bl1_dotsv3.c:177
double x1c
Definition bl1_dotsv3.c:175
* rho_yz
Definition bl1_dotsv3.c:269
double w1c
Definition bl1_dotsv3.c:175
Definition blis_type_defs.h:117

References bl1_abort(), v2df_t::d, i, n_left, n_pre, n_run, rho1, rho2, rho3, rho_wz, rho_xz, rho_yz, v2df_t::v, w1, w1c, x1, x1c, y1, y1c, z1, and z1c.

◆ bl1_sdotsv3()

void bl1_sdotsv3 ( conj1_t  conjxyw,
int  n,
float x,
int  inc_x,
float y,
int  inc_y,
float w,
int  inc_w,
float z,
int  inc_z,
float beta,
float rho_xz,
float rho_yz,
float rho_wz 
)
33{
34 bl1_abort();
35}

References bl1_abort().

◆ bl1_zdotsv3()

void bl1_zdotsv3 ( conj1_t  conjxyw,
int  n,
dcomplex x,
int  inc_x,
dcomplex y,
int  inc_y,
dcomplex w,
int  inc_w,
dcomplex z,
int  inc_z,
dcomplex beta,
dcomplex rho_xz,
dcomplex rho_yz,
dcomplex rho_wz 
)
301{
306 int i;
311 v2df_t x1v, x1rv;
312 v2df_t y1v, y1rv;
313 v2df_t w1v, w1rv;
314
315 x1 = x;
316 y1 = y;
317 w1 = w;
318 z1 = z;
319
320 rho1v.v = _mm_setzero_pd();
321 rho2v.v = _mm_setzero_pd();
322 rho3v.v = _mm_setzero_pd();
323
324 if ( bl1_is_conj( conjxyw ) )
325 {
327
328 for ( i = 0; i < n; ++i )
329 {
330 z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
331 z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );
332
333 x1v.v = _mm_load_pd( ( double* )x1 );
334 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
335 bcac.v = x1rv.v * z11v.v;
336 adbd.v = x1v.v * z12v.v;
337 rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v );
338
339 y1v.v = _mm_load_pd( ( double* )y1 );
340 y1rv.v = _mm_shuffle_pd( y1v.v, y1v.v, _MM_SHUFFLE2 (0,1) );
341 bcac.v = y1rv.v * z11v.v;
342 adbd.v = y1v.v * z12v.v;
343 rho2v.v = rho2v.v + _mm_addsub_pd( bcac.v, adbd.v );
344
345 w1v.v = _mm_load_pd( ( double* )w1 );
346 w1rv.v = _mm_shuffle_pd( w1v.v, w1v.v, _MM_SHUFFLE2 (0,1) );
347 bcac.v = w1rv.v * z11v.v;
348 adbd.v = w1v.v * z12v.v;
349 rho3v.v = rho3v.v + _mm_addsub_pd( bcac.v, adbd.v );
350
351 x1 += inc_x;
352 y1 += inc_y;
353 w1 += inc_w;
354 z1 += inc_z;
355 }
356
357 rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
358 rho2v.v = _mm_shuffle_pd( rho2v.v, rho2v.v, _MM_SHUFFLE2 (0,1) );
359 rho3v.v = _mm_shuffle_pd( rho3v.v, rho3v.v, _MM_SHUFFLE2 (0,1) );
360
361 rho1v.d[1] = -rho1v.d[1];
362 rho2v.d[1] = -rho2v.d[1];
363 rho3v.d[1] = -rho3v.d[1];
364 }
365 else
366 {
368
369 for ( i = 0; i < n; ++i )
370 {
371 z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
372 z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );
373
374 x1v.v = _mm_load_pd( ( double* )x1 );
375 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
376 cada.v = x1v.v * z11v.v;
377 dbcb.v = x1rv.v * z12v.v;
378 rho1v.v = rho1v.v + _mm_addsub_pd( cada.v, dbcb.v );
379
380 y1v.v = _mm_load_pd( ( double* )y1 );
381 y1rv.v = _mm_shuffle_pd( y1v.v, y1v.v, _MM_SHUFFLE2 (0,1) );
382 cada.v = y1v.v * z11v.v;
383 dbcb.v = y1rv.v * z12v.v;
384 rho2v.v = rho2v.v + _mm_addsub_pd( cada.v, dbcb.v );
385
386 w1v.v = _mm_load_pd( ( double* )w1 );
387 w1rv.v = _mm_shuffle_pd( w1v.v, w1v.v, _MM_SHUFFLE2 (0,1) );
388 cada.v = w1v.v * z11v.v;
389 dbcb.v = w1rv.v * z12v.v;
390 rho3v.v = rho3v.v + _mm_addsub_pd( cada.v, dbcb.v );
391
392 x1 += inc_x;
393 y1 += inc_y;
394 w1 += inc_w;
395 z1 += inc_z;
396 }
397 }
398
399 //bl1_zscals( beta, rho_xz );
400 //bl1_zscals( beta, rho_yz );
401 //bl1_zscals( beta, rho_wz );
402 {
403 v2df_t ab, ba, cc, dd, acbc, bdad;
404
405 ab.v = _mm_load_pd( ( double* )beta );
406 ba.v = _mm_shuffle_pd( ab.v, ab.v, _MM_SHUFFLE2 (0,1) );
407
408 cc.v = _mm_loaddup_pd( ( double* )&(rho_xz->real) );
409 dd.v = _mm_loaddup_pd( ( double* )&(rho_xz->imag) );
410 acbc.v = ab.v * cc.v;
411 bdad.v = ba.v * dd.v;
412 r1v.v = _mm_addsub_pd( acbc.v, bdad.v );
413
414 cc.v = _mm_loaddup_pd( ( double* )&(rho_yz->real) );
415 dd.v = _mm_loaddup_pd( ( double* )&(rho_yz->imag) );
416 acbc.v = ab.v * cc.v;
417 bdad.v = ba.v * dd.v;
418 r2v.v = _mm_addsub_pd( acbc.v, bdad.v );
419
420 cc.v = _mm_loaddup_pd( ( double* )&(rho_wz->real) );
421 dd.v = _mm_loaddup_pd( ( double* )&(rho_wz->imag) );
422 acbc.v = ab.v * cc.v;
423 bdad.v = ba.v * dd.v;
424 r3v.v = _mm_addsub_pd( acbc.v, bdad.v );
425 }
426
427 //rho_xz->real = rho_xz->real + rho1.real;
428 //rho_xz->imag = rho_xz->imag + rho1.imag;
429 rho1v.v = r1v.v + rho1v.v;
430 _mm_store_pd( ( double* )rho_xz, rho1v.v );
431
432 //rho_yz->real = rho_yz->real + rho2.real;
433 //rho_yz->imag = rho_yz->imag + rho2.imag;
434 rho2v.v = r2v.v + rho2v.v;
435 _mm_store_pd( ( double* )rho_yz, rho2v.v );
436
437 //rho_wz->real = rho_wz->real + rho3.real;
438 //rho_wz->imag = rho_wz->imag + rho3.imag;
439 rho3v.v = r3v.v + rho3v.v;
440 _mm_store_pd( ( double* )rho_wz, rho3v.v );
441}
int bl1_is_conj(conj1_t conj)
Definition bl1_is.c:42
Definition blis_type_defs.h:138
__m128d v
Definition blis_type_defs.h:118

References bl1_is_conj(), v2df_t::d, i, rho_wz, rho_xz, rho_yz, v2df_t::v, w1, x1, y1, and z1.

Referenced by FLA_Fused_Uhu_Yhu_Zhu_opz_var1().

◆ bl1_zscals() [1/2]

bl1_zscals ( beta  ,
rho_wz   
)

◆ bl1_zscals() [2/2]

bl1_zscals ( beta  ,
rho_yz   
)

◆ for()

for ( )
228 {
229 x1c = *x1;
230 x2c = *(x1 + 1);
231 y1c = *y1;
232 y2c = *(y1 + 1);
233 w1c = *w1;
234 w2c = *(w1 + 1);
235 z1c = *z1;
236 z2c = *(z1 + 1);
237
238 rho1 += x1c * z1c + x2c * z2c;
239 rho2 += y1c * z1c + y2c * z2c;
240 rho3 += w1c * z1c + w2c * z2c;
241
242 x1 += 2*inc_x;
243 y1 += 2*inc_y;
244 w1 += 2*inc_w;
245 z1 += 2*inc_z;
246 }
double x2c
Definition bl1_dotsv3.c:176
double z2c
Definition bl1_dotsv3.c:176
double w2c
Definition bl1_dotsv3.c:176
double y2c
Definition bl1_dotsv3.c:176

References rho1, rho2, rho3, w1, w1c, w2c, x1, x1c, x2c, y1, y1c, y2c, z1, z1c, and z2c.

◆ if() [1/3]

if ( bl1_is_conj(conjxyw )
462 {
463 for ( i = 0; i < n; ++i )
464 {
465 x1c = *x1;
466 y1c = *y1;
467 w1c = *w1;
468 z1c = *z1;
469
470 rho1.real += x1c.real * z1c.real - -x1c.imag * z1c.imag;
471 rho1.imag += x1c.real * z1c.imag + -x1c.imag * z1c.real;
472
473 rho2.real += y1c.real * z1c.real - -y1c.imag * z1c.imag;
474 rho2.imag += y1c.real * z1c.imag + -y1c.imag * z1c.real;
475
476 rho3.real += w1c.real * z1c.real - -w1c.imag * z1c.imag;
477 rho3.imag += w1c.real * z1c.imag + -w1c.imag * z1c.real;
478
479 x1 += inc_x;
480 y1 += inc_y;
481 w1 += inc_w;
482 z1 += inc_z;
483 }
484 }

References i, rho1, rho2, rho3, w1, w1c, x1, x1c, y1, y1c, z1, and z1c.

◆ if() [2/3]

if ( inc_x = 1 || inc_y != 1 || inc_w != 1 || inc_z != 1)
211 {
212 x1c = *x1;
213 y1c = *y1;
214 w1c = *w1;
215 z1c = *z1;
216
217 rho1 += x1c * z1c;
218 rho2 += y1c * z1c;
219 rho3 += w1c * z1c;
220
221 x1 += inc_x;
222 y1 += inc_y;
223 w1 += inc_w;
224 z1 += inc_z;
225 }

◆ if() [3/3]

if ( n_left  ,
 
)
249 {
250 for ( i = 0; i < n_left; ++i )
251 {
252 x1c = *x1;
253 y1c = *y1;
254 w1c = *w1;
255 z1c = *z1;
256
257 rho1 += x1c * z1c;
258 rho2 += y1c * z1c;
259 rho3 += w1c * z1c;
260
261 x1 += inc_x;
262 y1 += inc_y;
263 w1 += inc_w;
264 z1 += inc_z;
265 }
266 }

References i, n_left, rho1, rho2, rho3, w1, w1c, x1, x1c, y1, y1c, z1, and z1c.

Variable Documentation

◆ else

else
Initial value:
{
for ( i = 0; i < n; ++i )
{
x1c = *x1;
y1c = *y1;
w1c = *w1;
z1c = *z1;
rho1.real += x1c.real * z1c.real - x1c.imag * z1c.imag;
rho1.imag += x1c.real * z1c.imag + x1c.imag * z1c.real;
rho2.real += y1c.real * z1c.real - y1c.imag * z1c.imag;
rho2.imag += y1c.real * z1c.imag + y1c.imag * z1c.real;
rho3.real += w1c.real * z1c.real - w1c.imag * z1c.imag;
rho3.imag += w1c.real * z1c.imag + w1c.imag * z1c.real;
x1 += inc_x;
y1 += inc_y;
w1 += inc_w;
z1 += inc_z;
}
}
bl1_zscals(beta, rho_yz)
486 {
487 for ( i = 0; i < n; ++i )
488 {
489 x1c = *x1;
490 y1c = *y1;
491 w1c = *w1;
492 z1c = *z1;
493
494 rho1.real += x1c.real * z1c.real - x1c.imag * z1c.imag;
495 rho1.imag += x1c.real * z1c.imag + x1c.imag * z1c.real;
496
497 rho2.real += y1c.real * z1c.real - y1c.imag * z1c.imag;
498 rho2.imag += y1c.real * z1c.imag + y1c.imag * z1c.real;
499
500 rho3.real += w1c.real * z1c.real - w1c.imag * z1c.imag;
501 rho3.imag += w1c.real * z1c.imag + w1c.imag * z1c.real;
502
503 x1 += inc_x;
504 y1 += inc_y;
505 w1 += inc_w;
506 z1 += inc_z;
507 }
508 }

◆ i

int i

Referenced by bl1_ddotsv3(), bl1_zdotsv3(), if(), and if().

◆ imag

rho_wz imag = 0.0

◆ n_left

int n_left

Referenced by bl1_ddotsv3(), and if().

◆ n_pre

int n_pre

Referenced by bl1_ddotsv3().

◆ n_run

int n_run

Referenced by bl1_ddotsv3().

◆ real

rho_wz real = 0.0

◆ rho1

dcomplex rho1

Referenced by bl1_ddotsv3(), for(), if(), and if().

◆ rho2

dcomplex rho2

Referenced by bl1_ddotsv3(), for(), if(), and if().

◆ rho3

dcomplex rho3

Referenced by bl1_ddotsv3(), for(), if(), and if().

◆ rho_wz

* rho_wz = *beta * *rho_wz + rho3

Referenced by bl1_ddotsv3(), and bl1_zdotsv3().

◆ rho_xz

* rho_xz = *beta * *rho_xz + rho1

Referenced by bl1_ddotsv3(), and bl1_zdotsv3().

◆ rho_yz

* rho_yz = *beta * *rho_yz + rho2

Referenced by bl1_ddotsv3(), and bl1_zdotsv3().

◆ w1

◆ w1c

dcomplex w1c

Referenced by bl1_ddotsv3(), for(), if(), and if().

◆ w2c

double w2c

Referenced by for().

◆ x1

x1 = x

Referenced by bl1_ddotsv3(), bl1_zdotsv3(), for(), if(), and if().

◆ x1c

dcomplex x1c

Referenced by bl1_ddotsv3(), for(), if(), and if().

◆ x2c

double x2c

Referenced by for().

◆ y1

y1
Initial value:
{
double* restrict x1

Referenced by bl1_ddotsv3(), bl1_zdotsv3(), for(), if(), and if().

◆ y1c

dcomplex y1c

Referenced by bl1_ddotsv3(), for(), if(), and if().

◆ y2c

double y2c

Referenced by for().

◆ z1

z1 = z

Referenced by bl1_ddotsv3(), bl1_zdotsv3(), for(), if(), and if().

◆ z1c

dcomplex z1c

Referenced by bl1_ddotsv3(), for(), if(), and if().

◆ z2c

double z2c

Referenced by for().