libflame revision_anchor
Functions | Variables
bl1_dotsv2.c File Reference

(r)

Functions

void bl1_sdotsv2 (conj1_t conjxy, int n, float *x, int inc_x, float *y, int inc_y, float *z, int inc_z, float *beta, float *rho_xz, float *rho_yz)
 
void bl1_ddotsv2 (conj1_t conjxy, int n, double *x, int inc_x, double *y, int inc_y, double *z, int inc_z, double *beta, double *rho_xz, double *rho_yz)
 
 if (inc_x !=1||inc_y !=1||inc_z !=1)
 
 for (i=0;i< n_run;++i)
 
 if (n_left > 0)
 
void bl1_cdotsv2 (conj1_t conjxy, int n, scomplex *x, int inc_x, scomplex *y, int inc_y, scomplex *z, int inc_z, scomplex *beta, scomplex *rho_xz, scomplex *rho_yz)
 
void bl1_zdotsv2 (conj1_t conjxy, int n, dcomplex *x, int inc_x, dcomplex *y, int inc_y, dcomplex *z, int inc_z, dcomplex *beta, dcomplex *rho_xz, dcomplex *rho_yz)
 
 if (bl1_is_conj(conjxy))
 
 bl1_zscals (beta, rho_yz)
 

Variables

double *restrict y1
 
double *restrict z1 = z
 
double rho1 = rho1_c
 
double rho2 = rho2_c
 
double x1c
 
double y1c
 
double z1c
 
double x2c
 
double y2c
 
double z2c
 
int i
 
int n_pre
 
int n_run
 
int n_left
 
rho_xz = *beta * *rho_xz + rho1
 
rho_yz = *beta * *rho_yz + rho2
 
 x1 = x
 
rho1 real = 0.0
 
rho1 imag = 0.0
 
 else
 

Function Documentation

◆ bl1_cdotsv2()

void bl1_cdotsv2 ( conj1_t  conjxy,
int  n,
scomplex x,
int  inc_x,
scomplex y,
int  inc_y,
scomplex z,
int  inc_z,
scomplex beta,
scomplex rho_xz,
scomplex rho_yz 
)
243{
244 bl1_abort();
245}
void bl1_abort(void)
Definition bl1_abort.c:13

References bl1_abort().

◆ bl1_ddotsv2()

void bl1_ddotsv2 ( conj1_t  conjxy,
int  n,
double x,
int  inc_x,
double y,
int  inc_y,
double z,
int  inc_z,
double beta,
double rho_xz,
double rho_yz 
)
44{
45 double* restrict x1;
46 double* restrict y1;
47 double* restrict z1;
48 double rho1, rho2;
49 double x1c, y1c, z1c;
50 int i;
51
52 int n_pre;
53 int n_run;
54 int n_left;
55
57 v2df_t x1v, y1v, z1v;
58 v2df_t x2v, y2v, z2v;
59
60 if ( inc_x != 1 ||
61 inc_y != 1 ||
62 inc_z != 1 ) bl1_abort();
63
64 n_pre = 0;
65 if ( ( unsigned long ) z % 16 != 0 )
66 {
67 if ( ( unsigned long ) x % 16 == 0 ||
68 ( unsigned long ) y % 16 == 0 ) bl1_abort();
69
70 n_pre = 1;
71 }
72
73 n_run = ( n - n_pre ) / 4;
74 n_left = ( n - n_pre ) % 4;
75
76 x1 = x;
77 y1 = y;
78 z1 = z;
79
80 rho1 = 0.0;
81 rho2 = 0.0;
82
83 if ( n_pre == 1 )
84 {
85 x1c = *x1;
86 y1c = *y1;
87 z1c = *z1;
88
89 rho1 += x1c * z1c;
90 rho2 += y1c * z1c;
91
92 x1 += inc_x;
93 y1 += inc_y;
94 z1 += inc_z;
95 }
96
99
100 for ( i = 0; i < n_run; ++i )
101 {
102 x1v.v = _mm_load_pd( ( double* )x1 );
103 y1v.v = _mm_load_pd( ( double* )y1 );
104 z1v.v = _mm_load_pd( ( double* )z1 );
105
106 x2v.v = _mm_load_pd( ( double* )(x1 + 2) );
107 y2v.v = _mm_load_pd( ( double* )(y1 + 2) );
108 z2v.v = _mm_load_pd( ( double* )(z1 + 2) );
109
110 rho1v.v += x1v.v * z1v.v;
111 rho2v.v += y1v.v * z1v.v;
112
113 rho1v.v += x2v.v * z2v.v;
114 rho2v.v += y2v.v * z2v.v;
115
116 x1 += 4;
117 y1 += 4;
118 z1 += 4;
119 }
120
121 rho1 += rho1v.d[0] + rho1v.d[1];
122 rho2 += rho2v.d[0] + rho2v.d[1];
123
124 if ( n_left > 0 )
125 {
126 for ( i = 0; i < n_left; ++i )
127 {
128 x1c = *x1;
129 y1c = *y1;
130 z1c = *z1;
131
132 rho1 += x1c * z1c;
133 rho2 += y1c * z1c;
134
135 x1 += inc_x;
136 y1 += inc_y;
137 z1 += inc_z;
138 }
139 }
140
141 *rho_xz = *beta * *rho_xz + rho1;
142 *rho_yz = *beta * *rho_yz + rho2;
143}
double *restrict z1
Definition bl1_dotsv2.c:148
double rho2
Definition bl1_dotsv2.c:149
int n_left
Definition bl1_dotsv2.c:156
int n_pre
Definition bl1_dotsv2.c:154
double rho1
Definition bl1_dotsv2.c:149
double z1c
Definition bl1_dotsv2.c:150
int n_run
Definition bl1_dotsv2.c:155
double y1c
Definition bl1_dotsv2.c:150
* rho_xz
Definition bl1_dotsv2.c:229
x1
Definition bl1_dotsv2.c:374
double *restrict y1
Definition bl1_dotsv2.c:147
int i
Definition bl1_dotsv2.c:152
double x1c
Definition bl1_dotsv2.c:150
* rho_yz
Definition bl1_dotsv2.c:230
Definition blis_type_defs.h:117

References bl1_abort(), v2df_t::d, i, n_left, n_pre, n_run, rho1, rho2, rho_xz, rho_yz, v2df_t::v, x1, x1c, y1, y1c, z1, and z1c.

Referenced by FLA_Fused_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Uhu_Yhu_Zhu_opd_var1(), FLA_Fused_UYx_ZVx_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

◆ bl1_sdotsv2()

void bl1_sdotsv2 ( conj1_t  conjxy,
int  n,
float x,
int  inc_x,
float y,
int  inc_y,
float z,
int  inc_z,
float beta,
float rho_xz,
float rho_yz 
)
30{
31 bl1_abort();
32}

References bl1_abort().

◆ bl1_zdotsv2()

void bl1_zdotsv2 ( conj1_t  conjxy,
int  n,
dcomplex x,
int  inc_x,
dcomplex y,
int  inc_y,
dcomplex z,
int  inc_z,
dcomplex beta,
dcomplex rho_xz,
dcomplex rho_yz 
)
257{
261 int i;
265 v2df_t x1v, x1rv;
266 v2df_t y1v, y1rv;
267
268 x1 = x;
269 y1 = y;
270 z1 = z;
271
272 rho1v.v = _mm_setzero_pd();
273 rho2v.v = _mm_setzero_pd();
274
275 if ( bl1_is_conj( conjxy ) )
276 {
278
279 for ( i = 0; i < n; ++i )
280 {
281 z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
282 z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );
283
284 x1v.v = _mm_load_pd( ( double* )x1 );
285 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
286 bcac.v = x1rv.v * z11v.v;
287 adbd.v = x1v.v * z12v.v;
288 rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v );
289
290 y1v.v = _mm_load_pd( ( double* )y1 );
291 y1rv.v = _mm_shuffle_pd( y1v.v, y1v.v, _MM_SHUFFLE2 (0,1) );
292 bcac.v = y1rv.v * z11v.v;
293 adbd.v = y1v.v * z12v.v;
294 rho2v.v = rho2v.v + _mm_addsub_pd( bcac.v, adbd.v );
295
296 x1 += inc_x;
297 y1 += inc_y;
298 z1 += inc_z;
299 }
300
301 rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
302 rho2v.v = _mm_shuffle_pd( rho2v.v, rho2v.v, _MM_SHUFFLE2 (0,1) );
303
304 rho1v.d[1] = -rho1v.d[1];
305 rho2v.d[1] = -rho2v.d[1];
306 }
307 else
308 {
310
311 for ( i = 0; i < n; ++i )
312 {
313 z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
314 z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );
315
316 x1v.v = _mm_load_pd( ( double* )x1 );
317 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
318 cada.v = x1v.v * z11v.v;
319 dbcb.v = x1rv.v * z12v.v;
320 rho1v.v = rho1v.v + _mm_addsub_pd( cada.v, dbcb.v );
321
322 y1v.v = _mm_load_pd( ( double* )y1 );
323 y1rv.v = _mm_shuffle_pd( y1v.v, y1v.v, _MM_SHUFFLE2 (0,1) );
324 cada.v = y1v.v * z11v.v;
325 dbcb.v = y1rv.v * z12v.v;
326 rho2v.v = rho2v.v + _mm_addsub_pd( cada.v, dbcb.v );
327
328 x1 += inc_x;
329 y1 += inc_y;
330 z1 += inc_z;
331 }
332 }
333
334 //bl1_zscals( beta, rho_xz );
335 //bl1_zscals( beta, rho_yz );
336 {
337 v2df_t ab, ba, cc, dd, acbc, bdad;
338
339 ab.v = _mm_load_pd( ( double* )beta );
340 ba.v = _mm_shuffle_pd( ab.v, ab.v, _MM_SHUFFLE2 (0,1) );
341
342 cc.v = _mm_loaddup_pd( ( double* )&(rho_xz->real) );
343 dd.v = _mm_loaddup_pd( ( double* )&(rho_xz->imag) );
344 acbc.v = ab.v * cc.v;
345 bdad.v = ba.v * dd.v;
346 r1v.v = _mm_addsub_pd( acbc.v, bdad.v );
347
348 cc.v = _mm_loaddup_pd( ( double* )&(rho_yz->real) );
349 dd.v = _mm_loaddup_pd( ( double* )&(rho_yz->imag) );
350 acbc.v = ab.v * cc.v;
351 bdad.v = ba.v * dd.v;
352 r2v.v = _mm_addsub_pd( acbc.v, bdad.v );
353 }
354
355 //rho_xz->real = rho_xz->real + rho1.real;
356 //rho_xz->imag = rho_xz->imag + rho1.imag;
357 rho1v.v = r1v.v + rho1v.v;
358 _mm_store_pd( ( double* )rho_xz, rho1v.v );
359
360 //rho_yz->real = rho_yz->real + rho2.real;
361 //rho_yz->imag = rho_yz->imag + rho2.imag;
362 rho2v.v = r2v.v + rho2v.v;
363 _mm_store_pd( ( double* )rho_yz, rho2v.v );
364}
int bl1_is_conj(conj1_t conj)
Definition bl1_is.c:42
Definition blis_type_defs.h:138
__m128d v
Definition blis_type_defs.h:118

References bl1_is_conj(), v2df_t::d, i, rho_xz, rho_yz, v2df_t::v, x1, y1, and z1.

Referenced by FLA_Fused_Ahx_Axpy_Ax_opz_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opz_var1(), and FLA_Fused_UYx_ZVx_opz_var1().

◆ bl1_zscals()

bl1_zscals ( beta  ,
rho_yz   
)

◆ for()

for ( )
196 {
197 x1c = *x1;
198 x2c = *(x1 + 1);
199 y1c = *y1;
200 y2c = *(y1 + 1);
201 z1c = *z1;
202 z2c = *(z1 + 1);
203
204 rho1 += x1c * z1c + x2c * z2c;
205 rho2 += y1c * z1c + y2c * z2c;
206
207 x1 += 2*inc_x;
208 y1 += 2*inc_y;
209 z1 += 2*inc_z;
210 }
double x2c
Definition bl1_dotsv2.c:151
double z2c
Definition bl1_dotsv2.c:151
double y2c
Definition bl1_dotsv2.c:151

References rho1, rho2, x1, x1c, x2c, y1, y1c, y2c, z1, z1c, and z2c.

◆ if() [1/3]

if ( bl1_is_conj(conjxy )
382 {
383 for ( i = 0; i < n; ++i )
384 {
385 x1c = *x1;
386 y1c = *y1;
387 z1c = *z1;
388
389 rho1.real += x1c.real * z1c.real - -x1c.imag * z1c.imag;
390 rho1.imag += x1c.real * z1c.imag + -x1c.imag * z1c.real;
391
392 rho2.real += y1c.real * z1c.real - -y1c.imag * z1c.imag;
393 rho2.imag += y1c.real * z1c.imag + -y1c.imag * z1c.real;
394
395 x1 += inc_x;
396 y1 += inc_y;
397 z1 += inc_z;
398 }
399 }

References i, rho1, rho2, x1, x1c, y1, y1c, z1, and z1c.

◆ if() [2/3]

if ( inc_x = 1 || inc_y != 1 || inc_z != 1)
182 {
183 x1c = *x1;
184 y1c = *y1;
185 z1c = *z1;
186
187 rho1 += x1c * z1c;
188 rho2 += y1c * z1c;
189
190 x1 += inc_x;
191 y1 += inc_y;
192 z1 += inc_z;
193 }

◆ if() [3/3]

if ( n_left  ,
 
)
213 {
214 for ( i = 0; i < n_left; ++i )
215 {
216 x1c = *x1;
217 y1c = *y1;
218 z1c = *z1;
219
220 rho1 += x1c * z1c;
221 rho2 += y1c * z1c;
222
223 x1 += inc_x;
224 y1 += inc_y;
225 z1 += inc_z;
226 }
227 }

References i, n_left, rho1, rho2, x1, x1c, y1, y1c, z1, and z1c.

Variable Documentation

◆ else

else
Initial value:
{
for ( i = 0; i < n; ++i )
{
x1c = *x1;
y1c = *y1;
z1c = *z1;
rho1.real += x1c.real * z1c.real - x1c.imag * z1c.imag;
rho1.imag += x1c.real * z1c.imag + x1c.imag * z1c.real;
rho2.real += y1c.real * z1c.real - y1c.imag * z1c.imag;
rho2.imag += y1c.real * z1c.imag + y1c.imag * z1c.real;
x1 += inc_x;
y1 += inc_y;
z1 += inc_z;
}
}
bl1_zscals(beta, rho_yz)
401 {
402 for ( i = 0; i < n; ++i )
403 {
404 x1c = *x1;
405 y1c = *y1;
406 z1c = *z1;
407
408 rho1.real += x1c.real * z1c.real - x1c.imag * z1c.imag;
409 rho1.imag += x1c.real * z1c.imag + x1c.imag * z1c.real;
410
411 rho2.real += y1c.real * z1c.real - y1c.imag * z1c.imag;
412 rho2.imag += y1c.real * z1c.imag + y1c.imag * z1c.real;
413
414 x1 += inc_x;
415 y1 += inc_y;
416 z1 += inc_z;
417 }
418 }

◆ i

int i

Referenced by bl1_ddotsv2(), bl1_zdotsv2(), if(), and if().

◆ imag

rho_yz imag = 0.0

◆ n_left

int n_left

Referenced by bl1_ddotsv2(), and if().

◆ n_pre

int n_pre

Referenced by bl1_ddotsv2().

◆ n_run

int n_run

Referenced by bl1_ddotsv2().

◆ real

rho_yz real = 0.0

◆ rho1

* rho1 = rho1_c

◆ rho2

* rho2 = rho2_c

Referenced by bl1_ddotsv2(), for(), if(), and if().

◆ rho_xz

* rho_xz = *beta * *rho_xz + rho1

Referenced by bl1_ddotsv2(), and bl1_zdotsv2().

◆ rho_yz

* rho_yz = *beta * *rho_yz + rho2

Referenced by bl1_ddotsv2(), and bl1_zdotsv2().

◆ x1

x1 = x

◆ x1c

dcomplex x1c

Referenced by bl1_ddotsv2(), for(), if(), and if().

◆ x2c

double x2c

Referenced by for().

◆ y1

◆ y1c

dcomplex y1c

Referenced by bl1_ddotsv2(), for(), if(), and if().

◆ y2c

double y2c

Referenced by for().

◆ z1

dcomplex *restrict z1 = z

◆ z1c

dcomplex z1c

Referenced by bl1_ddotsv2(), for(), if(), and if().

◆ z2c

double z2c

Referenced by for().