libflame revision_anchor
Functions
blis_prototypes_fused1.h File Reference

(r)

Go to the source code of this file.

Functions

void bl1_saxmyv2 (conj1_t conjx, int n, float *alpha, float *beta, float *x, int inc_x, float *y, int inc_y, float *z, int inc_z)
 
void bl1_daxmyv2 (conj1_t conjx, int n, double *alpha, double *beta, double *x, int inc_x, double *y, int inc_y, double *z, int inc_z)
 
void bl1_caxmyv2 (conj1_t conjx, int n, scomplex *alpha, scomplex *beta, scomplex *x, int inc_x, scomplex *y, int inc_y, scomplex *z, int inc_z)
 
void bl1_zaxmyv2 (conj1_t conjx, int n, dcomplex *alpha, dcomplex *beta, dcomplex *x, int inc_x, dcomplex *y, int inc_y, dcomplex *z, int inc_z)
 
void bl1_saxpyv2b (int n, float *beta1, float *beta2, float *a1, int inc_a1, float *a2, int inc_a2, float *w, int inc_w)
 
void bl1_daxpyv2b (int n, double *beta1, double *beta2, double *a1, int inc_a1, double *a2, int inc_a2, double *w, int inc_w)
 
void bl1_caxpyv2b (int n, scomplex *beta1, scomplex *beta2, scomplex *a1, int inc_a1, scomplex *a2, int inc_a2, scomplex *w, int inc_w)
 
void bl1_zaxpyv2b (int n, dcomplex *beta1, dcomplex *beta2, dcomplex *a1, int inc_a1, dcomplex *a2, int inc_a2, dcomplex *w, int inc_w)
 
void bl1_saxpyv3b (int n, float *beta1, float *beta2, float *beta3, float *a1, int inc_a1, float *a2, int inc_a2, float *a3, int inc_a3, float *w, int inc_w)
 
void bl1_daxpyv3b (int n, double *beta1, double *beta2, double *beta3, double *a1, int inc_a1, double *a2, int inc_a2, double *a3, int inc_a3, double *w, int inc_w)
 
void bl1_caxpyv3b (int n, scomplex *beta1, scomplex *beta2, scomplex *beta3, scomplex *a1, int inc_a1, scomplex *a2, int inc_a2, scomplex *a3, int inc_a3, scomplex *w, int inc_w)
 
void bl1_zaxpyv3b (int n, dcomplex *beta1, dcomplex *beta2, dcomplex *beta3, dcomplex *a1, int inc_a1, dcomplex *a2, int inc_a2, dcomplex *a3, int inc_a3, dcomplex *w, int inc_w)
 
void bl1_saxpyv2bdotaxpy (int n, float *beta, float *u, int inc_u, float *gamma, float *z, int inc_z, float *a, int inc_a, float *x, int inc_x, float *kappa, float *rho, float *w, int inc_w)
 
void bl1_daxpyv2bdotaxpy (int n, double *beta, double *u, int inc_u, double *gamma, double *z, int inc_z, double *a, int inc_a, double *x, int inc_x, double *kappa, double *rho, double *w, int inc_w)
 
void bl1_caxpyv2bdotaxpy (int n, scomplex *beta, scomplex *u, int inc_u, scomplex *gamma, scomplex *z, int inc_z, scomplex *a, int inc_a, scomplex *x, int inc_x, scomplex *kappa, scomplex *rho, scomplex *w, int inc_w)
 
void bl1_zaxpyv2bdotaxpy (int n, dcomplex *beta, dcomplex *u, int inc_u, dcomplex *gamma, dcomplex *z, int inc_z, dcomplex *a, int inc_a, dcomplex *x, int inc_x, dcomplex *kappa, dcomplex *rho, dcomplex *w, int inc_w)
 
void bl1_sdotsv2 (conj1_t conjxy, int n, float *x, int inc_x, float *y, int inc_y, float *z, int inc_z, float *beta, float *rho_xz, float *rho_yz)
 
void bl1_ddotsv2 (conj1_t conjxy, int n, double *x, int inc_x, double *y, int inc_y, double *z, int inc_z, double *beta, double *rho_xz, double *rho_yz)
 
void bl1_cdotsv2 (conj1_t conjxy, int n, scomplex *x, int inc_x, scomplex *y, int inc_y, scomplex *z, int inc_z, scomplex *beta, scomplex *rho_xz, scomplex *rho_yz)
 
void bl1_zdotsv2 (conj1_t conjxy, int n, dcomplex *x, int inc_x, dcomplex *y, int inc_y, dcomplex *z, int inc_z, dcomplex *beta, dcomplex *rho_xz, dcomplex *rho_yz)
 
void bl1_sdotsv3 (conj1_t conjxyw, int n, float *x, int inc_x, float *y, int inc_y, float *w, int inc_w, float *z, int inc_z, float *beta, float *rho_xz, float *rho_yz, float *rho_wz)
 
void bl1_ddotsv3 (conj1_t conjxyw, int n, double *x, int inc_x, double *y, int inc_y, double *w, int inc_w, double *z, int inc_z, double *beta, double *rho_xz, double *rho_yz, double *rho_wz)
 
void bl1_cdotsv3 (conj1_t conjxyw, int n, scomplex *x, int inc_x, scomplex *y, int inc_y, scomplex *w, int inc_w, scomplex *z, int inc_z, scomplex *beta, scomplex *rho_xz, scomplex *rho_yz, scomplex *rho_wz)
 
void bl1_zdotsv3 (conj1_t conjxyw, int n, dcomplex *x, int inc_x, dcomplex *y, int inc_y, dcomplex *w, int inc_w, dcomplex *z, int inc_z, dcomplex *beta, dcomplex *rho_xz, dcomplex *rho_yz, dcomplex *rho_wz)
 
void bl1_sdotaxpy (int n, float *a, int inc_a, float *x, int inc_x, float *kappa, float *rho, float *w, int inc_w)
 
void bl1_ddotaxpy (int n, double *a, int inc_a, double *x, int inc_x, double *kappa, double *rho, double *w, int inc_w)
 
void bl1_cdotaxpy (int n, scomplex *a, int inc_a, scomplex *x, int inc_x, scomplex *kappa, scomplex *rho, scomplex *w, int inc_w)
 
void bl1_zdotaxpy (int n, dcomplex *a, int inc_a, dcomplex *x, int inc_x, dcomplex *kappa, dcomplex *rho, dcomplex *w, int inc_w)
 
void bl1_sdotaxmyv2 (int n, float *alpha, float *beta, float *x, int inc_x, float *u, int inc_u, float *rho, float *y, int inc_y, float *z, int inc_z)
 
void bl1_ddotaxmyv2 (int n, double *alpha, double *beta, double *x, int inc_x, double *u, int inc_u, double *rho, double *y, int inc_y, double *z, int inc_z)
 
void bl1_cdotaxmyv2 (int n, scomplex *alpha, scomplex *beta, scomplex *x, int inc_x, scomplex *u, int inc_u, scomplex *rho, scomplex *y, int inc_y, scomplex *z, int inc_z)
 
void bl1_zdotaxmyv2 (int n, dcomplex *alpha, dcomplex *beta, dcomplex *x, int inc_x, dcomplex *u, int inc_u, dcomplex *rho, dcomplex *y, int inc_y, dcomplex *z, int inc_z)
 
void bl1_sdotv2axpyv2b (int n, float *a1, int inc_a1, float *a2, int inc_a2, float *x, int inc_x, float *kappa1, float *kappa2, float *rho1, float *rho2, float *w, int inc_w)
 
void bl1_ddotv2axpyv2b (int n, double *a1, int inc_a1, double *a2, int inc_a2, double *x, int inc_x, double *kappa1, double *kappa2, double *rho1, double *rho2, double *w, int inc_w)
 
void bl1_cdotv2axpyv2b (int n, scomplex *a1, int inc_a1, scomplex *a2, int inc_a2, scomplex *x, int inc_x, scomplex *kappa1, scomplex *kappa2, scomplex *rho1, scomplex *rho2, scomplex *w, int inc_w)
 
void bl1_zdotv2axpyv2b (int n, dcomplex *a1, int inc_a1, dcomplex *a2, int inc_a2, dcomplex *x, int inc_x, dcomplex *kappa1, dcomplex *kappa2, dcomplex *rho1, dcomplex *rho2, dcomplex *w, int inc_w)
 
void bl1_zaxpyv2bdots (int n, dcomplex *alpha1, dcomplex *alpha2, dcomplex *x1, int inc_x1, dcomplex *x2, int inc_x2, dcomplex *y, int inc_y, dcomplex *u, int inc_u, dcomplex *beta, dcomplex *rho)
 

Function Documentation

◆ bl1_caxmyv2()

void bl1_caxmyv2 ( conj1_t  conjx,
int  n,
scomplex alpha,
scomplex beta,
scomplex x,
int  inc_x,
scomplex y,
int  inc_y,
scomplex z,
int  inc_z 
)
245{
246 bl1_abort();
247}
void bl1_abort(void)
Definition bl1_abort.c:13

References bl1_abort().

◆ bl1_caxpyv2b()

void bl1_caxpyv2b ( int  n,
scomplex beta1,
scomplex beta2,
scomplex a1,
int  inc_a1,
scomplex a2,
int  inc_a2,
scomplex w,
int  inc_w 
)
205{
206 bl1_abort();
207}

References bl1_abort().

◆ bl1_caxpyv2bdotaxpy()

void bl1_caxpyv2bdotaxpy ( int  n,
scomplex beta,
scomplex u,
int  inc_u,
scomplex gamma,
scomplex z,
int  inc_z,
scomplex a,
int  inc_a,
scomplex x,
int  inc_x,
scomplex kappa,
scomplex rho,
scomplex w,
int  inc_w 
)
337{
338 bl1_abort();
339}

References bl1_abort().

◆ bl1_caxpyv3b()

void bl1_caxpyv3b ( int  n,
scomplex beta1,
scomplex beta2,
scomplex beta3,
scomplex a1,
int  inc_a1,
scomplex a2,
int  inc_a2,
scomplex a3,
int  inc_a3,
scomplex w,
int  inc_w 
)
219{
220 bl1_abort();
221}

References bl1_abort().

◆ bl1_cdotaxmyv2()

void bl1_cdotaxmyv2 ( int  n,
scomplex alpha,
scomplex beta,
scomplex x,
int  inc_x,
scomplex u,
int  inc_u,
scomplex rho,
scomplex y,
int  inc_y,
scomplex z,
int  inc_z 
)
271{
272 bl1_abort();
273}

References bl1_abort().

◆ bl1_cdotaxpy()

void bl1_cdotaxpy ( int  n,
scomplex a,
int  inc_a,
scomplex x,
int  inc_x,
scomplex kappa,
scomplex rho,
scomplex w,
int  inc_w 
)
253{
254 bl1_abort();
255}

References bl1_abort().

◆ bl1_cdotsv2()

void bl1_cdotsv2 ( conj1_t  conjxy,
int  n,
scomplex x,
int  inc_x,
scomplex y,
int  inc_y,
scomplex z,
int  inc_z,
scomplex beta,
scomplex rho_xz,
scomplex rho_yz 
)
243{
244 bl1_abort();
245}

References bl1_abort().

◆ bl1_cdotsv3()

void bl1_cdotsv3 ( conj1_t  conjxyw,
int  n,
scomplex x,
int  inc_x,
scomplex y,
int  inc_y,
scomplex w,
int  inc_w,
scomplex z,
int  inc_z,
scomplex beta,
scomplex rho_xz,
scomplex rho_yz,
scomplex rho_wz 
)
285{
286 bl1_abort();
287}

References bl1_abort().

◆ bl1_cdotv2axpyv2b()

void bl1_cdotv2axpyv2b ( int  n,
scomplex a1,
int  inc_a1,
scomplex a2,
int  inc_a2,
scomplex x,
int  inc_x,
scomplex kappa1,
scomplex kappa2,
scomplex rho1,
scomplex rho2,
scomplex w,
int  inc_w 
)
326{
327 bl1_abort();
328}

References bl1_abort().

◆ bl1_daxmyv2()

void bl1_daxmyv2 ( conj1_t  conjx,
int  n,
double alpha,
double beta,
double x,
int  inc_x,
double y,
int  inc_y,
double z,
int  inc_z 
)
42{
43 double* restrict chi1;
44 double* restrict psi1;
45 double* restrict zeta1;
46 int i;
47
48 int n_pre;
49 int n_run;
50 int n_left;
51
52 v2df_t a1v, b1v;
53 v2df_t x1v, y1v, z1v;
54 v2df_t x2v, y2v, z2v;
55
56 if ( inc_x != 1 ||
57 inc_y != 1 ||
58 inc_z != 1 ) bl1_abort();
59
60 n_pre = 0;
61 if ( ( unsigned long ) z % 16 != 0 )
62 {
63 if ( ( unsigned long ) x % 16 == 0 ||
64 ( unsigned long ) y % 16 == 0 ) bl1_abort();
65
66 n_pre = 1;
67 }
68
69 n_run = ( n - n_pre ) / 4;
70 n_left = ( n - n_pre ) % 4;
71
72 chi1 = x;
73 psi1 = y;
74 zeta1 = z;
75
76 if ( n_pre == 1 )
77 {
78 double alpha_c = *alpha;
79 double beta_c = *beta;
80 double chi1_c = *chi1;
81
82 *psi1 -= alpha_c * chi1_c;
83 *zeta1 -= beta_c * chi1_c;
84
85 chi1 += inc_x;
86 psi1 += inc_y;
87 zeta1 += inc_z;
88 }
89
90 a1v.v = _mm_loaddup_pd( ( double* )alpha );
91 b1v.v = _mm_loaddup_pd( ( double* )beta );
92
93 for ( i = 0; i < n_run; ++i )
94 {
95 x1v.v = _mm_load_pd( ( double* )chi1 );
96 y1v.v = _mm_load_pd( ( double* )psi1 );
97 z1v.v = _mm_load_pd( ( double* )zeta1 );
98
99 x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
100 y2v.v = _mm_load_pd( ( double* )(psi1 + 2) );
101 z2v.v = _mm_load_pd( ( double* )(zeta1 + 2) );
102
103 y1v.v = y1v.v - a1v.v * x1v.v;
104 z1v.v = z1v.v - b1v.v * x1v.v;
105
106 _mm_store_pd( ( double* )psi1, y1v.v );
107 _mm_store_pd( ( double* )zeta1, z1v.v );
108
109 y2v.v = y2v.v - a1v.v * x2v.v;
110 z2v.v = z2v.v - b1v.v * x2v.v;
111
112 _mm_store_pd( ( double* )(psi1 + 2), y2v.v );
113 _mm_store_pd( ( double* )(zeta1 + 2), z2v.v );
114
115 chi1 += 4;
116 psi1 += 4;
117 zeta1 += 4;
118 }
119
120 if ( n_left > 0 )
121 {
122 double alpha_c = *alpha;
123 double beta_c = *beta;
124
125 for( i = 0; i < n_left; ++i )
126 {
127 double chi1_c = *chi1;
128
129 *psi1 -= alpha_c * chi1_c;
130 *zeta1 -= beta_c * chi1_c;
131
132 chi1 += inc_x;
133 psi1 += inc_y;
134 zeta1 += inc_z;
135 }
136 }
137}
double *restrict zeta1
Definition bl1_axmyv2.c:142
double *restrict psi1
Definition bl1_axmyv2.c:141
double beta_c
Definition bl1_axmyv2.c:144
double alpha_c
Definition bl1_axmyv2.c:143
int n_left
Definition bl1_axmyv2.c:149
int n_pre
Definition bl1_axmyv2.c:147
int n_run
Definition bl1_axmyv2.c:148
int i
Definition bl1_axmyv2.c:145
chi1
Definition bl1_axmyv2.c:366
Definition blis_type_defs.h:117

References alpha_c, beta_c, bl1_abort(), chi1, i, n_left, n_pre, n_run, psi1, v2df_t::v, and zeta1.

Referenced by FLA_Fused_UYx_ZVx_opd_var1().

◆ bl1_daxpyv2b()

void bl1_daxpyv2b ( int  n,
double beta1,
double beta2,
double a1,
int  inc_a1,
double a2,
int  inc_a2,
double w,
int  inc_w 
)
38{
39 double* restrict chi1;
40 double* restrict chi2;
41 double* restrict psi1;
42 int i;
43
44 int n_pre;
45 int n_run;
46 int n_left;
47
48 v2df_t a1v, a2v;
51 v2df_t y1v;
52 v2df_t y2v;
53
54 if ( inc_x1 != 1 ||
55 inc_x2 != 1 ||
56 inc_y != 1 ) bl1_abort();
57
58 n_pre = 0;
59 if ( ( unsigned long ) y % 16 != 0 )
60 {
61 if ( ( unsigned long ) x1 % 16 == 0 ||
62 ( unsigned long ) x2 % 16 == 0 ) bl1_abort();
63
64 n_pre = 1;
65 }
66
67 n_run = ( n - n_pre ) / 4;
68 n_left = ( n - n_pre ) % 4;
69
70 chi1 = x1;
71 chi2 = x2;
72 psi1 = y;
73
74 if ( n_pre == 1 )
75 {
76 double alpha1_c = *alpha1;
77 double alpha2_c = *alpha2;
78 double chi11_c = *chi1;
79 double chi12_c = *chi2;
80 double temp1;
81
82 // psi1 = psi1 + alpha1 * chi11 + alpha2 * chi12;
84 *psi1 = *psi1 + temp1;
85
86 chi1 += inc_x1;
87 chi2 += inc_x2;
88 psi1 += inc_y;
89 }
90
91 a1v.v = _mm_loaddup_pd( ( double* )alpha1 );
92 a2v.v = _mm_loaddup_pd( ( double* )alpha2 );
93
94 for ( i = 0; i < n_run; ++i )
95 {
96 x11v.v = _mm_load_pd( ( double* )chi1 );
97 x12v.v = _mm_load_pd( ( double* )chi2 );
98 y1v.v = _mm_load_pd( ( double* )psi1 );
99
100 x21v.v = _mm_load_pd( ( double* )(chi1 + 2) );
101 x22v.v = _mm_load_pd( ( double* )(chi2 + 2) );
102 y2v.v = _mm_load_pd( ( double* )(psi1 + 2) );
103
104 y1v.v += a1v.v * x11v.v + a2v.v * x12v.v;
105 y2v.v += a1v.v * x21v.v + a2v.v * x22v.v;
106
107 _mm_store_pd( ( double* )psi1, y1v.v );
108 _mm_store_pd( ( double* )(psi1 + 2), y2v.v );
109
110 //chi1 += step_x1;
111 //chi2 += step_x2;
112 //psi1 += step_y;
113 chi1 += 4;
114 chi2 += 4;
115 psi1 += 4;
116 }
117
118 if ( n_left > 0 )
119 {
120 double alpha1_c = *alpha1;
121 double alpha2_c = *alpha2;
122
123 for ( i = 0; i < n_left; ++i )
124 {
125 double chi11_c = *chi1;
126 double chi12_c = *chi2;
127 double psi1_c = *psi1;
128 double temp1;
129
131 *psi1 = psi1_c + temp1;
132
133 chi1 += inc_x1;
134 chi2 += inc_x2;
135 psi1 += inc_y;
136 }
137 }
138}
chi1
Definition bl1_axpyv2b.c:156
int n_left
Definition bl1_axpyv2b.c:151
int n_run
Definition bl1_axpyv2b.c:150
double *restrict psi1
Definition bl1_axpyv2b.c:143
int i
Definition bl1_axpyv2b.c:148
double temp1
Definition bl1_axpyv2b.c:146
double *restrict chi2
Definition bl1_axpyv2b.c:142
double alpha1_c
Definition bl1_axpyv2b.c:144
double alpha2_c
Definition bl1_axpyv2b.c:145
double *restrict alpha1
Definition bl1_axpyv2bdotaxpy.c:198
x1
Definition bl1_dotsv2.c:374
double *restrict alpha2
Definition bl1_dotv2axpyv2b.c:188

References alpha1, alpha1_c, alpha2, alpha2_c, bl1_abort(), chi1, chi2, i, n_left, n_pre, n_run, psi1, temp1, v2df_t::v, and x1.

Referenced by FLA_Fused_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Gerc2_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

◆ bl1_daxpyv2bdotaxpy()

void bl1_daxpyv2bdotaxpy ( int  n,
double beta,
double u,
int  inc_u,
double gamma,
double z,
int  inc_z,
double a,
int  inc_a,
double x,
int  inc_x,
double kappa,
double rho,
double w,
int  inc_w 
)
47{
48 double* restrict upsilon1;
49 double* restrict zeta1;
50 double* restrict alpha1;
51 double* restrict chi1;
52 double* restrict omega1;
53 double rho_c;
54 int i;
55 v2df_t b1v, g1v, k1v;
57 v2df_t u1v, z1v, a1v;
58 v2df_t u2v, z2v, a2v;
59 v2df_t x1v, w1v;
60 v2df_t x2v, w2v;
61
62 int n_pre;
63 int n_run;
64 int n_left;
65
66 n_pre = 0;
67 if ( ( unsigned long ) a % 16 != 0 )
68 {
69 if ( ( unsigned long ) u % 16 == 0 ||
70 ( unsigned long ) z % 16 == 0 ||
71 ( unsigned long ) x % 16 == 0 ||
72 ( unsigned long ) w % 16 == 0 ) bl1_abort();
73
74 n_pre = 1;
75 }
76
77 n_run = ( n - n_pre ) / 4;
78 n_left = ( n - n_pre ) % 4;
79
80 upsilon1 = u;
81 zeta1 = z;
82 alpha1 = a;
83 chi1 = x;
84 omega1 = w;
85
86
87 rho_c = 0.0;
88
89 if ( n_pre == 1 )
90 {
91 double beta_c = *beta;
92 double gamma_c = *gamma;
93 double kappa_c = *kappa;
94
95 double upsilon1_c = *upsilon1;
96 double zeta1_c = *zeta1;
97 double alpha1_c = *alpha1;
98 double chi1_c = *chi1;
99 double omega1_c = *omega1;
100
102 rho_c += alpha1_c * chi1_c;
104
105 *alpha1 = alpha1_c;
106 *omega1 = omega1_c;
107
108 upsilon1 += inc_u;
109 zeta1 += inc_z;
110 alpha1 += inc_a;
111 chi1 += inc_x;
112 omega1 += inc_w;
113 }
114
115 b1v.v = _mm_loaddup_pd( ( double* )beta );
116 g1v.v = _mm_loaddup_pd( ( double* )gamma );
117 k1v.v = _mm_loaddup_pd( ( double* )kappa );
118
119 rhov.v = _mm_setzero_pd();
120
121 for ( i = 0; i < n_run; ++i )
122 {
123 u1v.v = _mm_load_pd( ( double* )upsilon1 );
124 z1v.v = _mm_load_pd( ( double* )zeta1 );
125 a1v.v = _mm_load_pd( ( double* )alpha1 );
126
127 a1v.v += b1v.v * u1v.v + g1v.v * z1v.v;
128
129 u2v.v = _mm_load_pd( ( double* )(upsilon1 + 2) );
130 z2v.v = _mm_load_pd( ( double* )(zeta1 + 2) );
131 a2v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
132
133 a2v.v += b1v.v * u2v.v + g1v.v * z2v.v;
134
135 x1v.v = _mm_load_pd( ( double* )chi1 );
136 x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
137
138 w1v.v = _mm_load_pd( ( double* )omega1 );
139 w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );
140
141 rhov.v += a1v.v * x1v.v;
142 rhov.v += a2v.v * x2v.v;
143
144 w1v.v += k1v.v * a1v.v;
145 w2v.v += k1v.v * a2v.v;
146
147 _mm_store_pd( ( double* )alpha1, a1v.v );
148 _mm_store_pd( ( double* )(alpha1 + 2), a2v.v );
149
150 _mm_store_pd( ( double* )omega1, w1v.v );
151 _mm_store_pd( ( double* )(omega1 + 2), w2v.v );
152
153
154 upsilon1 += 4;
155 zeta1 += 4;
156 alpha1 += 4;
157 chi1 += 4;
158 omega1 += 4;
159 }
160
161 rho_c += rhov.d[0] + rhov.d[1];
162
163 if ( n_left > 0 )
164 {
165 double beta_c = *beta;
166 double gamma_c = *gamma;
167 double kappa_c = *kappa;
168
169 for ( i = 0; i < n_left; ++i )
170 {
171 double upsilon1_c = *upsilon1;
172 double zeta1_c = *zeta1;
173 double alpha1_c = *alpha1;
174 double chi1_c = *chi1;
175 double omega1_c = *omega1;
176
178 rho_c += alpha1_c * chi1_c;
180
181 *alpha1 = alpha1_c;
182 *omega1 = omega1_c;
183
184 upsilon1 += inc_u;
185 zeta1 += inc_z;
186 alpha1 += inc_a;
187 chi1 += inc_x;
188 omega1 += inc_w;
189 }
190 }
191
192 *rho = rho_c;
193}
int n_left
Definition bl1_axpyv2bdotaxpy.c:209
double *restrict chi1
Definition bl1_axpyv2bdotaxpy.c:199
upsilon1
Definition bl1_axpyv2bdotaxpy.c:225
double beta_c
Definition bl1_axpyv2bdotaxpy.c:201
double rho_c
Definition bl1_axpyv2bdotaxpy.c:204
double kappa_c
Definition bl1_axpyv2bdotaxpy.c:203
* rho
Definition bl1_axpyv2bdotaxpy.c:322
double *restrict zeta1
Definition bl1_axpyv2bdotaxpy.c:197
int i
Definition bl1_axpyv2bdotaxpy.c:205
int n_pre
Definition bl1_axpyv2bdotaxpy.c:207
double gamma_c
Definition bl1_axpyv2bdotaxpy.c:202
double *restrict omega1
Definition bl1_axpyv2bdotaxpy.c:200
int n_run
Definition bl1_axpyv2bdotaxpy.c:208

References alpha1, alpha1_c, beta_c, bl1_abort(), chi1, v2df_t::d, gamma_c, i, kappa_c, n_left, n_pre, n_run, omega1, rho, rho_c, upsilon1, v2df_t::v, and zeta1.

Referenced by FLA_Fused_Gerc2_Ahx_Ax_opd_var1(), and FLA_Fused_Her2_Ax_l_opd_var1().

◆ bl1_daxpyv3b()

void bl1_daxpyv3b ( int  n,
double beta1,
double beta2,
double beta3,
double a1,
int  inc_a1,
double a2,
int  inc_a2,
double a3,
int  inc_a3,
double w,
int  inc_w 
)
43{
44 double* restrict chi1;
45 double* restrict chi2;
46 double* restrict chi3;
47 double* restrict psi1;
48 int i;
49
50 int n_pre;
51 int n_run;
52 int n_left;
53
54 v2df_t a1v, a2v, a3v;
57 v2df_t y1v;
58 v2df_t y2v;
59
60 if ( inc_x1 != 1 ||
61 inc_x2 != 1 ||
62 inc_x3 != 1 ||
63 inc_y != 1 ) bl1_abort();
64
65 n_pre = 0;
66 if ( ( unsigned long ) y % 16 != 0 )
67 {
68 if ( ( unsigned long ) x1 % 16 == 0 ||
69 ( unsigned long ) x2 % 16 == 0 ||
70 ( unsigned long ) x3 % 16 == 0 ) bl1_abort();
71
72 n_pre = 1;
73 }
74
75 n_run = ( n - n_pre ) / 4;
76 n_left = ( n - n_pre ) % 4;
77
78 chi1 = x1;
79 chi2 = x2;
80 chi3 = x3;
81 psi1 = y;
82
83 if ( n_pre == 1 )
84 {
85 double alpha1_c = *alpha1;
86 double alpha2_c = *alpha2;
87 double alpha3_c = *alpha3;
88 double chi11_c = *chi1;
89 double chi12_c = *chi2;
90 double chi13_c = *chi3;
91
93
94 chi1 += inc_x1;
95 chi2 += inc_x2;
96 chi3 += inc_x3;
97 psi1 += inc_y;
98 }
99
100 a1v.v = _mm_loaddup_pd( ( double* )alpha1 );
101 a2v.v = _mm_loaddup_pd( ( double* )alpha2 );
102 a3v.v = _mm_loaddup_pd( ( double* )alpha3 );
103
104 for ( i = 0; i < n_run; ++i )
105 {
106 x11v.v = _mm_load_pd( ( double* )chi1 );
107 x12v.v = _mm_load_pd( ( double* )chi2 );
108 x13v.v = _mm_load_pd( ( double* )chi3 );
109 y1v.v = _mm_load_pd( ( double* )psi1 );
110
111 y1v.v += a1v.v * x11v.v + a2v.v * x12v.v + a3v.v * x13v.v;
112
113 _mm_store_pd( ( double* )psi1, y1v.v );
114
115 x21v.v = _mm_load_pd( ( double* )(chi1 + 2) );
116 x22v.v = _mm_load_pd( ( double* )(chi2 + 2) );
117 x23v.v = _mm_load_pd( ( double* )(chi3 + 2) );
118 y2v.v = _mm_load_pd( ( double* )(psi1 + 2) );
119
120 y2v.v += a1v.v * x21v.v + a2v.v * x22v.v + a3v.v * x23v.v;
121
122 _mm_store_pd( ( double* )(psi1 + 2), y2v.v );
123
124 chi1 += 4;
125 chi2 += 4;
126 chi3 += 4;
127 psi1 += 4;
128 }
129
130 if ( n_left > 0 )
131 {
132 double alpha1_c = *alpha1;
133 double alpha2_c = *alpha2;
134 double alpha3_c = *alpha3;
135
136 for ( i = 0; i < n_left; ++i )
137 {
138 double chi11_c = *chi1;
139 double chi12_c = *chi2;
140 double chi13_c = *chi3;
141
143
144 chi1 += inc_x1;
145 chi2 += inc_x2;
146 chi3 += inc_x3;
147 psi1 += inc_y;
148 }
149 }
150}
chi1
Definition bl1_axpyv3b.c:168
int n_left
Definition bl1_axpyv3b.c:163
double *restrict chi2
Definition bl1_axpyv3b.c:154
double alpha1_c
Definition bl1_axpyv3b.c:156
int n_run
Definition bl1_axpyv3b.c:162
double *restrict psi1
Definition bl1_axpyv3b.c:155
int i
Definition bl1_axpyv3b.c:160
double alpha2_c
Definition bl1_axpyv3b.c:157

References alpha1, alpha1_c, alpha2, alpha2_c, bl1_abort(), chi1, chi2, i, n_left, n_pre, n_run, psi1, v2df_t::v, and x1.

◆ bl1_ddotaxmyv2()

void bl1_ddotaxmyv2 ( int  n,
double alpha,
double beta,
double x,
int  inc_x,
double u,
int  inc_u,
double rho,
double y,
int  inc_y,
double z,
int  inc_z 
)
43{
44 double* restrict chi1;
45 double* restrict upsilon1;
46 double* restrict psi1;
47 double* restrict zeta1;
48 double rho_c;
49 int i;
50
51 int n_pre;
52 int n_run;
53 int n_left;
54
55 v2df_t a1v, b1v;
57 v2df_t x1v, u1v, y1v, z1v;
58
59 if ( inc_x != 1 ||
60 inc_u != 1 ||
61 inc_y != 1 ||
62 inc_z != 1 ) bl1_abort();
63
64 n_pre = 0;
65 if ( ( unsigned long ) z % 16 != 0 )
66 {
67 if ( ( unsigned long ) x % 16 == 0 ||
68 ( unsigned long ) u % 16 == 0 ||
69 ( unsigned long ) y % 16 == 0 ) bl1_abort();
70
71 n_pre = 1;
72 }
73
74 n_run = ( n - n_pre ) / 2;
75 n_left = ( n - n_pre ) % 2;
76
77 chi1 = x;
78 upsilon1 = u;
79 psi1 = y;
80 zeta1 = z;
81
82 rho_c = 0.0;
83
84 if ( n_pre == 1 )
85 {
86 double alpha_c = *alpha;
87 double beta_c = *beta;
88 double chi1_c = *chi1;
89 double upsilon_c = *upsilon1;
90
92 *psi1 -= alpha_c * chi1_c;
93 *zeta1 -= beta_c * chi1_c;
94
95 chi1 += inc_x;
96 upsilon1 += inc_u;
97 psi1 += inc_y;
98 zeta1 += inc_z;
99 }
100
101 a1v.v = _mm_loaddup_pd( ( double* )alpha );
102 b1v.v = _mm_loaddup_pd( ( double* )beta );
103
104 rho1v.v = _mm_setzero_pd();
105
106 for ( i = 0; i < n_run; ++i )
107 {
108 x1v.v = _mm_load_pd( ( double* )chi1 );
109 u1v.v = _mm_load_pd( ( double* )upsilon1 );
110 y1v.v = _mm_load_pd( ( double* )psi1 );
111 z1v.v = _mm_load_pd( ( double* )zeta1 );
112
113 rho1v.v += x1v.v * u1v.v;
114 y1v.v -= a1v.v * x1v.v;
115 z1v.v -= b1v.v * x1v.v;
116
117 _mm_store_pd( ( double* )psi1, y1v.v );
118 _mm_store_pd( ( double* )zeta1, z1v.v );
119
120 chi1 += 2;
121 upsilon1 += 2;
122 psi1 += 2;
123 zeta1 += 2;
124 }
125
126 rho_c += rho1v.d[0] + rho1v.d[1];
127
128 if ( n_left > 0 )
129 {
130 double alpha_c = *alpha;
131 double beta_c = *beta;
132
133 for( i = 0; i < n_left; ++i )
134 {
135 double chi1_c = *chi1;
136 double upsilon_c = *upsilon1;
137
139 *psi1 -= alpha_c * chi1_c;
140 *zeta1 -= beta_c * chi1_c;
141
142 chi1 += inc_x;
143 upsilon1 += inc_u;
144 psi1 += inc_y;
145 zeta1 += inc_z;
146 }
147 }
148
149 *rho = rho_c;
150}
double beta_c
Definition bl1_dotaxmyv2.c:158
double alpha_c
Definition bl1_dotaxmyv2.c:157
int n_left
Definition bl1_dotaxmyv2.c:164
double *restrict upsilon1
Definition bl1_dotaxmyv2.c:154
int n_pre
Definition bl1_dotaxmyv2.c:162
double rho_c
Definition bl1_dotaxmyv2.c:159
double *restrict psi1
Definition bl1_dotaxmyv2.c:155
int n_run
Definition bl1_dotaxmyv2.c:163
* rho
Definition bl1_dotaxmyv2.c:258
int i
Definition bl1_dotaxmyv2.c:160
double *restrict zeta1
Definition bl1_dotaxmyv2.c:156

References alpha_c, beta_c, bl1_abort(), chi1, v2df_t::d, i, n_left, n_pre, n_run, psi1, rho, rho_c, upsilon1, v2df_t::v, and zeta1.

Referenced by FLA_Fused_Uhu_Yhu_Zhu_opd_var1().

◆ bl1_ddotaxpy()

void bl1_ddotaxpy ( int  n,
double a,
int  inc_a,
double x,
int  inc_x,
double kappa,
double rho,
double w,
int  inc_w 
)
38{
39 double* restrict alpha1;
40 double* restrict chi1;
41 double* restrict omega1;
42 double rho_c;
43 int i;
44
45 int n_pre;
46 int n_run;
47 int n_left;
48
50 v2df_t a1v, x1v, w1v;
51 v2df_t a2v, x2v, w2v;
52
53 if ( inc_a != 1 ||
54 inc_x != 1 ||
55 inc_w != 1 ) bl1_abort();
56
57 n_pre = 0;
58 if ( ( unsigned long ) a % 16 != 0 )
59 {
60 if ( ( unsigned long ) x % 16 == 0 ||
61 ( unsigned long ) w % 16 == 0 ) bl1_abort();
62
63 n_pre = 1;
64 }
65
66 n_run = ( n - n_pre ) / 4;
67 n_left = ( n - n_pre ) % 4;
68
69 alpha1 = a;
70 chi1 = x;
71 omega1 = w;
72
73 rho_c = 0.0;
74
75 if ( n_pre == 1 )
76 {
77 double kappa_c = *kappa;
78 double alpha1_c = *alpha1;
79 double chi1_c = *chi1;
80 double omega1_c = *omega1;
81
84
86
87 alpha1 += inc_a;
88 chi1 += inc_x;
89 omega1 += inc_w;
90 }
91
93
94 k1v.v = _mm_loaddup_pd( ( double* )kappa );
95
96 for ( i = 0; i < n_run; ++i )
97 {
98 a1v.v = _mm_load_pd( ( double* )alpha1 );
99 x1v.v = _mm_load_pd( ( double* )chi1 );
100 w1v.v = _mm_load_pd( ( double* )omega1 );
101
102 a2v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
103 x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
104 w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );
105
106 rho1v.v += a1v.v * x1v.v;
107 w1v.v += k1v.v * a1v.v;
108
109 _mm_store_pd( ( double* )omega1, w1v.v );
110
111 rho1v.v += a2v.v * x2v.v;
112 w2v.v += k1v.v * a2v.v;
113
114 _mm_store_pd( ( double* )(omega1 + 2), w2v.v );
115
116 alpha1 += 4;
117 chi1 += 4;
118 omega1 += 4;
119 }
120
121 if ( n_left > 0 )
122 {
123 for ( i = 0; i < n_left; ++i )
124 {
125 double kappa_c = *kappa;
126 double alpha1_c = *alpha1;
127 double chi1_c = *chi1;
128 double omega1_c = *omega1;
129
130 rho_c += alpha1_c * chi1_c;
132
133 *omega1 = omega1_c;
134
135 alpha1 += inc_a;
136 chi1 += inc_x;
137 omega1 += inc_w;
138 }
139 }
140
141 rho_c += rho1v.d[0] + rho1v.d[1];
142
143 *rho = rho_c;
144}
double *restrict omega1
Definition bl1_dotaxpy.c:149
double *restrict chi1
Definition bl1_dotaxpy.c:148
alpha1
Definition bl1_dotaxpy.c:338
int n_left
Definition bl1_dotaxpy.c:156
int n_pre
Definition bl1_dotaxpy.c:154
double rho_c
Definition bl1_dotaxpy.c:151
double kappa_c
Definition bl1_dotaxpy.c:150
int n_run
Definition bl1_dotaxpy.c:155
* rho
Definition bl1_dotaxpy.c:242
int i
Definition bl1_dotaxpy.c:152

References alpha1, alpha1_c, bl1_abort(), chi1, v2df_t::d, i, kappa_c, n_left, n_pre, n_run, omega1, rho, rho_c, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Ax_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

◆ bl1_ddotsv2()

void bl1_ddotsv2 ( conj1_t  conjxy,
int  n,
double x,
int  inc_x,
double y,
int  inc_y,
double z,
int  inc_z,
double beta,
double rho_xz,
double rho_yz 
)
44{
45 double* restrict x1;
46 double* restrict y1;
47 double* restrict z1;
48 double rho1, rho2;
49 double x1c, y1c, z1c;
50 int i;
51
52 int n_pre;
53 int n_run;
54 int n_left;
55
57 v2df_t x1v, y1v, z1v;
58 v2df_t x2v, y2v, z2v;
59
60 if ( inc_x != 1 ||
61 inc_y != 1 ||
62 inc_z != 1 ) bl1_abort();
63
64 n_pre = 0;
65 if ( ( unsigned long ) z % 16 != 0 )
66 {
67 if ( ( unsigned long ) x % 16 == 0 ||
68 ( unsigned long ) y % 16 == 0 ) bl1_abort();
69
70 n_pre = 1;
71 }
72
73 n_run = ( n - n_pre ) / 4;
74 n_left = ( n - n_pre ) % 4;
75
76 x1 = x;
77 y1 = y;
78 z1 = z;
79
80 rho1 = 0.0;
81 rho2 = 0.0;
82
83 if ( n_pre == 1 )
84 {
85 x1c = *x1;
86 y1c = *y1;
87 z1c = *z1;
88
89 rho1 += x1c * z1c;
90 rho2 += y1c * z1c;
91
92 x1 += inc_x;
93 y1 += inc_y;
94 z1 += inc_z;
95 }
96
99
100 for ( i = 0; i < n_run; ++i )
101 {
102 x1v.v = _mm_load_pd( ( double* )x1 );
103 y1v.v = _mm_load_pd( ( double* )y1 );
104 z1v.v = _mm_load_pd( ( double* )z1 );
105
106 x2v.v = _mm_load_pd( ( double* )(x1 + 2) );
107 y2v.v = _mm_load_pd( ( double* )(y1 + 2) );
108 z2v.v = _mm_load_pd( ( double* )(z1 + 2) );
109
110 rho1v.v += x1v.v * z1v.v;
111 rho2v.v += y1v.v * z1v.v;
112
113 rho1v.v += x2v.v * z2v.v;
114 rho2v.v += y2v.v * z2v.v;
115
116 x1 += 4;
117 y1 += 4;
118 z1 += 4;
119 }
120
121 rho1 += rho1v.d[0] + rho1v.d[1];
122 rho2 += rho2v.d[0] + rho2v.d[1];
123
124 if ( n_left > 0 )
125 {
126 for ( i = 0; i < n_left; ++i )
127 {
128 x1c = *x1;
129 y1c = *y1;
130 z1c = *z1;
131
132 rho1 += x1c * z1c;
133 rho2 += y1c * z1c;
134
135 x1 += inc_x;
136 y1 += inc_y;
137 z1 += inc_z;
138 }
139 }
140
141 *rho_xz = *beta * *rho_xz + rho1;
142 *rho_yz = *beta * *rho_yz + rho2;
143}
double *restrict z1
Definition bl1_dotsv2.c:148
double rho2
Definition bl1_dotsv2.c:149
int n_left
Definition bl1_dotsv2.c:156
int n_pre
Definition bl1_dotsv2.c:154
double rho1
Definition bl1_dotsv2.c:149
double z1c
Definition bl1_dotsv2.c:150
int n_run
Definition bl1_dotsv2.c:155
double y1c
Definition bl1_dotsv2.c:150
* rho_xz
Definition bl1_dotsv2.c:229
double *restrict y1
Definition bl1_dotsv2.c:147
int i
Definition bl1_dotsv2.c:152
double x1c
Definition bl1_dotsv2.c:150
* rho_yz
Definition bl1_dotsv2.c:230

References bl1_abort(), v2df_t::d, i, n_left, n_pre, n_run, rho1, rho2, rho_xz, rho_yz, v2df_t::v, x1, x1c, y1, y1c, z1, and z1c.

Referenced by FLA_Fused_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Uhu_Yhu_Zhu_opd_var1(), FLA_Fused_UYx_ZVx_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

◆ bl1_ddotsv3()

void bl1_ddotsv3 ( conj1_t  conjxyw,
int  n,
double x,
int  inc_x,
double y,
int  inc_y,
double w,
int  inc_w,
double z,
int  inc_z,
double beta,
double rho_xz,
double rho_yz,
double rho_wz 
)
49{
50 double* restrict x1;
51 double* restrict y1;
52 double* restrict w1;
53 double* restrict z1;
54 double rho1, rho2, rho3;
55 double x1c, y1c, w1c, z1c;
56 int i;
57
58 int n_pre;
59 int n_run;
60 int n_left;
61
63 v2df_t x1v, y1v, w1v, z1v;
64 v2df_t x2v, y2v, w2v, z2v;
65
66 if ( inc_x != 1 ||
67 inc_y != 1 ||
68 inc_w != 1 ||
69 inc_z != 1 ) bl1_abort();
70
71 n_pre = 0;
72 if ( ( unsigned long ) z % 16 != 0 )
73 {
74 if ( ( unsigned long ) x % 16 == 0 ||
75 ( unsigned long ) y % 16 == 0 ||
76 ( unsigned long ) w % 16 == 0 ) bl1_abort();
77
78 n_pre = 1;
79 }
80
81 n_run = ( n - n_pre ) / 4;
82 n_left = ( n - n_pre ) % 4;
83
84 x1 = x;
85 y1 = y;
86 w1 = w;
87 z1 = z;
88
89 rho1 = 0.0;
90 rho2 = 0.0;
91 rho3 = 0.0;
92
93 if ( n_pre == 1 )
94 {
95 x1c = *x1;
96 y1c = *y1;
97 w1c = *w1;
98 z1c = *z1;
99
100 rho1 += x1c * z1c;
101 rho2 += y1c * z1c;
102 rho3 += w1c * z1c;
103
104 x1 += inc_x;
105 y1 += inc_y;
106 w1 += inc_w;
107 z1 += inc_z;
108 }
109
110 rho1v.v = _mm_setzero_pd();
111 rho2v.v = _mm_setzero_pd();
112 rho3v.v = _mm_setzero_pd();
113
114 for ( i = 0; i < n_run; ++i )
115 {
116 x1v.v = _mm_load_pd( ( double* )x1 );
117 y1v.v = _mm_load_pd( ( double* )y1 );
118 w1v.v = _mm_load_pd( ( double* )w1 );
119 z1v.v = _mm_load_pd( ( double* )z1 );
120
121 rho1v.v += x1v.v * z1v.v;
122 rho2v.v += y1v.v * z1v.v;
123 rho3v.v += w1v.v * z1v.v;
124
125 x2v.v = _mm_load_pd( ( double* )(x1 + 2) );
126 y2v.v = _mm_load_pd( ( double* )(y1 + 2) );
127 w2v.v = _mm_load_pd( ( double* )(w1 + 2) );
128 z2v.v = _mm_load_pd( ( double* )(z1 + 2) );
129
130 rho1v.v += x2v.v * z2v.v;
131 rho2v.v += y2v.v * z2v.v;
132 rho3v.v += w2v.v * z2v.v;
133
134 x1 += 4;
135 y1 += 4;
136 w1 += 4;
137 z1 += 4;
138 }
139
140 rho1 += rho1v.d[0] + rho1v.d[1];
141 rho2 += rho2v.d[0] + rho2v.d[1];
142 rho3 += rho3v.d[0] + rho3v.d[1];
143
144 if ( n_left > 0 )
145 {
146 for ( i = 0; i < n_left; ++i )
147 {
148 x1c = *x1;
149 y1c = *y1;
150 w1c = *w1;
151 z1c = *z1;
152
153 rho1 += x1c * z1c;
154 rho2 += y1c * z1c;
155 rho3 += w1c * z1c;
156
157 x1 += inc_x;
158 y1 += inc_y;
159 w1 += inc_w;
160 z1 += inc_z;
161 }
162 }
163
164 *rho_xz = *beta * *rho_xz + rho1;
165 *rho_yz = *beta * *rho_yz + rho2;
166 *rho_wz = *beta * *rho_wz + rho3;
167}
double *restrict z1
Definition bl1_dotsv3.c:173
int n_left
Definition bl1_dotsv3.c:181
* rho_wz
Definition bl1_dotsv3.c:270
int n_pre
Definition bl1_dotsv3.c:179
double *restrict y1
Definition bl1_dotsv3.c:171
double *restrict w1
Definition bl1_dotsv3.c:172
double z1c
Definition bl1_dotsv3.c:175
int n_run
Definition bl1_dotsv3.c:180
double rho1
Definition bl1_dotsv3.c:174
double rho3
Definition bl1_dotsv3.c:174
double y1c
Definition bl1_dotsv3.c:175
double rho2
Definition bl1_dotsv3.c:174
* rho_xz
Definition bl1_dotsv3.c:268
x1
Definition bl1_dotsv3.c:452
int i
Definition bl1_dotsv3.c:177
double x1c
Definition bl1_dotsv3.c:175
* rho_yz
Definition bl1_dotsv3.c:269
double w1c
Definition bl1_dotsv3.c:175

References bl1_abort(), v2df_t::d, i, n_left, n_pre, n_run, rho1, rho2, rho3, rho_wz, rho_xz, rho_yz, v2df_t::v, w1, w1c, x1, x1c, y1, y1c, z1, and z1c.

◆ bl1_ddotv2axpyv2b()

void bl1_ddotv2axpyv2b ( int  n,
double a1,
int  inc_a1,
double a2,
int  inc_a2,
double x,
int  inc_x,
double kappa1,
double kappa2,
double rho1,
double rho2,
double w,
int  inc_w 
)
46{
47 double* restrict alpha1;
48 double* restrict alpha2;
49 double* restrict chi1;
50 double* restrict omega1;
51 double rho1_c;
52 double rho2_c;
53 int i;
54
55 int n_pre;
56 int n_run;
57 int n_left;
58
63
64 if ( inc_a1 != 1 ||
65 inc_a2 != 1 ||
66 inc_x != 1 ||
67 inc_w != 1 ) bl1_abort();
68
69 n_pre = 0;
70 if ( ( unsigned long ) a1 % 16 != 0 )
71 {
72 if ( ( unsigned long ) a2 % 16 == 0 ||
73 ( unsigned long ) x % 16 == 0 ||
74 ( unsigned long ) w % 16 == 0 ) bl1_abort();
75
76 n_pre = 1;
77 }
78
79 n_run = ( n - n_pre ) / 4;
80 n_left = ( n - n_pre ) % 4;
81
82 alpha1 = a1;
83 alpha2 = a2;
84 chi1 = x;
85 omega1 = w;
86
87 rho1_c = 0.0;
88 rho2_c = 0.0;
89
90 if ( n_pre == 1 )
91 {
92 double kappa1_c = *kappa1;
93 double kappa2_c = *kappa2;
94 double alpha1_c = *alpha1;
95 double alpha2_c = *alpha2;
96 double chi1_c = *chi1;
97 double omega1_c = *omega1;
98
101
104
105 *omega1 = omega1_c;
106
107 alpha1 += inc_a1;
108 alpha2 += inc_a2;
109 chi1 += inc_x;
110 omega1 += inc_w;
111 }
112
113 rho1v.v = _mm_setzero_pd();
114 rho2v.v = _mm_setzero_pd();
115
116 k1v.v = _mm_loaddup_pd( ( double* )kappa1 );
117 k2v.v = _mm_loaddup_pd( ( double* )kappa2 );
118
119 for ( i = 0; i < n_run; ++i )
120 {
121 a11v.v = _mm_load_pd( ( double* )alpha1 );
122 a12v.v = _mm_load_pd( ( double* )alpha2 );
123 x1v.v = _mm_load_pd( ( double* )chi1 );
124 w1v.v = _mm_load_pd( ( double* )omega1 );
125
126 rho1v.v += a11v.v * x1v.v;
127 w1v.v += k1v.v * a11v.v;
128
129 rho2v.v += a12v.v * x1v.v;
130 w1v.v += k2v.v * a12v.v;
131
132 _mm_store_pd( ( double* )omega1, w1v.v );
133
134 a21v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
135 a22v.v = _mm_load_pd( ( double* )(alpha2 + 2) );
136 x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
137 w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );
138
139 rho1v.v += a21v.v * x2v.v;
140 w2v.v += k1v.v * a21v.v;
141
142 rho2v.v += a22v.v * x2v.v;
143 w2v.v += k2v.v * a22v.v;
144
145 _mm_store_pd( ( double* )(omega1 + 2), w2v.v );
146
147 alpha1 += 4;
148 alpha2 += 4;
149 chi1 += 4;
150 omega1 += 4;
151 }
152
153 if ( n_left > 0 )
154 {
155 for ( i = 0; i < n_left; ++i )
156 {
157 double kappa1_c = *kappa1;
158 double kappa2_c = *kappa2;
159 double alpha1_c = *alpha1;
160 double alpha2_c = *alpha2;
161 double chi1_c = *chi1;
162 double omega1_c = *omega1;
163
166
169
170 *omega1 = omega1_c;
171
172 alpha1 += inc_a1;
173 alpha2 += inc_a2;
174 chi1 += inc_x;
175 omega1 += inc_w;
176 }
177 }
178
179 rho1_c += rho1v.d[0] + rho1v.d[1];
180 rho2_c += rho2v.d[0] + rho2v.d[1];
181
182 *rho1 = rho1_c;
183 *rho2 = rho2_c;
184}
double *restrict omega1
Definition bl1_dotv2axpyv2b.c:190
double rho2_c
Definition bl1_dotv2axpyv2b.c:194
double *restrict chi1
Definition bl1_dotv2axpyv2b.c:189
double kappa1_c
Definition bl1_dotv2axpyv2b.c:191
alpha1
Definition bl1_dotv2axpyv2b.c:456
* rho2
Definition bl1_dotv2axpyv2b.c:312
int n_left
Definition bl1_dotv2axpyv2b.c:199
double rho1_c
Definition bl1_dotv2axpyv2b.c:193
int n_pre
Definition bl1_dotv2axpyv2b.c:197
* rho1
Definition bl1_dotv2axpyv2b.c:311
int n_run
Definition bl1_dotv2axpyv2b.c:198
double kappa2_c
Definition bl1_dotv2axpyv2b.c:192
int i
Definition bl1_dotv2axpyv2b.c:195

References alpha1, alpha1_c, alpha2, alpha2_c, bl1_abort(), chi1, v2df_t::d, i, kappa1_c, kappa2_c, n_left, n_pre, n_run, omega1, rho1, rho1_c, rho2, rho2_c, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Ax_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

◆ bl1_saxmyv2()

void bl1_saxmyv2 ( conj1_t  conjx,
int  n,
float alpha,
float beta,
float x,
int  inc_x,
float y,
int  inc_y,
float z,
int  inc_z 
)
29{
30 bl1_abort();
31}

References bl1_abort().

◆ bl1_saxpyv2b()

void bl1_saxpyv2b ( int  n,
float beta1,
float beta2,
float a1,
int  inc_a1,
float a2,
int  inc_a2,
float w,
int  inc_w 
)
26{
27 bl1_abort();
28}

References bl1_abort().

◆ bl1_saxpyv2bdotaxpy()

void bl1_saxpyv2bdotaxpy ( int  n,
float beta,
float u,
int  inc_u,
float gamma,
float z,
int  inc_z,
float a,
int  inc_a,
float x,
int  inc_x,
float kappa,
float rho,
float w,
int  inc_w 
)
31{
32 bl1_abort();
33}

References bl1_abort().

◆ bl1_saxpyv3b()

void bl1_saxpyv3b ( int  n,
float beta1,
float beta2,
float beta3,
float a1,
int  inc_a1,
float a2,
int  inc_a2,
float a3,
int  inc_a3,
float w,
int  inc_w 
)
29{
30 bl1_abort();
31}

References bl1_abort().

◆ bl1_sdotaxmyv2()

void bl1_sdotaxmyv2 ( int  n,
float alpha,
float beta,
float x,
int  inc_x,
float u,
int  inc_u,
float rho,
float y,
int  inc_y,
float z,
int  inc_z 
)
29{
30 bl1_abort();
31}

References bl1_abort().

◆ bl1_sdotaxpy()

void bl1_sdotaxpy ( int  n,
float a,
int  inc_a,
float x,
int  inc_x,
float kappa,
float rho,
float w,
int  inc_w 
)
26{
27 bl1_abort();
28}

References bl1_abort().

◆ bl1_sdotsv2()

void bl1_sdotsv2 ( conj1_t  conjxy,
int  n,
float x,
int  inc_x,
float y,
int  inc_y,
float z,
int  inc_z,
float beta,
float rho_xz,
float rho_yz 
)
30{
31 bl1_abort();
32}

References bl1_abort().

◆ bl1_sdotsv3()

void bl1_sdotsv3 ( conj1_t  conjxyw,
int  n,
float x,
int  inc_x,
float y,
int  inc_y,
float w,
int  inc_w,
float z,
int  inc_z,
float beta,
float rho_xz,
float rho_yz,
float rho_wz 
)
33{
34 bl1_abort();
35}

References bl1_abort().

◆ bl1_sdotv2axpyv2b()

void bl1_sdotv2axpyv2b ( int  n,
float a1,
int  inc_a1,
float a2,
int  inc_a2,
float x,
int  inc_x,
float kappa1,
float kappa2,
float rho1,
float rho2,
float w,
int  inc_w 
)
31{
32 bl1_abort();
33}

References bl1_abort().

◆ bl1_zaxmyv2()

void bl1_zaxmyv2 ( conj1_t  conjx,
int  n,
dcomplex alpha,
dcomplex beta,
dcomplex x,
int  inc_x,
dcomplex y,
int  inc_y,
dcomplex z,
int  inc_z 
)
258{
264 int i;
269
270 chi1 = x;
271 psi1 = y;
272 zeta1 = z;
273
274 alphav.v = _mm_load_pd( ( double* )alpha );
275 betav.v = _mm_load_pd( ( double* )beta );
277 betarv.v = _mm_shuffle_pd( betav.v, betav.v, _MM_SHUFFLE2 (0,1) );
278
279 if ( bl1_is_conj( conjx ) )
280 {
281 alpha_c = *alpha;
282 beta_c = *beta;
283
284 for ( i = 0; i < n; ++i )
285 {
287
288 // psi1 = psi1 + alpha * chi1;
289 psi1->real += alpha_c.real * chi1_c.real - alpha_c.imag * -chi1_c.imag;
290 psi1->imag += alpha_c.real * -chi1_c.imag + alpha_c.imag * chi1_c.real;
291
292 // zeta1 = zeta1 + beta * chi1;
293 zeta1->real += beta_c.real * chi1_c.real - beta_c.imag * -chi1_c.imag;
294 zeta1->imag += beta_c.real * -chi1_c.imag + beta_c.imag * chi1_c.real;
295
296 chi1 += inc_x;
297 psi1 += inc_y;
298 zeta1 += inc_z;
299 }
300 }
301 else
302 {
303 if ( inc_x == 1 &&
304 inc_y == 1 &&
305 inc_z == 1 )
306 {
307 for ( i = 0; i < n; ++i )
308 {
309 x11v.v = _mm_load_pd( ( double* )chi1 );
310 x12v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (1,1) );
311 x11v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (0,0) );
312
313 acbc.v = alphav.v * x11v.v;
314 bdad.v = alpharv.v * x12v.v;
315 y1v.v = _mm_load_pd( ( double* )psi1 );
316 y1v.v = y1v.v - _mm_addsub_pd( acbc.v, bdad.v );
317 _mm_store_pd( ( double* )psi1, y1v.v );
318
319 acbc.v = betav.v * x11v.v;
320 bdad.v = betarv.v * x12v.v;
321 z1v.v = _mm_load_pd( ( double* )zeta1 );
322 z1v.v = z1v.v - _mm_addsub_pd( acbc.v, bdad.v );
323 _mm_store_pd( ( double* )zeta1, z1v.v );
324
325 chi1 += 1;
326 psi1 += 1;
327 zeta1 += 1;
328 }
329 }
330 else
331 {
332 for ( i = 0; i < n; ++i )
333 {
334 x11v.v = _mm_load_pd( ( double* )chi1 );
335 x12v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (1,1) );
336 x11v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (0,0) );
337
338 acbc.v = alphav.v * x11v.v;
339 bdad.v = alpharv.v * x12v.v;
340 y1v.v = _mm_load_pd( ( double* )psi1 );
341 y1v.v = y1v.v - _mm_addsub_pd( acbc.v, bdad.v );
342 _mm_store_pd( ( double* )psi1, y1v.v );
343
344 acbc.v = betav.v * x11v.v;
345 bdad.v = betarv.v * x12v.v;
346 z1v.v = _mm_load_pd( ( double* )zeta1 );
347 z1v.v = z1v.v - _mm_addsub_pd( acbc.v, bdad.v );
348 _mm_store_pd( ( double* )zeta1, z1v.v );
349
350 chi1 += inc_x;
351 psi1 += inc_y;
352 zeta1 += inc_z;
353 }
354 }
355 }
356}
int bl1_is_conj(conj1_t conj)
Definition bl1_is.c:42
Definition blis_type_defs.h:138
double real
Definition blis_type_defs.h:139

References alpha_c, beta_c, bl1_is_conj(), chi1, i, dcomplex::imag, psi1, dcomplex::real, v2df_t::v, and zeta1.

Referenced by FLA_Fused_UYx_ZVx_opz_var1().

◆ bl1_zaxpyv2b()

void bl1_zaxpyv2b ( int  n,
dcomplex beta1,
dcomplex beta2,
dcomplex a1,
int  inc_a1,
dcomplex a2,
int  inc_a2,
dcomplex w,
int  inc_w 
)
217{
221 int i;
225 v2df_t t1v, y1v;
227
228 chi1 = x1;
229 chi2 = x2;
230 psi1 = y;
231
232 alpha1v.v = _mm_load_pd( ( double* )alpha1 );
233 alpha2v.v = _mm_load_pd( ( double* )alpha2 );
236
237 if ( inc_x1 == 1 &&
238 inc_x2 == 1 &&
239 inc_y == 1 )
240 {
241 for ( i = 0; i < n; ++i )
242 {
243 x11v.v = _mm_load_pd( ( double* )chi1 );
244 x12v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (1,1) );
245 x11v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (0,0) );
246 acbc.v = alpha1v.v * x11v.v;
247 bdad.v = alpha1rv.v * x12v.v;
248 t1v.v = _mm_addsub_pd( acbc.v, bdad.v );
249
250 x11v.v = _mm_load_pd( ( double* )chi2 );
251 x12v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (1,1) );
252 x11v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (0,0) );
253 acbc.v = alpha2v.v * x11v.v;
254 bdad.v = alpha2rv.v * x12v.v;
255 t1v.v = t1v.v + _mm_addsub_pd( acbc.v, bdad.v );
256
257 y1v.v = _mm_load_pd( ( double* )psi1 );
258 y1v.v = y1v.v + t1v.v;
259 _mm_store_pd( ( double* )psi1, y1v.v );
260
261 chi1 += 1;
262 chi2 += 1;
263 psi1 += 1;
264 }
265 }
266 else
267 {
268 for ( i = 0; i < n; ++i )
269 {
270 x11v.v = _mm_load_pd( ( double* )chi1 );
271 x12v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (1,1) );
272 x11v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (0,0) );
273 acbc.v = alpha1v.v * x11v.v;
274 bdad.v = alpha1rv.v * x12v.v;
275 t1v.v = _mm_addsub_pd( acbc.v, bdad.v );
276
277 x11v.v = _mm_load_pd( ( double* )chi2 );
278 x12v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (1,1) );
279 x11v.v = _mm_shuffle_pd( x11v.v, x11v.v, _MM_SHUFFLE2 (0,0) );
280 acbc.v = alpha2v.v * x11v.v;
281 bdad.v = alpha2rv.v * x12v.v;
282 t1v.v = t1v.v + _mm_addsub_pd( acbc.v, bdad.v );
283
284 y1v.v = _mm_load_pd( ( double* )psi1 );
285 y1v.v = y1v.v + t1v.v;
286 _mm_store_pd( ( double* )psi1, y1v.v );
287
288 chi1 += inc_x1;
289 chi2 += inc_x2;
290 psi1 += inc_y;
291 }
292 }
293}
__m128d v
Definition blis_type_defs.h:118

References alpha1, alpha2, chi1, chi2, i, psi1, v2df_t::v, and x1.

Referenced by FLA_Fused_Ahx_Axpy_Ax_opz_var1(), FLA_Fused_Gerc2_Ahx_Ax_opz_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opz_var1(), FLA_Fused_Gerc2_opz_var1(), FLA_Fused_Her2_Ax_l_opz_var1(), and FLA_Fused_Uhu_Yhu_Zhu_opz_var1().

◆ bl1_zaxpyv2bdotaxpy()

void bl1_zaxpyv2bdotaxpy ( int  n,
dcomplex beta,
dcomplex u,
int  inc_u,
dcomplex gamma,
dcomplex z,
int  inc_z,
dcomplex a,
int  inc_a,
dcomplex x,
int  inc_x,
dcomplex kappa,
dcomplex rho,
dcomplex w,
int  inc_w 
)
353{
359 int i;
360
361 //v2df_t beta1v, beta1rv;
362 //v2df_t gamma1v, gamma1rv;
363 //v2df_t kappa1v, kappa1rv;
365 //v2df_t u11v, u12v;
366 //v2df_t z11v, z12v;
368 v2df_t x1v, x1rv;
369 v2df_t w1v;
372
373 v2df_t a1v, a1rv;
374 v2df_t u1v, u1rv;
375 v2df_t z1v, z1rv;
378
379 upsilon1 = u;
380 zeta1 = z;
381 alpha1 = a;
382 chi1 = x;
383 omega1 = w;
384
385 if ( inc_u != 1 ||
386 inc_z != 1 ||
387 inc_a != 1 ||
388 inc_x != 1 ||
389 inc_w != 1 ) bl1_abort();
390
391
392 beta11v.v = _mm_loaddup_pd( ( double* )&(beta->real) );
393 beta12v.v = _mm_loaddup_pd( ( double* )&(beta->imag) );
394 gamma11v.v = _mm_loaddup_pd( ( double* )&(gamma->real) );
395 gamma12v.v = _mm_loaddup_pd( ( double* )&(gamma->imag) );
396 kappa11v.v = _mm_loaddup_pd( ( double* )&(kappa->real) );
397 kappa12v.v = _mm_loaddup_pd( ( double* )&(kappa->imag) );
398
399 rho1v.v = _mm_setzero_pd();
400
401 for ( i = 0; i < n; ++i )
402 {
403 //alpha_c = *alpha1;
404 a1v.v = _mm_load_pd( ( double* )alpha1 );
405
406 //alpha1_c.real += beta_c.real * upsilon1_c.real - beta_c.imag * upsilon1_c.imag;
407 //alpha1_c.imag += beta_c.real * upsilon1_c.imag + beta_c.imag * upsilon1_c.real;
408 u1v.v = _mm_load_pd( ( double* )upsilon1 );
409 u1rv.v = _mm_shuffle_pd( u1v.v, u1v.v, _MM_SHUFFLE2 (0,1) );
410 acbc.v = beta11v.v * u1v.v;
411 bdad.v = beta12v.v * u1rv.v;
412 a1v.v += _mm_addsub_pd( acbc.v, bdad.v );
413
414 //alpha1_c.real += gamma_c.real * zeta1_c.real - gamma_c.imag * zeta1_c.imag;
415 //alpha1_c.imag += gamma_c.real * zeta1_c.imag + gamma_c.imag * zeta1_c.real;
416 z1v.v = _mm_load_pd( ( double* )zeta1 );
417 z1rv.v = _mm_shuffle_pd( z1v.v, z1v.v, _MM_SHUFFLE2 (0,1) );
418 acbc.v = gamma11v.v * z1v.v;
419 bdad.v = gamma12v.v * z1rv.v;
420 a1v.v += _mm_addsub_pd( acbc.v, bdad.v );
421
422 //*alpha1 = alpha1_c;
423 _mm_store_pd( ( double* )alpha1, a1v.v );
424
425 //rho_c.real += alpha1_c.real * chi1_c.real - -alpha1_c.imag * chi1_c.imag;
426 //rho_c.imag += alpha1_c.real * chi1_c.imag + -alpha1_c.imag * chi1_c.real;
427 x1v.v = _mm_load_pd( ( double* )chi1 );
428 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
429 a11v.v = a1v.v;
430 a12v.v = _mm_shuffle_pd( a11v.v, a11v.v, _MM_SHUFFLE2 (1,1) );
431 a11v.v = _mm_shuffle_pd( a11v.v, a11v.v, _MM_SHUFFLE2 (0,0) );
432 adac.v = a11v.v * x1rv.v;
433 bcbd.v = a12v.v * x1v.v;
434 rho1v.v = rho1v.v + _mm_addsub_pd( adac.v, bcbd.v );
435
436 //omega_c = *omega1;
437 w1v.v = _mm_load_pd( ( double* )omega1 );
438
439 //omega1_c.real += kappa_c.real * alpha1_c.real - kappa_c.imag * alpha1_c.imag;
440 //omega1_c.imag += kappa_c.real * alpha1_c.imag + kappa_c.imag * alpha1_c.real;
441 a1rv.v = _mm_shuffle_pd( a1v.v, a1v.v, _MM_SHUFFLE2 (0,1) );
442 acbc.v = kappa11v.v * a1v.v;
443 bdad.v = kappa12v.v * a1rv.v;
444 w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
445
446 // *omega1 = omega1_c;
447 _mm_store_pd( ( double* )omega1, w1v.v );
448
449
450 upsilon1 += 1;
451 zeta1 += 1;
452 alpha1 += 1;
453 chi1 += 1;
454 omega1 += 1;
455 }
456
457 rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
458
459 //rho->real = rho_c.real;
460 //rho->imag = rho_c.imag;
461 _mm_store_pd( ( double* )rho, rho1v.v );
462}

References alpha1, bl1_abort(), chi1, i, dcomplex::imag, omega1, dcomplex::real, rho, upsilon1, v2df_t::v, and zeta1.

◆ bl1_zaxpyv2bdots()

void bl1_zaxpyv2bdots ( int  n,
dcomplex alpha1,
dcomplex alpha2,
dcomplex x1,
int  inc_x1,
dcomplex x2,
int  inc_x2,
dcomplex y,
int  inc_y,
dcomplex u,
int  inc_u,
dcomplex beta,
dcomplex rho 
)

◆ bl1_zaxpyv3b()

void bl1_zaxpyv3b ( int  n,
dcomplex beta1,
dcomplex beta2,
dcomplex beta3,
dcomplex a1,
int  inc_a1,
dcomplex a2,
int  inc_a2,
dcomplex a3,
int  inc_a3,
dcomplex w,
int  inc_w 
)
232{
233 bl1_abort();
234}

References bl1_abort().

◆ bl1_zdotaxmyv2()

void bl1_zdotaxmyv2 ( int  n,
dcomplex alpha,
dcomplex beta,
dcomplex x,
int  inc_x,
dcomplex u,
int  inc_u,
dcomplex rho,
dcomplex y,
int  inc_y,
dcomplex z,
int  inc_z 
)
285{
290 int i;
291
295 v2df_t x1v, x1rv;
296 v2df_t y1v;
297 v2df_t z1v;
301
302 if ( inc_x != 1 ||
303 inc_u != 1 ||
304 inc_y != 1 ||
305 inc_z != 1 ) bl1_abort();
306
307 chi1 = x;
308 upsilon1 = u;
309 psi1 = y;
310 zeta1 = z;
311
312 //rho_c.real = 0.0;
313 //rho_c.imag = 0.0;
314 rho1v.v = _mm_setzero_pd();
315
316 //alpha_c = *alpha;
317 //beta_c = *beta;
318 alpha11v.v = _mm_loaddup_pd( ( double* )&(alpha->real) );
319 alpha12v.v = _mm_loaddup_pd( ( double* )&(alpha->imag) );
320 beta11v.v = _mm_loaddup_pd( ( double* )&(beta->real) );
321 beta12v.v = _mm_loaddup_pd( ( double* )&(beta->imag) );
322
323 for ( i = 0; i < n; ++i )
324 {
325 //dcomplex chi1_c = *chi1;
326 x1v.v = _mm_load_pd( ( double* )chi1 );
327
328 //psi1->real -= alpha_c.real * chi1_c.real - alpha_c.imag * chi1_c.imag;
329 //psi1->imag -= alpha_c.real * chi1_c.imag + alpha_c.imag * chi1_c.real;
330 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
331 acad.v = alpha11v.v * x1v.v;
332 bdbc.v = alpha12v.v * x1rv.v;
333 y1v.v = _mm_load_pd( ( double* )psi1 );
334 y1v.v = y1v.v - _mm_addsub_pd( acad.v, bdbc.v );
335 _mm_store_pd( ( double* )psi1, y1v.v );
336
337 //zeta1->real -= beta_c.real * chi1_c.real - beta_c.imag * chi1_c.imag;
338 //zeta1->imag -= beta_c.real * chi1_c.imag + beta_c.imag * chi1_c.real;
339 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
340 acad.v = beta11v.v * x1v.v;
341 bdbc.v = beta12v.v * x1rv.v;
342 z1v.v = _mm_load_pd( ( double* )zeta1 );
343 z1v.v = z1v.v - _mm_addsub_pd( acad.v, bdbc.v );
344 _mm_store_pd( ( double* )zeta1, z1v.v );
345
346 //rho_c.real = chi1_c.real * upsilon1_c.real - -chi1_c.imag * upsilon1_c.imag;
347 //rho_c.imag = chi1_c.real * upsilon1_c.imag + -chi1_c.imag * upsilon1_c.real;
348 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
349 u11v.v = _mm_loaddup_pd( ( double* )&(upsilon1->real) );
350 u12v.v = _mm_loaddup_pd( ( double* )&(upsilon1->imag) );
351 bcac.v = x1rv.v * u11v.v;
352 adbd.v = x1v.v * u12v.v;
353 rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v );
354
355 chi1 += 1;
356 upsilon1 += 1;
357 psi1 += 1;
358 zeta1 += 1;
359 }
360
361 rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
362
363 rho1v.d[1] = -rho1v.d[1];
364
365 _mm_store_pd( ( double* )rho, rho1v.v );
366}

References bl1_abort(), chi1, v2df_t::d, i, dcomplex::imag, psi1, dcomplex::real, rho, upsilon1, v2df_t::v, and zeta1.

◆ bl1_zdotaxpy()

void bl1_zdotaxpy ( int  n,
dcomplex a,
int  inc_a,
dcomplex x,
int  inc_x,
dcomplex kappa,
dcomplex rho,
dcomplex w,
int  inc_w 
)
265{
269 int i;
270
274 v2df_t x1v, x1rv;
275 v2df_t w1v;
278
279 alpha1 = a;
280 chi1 = x;
281 omega1 = w;
282
283 if ( inc_a != 1 ||
284 inc_x != 1 ||
285 inc_w != 1 ) bl1_abort();
286
287 kappa1v.v = _mm_load_pd( ( double* )kappa );
289
290 rho1v.v = _mm_setzero_pd();
291
292 for ( i = 0; i < n; ++i )
293 {
294 //alpha_c = *alpha1;
295 a11v.v = _mm_loaddup_pd( ( double* )&(alpha1->real) );
296 a12v.v = _mm_loaddup_pd( ( double* )&(alpha1->imag) );
297
298 //rho_c.real += alpha1_c.real * chi1_c.real - -alpha1_c.imag * chi1_c.imag;
299 //rho_c.imag += alpha1_c.real * chi1_c.imag + -alpha1_c.imag * chi1_c.real;
300 x1v.v = _mm_load_pd( ( double* )chi1 );
301 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
302 adac.v = a11v.v * x1rv.v;
303 bcbd.v = a12v.v * x1v.v;
304 rho1v.v = rho1v.v + _mm_addsub_pd( adac.v, bcbd.v );
305
306 //omega_c = *omega1;
307 w1v.v = _mm_load_pd( ( double* )omega1 );
308
309 //omega1_c.real += kappa_c.real * alpha1_c.real - kappa_c.imag * alpha1_c.imag;
310 //omega1_c.imag += kappa_c.real * alpha1_c.imag + kappa_c.imag * alpha1_c.real;
311 acbc.v = kappa1v.v * a11v.v;
312 bdad.v = kappa1rv.v * a12v.v;
313 w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
314
315 //*omega1 = omega1_c;
316 _mm_store_pd( ( double* )omega1, w1v.v );
317
318 alpha1 += 1;
319 chi1 += 1;
320 omega1 += 1;
321 }
322
323 rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
324
325 //rho->real = rho_c.real;
326 //rho->imag = rho_c.imag;
327 _mm_store_pd( ( double* )rho, rho1v.v );
328}

References alpha1, bl1_abort(), chi1, i, omega1, rho, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Ax_opz_var1(), FLA_Fused_Gerc2_Ahx_Ax_opz_var1(), FLA_Fused_Her2_Ax_l_opz_var1(), and FLA_Fused_UZhu_ZUhu_opz_var1().

◆ bl1_zdotsv2()

void bl1_zdotsv2 ( conj1_t  conjxy,
int  n,
dcomplex x,
int  inc_x,
dcomplex y,
int  inc_y,
dcomplex z,
int  inc_z,
dcomplex beta,
dcomplex rho_xz,
dcomplex rho_yz 
)
257{
261 int i;
265 v2df_t x1v, x1rv;
266 v2df_t y1v, y1rv;
267
268 x1 = x;
269 y1 = y;
270 z1 = z;
271
272 rho1v.v = _mm_setzero_pd();
273 rho2v.v = _mm_setzero_pd();
274
275 if ( bl1_is_conj( conjxy ) )
276 {
278
279 for ( i = 0; i < n; ++i )
280 {
281 z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
282 z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );
283
284 x1v.v = _mm_load_pd( ( double* )x1 );
285 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
286 bcac.v = x1rv.v * z11v.v;
287 adbd.v = x1v.v * z12v.v;
288 rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v );
289
290 y1v.v = _mm_load_pd( ( double* )y1 );
291 y1rv.v = _mm_shuffle_pd( y1v.v, y1v.v, _MM_SHUFFLE2 (0,1) );
292 bcac.v = y1rv.v * z11v.v;
293 adbd.v = y1v.v * z12v.v;
294 rho2v.v = rho2v.v + _mm_addsub_pd( bcac.v, adbd.v );
295
296 x1 += inc_x;
297 y1 += inc_y;
298 z1 += inc_z;
299 }
300
301 rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
302 rho2v.v = _mm_shuffle_pd( rho2v.v, rho2v.v, _MM_SHUFFLE2 (0,1) );
303
304 rho1v.d[1] = -rho1v.d[1];
305 rho2v.d[1] = -rho2v.d[1];
306 }
307 else
308 {
310
311 for ( i = 0; i < n; ++i )
312 {
313 z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
314 z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );
315
316 x1v.v = _mm_load_pd( ( double* )x1 );
317 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
318 cada.v = x1v.v * z11v.v;
319 dbcb.v = x1rv.v * z12v.v;
320 rho1v.v = rho1v.v + _mm_addsub_pd( cada.v, dbcb.v );
321
322 y1v.v = _mm_load_pd( ( double* )y1 );
323 y1rv.v = _mm_shuffle_pd( y1v.v, y1v.v, _MM_SHUFFLE2 (0,1) );
324 cada.v = y1v.v * z11v.v;
325 dbcb.v = y1rv.v * z12v.v;
326 rho2v.v = rho2v.v + _mm_addsub_pd( cada.v, dbcb.v );
327
328 x1 += inc_x;
329 y1 += inc_y;
330 z1 += inc_z;
331 }
332 }
333
334 //bl1_zscals( beta, rho_xz );
335 //bl1_zscals( beta, rho_yz );
336 {
337 v2df_t ab, ba, cc, dd, acbc, bdad;
338
339 ab.v = _mm_load_pd( ( double* )beta );
340 ba.v = _mm_shuffle_pd( ab.v, ab.v, _MM_SHUFFLE2 (0,1) );
341
342 cc.v = _mm_loaddup_pd( ( double* )&(rho_xz->real) );
343 dd.v = _mm_loaddup_pd( ( double* )&(rho_xz->imag) );
344 acbc.v = ab.v * cc.v;
345 bdad.v = ba.v * dd.v;
346 r1v.v = _mm_addsub_pd( acbc.v, bdad.v );
347
348 cc.v = _mm_loaddup_pd( ( double* )&(rho_yz->real) );
349 dd.v = _mm_loaddup_pd( ( double* )&(rho_yz->imag) );
350 acbc.v = ab.v * cc.v;
351 bdad.v = ba.v * dd.v;
352 r2v.v = _mm_addsub_pd( acbc.v, bdad.v );
353 }
354
355 //rho_xz->real = rho_xz->real + rho1.real;
356 //rho_xz->imag = rho_xz->imag + rho1.imag;
357 rho1v.v = r1v.v + rho1v.v;
358 _mm_store_pd( ( double* )rho_xz, rho1v.v );
359
360 //rho_yz->real = rho_yz->real + rho2.real;
361 //rho_yz->imag = rho_yz->imag + rho2.imag;
362 rho2v.v = r2v.v + rho2v.v;
363 _mm_store_pd( ( double* )rho_yz, rho2v.v );
364}

References bl1_is_conj(), v2df_t::d, i, rho_xz, rho_yz, v2df_t::v, x1, y1, and z1.

Referenced by FLA_Fused_Ahx_Axpy_Ax_opz_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opz_var1(), and FLA_Fused_UYx_ZVx_opz_var1().

◆ bl1_zdotsv3()

void bl1_zdotsv3 ( conj1_t  conjxyw,
int  n,
dcomplex x,
int  inc_x,
dcomplex y,
int  inc_y,
dcomplex w,
int  inc_w,
dcomplex z,
int  inc_z,
dcomplex beta,
dcomplex rho_xz,
dcomplex rho_yz,
dcomplex rho_wz 
)
301{
306 int i;
311 v2df_t x1v, x1rv;
312 v2df_t y1v, y1rv;
313 v2df_t w1v, w1rv;
314
315 x1 = x;
316 y1 = y;
317 w1 = w;
318 z1 = z;
319
320 rho1v.v = _mm_setzero_pd();
321 rho2v.v = _mm_setzero_pd();
322 rho3v.v = _mm_setzero_pd();
323
324 if ( bl1_is_conj( conjxyw ) )
325 {
327
328 for ( i = 0; i < n; ++i )
329 {
330 z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
331 z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );
332
333 x1v.v = _mm_load_pd( ( double* )x1 );
334 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
335 bcac.v = x1rv.v * z11v.v;
336 adbd.v = x1v.v * z12v.v;
337 rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v );
338
339 y1v.v = _mm_load_pd( ( double* )y1 );
340 y1rv.v = _mm_shuffle_pd( y1v.v, y1v.v, _MM_SHUFFLE2 (0,1) );
341 bcac.v = y1rv.v * z11v.v;
342 adbd.v = y1v.v * z12v.v;
343 rho2v.v = rho2v.v + _mm_addsub_pd( bcac.v, adbd.v );
344
345 w1v.v = _mm_load_pd( ( double* )w1 );
346 w1rv.v = _mm_shuffle_pd( w1v.v, w1v.v, _MM_SHUFFLE2 (0,1) );
347 bcac.v = w1rv.v * z11v.v;
348 adbd.v = w1v.v * z12v.v;
349 rho3v.v = rho3v.v + _mm_addsub_pd( bcac.v, adbd.v );
350
351 x1 += inc_x;
352 y1 += inc_y;
353 w1 += inc_w;
354 z1 += inc_z;
355 }
356
357 rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
358 rho2v.v = _mm_shuffle_pd( rho2v.v, rho2v.v, _MM_SHUFFLE2 (0,1) );
359 rho3v.v = _mm_shuffle_pd( rho3v.v, rho3v.v, _MM_SHUFFLE2 (0,1) );
360
361 rho1v.d[1] = -rho1v.d[1];
362 rho2v.d[1] = -rho2v.d[1];
363 rho3v.d[1] = -rho3v.d[1];
364 }
365 else
366 {
368
369 for ( i = 0; i < n; ++i )
370 {
371 z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
372 z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );
373
374 x1v.v = _mm_load_pd( ( double* )x1 );
375 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
376 cada.v = x1v.v * z11v.v;
377 dbcb.v = x1rv.v * z12v.v;
378 rho1v.v = rho1v.v + _mm_addsub_pd( cada.v, dbcb.v );
379
380 y1v.v = _mm_load_pd( ( double* )y1 );
381 y1rv.v = _mm_shuffle_pd( y1v.v, y1v.v, _MM_SHUFFLE2 (0,1) );
382 cada.v = y1v.v * z11v.v;
383 dbcb.v = y1rv.v * z12v.v;
384 rho2v.v = rho2v.v + _mm_addsub_pd( cada.v, dbcb.v );
385
386 w1v.v = _mm_load_pd( ( double* )w1 );
387 w1rv.v = _mm_shuffle_pd( w1v.v, w1v.v, _MM_SHUFFLE2 (0,1) );
388 cada.v = w1v.v * z11v.v;
389 dbcb.v = w1rv.v * z12v.v;
390 rho3v.v = rho3v.v + _mm_addsub_pd( cada.v, dbcb.v );
391
392 x1 += inc_x;
393 y1 += inc_y;
394 w1 += inc_w;
395 z1 += inc_z;
396 }
397 }
398
399 //bl1_zscals( beta, rho_xz );
400 //bl1_zscals( beta, rho_yz );
401 //bl1_zscals( beta, rho_wz );
402 {
403 v2df_t ab, ba, cc, dd, acbc, bdad;
404
405 ab.v = _mm_load_pd( ( double* )beta );
406 ba.v = _mm_shuffle_pd( ab.v, ab.v, _MM_SHUFFLE2 (0,1) );
407
408 cc.v = _mm_loaddup_pd( ( double* )&(rho_xz->real) );
409 dd.v = _mm_loaddup_pd( ( double* )&(rho_xz->imag) );
410 acbc.v = ab.v * cc.v;
411 bdad.v = ba.v * dd.v;
412 r1v.v = _mm_addsub_pd( acbc.v, bdad.v );
413
414 cc.v = _mm_loaddup_pd( ( double* )&(rho_yz->real) );
415 dd.v = _mm_loaddup_pd( ( double* )&(rho_yz->imag) );
416 acbc.v = ab.v * cc.v;
417 bdad.v = ba.v * dd.v;
418 r2v.v = _mm_addsub_pd( acbc.v, bdad.v );
419
420 cc.v = _mm_loaddup_pd( ( double* )&(rho_wz->real) );
421 dd.v = _mm_loaddup_pd( ( double* )&(rho_wz->imag) );
422 acbc.v = ab.v * cc.v;
423 bdad.v = ba.v * dd.v;
424 r3v.v = _mm_addsub_pd( acbc.v, bdad.v );
425 }
426
427 //rho_xz->real = rho_xz->real + rho1.real;
428 //rho_xz->imag = rho_xz->imag + rho1.imag;
429 rho1v.v = r1v.v + rho1v.v;
430 _mm_store_pd( ( double* )rho_xz, rho1v.v );
431
432 //rho_yz->real = rho_yz->real + rho2.real;
433 //rho_yz->imag = rho_yz->imag + rho2.imag;
434 rho2v.v = r2v.v + rho2v.v;
435 _mm_store_pd( ( double* )rho_yz, rho2v.v );
436
437 //rho_wz->real = rho_wz->real + rho3.real;
438 //rho_wz->imag = rho_wz->imag + rho3.imag;
439 rho3v.v = r3v.v + rho3v.v;
440 _mm_store_pd( ( double* )rho_wz, rho3v.v );
441}

References bl1_is_conj(), v2df_t::d, i, rho_wz, rho_xz, rho_yz, v2df_t::v, w1, x1, y1, and z1.

Referenced by FLA_Fused_Uhu_Yhu_Zhu_opz_var1().

◆ bl1_zdotv2axpyv2b()

void bl1_zdotv2axpyv2b ( int  n,
dcomplex a1,
int  inc_a1,
dcomplex a2,
int  inc_a2,
dcomplex x,
int  inc_x,
dcomplex kappa1,
dcomplex kappa2,
dcomplex rho1,
dcomplex rho2,
dcomplex w,
int  inc_w 
)
341{
346 int i;
347
354 v2df_t x1v, x1rv;
355 v2df_t w1v;
358
359 if ( inc_a1 != 1 ||
360 inc_a2 != 1 ||
361 inc_x != 1 ||
362 inc_w != 1 ) bl1_abort();
363
364 alpha1 = a1;
365 alpha2 = a2;
366 chi1 = x;
367 omega1 = w;
368
369 rho1v.v = _mm_setzero_pd();
370 rho2v.v = _mm_setzero_pd();
371
372 kappa1v.v = _mm_load_pd( ( double* )kappa1 );
374 kappa2v.v = _mm_load_pd( ( double* )kappa2 );
376
377 for ( i = 0; i < n; ++i )
378 {
379 //dcomplex omega1_c = *omega1;
380 w1v.v = _mm_load_pd( ( double* )omega1 );
381
382 //dcomplex chi1_c = *chi1;
383 x1v.v = _mm_load_pd( ( double* )chi1 );
384
385
386 //dcomplex alpha1_c = *alpha1;
387 a11v.v = _mm_loaddup_pd( ( double* )&(alpha1->real) );
388 a12v.v = _mm_loaddup_pd( ( double* )&(alpha1->imag) );
389
390 //rho1_c.real += alpha1_c.real * chi1_c.real - -alpha1_c.imag * chi1_c.imag;
391 //rho1_c.imag += alpha1_c.real * chi1_c.imag + -alpha1_c.imag * chi1_c.real;
392 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
393 adac.v = a11v.v * x1rv.v;
394 bcbd.v = a12v.v * x1v.v;
395 rho1v.v = rho1v.v + _mm_addsub_pd( adac.v, bcbd.v );
396
397 //omega1_c.real += kappa1_c.real * alpha1_c.real - kappa1_c.imag * alpha1_c.imag;
398 //omega1_c.imag += kappa1_c.real * alpha1_c.imag + kappa1_c.imag * alpha1_c.real;
399 acbc.v = kappa1v.v * a11v.v;
400 bdad.v = kappa1rv.v * a12v.v;
401 w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
402
403
404 //dcomplex alpha2_c = *alpha2;
405 a21v.v = _mm_loaddup_pd( ( double* )&(alpha2->real) );
406 a22v.v = _mm_loaddup_pd( ( double* )&(alpha2->imag) );
407
408 //rho2_c.real += alpha2_c.real * chi1_c.real - -alpha2_c.imag * chi1_c.imag;
409 //rho2_c.imag += alpha2_c.real * chi1_c.imag + -alpha2_c.imag * chi1_c.real;
410 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
411 adac.v = a21v.v * x1rv.v;
412 bcbd.v = a22v.v * x1v.v;
413 rho2v.v = rho2v.v + _mm_addsub_pd( adac.v, bcbd.v );
414
415 //omega1_c.real += kappa2_c.real * alpha2_c.real - kappa2_c.imag * alpha2_c.imag;
416 //omega1_c.imag += kappa2_c.real * alpha2_c.imag + kappa2_c.imag * alpha2_c.real;
417 acbc.v = kappa2v.v * a21v.v;
418 bdad.v = kappa2rv.v * a22v.v;
419 w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
420
421
422 //*omega1 = omega1_c;
423 _mm_store_pd( ( double* )omega1, w1v.v );
424
425
426 //alpha1 += inc_a1;
427 //alpha2 += inc_a2;
428 //chi1 += inc_x;
429 //omega1 += inc_w;
430 alpha1 += 1;
431 alpha2 += 1;
432 chi1 += 1;
433 omega1 += 1;
434 }
435
436 rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
437 rho2v.v = _mm_shuffle_pd( rho2v.v, rho2v.v, _MM_SHUFFLE2 (0,1) );
438
439 //*rho1 = rho1_c;
440 //*rho2 = rho2_c;
441 _mm_store_pd( ( double* )rho1, rho1v.v );
442 _mm_store_pd( ( double* )rho2, rho2v.v );
443}

References alpha1, alpha2, bl1_abort(), chi1, i, omega1, rho1, rho2, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Ax_opz_var1().