libflame revision_anchor
Functions
bl1_gemm.c File Reference

(r)

Functions

void bl1_sgemm (trans1_t transa, trans1_t transb, int m, int k, int n, float *alpha, float *a, int a_rs, int a_cs, float *b, int b_rs, int b_cs, float *beta, float *c, int c_rs, int c_cs)
 
void bl1_dgemm (trans1_t transa, trans1_t transb, int m, int k, int n, double *alpha, double *a, int a_rs, int a_cs, double *b, int b_rs, int b_cs, double *beta, double *c, int c_rs, int c_cs)
 
void bl1_cgemm (trans1_t transa, trans1_t transb, int m, int k, int n, scomplex *alpha, scomplex *a, int a_rs, int a_cs, scomplex *b, int b_rs, int b_cs, scomplex *beta, scomplex *c, int c_rs, int c_cs)
 
void bl1_zgemm (trans1_t transa, trans1_t transb, int m, int k, int n, dcomplex *alpha, dcomplex *a, int a_rs, int a_cs, dcomplex *b, int b_rs, int b_cs, dcomplex *beta, dcomplex *c, int c_rs, int c_cs)
 
void bl1_sgemm_blas (trans1_t transa, trans1_t transb, int m, int n, int k, float *alpha, float *a, int lda, float *b, int ldb, float *beta, float *c, int ldc)
 
void bl1_dgemm_blas (trans1_t transa, trans1_t transb, int m, int n, int k, double *alpha, double *a, int lda, double *b, int ldb, double *beta, double *c, int ldc)
 
void bl1_cgemm_blas (trans1_t transa, trans1_t transb, int m, int n, int k, scomplex *alpha, scomplex *a, int lda, scomplex *b, int ldb, scomplex *beta, scomplex *c, int ldc)
 
void bl1_zgemm_blas (trans1_t transa, trans1_t transb, int m, int n, int k, dcomplex *alpha, dcomplex *a, int lda, dcomplex *b, int ldb, dcomplex *beta, dcomplex *c, int ldc)
 

Function Documentation

◆ bl1_cgemm()

void bl1_cgemm ( trans1_t  transa,
trans1_t  transb,
int  m,
int  k,
int  n,
scomplex alpha,
scomplex a,
int  a_rs,
int  a_cs,
scomplex b,
int  b_rs,
int  b_cs,
scomplex beta,
scomplex c,
int  c_rs,
int  c_cs 
)
536{
537 int m_save = m;
538 int n_save = n;
539 scomplex* a_save = a;
540 scomplex* b_save = b;
541 scomplex* c_save = c;
542 int a_rs_save = a_rs;
543 int a_cs_save = a_cs;
544 int b_rs_save = b_rs;
545 int b_cs_save = b_cs;
546 int c_rs_save = c_rs;
547 int c_cs_save = c_cs;
548 scomplex zero = bl1_c0();
549 scomplex one = bl1_c1();
555 int lda, inca;
556 int ldb, incb;
557 int ldc, incc;
558 int lda_conj, inca_conj;
559 int ldb_conj, incb_conj;
561 int m_gemm, n_gemm;
563 int a_was_copied;
564 int b_was_copied;
565
566 // Return early if possible.
567 if ( bl1_zero_dim3( m, k, n ) )
568 {
570 m,
571 n,
572 beta,
573 c, c_rs, c_cs );
574 return;
575 }
576
577 // If necessary, allocate, initialize, and use a temporary contiguous
578 // copy of each matrix rather than the original matrices.
580 m,
581 k,
583 &a, &a_rs, &a_cs );
584
586 k,
587 n,
589 &b, &b_rs, &b_cs );
590
592 n,
594 &c, &c_rs, &c_cs );
595
596 // Figure out whether A and/or B was copied to contiguous memory. This
597 // is used later to prevent redundant copying.
598 a_was_copied = ( a != a_save );
599 b_was_copied = ( b != b_save );
600
601 // These are used to track the original values of a and b prior to any
602 // operand swapping that might take place. This is necessary for proper
603 // freeing of memory when one is a temporary contiguous matrix.
604 a_unswap = a;
605 b_unswap = b;
606
607 // These are used to track the dimensions of the product of the
608 // A and B operands to the BLAS invocation of gemm. These differ
609 // from m and n when the operands need to be swapped.
610 m_gemm = m;
611 n_gemm = n;
612
613 // Initialize with values assuming column-major storage.
614 lda = a_cs;
615 inca = a_rs;
616 ldb = b_cs;
617 incb = b_rs;
618 ldc = c_cs;
619 incc = c_rs;
620
621 // Adjust the parameters based on the storage of each matrix.
622 if ( bl1_is_col_storage( c_rs, c_cs ) )
623 {
624 if ( bl1_is_col_storage( a_rs, a_cs ) )
625 {
626 if ( bl1_is_col_storage( b_rs, b_cs ) )
627 {
628 // requested operation: C_c += tr( A_c ) * tr( B_c )
629 // effective operation: C_c += tr( A_c ) * tr( B_c )
630 }
631 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
632 {
633
634 // requested operation: C_c += tr( A_c ) * tr( B_r )
635 // effective operation: C_c += tr( A_c ) * tr( B_c )^T
637
639 }
640 }
641 else // if ( bl1_is_row_storage( a_rs, a_cs ) )
642 {
643 if ( bl1_is_col_storage( b_rs, b_cs ) )
644 {
645 // requested operation: C_c += tr( A_r ) * tr( B_c )
646 // effective operation: C_c += tr( A_r )^T * tr( B_c )
648
650 }
651 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
652 {
653 // requested operation: C_c += tr( A_r ) * tr( B_r )
654 // effective operation: C_c += ( tr( B_c ) * tr( A_c ) )^T
657
663
666 }
667 }
668 }
669 else // if ( bl1_is_row_storage( c_rs, c_cs ) )
670 {
671 if ( bl1_is_col_storage( a_rs, a_cs ) )
672 {
673 if ( bl1_is_col_storage( b_rs, b_cs ) )
674 {
675 // requested operation: C_r += tr( A_c ) * tr( B_c )
676 // effective operation: C_c += ( tr( A_c ) * tr( B_c ) )^T
678
679 bl1_swap_ints( m, n );
680
682 }
683 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
684 {
685 // requested operation: C_r += tr( A_c ) * tr( B_r )
686 // effective operation: C_c += tr( B_c ) * tr( A_c )^T
689
691
692 bl1_swap_ints( m, n );
699 }
700 }
701 else // if ( bl1_is_row_storage( a_rs, a_cs ) )
702 {
703 if ( bl1_is_col_storage( b_rs, b_cs ) )
704 {
705 // requested operation: C_r += tr( A_r ) * tr( B_c )
706 // effective operation: C_c += tr( B_c )^T * tr( A_c )
709
711
712 bl1_swap_ints( m, n );
719 }
720 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
721 {
722 // requested operation: C_r += tr( A_r ) * tr( B_r )
723 // effective operation: C_c += tr( B_c ) * tr( A_c )
727
728 bl1_swap_ints( m, n );
735 }
736 }
737 }
738
739 // We need a temporary matrix for the case where A is conjugated.
740 a_conj = a;
741 lda_conj = lda;
742 inca_conj = inca;
743
744 // If transa indicates conjugate-no-transpose and A was not already
745 // copied, then copy and conjugate it to a temporary matrix. Otherwise,
746 // if transa indicates conjugate-no-transpose and A was already copied,
747 // just conjugate it.
749 {
752 inca_conj = 1;
753
755 m_gemm,
756 k,
757 a, inca, lda,
759 }
760 else if ( bl1_is_conjnotrans( transa ) && a_was_copied )
761 {
763 k,
765 }
766
767 // We need a temporary matrix for the case where B is conjugated.
768 b_conj = b;
769 ldb_conj = ldb;
770 incb_conj = incb;
771
772 // If transb indicates conjugate-no-transpose and B was not already
773 // copied, then copy and conjugate it to a temporary matrix. Otherwise,
774 // if transb indicates conjugate-no-transpose and B was already copied,
775 // just conjugate it.
777 {
779 ldb_conj = k;
780 incb_conj = 1;
781
783 k,
784 n_gemm,
785 b, incb, ldb,
787 }
788 else if ( bl1_is_conjnotrans( transb ) && b_was_copied )
789 {
790 bl1_cconjm( k,
791 n_gemm,
793 }
794
795 // There are two cases where we need to perform the gemm and then axpy
796 // the result into C with a transposition. We handle those cases here.
797 if ( gemm_needs_axpyt )
798 {
799 // We need a temporary matrix for holding C^T. Notice that m and n
800 // represent the dimensions of C, while m_gemm and n_gemm are the
801 // dimensions of the actual product op(A)*op(B), which may be n-by-m
802 // since the operands may have been swapped.
805 incc_trans = 1;
806
807 // Compute tr( A ) * tr( B ), where A and B may have been swapped
808 // to reference the other, and store the result in C_trans.
810 transb,
811 m_gemm,
812 n_gemm,
813 k,
814 alpha,
817 &zero,
819
820 // Scale C by beta.
822 m,
823 n,
824 beta,
825 c, incc, ldc );
826
827 // And finally, accumulate the matrix product in C_trans into C
828 // with a transpose.
830 m,
831 n,
832 &one,
834 c, incc, ldc );
835
836 // Free the temporary matrix for C.
838 }
839 else // no extra axpyt step needed
840 {
842 transb,
843 m_gemm,
844 n_gemm,
845 k,
846 alpha,
849 beta,
850 c, ldc );
851 }
852
854 bl1_cfree( a_conj );
855
857 bl1_cfree( b_conj );
858
859 // Free any temporary contiguous matrices, copying the result back to
860 // the original matrix.
862 &a_unswap, &a_rs, &a_cs );
863
865 &b_unswap, &b_rs, &b_cs );
866
868 n_save,
870 &c, &c_rs, &c_cs );
871}
int i
Definition bl1_axmyv2.c:145
void bl1_caxpymt(trans1_t trans, int m, int n, scomplex *alpha, scomplex *a, int a_rs, int a_cs, scomplex *b, int b_rs, int b_cs)
Definition bl1_axpymt.c:149
void bl1_cconjm(int m, int n, scomplex *a, int a_rs, int a_cs)
Definition bl1_conjm.c:23
void bl1_ccopymt(trans1_t trans, int m, int n, scomplex *a, int a_rs, int a_cs, scomplex *b, int b_rs, int b_cs)
Definition bl1_copymt.c:215
void bl1_cgemm_blas(trans1_t transa, trans1_t transb, int m, int n, int k, scomplex *alpha, scomplex *a, int lda, scomplex *b, int ldb, scomplex *beta, scomplex *c, int ldc)
Definition bl1_gemm.c:1295
void bl1_cscalm(conj1_t conj, int m, int n, scomplex *alpha, scomplex *a, int a_rs, int a_cs)
Definition bl1_scalm.c:169
int bl1_is_col_storage(int rs, int cs)
Definition bl1_is.c:90
int bl1_zero_dim3(int m, int k, int n)
Definition bl1_is.c:123
int bl1_is_conjnotrans(trans1_t trans)
Definition bl1_is.c:25
void bl1_cfree_contigm(scomplex *a_save, int a_rs_save, int a_cs_save, scomplex **a, int *a_rs, int *a_cs)
Definition bl1_free_contigm.c:45
scomplex bl1_c1(void)
Definition bl1_constants.c:61
void bl1_cfree(scomplex *p)
Definition bl1_free.c:40
void bl1_ccreate_contigm(int m, int n, scomplex *a_save, int a_rs_save, int a_cs_save, scomplex **a, int *a_rs, int *a_cs)
Definition bl1_create_contigm.c:81
scomplex * bl1_callocm(unsigned int m, unsigned int n)
Definition bl1_allocm.c:40
scomplex bl1_c0(void)
Definition bl1_constants.c:125
void bl1_cfree_saved_contigm(int m, int n, scomplex *a_save, int a_rs_save, int a_cs_save, scomplex **a, int *a_rs, int *a_cs)
Definition bl1_free_saved_contigm.c:59
void bl1_ccreate_contigmt(trans1_t trans_dims, int m, int n, scomplex *a_save, int a_rs_save, int a_cs_save, scomplex **a, int *a_rs, int *a_cs)
Definition bl1_create_contigmt.c:89
@ BLIS1_TRANSPOSE
Definition blis_type_defs.h:55
@ BLIS1_CONJ_NO_TRANSPOSE
Definition blis_type_defs.h:56
@ BLIS1_NO_CONJUGATE
Definition blis_type_defs.h:81
Definition blis_type_defs.h:133

References bl1_c0(), bl1_c1(), bl1_callocm(), bl1_caxpymt(), bl1_cconjm(), bl1_ccopymt(), bl1_ccreate_contigm(), bl1_ccreate_contigmt(), bl1_cfree(), bl1_cfree_contigm(), bl1_cfree_saved_contigm(), bl1_cgemm_blas(), bl1_cscalm(), bl1_is_col_storage(), bl1_is_conjnotrans(), bl1_zero_dim3(), BLIS1_CONJ_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, and BLIS1_TRANSPOSE.

Referenced by FLA_Gemm_external().

◆ bl1_cgemm_blas()

void bl1_cgemm_blas ( trans1_t  transa,
trans1_t  transb,
int  m,
int  n,
int  k,
scomplex alpha,
scomplex a,
int  lda,
scomplex b,
int  ldb,
scomplex beta,
scomplex c,
int  ldc 
)
1296{
1297#ifdef BLIS1_ENABLE_CBLAS_INTERFACES
1301
1304
1308 m,
1309 n,
1310 k,
1311 alpha,
1312 a, lda,
1313 b, ldb,
1314 beta,
1315 c, ldc );
1316#else
1317 char blas_transa;
1318 char blas_transb;
1319
1322
1324 &blas_transb,
1325 &m,
1326 &n,
1327 &k,
1328 alpha,
1329 a, &lda,
1330 b, &ldb,
1331 beta,
1332 c, &ldc );
1333#endif
1334}
void F77_cgemm(char *transa, char *transb, int *m, int *n, int *k, scomplex *alpha, scomplex *a, int *lda, scomplex *b, int *ldb, scomplex *beta, scomplex *c, int *ldc)
CBLAS_ORDER
Definition blis_prototypes_cblas.h:17
@ CblasColMajor
Definition blis_prototypes_cblas.h:17
void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc)
CBLAS_TRANSPOSE
Definition blis_prototypes_cblas.h:18
void bl1_param_map_to_netlib_trans(trans1_t blis_trans, void *blas_trans)
Definition bl1_param_map.c:15

References bl1_param_map_to_netlib_trans(), cblas_cgemm(), CblasColMajor, and F77_cgemm().

Referenced by bl1_cgemm().

◆ bl1_dgemm()

void bl1_dgemm ( trans1_t  transa,
trans1_t  transb,
int  m,
int  k,
int  n,
double alpha,
double a,
int  a_rs,
int  a_cs,
double b,
int  b_rs,
int  b_cs,
double beta,
double c,
int  c_rs,
int  c_cs 
)
275{
276 int m_save = m;
277 int n_save = n;
278 double* a_save = a;
279 double* b_save = b;
280 double* c_save = c;
281 int a_rs_save = a_rs;
282 int a_cs_save = a_cs;
283 int b_rs_save = b_rs;
284 int b_cs_save = b_cs;
285 int c_rs_save = c_rs;
286 int c_cs_save = c_cs;
287 double zero = bl1_d0();
288 double one = bl1_d1();
289 double* a_unswap;
290 double* b_unswap;
291 double* c_trans;
292 int lda, inca;
293 int ldb, incb;
294 int ldc, incc;
296 int m_gemm, n_gemm;
298
299 // Return early if possible.
300 if ( bl1_zero_dim3( m, k, n ) )
301 {
303 m,
304 n,
305 beta,
306 c, c_rs, c_cs );
307 return;
308 }
309
310 // If necessary, allocate, initialize, and use a temporary contiguous
311 // copy of each matrix rather than the original matrices.
313 m,
314 k,
316 &a, &a_rs, &a_cs );
317
319 k,
320 n,
322 &b, &b_rs, &b_cs );
323
325 n,
327 &c, &c_rs, &c_cs );
328
329 // These are used to track the original values of a and b prior to any
330 // operand swapping that might take place. This is necessary for proper
331 // freeing of memory when one is a temporary contiguous matrix.
332 a_unswap = a;
333 b_unswap = b;
334
335 // These are used to track the dimensions of the product of the
336 // A and B operands to the BLAS invocation of gemm. These differ
337 // from m and n when the operands need to be swapped.
338 m_gemm = m;
339 n_gemm = n;
340
341 // Initialize with values assuming column-major storage.
342 lda = a_cs;
343 inca = a_rs;
344 ldb = b_cs;
345 incb = b_rs;
346 ldc = c_cs;
347 incc = c_rs;
348
349 // Adjust the parameters based on the storage of each matrix.
350 if ( bl1_is_col_storage( c_rs, c_cs ) )
351 {
352 if ( bl1_is_col_storage( a_rs, a_cs ) )
353 {
354 if ( bl1_is_col_storage( b_rs, b_cs ) )
355 {
356 // requested operation: C_c += tr( A_c ) * tr( B_c )
357 // effective operation: C_c += tr( A_c ) * tr( B_c )
358 }
359 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
360 {
361
362 // requested operation: C_c += tr( A_c ) * tr( B_r )
363 // effective operation: C_c += tr( A_c ) * tr( B_c )^T
365
367 }
368 }
369 else // if ( bl1_is_row_storage( a_rs, a_cs ) )
370 {
371 if ( bl1_is_col_storage( b_rs, b_cs ) )
372 {
373 // requested operation: C_c += tr( A_r ) * tr( B_c )
374 // effective operation: C_c += tr( A_r )^T * tr( B_c )
376
378 }
379 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
380 {
381 // requested operation: C_c += tr( A_r ) * tr( B_r )
382 // effective operation: C_c += ( tr( B_c ) * tr( A_c ) )^T
385
390
393 }
394 }
395 }
396 else // if ( bl1_is_row_storage( c_rs, c_cs ) )
397 {
398 if ( bl1_is_col_storage( a_rs, a_cs ) )
399 {
400 if ( bl1_is_col_storage( b_rs, b_cs ) )
401 {
402 // requested operation: C_r += tr( A_c ) * tr( B_c )
403 // effective operation: C_c += ( tr( A_c ) * tr( B_c ) )^T
405
406 bl1_swap_ints( m, n );
407
409 }
410 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
411 {
412 // requested operation: C_r += tr( A_c ) * tr( B_r )
413 // effective operation: C_c += tr( B_c ) * tr( A_c )^T
416
418
419 bl1_swap_ints( m, n );
425 }
426 }
427 else // if ( bl1_is_row_storage( a_rs, a_cs ) )
428 {
429 if ( bl1_is_col_storage( b_rs, b_cs ) )
430 {
431 // requested operation: C_r += tr( A_r ) * tr( B_c )
432 // effective operation: C_c += tr( B_c )^T * tr( A_c )
435
437
438 bl1_swap_ints( m, n );
444 }
445 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
446 {
447 // requested operation: C_r += tr( A_r ) * tr( B_r )
448 // effective operation: C_c += tr( B_c ) * tr( A_c )
452
453 bl1_swap_ints( m, n );
459 }
460 }
461 }
462
463 // There are two cases where we need to perform the gemm and then axpy
464 // the result into C with a transposition. We handle those cases here.
465 if ( gemm_needs_axpyt )
466 {
467 // We need a temporary matrix for holding C^T. Notice that m and n
468 // represent the dimensions of C, while m_gemm and n_gemm are the
469 // dimensions of the actual product op(A)*op(B), which may be n-by-m
470 // since the operands may have been swapped.
473 incc_trans = 1;
474
475 // Compute tr( A ) * tr( B ), where A and B may have been swapped
476 // to reference the other, and store the result in C_trans.
478 transb,
479 m_gemm,
480 n_gemm,
481 k,
482 alpha,
483 a, lda,
484 b, ldb,
485 &zero,
487
488 // Scale C by beta.
490 m,
491 n,
492 beta,
493 c, incc, ldc );
494
495 // And finally, accumulate the matrix product in C_trans into C
496 // with a transpose.
498 m,
499 n,
500 &one,
502 c, incc, ldc );
503
504 // Free the temporary matrix for C.
506 }
507 else // no extra axpyt step needed
508 {
510 transb,
511 m_gemm,
512 n_gemm,
513 k,
514 alpha,
515 a, lda,
516 b, ldb,
517 beta,
518 c, ldc );
519 }
520
521 // Free any temporary contiguous matrices, copying the result back to
522 // the original matrix.
524 &a_unswap, &a_rs, &a_cs );
525
527 &b_unswap, &b_rs, &b_cs );
528
530 n_save,
532 &c, &c_rs, &c_cs );
533}
void bl1_daxpymt(trans1_t trans, int m, int n, double *alpha, double *a, int a_rs, int a_cs, double *b, int b_rs, int b_cs)
Definition bl1_axpymt.c:81
void bl1_dgemm_blas(trans1_t transa, trans1_t transb, int m, int n, int k, double *alpha, double *a, int lda, double *b, int ldb, double *beta, double *c, int ldc)
Definition bl1_gemm.c:1254
void bl1_dscalm(conj1_t conj, int m, int n, double *alpha, double *a, int a_rs, int a_cs)
Definition bl1_scalm.c:65
void bl1_dfree_contigm(double *a_save, int a_rs_save, int a_cs_save, double **a, int *a_rs, int *a_cs)
Definition bl1_free_contigm.c:29
void bl1_dcreate_contigm(int m, int n, double *a_save, int a_rs_save, int a_cs_save, double **a, int *a_rs, int *a_cs)
Definition bl1_create_contigm.c:47
void bl1_dfree_saved_contigm(int m, int n, double *a_save, int a_rs_save, int a_cs_save, double **a, int *a_rs, int *a_cs)
Definition bl1_free_saved_contigm.c:36
void bl1_dcreate_contigmt(trans1_t trans_dims, int m, int n, double *a_save, int a_rs_save, int a_cs_save, double **a, int *a_rs, int *a_cs)
Definition bl1_create_contigmt.c:51
double bl1_d0(void)
Definition bl1_constants.c:118
void bl1_dfree(double *p)
Definition bl1_free.c:35
double * bl1_dallocm(unsigned int m, unsigned int n)
Definition bl1_allocm.c:35
double bl1_d1(void)
Definition bl1_constants.c:54

References bl1_d0(), bl1_d1(), bl1_dallocm(), bl1_daxpymt(), bl1_dcreate_contigm(), bl1_dcreate_contigmt(), bl1_dfree(), bl1_dfree_contigm(), bl1_dfree_saved_contigm(), bl1_dgemm_blas(), bl1_dscalm(), bl1_is_col_storage(), bl1_zero_dim3(), BLIS1_NO_CONJUGATE, and BLIS1_TRANSPOSE.

Referenced by FLA_Bsvd_v_opd_var2(), FLA_Bsvd_v_opz_var2(), FLA_Gemm_external(), FLA_Tevd_v_opd_var2(), and FLA_Tevd_v_opz_var2().

◆ bl1_dgemm_blas()

void bl1_dgemm_blas ( trans1_t  transa,
trans1_t  transb,
int  m,
int  n,
int  k,
double alpha,
double a,
int  lda,
double b,
int  ldb,
double beta,
double c,
int  ldc 
)
1255{
1256#ifdef BLIS1_ENABLE_CBLAS_INTERFACES
1260
1263
1267 m,
1268 n,
1269 k,
1270 *alpha,
1271 a, lda,
1272 b, ldb,
1273 *beta,
1274 c, ldc );
1275#else
1276 char blas_transa;
1277 char blas_transb;
1278
1281
1283 &blas_transb,
1284 &m,
1285 &n,
1286 &k,
1287 alpha,
1288 a, &lda,
1289 b, &ldb,
1290 beta,
1291 c, &ldc );
1292#endif
1293}
void F77_dgemm(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc)
void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc)

References bl1_param_map_to_netlib_trans(), cblas_dgemm(), CblasColMajor, and F77_dgemm().

Referenced by bl1_dgemm().

◆ bl1_sgemm()

void bl1_sgemm ( trans1_t  transa,
trans1_t  transb,
int  m,
int  k,
int  n,
float alpha,
float a,
int  a_rs,
int  a_cs,
float b,
int  b_rs,
int  b_cs,
float beta,
float c,
int  c_rs,
int  c_cs 
)
14{
15 int m_save = m;
16 int n_save = n;
17 float* a_save = a;
18 float* b_save = b;
19 float* c_save = c;
20 int a_rs_save = a_rs;
21 int a_cs_save = a_cs;
22 int b_rs_save = b_rs;
23 int b_cs_save = b_cs;
24 int c_rs_save = c_rs;
25 int c_cs_save = c_cs;
26 float zero = bl1_s0();
27 float one = bl1_s1();
28 float* a_unswap;
29 float* b_unswap;
30 float* c_trans;
31 int lda, inca;
32 int ldb, incb;
33 int ldc, incc;
35 int m_gemm, n_gemm;
37
38 // Return early if possible.
39 if ( bl1_zero_dim3( m, k, n ) )
40 {
42 m,
43 n,
44 beta,
45 c, c_rs, c_cs );
46 return;
47 }
48
49 // If necessary, allocate, initialize, and use a temporary contiguous
50 // copy of each matrix rather than the original matrices.
52 m,
53 k,
55 &a, &a_rs, &a_cs );
56
58 k,
59 n,
61 &b, &b_rs, &b_cs );
62
64 n,
66 &c, &c_rs, &c_cs );
67
68 // These are used to track the original values of a and b prior to any
69 // operand swapping that might take place. This is necessary for proper
70 // freeing of memory when one is a temporary contiguous matrix.
71 a_unswap = a;
72 b_unswap = b;
73
74 // These are used to track the dimensions of the product of the
75 // A and B operands to the BLAS invocation of gemm. These differ
76 // from m and n when the operands need to be swapped.
77 m_gemm = m;
78 n_gemm = n;
79
80 // Initialize with values assuming column-major storage.
81 lda = a_cs;
82 inca = a_rs;
83 ldb = b_cs;
84 incb = b_rs;
85 ldc = c_cs;
86 incc = c_rs;
87
88 // Adjust the parameters based on the storage of each matrix.
90 {
92 {
94 {
95 // requested operation: C_c += tr( A_c ) * tr( B_c )
96 // effective operation: C_c += tr( A_c ) * tr( B_c )
97 }
98 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
99 {
100
101 // requested operation: C_c += tr( A_c ) * tr( B_r )
102 // effective operation: C_c += tr( A_c ) * tr( B_c )^T
104
106 }
107 }
108 else // if ( bl1_is_row_storage( a_rs, a_cs ) )
109 {
110 if ( bl1_is_col_storage( b_rs, b_cs ) )
111 {
112 // requested operation: C_c += tr( A_r ) * tr( B_c )
113 // effective operation: C_c += tr( A_r )^T * tr( B_c )
115
117 }
118 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
119 {
120 // requested operation: C_c += tr( A_r ) * tr( B_r )
121 // effective operation: C_c += ( tr( B_c ) * tr( A_c ) )^T
124
129
132 }
133 }
134 }
135 else // if ( bl1_is_row_storage( c_rs, c_cs ) )
136 {
137 if ( bl1_is_col_storage( a_rs, a_cs ) )
138 {
139 if ( bl1_is_col_storage( b_rs, b_cs ) )
140 {
141 // requested operation: C_r += tr( A_c ) * tr( B_c )
142 // effective operation: C_c += ( tr( A_c ) * tr( B_c ) )^T
144
145 bl1_swap_ints( m, n );
146
148 }
149 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
150 {
151 // requested operation: C_r += tr( A_c ) * tr( B_r )
152 // effective operation: C_c += tr( B_c ) * tr( A_c )^T
155
157
158 bl1_swap_ints( m, n );
164 }
165 }
166 else // if ( bl1_is_row_storage( a_rs, a_cs ) )
167 {
168 if ( bl1_is_col_storage( b_rs, b_cs ) )
169 {
170 // requested operation: C_r += tr( A_r ) * tr( B_c )
171 // effective operation: C_c += tr( B_c )^T * tr( A_c )
174
176
177 bl1_swap_ints( m, n );
183 }
184 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
185 {
186 // requested operation: C_r += tr( A_r ) * tr( B_r )
187 // effective operation: C_c += tr( B_c ) * tr( A_c )
191
192 bl1_swap_ints( m, n );
198 }
199 }
200 }
201
202 // There are two cases where we need to perform the gemm and then axpy
203 // the result into C with a transposition. We handle those cases here.
204 if ( gemm_needs_axpyt )
205 {
206 // We need a temporary matrix for holding C^T. Notice that m and n
207 // represent the dimensions of C, while m_gemm and n_gemm are the
208 // dimensions of the actual product op(A)*op(B), which may be n-by-m
209 // since the operands may have been swapped.
212 incc_trans = 1;
213
214 // Compute tr( A ) * tr( B ), where A and B may have been swapped
215 // to reference the other, and store the result in C_trans.
217 transb,
218 m_gemm,
219 n_gemm,
220 k,
221 alpha,
222 a, lda,
223 b, ldb,
224 &zero,
226
227 // Scale C by beta.
229 m,
230 n,
231 beta,
232 c, incc, ldc );
233
234 // And finally, accumulate the matrix product in C_trans into C
235 // with a transpose.
237 m,
238 n,
239 &one,
241 c, incc, ldc );
242
243 // Free the temporary matrix for C.
245 }
246 else // no extra axpyt step needed
247 {
249 transb,
250 m_gemm,
251 n_gemm,
252 k,
253 alpha,
254 a, lda,
255 b, ldb,
256 beta,
257 c, ldc );
258 }
259
260 // Free any temporary contiguous matrices, copying the result back to
261 // the original matrix.
263 &a_unswap, &a_rs, &a_cs );
264
266 &b_unswap, &b_rs, &b_cs );
267
269 n_save,
271 &c, &c_rs, &c_cs );
272}
void bl1_saxpymt(trans1_t trans, int m, int n, float *alpha, float *a, int a_rs, int a_cs, float *b, int b_rs, int b_cs)
Definition bl1_axpymt.c:13
void bl1_sgemm_blas(trans1_t transa, trans1_t transb, int m, int n, int k, float *alpha, float *a, int lda, float *b, int ldb, float *beta, float *c, int ldc)
Definition bl1_gemm.c:1213
void bl1_sscalm(conj1_t conj, int m, int n, float *alpha, float *a, int a_rs, int a_cs)
Definition bl1_scalm.c:13
float bl1_s0(void)
Definition bl1_constants.c:111
void bl1_sfree_contigm(float *a_save, int a_rs_save, int a_cs_save, float **a, int *a_rs, int *a_cs)
Definition bl1_free_contigm.c:13
void bl1_sfree(float *p)
Definition bl1_free.c:30
float * bl1_sallocm(unsigned int m, unsigned int n)
Definition bl1_allocm.c:30
void bl1_sfree_saved_contigm(int m, int n, float *a_save, int a_rs_save, int a_cs_save, float **a, int *a_rs, int *a_cs)
Definition bl1_free_saved_contigm.c:13
float bl1_s1(void)
Definition bl1_constants.c:47
void bl1_screate_contigm(int m, int n, float *a_save, int a_rs_save, int a_cs_save, float **a, int *a_rs, int *a_cs)
Definition bl1_create_contigm.c:13
void bl1_screate_contigmt(trans1_t trans_dims, int m, int n, float *a_save, int a_rs_save, int a_cs_save, float **a, int *a_rs, int *a_cs)
Definition bl1_create_contigmt.c:13

References bl1_is_col_storage(), bl1_s0(), bl1_s1(), bl1_sallocm(), bl1_saxpymt(), bl1_screate_contigm(), bl1_screate_contigmt(), bl1_sfree(), bl1_sfree_contigm(), bl1_sfree_saved_contigm(), bl1_sgemm_blas(), bl1_sscalm(), bl1_zero_dim3(), BLIS1_NO_CONJUGATE, and BLIS1_TRANSPOSE.

Referenced by FLA_Gemm_external().

◆ bl1_sgemm_blas()

void bl1_sgemm_blas ( trans1_t  transa,
trans1_t  transb,
int  m,
int  n,
int  k,
float alpha,
float a,
int  lda,
float b,
int  ldb,
float beta,
float c,
int  ldc 
)
1214{
1215#ifdef BLIS1_ENABLE_CBLAS_INTERFACES
1219
1222
1226 m,
1227 n,
1228 k,
1229 *alpha,
1230 a, lda,
1231 b, ldb,
1232 *beta,
1233 c, ldc );
1234#else
1235 char blas_transa;
1236 char blas_transb;
1237
1240
1242 &blas_transb,
1243 &m,
1244 &n,
1245 &k,
1246 alpha,
1247 a, &lda,
1248 b, &ldb,
1249 beta,
1250 c, &ldc );
1251#endif
1252}
void F77_sgemm(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc)
void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc)

References bl1_param_map_to_netlib_trans(), cblas_sgemm(), CblasColMajor, and F77_sgemm().

Referenced by bl1_sgemm().

◆ bl1_zgemm()

void bl1_zgemm ( trans1_t  transa,
trans1_t  transb,
int  m,
int  k,
int  n,
dcomplex alpha,
dcomplex a,
int  a_rs,
int  a_cs,
dcomplex b,
int  b_rs,
int  b_cs,
dcomplex beta,
dcomplex c,
int  c_rs,
int  c_cs 
)
874{
875 int m_save = m;
876 int n_save = n;
877 dcomplex* a_save = a;
878 dcomplex* b_save = b;
879 dcomplex* c_save = c;
880 int a_rs_save = a_rs;
881 int a_cs_save = a_cs;
882 int b_rs_save = b_rs;
883 int b_cs_save = b_cs;
884 int c_rs_save = c_rs;
885 int c_cs_save = c_cs;
886 dcomplex zero = bl1_z0();
887 dcomplex one = bl1_z1();
893 int lda, inca;
894 int ldb, incb;
895 int ldc, incc;
896 int lda_conj, inca_conj;
897 int ldb_conj, incb_conj;
899 int m_gemm, n_gemm;
901 int a_was_copied;
902 int b_was_copied;
903
904 // Return early if possible.
905 if ( bl1_zero_dim3( m, k, n ) )
906 {
908 m,
909 n,
910 beta,
911 c, c_rs, c_cs );
912 return;
913 }
914
915 // If necessary, allocate, initialize, and use a temporary contiguous
916 // copy of each matrix rather than the original matrices.
918 m,
919 k,
921 &a, &a_rs, &a_cs );
922
924 k,
925 n,
927 &b, &b_rs, &b_cs );
928
930 n,
932 &c, &c_rs, &c_cs );
933
934 // Figure out whether A and/or B was copied to contiguous memory. This
935 // is used later to prevent redundant copying.
936 a_was_copied = ( a != a_save );
937 b_was_copied = ( b != b_save );
938
939 // These are used to track the original values of a and b prior to any
940 // operand swapping that might take place. This is necessary for proper
941 // freeing of memory when one is a temporary contiguous matrix.
942 a_unswap = a;
943 b_unswap = b;
944
945 // These are used to track the dimensions of the product of the
946 // A and B operands to the BLAS invocation of gemm. These differ
947 // from m and n when the operands need to be swapped.
948 m_gemm = m;
949 n_gemm = n;
950
951 // Initialize with values assuming column-major storage.
952 lda = a_cs;
953 inca = a_rs;
954 ldb = b_cs;
955 incb = b_rs;
956 ldc = c_cs;
957 incc = c_rs;
958
959 // Adjust the parameters based on the storage of each matrix.
960 if ( bl1_is_col_storage( c_rs, c_cs ) )
961 {
962 if ( bl1_is_col_storage( a_rs, a_cs ) )
963 {
964 if ( bl1_is_col_storage( b_rs, b_cs ) )
965 {
966 // requested operation: C_c += tr( A_c ) * tr( B_c )
967 // effective operation: C_c += tr( A_c ) * tr( B_c )
968 }
969 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
970 {
971
972 // requested operation: C_c += tr( A_c ) * tr( B_r )
973 // effective operation: C_c += tr( A_c ) * tr( B_c )^T
975
977 }
978 }
979 else // if ( bl1_is_row_storage( a_rs, a_cs ) )
980 {
981 if ( bl1_is_col_storage( b_rs, b_cs ) )
982 {
983 // requested operation: C_c += tr( A_r ) * tr( B_c )
984 // effective operation: C_c += tr( A_r )^T * tr( B_c )
986
988 }
989 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
990 {
991 // requested operation: C_c += tr( A_r ) * tr( B_r )
992 // effective operation: C_c += ( tr( B_c ) * tr( A_c ) )^T
995
1001
1004 }
1005 }
1006 }
1007 else // if ( bl1_is_row_storage( c_rs, c_cs ) )
1008 {
1009 if ( bl1_is_col_storage( a_rs, a_cs ) )
1010 {
1011 if ( bl1_is_col_storage( b_rs, b_cs ) )
1012 {
1013 // requested operation: C_r += tr( A_c ) * tr( B_c )
1014 // effective operation: C_c += ( tr( A_c ) * tr( B_c ) )^T
1016
1017 bl1_swap_ints( m, n );
1018
1020 }
1021 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
1022 {
1023 // requested operation: C_r += tr( A_c ) * tr( B_r )
1024 // effective operation: C_c += tr( B_c ) * tr( A_c )^T
1027
1029
1030 bl1_swap_ints( m, n );
1034 bl1_swap_ints( lda, ldb );
1037 }
1038 }
1039 else // if ( bl1_is_row_storage( a_rs, a_cs ) )
1040 {
1041 if ( bl1_is_col_storage( b_rs, b_cs ) )
1042 {
1043 // requested operation: C_r += tr( A_r ) * tr( B_c )
1044 // effective operation: C_c += tr( B_c )^T * tr( A_c )
1047
1049
1050 bl1_swap_ints( m, n );
1054 bl1_swap_ints( lda, ldb );
1057 }
1058 else // if ( bl1_is_row_storage( b_rs, b_cs ) )
1059 {
1060 // requested operation: C_r += tr( A_r ) * tr( B_r )
1061 // effective operation: C_c += tr( B_c ) * tr( A_c )
1065
1066 bl1_swap_ints( m, n );
1070 bl1_swap_ints( lda, ldb );
1073 }
1074 }
1075 }
1076
1077 // We need a temporary matrix for the case where A is conjugated.
1078 a_conj = a;
1079 lda_conj = lda;
1080 inca_conj = inca;
1081
1082 // If transa indicates conjugate-no-transpose and A was not already
1083 // copied, then copy and conjugate it to a temporary matrix. Otherwise,
1084 // if transa indicates conjugate-no-transpose and A was already copied,
1085 // just conjugate it.
1087 {
1088 a_conj = bl1_zallocm( m_gemm, k );
1089 lda_conj = m_gemm;
1090 inca_conj = 1;
1091
1093 m_gemm,
1094 k,
1095 a, inca, lda,
1097 }
1098 else if ( bl1_is_conjnotrans( transa ) && a_was_copied )
1099 {
1101 k,
1103 }
1104
1105 // We need a temporary matrix for the case where B is conjugated.
1106 b_conj = b;
1107 ldb_conj = ldb;
1108 incb_conj = incb;
1109
1110 // If transb indicates conjugate-no-transpose and B was not already
1111 // copied, then copy and conjugate it to a temporary matrix. Otherwise,
1112 // if transb indicates conjugate-no-transpose and B was already copied,
1113 // just conjugate it.
1115 {
1116 b_conj = bl1_zallocm( k, n_gemm );
1117 ldb_conj = k;
1118 incb_conj = 1;
1119
1121 k,
1122 n_gemm,
1123 b, incb, ldb,
1125 }
1126 else if ( bl1_is_conjnotrans( transb ) && b_was_copied )
1127 {
1128 bl1_zconjm( k,
1129 n_gemm,
1131 }
1132
1133 // There are two cases where we need to perform the gemm and then axpy
1134 // the result into C with a transposition. We handle those cases here.
1135 if ( gemm_needs_axpyt )
1136 {
1137 // We need a temporary matrix for holding C^T. Notice that m and n
1138 // represent the dimensions of C, while m_gemm and n_gemm are the
1139 // dimensions of the actual product op(A)*op(B), which may be n-by-m
1140 // since the operands may have been swapped.
1142 ldc_trans = m_gemm;
1143 incc_trans = 1;
1144
1145 // Compute tr( A ) * tr( B ), where A and B may have been swapped
1146 // to reference the other, and store the result in C_trans.
1148 transb,
1149 m_gemm,
1150 n_gemm,
1151 k,
1152 alpha,
1155 &zero,
1156 c_trans, ldc_trans );
1157
1158 // Scale C by beta.
1160 m,
1161 n,
1162 beta,
1163 c, incc, ldc );
1164
1165 // And finally, accumulate the matrix product in C_trans into C
1166 // with a transpose.
1168 m,
1169 n,
1170 &one,
1172 c, incc, ldc );
1173
1174 // Free the temporary matrix for C.
1175 bl1_zfree( c_trans );
1176 }
1177 else // no extra axpyt step needed
1178 {
1180 transb,
1181 m_gemm,
1182 n_gemm,
1183 k,
1184 alpha,
1187 beta,
1188 c, ldc );
1189 }
1190
1192 bl1_zfree( a_conj );
1193
1195 bl1_zfree( b_conj );
1196
1197 // Free any temporary contiguous matrices, copying the result back to
1198 // the original matrix.
1200 &a_unswap, &a_rs, &a_cs );
1201
1203 &b_unswap, &b_rs, &b_cs );
1204
1206 n_save,
1208 &c, &c_rs, &c_cs );
1209}
void bl1_zaxpymt(trans1_t trans, int m, int n, dcomplex *alpha, dcomplex *a, int a_rs, int a_cs, dcomplex *b, int b_rs, int b_cs)
Definition bl1_axpymt.c:248
void bl1_zconjm(int m, int n, dcomplex *a, int a_rs, int a_cs)
Definition bl1_conjm.c:72
void bl1_zcopymt(trans1_t trans, int m, int n, dcomplex *a, int a_rs, int a_cs, dcomplex *b, int b_rs, int b_cs)
Definition bl1_copymt.c:286
void bl1_zgemm_blas(trans1_t transa, trans1_t transb, int m, int n, int k, dcomplex *alpha, dcomplex *a, int lda, dcomplex *b, int ldb, dcomplex *beta, dcomplex *c, int ldc)
Definition bl1_gemm.c:1336
void bl1_zscalm(conj1_t conj, int m, int n, dcomplex *alpha, dcomplex *a, int a_rs, int a_cs)
Definition bl1_scalm.c:273
dcomplex bl1_z0(void)
Definition bl1_constants.c:133
void bl1_zcreate_contigmt(trans1_t trans_dims, int m, int n, dcomplex *a_save, int a_rs_save, int a_cs_save, dcomplex **a, int *a_rs, int *a_cs)
Definition bl1_create_contigmt.c:127
dcomplex bl1_z1(void)
Definition bl1_constants.c:69
void bl1_zcreate_contigm(int m, int n, dcomplex *a_save, int a_rs_save, int a_cs_save, dcomplex **a, int *a_rs, int *a_cs)
Definition bl1_create_contigm.c:115
dcomplex * bl1_zallocm(unsigned int m, unsigned int n)
Definition bl1_allocm.c:45
void bl1_zfree(dcomplex *p)
Definition bl1_free.c:45
void bl1_zfree_contigm(dcomplex *a_save, int a_rs_save, int a_cs_save, dcomplex **a, int *a_rs, int *a_cs)
Definition bl1_free_contigm.c:61
void bl1_zfree_saved_contigm(int m, int n, dcomplex *a_save, int a_rs_save, int a_cs_save, dcomplex **a, int *a_rs, int *a_cs)
Definition bl1_free_saved_contigm.c:82
Definition blis_type_defs.h:138

References bl1_is_col_storage(), bl1_is_conjnotrans(), bl1_z0(), bl1_z1(), bl1_zallocm(), bl1_zaxpymt(), bl1_zconjm(), bl1_zcopymt(), bl1_zcreate_contigm(), bl1_zcreate_contigmt(), bl1_zero_dim3(), bl1_zfree(), bl1_zfree_contigm(), bl1_zfree_saved_contigm(), bl1_zgemm_blas(), bl1_zscalm(), BLIS1_CONJ_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, and BLIS1_TRANSPOSE.

Referenced by FLA_Gemm_external().

◆ bl1_zgemm_blas()

void bl1_zgemm_blas ( trans1_t  transa,
trans1_t  transb,
int  m,
int  n,
int  k,
dcomplex alpha,
dcomplex a,
int  lda,
dcomplex b,
int  ldb,
dcomplex beta,
dcomplex c,
int  ldc 
)
1337{
1338#ifdef BLIS1_ENABLE_CBLAS_INTERFACES
1342
1345
1349 m,
1350 n,
1351 k,
1352 alpha,
1353 a, lda,
1354 b, ldb,
1355 beta,
1356 c, ldc );
1357#else
1358 char blas_transa;
1359 char blas_transb;
1360
1363
1365 &blas_transb,
1366 &m,
1367 &n,
1368 &k,
1369 alpha,
1370 a, &lda,
1371 b, &ldb,
1372 beta,
1373 c, &ldc );
1374#endif
1375}
void F77_zgemm(char *transa, char *transb, int *m, int *n, int *k, dcomplex *alpha, dcomplex *a, int *lda, dcomplex *b, int *ldb, dcomplex *beta, dcomplex *c, int *ldc)
void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const void *alpha, const void *A, const int lda, const void *B, const int ldb, const void *beta, void *C, const int ldc)

References bl1_param_map_to_netlib_trans(), cblas_zgemm(), CblasColMajor, and F77_zgemm().

Referenced by bl1_zgemm().