Commit 998511a6 by Thomas Koenig

re PR fortran/29550 (Optimize -fexternal-blas calls for conjg())

2018-09-18  Thomas Koenig  <tkoenig@gcc.gnu.org>

	PR fortran/29550
	* gfortran.h (gfc_expr): Add external_blas flag.
	* frontend-passes.c (matrix_case): Add case A2TB2T.
	(optimize_namespace): Handle flag_external_blas by
	calling call_external_blas.
	(get_array_inq_function): Add argument okind. If
	it is nonzero, use it as the kind of argument
	to be used.
	(inline_limit_check): Remove m_case argument, add
	limit argument instead.  Remove assert about m_case.
	Set the limit for inlining from the limit argument.
	(matmul_lhs_realloc): Handle case A2TB2T.
	(inline_matmul_assign): Handle inline limit for other cases with
	two rank-two matrices.  Remove no-op calls to inline_limit_check.
	(call_external_blas): New function.
	* trans-intrinsic.c (gfc_conv_intrinsic_funcall): Do not add
	argument to external BLAS if external_blas is already set.

2018-09-18  Thomas Koenig  <tkoenig@gcc.gnu.org>

	PR fortran/29550
	* gfortran.dg/inline_matmul_13.f90: Adjust count for
	_gfortran_matmul.
	* gfortran.dg/inline_matmul_16.f90: Likewise.
	* gfortran.dg/promotion_2.f90: Add -fblas-matmul-limit=1.  Scan
	for dgemm instead of dgemm_.  Add call to random_number to make
	standard conforming.
	* gfortran.dg/matmul_blas_1.f90: New test.
	* gfortran.dg/matmul_bounds_14.f: New test.
	* gfortran.dg/matmul_bounds_15.f: New test.
	* gfortran.dg/matmul_bounds_16.f: New test.
	* gfortran.dg/blas_gemm_routines.f: New test / additional file for
	preceding tests.

From-SVN: r264412
parent 5c470e0f
......@@ -53,6 +53,7 @@ static gfc_code * create_do_loop (gfc_expr *, gfc_expr *, gfc_expr *,
char *vname=NULL);
static gfc_expr* check_conjg_transpose_variable (gfc_expr *, bool *,
bool *);
static int call_external_blas (gfc_code **, int *, void *);
static bool has_dimen_vector_ref (gfc_expr *);
static int matmul_temp_args (gfc_code **, int *,void *data);
static int index_interchange (gfc_code **, int*, void *);
......@@ -131,7 +132,7 @@ static int var_num = 1;
/* What sort of matrix we are dealing with when inlining MATMUL. */
enum matrix_case { none=0, A2B2, A2B1, A1B2, A2B2T, A2TB2 };
enum matrix_case { none=0, A2B2, A2B1, A1B2, A2B2T, A2TB2, A2TB2T };
/* Keep track of the number of expressions we have inserted so far
using create_var. */
......@@ -1428,7 +1429,7 @@ optimize_namespace (gfc_namespace *ns)
gfc_code_walker (&ns->code, convert_elseif, dummy_expr_callback, NULL);
gfc_code_walker (&ns->code, cfe_code, cfe_expr_0, NULL);
gfc_code_walker (&ns->code, optimize_code, optimize_expr, NULL);
if (flag_inline_matmul_limit != 0)
if (flag_inline_matmul_limit != 0 || flag_external_blas)
{
bool found;
do
......@@ -1441,9 +1442,15 @@ optimize_namespace (gfc_namespace *ns)
gfc_code_walker (&ns->code, matmul_temp_args, dummy_expr_callback,
NULL);
gfc_code_walker (&ns->code, inline_matmul_assign, dummy_expr_callback,
NULL);
}
if (flag_external_blas)
gfc_code_walker (&ns->code, call_external_blas, dummy_expr_callback,
NULL);
if (flag_inline_matmul_limit != 0)
gfc_code_walker (&ns->code, inline_matmul_assign, dummy_expr_callback,
NULL);
}
if (flag_frontend_loop_interchange)
......@@ -2938,7 +2945,7 @@ matmul_temp_args (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
dim is zero-based. */
static gfc_expr *
get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim)
get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim, int okind = 0)
{
gfc_expr *fcn;
gfc_expr *dim_arg, *kind;
......@@ -2964,8 +2971,12 @@ get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim)
}
dim_arg = gfc_get_int_expr (gfc_default_integer_kind, &e->where, dim);
kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where,
gfc_index_integer_kind);
if (okind != 0)
kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where,
okind);
else
kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where,
gfc_index_integer_kind);
ec = gfc_copy_expr (e);
......@@ -3026,7 +3037,7 @@ get_operand (gfc_intrinsic_op op, gfc_expr *e1, gfc_expr *e2)
removed by DCE. Only called for rank-two matrices A and B. */
static gfc_code *
inline_limit_check (gfc_expr *a, gfc_expr *b, enum matrix_case m_case)
inline_limit_check (gfc_expr *a, gfc_expr *b, int limit)
{
gfc_expr *inline_limit;
gfc_code *if_1, *if_2, *else_2;
......@@ -3034,14 +3045,11 @@ inline_limit_check (gfc_expr *a, gfc_expr *b, enum matrix_case m_case)
gfc_typespec ts;
gfc_expr *cond;
gcc_assert (m_case == A2B2 || m_case == A2B2T || m_case == A2TB2);
/* Calculation is done in real to avoid integer overflow. */
inline_limit = gfc_get_constant_expr (BT_REAL, gfc_default_real_kind,
&a->where);
mpfr_set_si (inline_limit->value.real, flag_inline_matmul_limit,
GFC_RND_MODE);
mpfr_set_si (inline_limit->value.real, limit, GFC_RND_MODE);
mpfr_pow_ui (inline_limit->value.real, inline_limit->value.real, 3,
GFC_RND_MODE);
......@@ -3235,6 +3243,22 @@ matmul_lhs_realloc (gfc_expr *c, gfc_expr *a, gfc_expr *b,
get_array_inq_function (GFC_ISYM_SIZE, b, 2));
break;
case A2TB2T:
/* This can only happen for BLAS, we do not handle that case in
inline mamtul. */
ar->start[0] = get_array_inq_function (GFC_ISYM_SIZE, a, 2);
ar->start[1] = get_array_inq_function (GFC_ISYM_SIZE, b, 1);
ne1 = build_logical_expr (INTRINSIC_NE,
get_array_inq_function (GFC_ISYM_SIZE, c, 1),
get_array_inq_function (GFC_ISYM_SIZE, a, 2));
ne2 = build_logical_expr (INTRINSIC_NE,
get_array_inq_function (GFC_ISYM_SIZE, c, 2),
get_array_inq_function (GFC_ISYM_SIZE, b, 1));
cond = build_logical_expr (INTRINSIC_OR, ne1, ne2);
break;
default:
gcc_unreachable();
......@@ -3946,9 +3970,11 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
/* Take care of the inline flag. If the limit check evaluates to a
constant, dead code elimination will eliminate the unneeded branch. */
if (m_case == A2B2 && flag_inline_matmul_limit > 0)
if (flag_inline_matmul_limit > 0 && matrix_a->rank == 2
&& matrix_b->rank == 2)
{
if_limit = inline_limit_check (matrix_a, matrix_b, m_case);
if_limit = inline_limit_check (matrix_a, matrix_b,
flag_inline_matmul_limit);
/* Insert the original statement into the else branch. */
if_limit->block->block->next = co;
......@@ -4118,7 +4144,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
switch (m_case)
{
case A2B2:
inline_limit_check (matrix_a, matrix_b, m_case);
u1 = get_size_m1 (matrix_b, 2);
u2 = get_size_m1 (matrix_a, 2);
......@@ -4151,7 +4176,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
break;
case A2B2T:
inline_limit_check (matrix_a, matrix_b, m_case);
u1 = get_size_m1 (matrix_b, 1);
u2 = get_size_m1 (matrix_a, 2);
......@@ -4184,7 +4208,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
break;
case A2TB2:
inline_limit_check (matrix_a, matrix_b, m_case);
u1 = get_size_m1 (matrix_a, 2);
u2 = get_size_m1 (matrix_b, 2);
......@@ -4311,6 +4334,405 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
return 0;
}
/* Change matmul function calls in the form of
c = matmul(a,b)
to the corresponding call to a BLAS routine, if applicable. */
static int
call_external_blas (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
void *data ATTRIBUTE_UNUSED)
{
gfc_code *co, *co_next;
gfc_expr *expr1, *expr2;
gfc_expr *matrix_a, *matrix_b;
gfc_code *if_limit = NULL;
gfc_actual_arglist *a, *b;
bool conjg_a, conjg_b, transpose_a, transpose_b;
gfc_code *call;
const char *blas_name;
const char *transa, *transb;
gfc_expr *c1, *c2, *b1;
gfc_actual_arglist *actual, *next;
bt type;
int kind;
enum matrix_case m_case;
bool realloc_c;
gfc_code **next_code_point;
/* Many of the tests for inline matmul also apply here. */
co = *c;
if (co->op != EXEC_ASSIGN)
return 0;
if (in_where || in_assoc_list)
return 0;
/* The BLOCKS generated for the temporary variables and FORALL don't
mix. */
if (forall_level > 0)
return 0;
/* For now don't do anything in OpenMP workshare, it confuses
its translation, which expects only the allowed statements in there. */
if (in_omp_workshare)
return 0;
expr1 = co->expr1;
expr2 = co->expr2;
if (expr2->expr_type != EXPR_FUNCTION
|| expr2->value.function.isym == NULL
|| expr2->value.function.isym->id != GFC_ISYM_MATMUL)
return 0;
type = expr2->ts.type;
kind = expr2->ts.kind;
/* Guard against recursion. */
if (expr2->external_blas)
return 0;
if (type != expr1->ts.type || kind != expr1->ts.kind)
return 0;
if (type == BT_REAL)
{
if (kind == 4)
blas_name = "sgemm";
else if (kind == 8)
blas_name = "dgemm";
else
return 0;
}
else if (type == BT_COMPLEX)
{
if (kind == 4)
blas_name = "cgemm";
else if (kind == 8)
blas_name = "zgemm";
else
return 0;
}
else
return 0;
a = expr2->value.function.actual;
if (a->expr->rank != 2)
return 0;
b = a->next;
if (b->expr->rank != 2)
return 0;
matrix_a = check_conjg_transpose_variable (a->expr, &conjg_a, &transpose_a);
if (matrix_a == NULL)
return 0;
if (transpose_a)
{
if (conjg_a)
transa = "C";
else
transa = "T";
}
else
transa = "N";
matrix_b = check_conjg_transpose_variable (b->expr, &conjg_b, &transpose_b);
if (matrix_b == NULL)
return 0;
if (transpose_b)
{
if (conjg_b)
transb = "C";
else
transb = "T";
}
else
transb = "N";
if (transpose_a)
{
if (transpose_b)
m_case = A2TB2T;
else
m_case = A2TB2;
}
else
{
if (transpose_b)
m_case = A2B2T;
else
m_case = A2B2;
}
current_code = c;
inserted_block = NULL;
changed_statement = NULL;
expr2->external_blas = 1;
/* We do not handle data dependencies yet. */
if (gfc_check_dependency (expr1, matrix_a, true)
|| gfc_check_dependency (expr1, matrix_b, true))
return 0;
/* Generate the if statement and hang it into the tree. */
if_limit = inline_limit_check (matrix_a, matrix_b, flag_blas_matmul_limit);
co_next = co->next;
(*current_code) = if_limit;
co->next = NULL;
if_limit->block->next = co;
call = XCNEW (gfc_code);
call->loc = co->loc;
/* Bounds checking - a bit simpler than for inlining since we only
have to take care of two-dimensional arrays here. */
realloc_c = flag_realloc_lhs && gfc_is_reallocatable_lhs (expr1);
next_code_point = &(if_limit->block->block->next);
if (gfc_option.rtcheck & GFC_RTCHECK_BOUNDS)
{
gfc_code *test;
// gfc_expr *a2, *b1, *c1, *c2, *a1, *b2;
gfc_expr *c1, *a1, *c2, *b2, *a2;
switch (m_case)
{
case A2B2:
b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
test = runtime_error_ne (b1, a2, B_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
if (!realloc_c)
{
c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
test = runtime_error_ne (c1, a1, C_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
test = runtime_error_ne (c2, b2, C_ERROR(2));
*next_code_point = test;
next_code_point = &test->next;
}
break;
case A2B2T:
b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
/* matrix_b is transposed, hence dimension 1 for the error message. */
test = runtime_error_ne (b2, a2, B_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
if (!realloc_c)
{
c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
test = runtime_error_ne (c1, a1, C_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
test = runtime_error_ne (c2, b1, C_ERROR(2));
*next_code_point = test;
next_code_point = &test->next;
}
break;
case A2TB2:
b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
test = runtime_error_ne (b1, a1, B_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
if (!realloc_c)
{
c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
test = runtime_error_ne (c1, a2, C_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
test = runtime_error_ne (c2, b2, C_ERROR(2));
*next_code_point = test;
next_code_point = &test->next;
}
break;
case A2TB2T:
b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
test = runtime_error_ne (b2, a1, B_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
if (!realloc_c)
{
c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
test = runtime_error_ne (c1, a2, C_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
test = runtime_error_ne (c2, b1, C_ERROR(2));
*next_code_point = test;
next_code_point = &test->next;
}
break;
default:
gcc_unreachable ();
}
}
/* Handle the reallocation, if needed. */
if (realloc_c)
{
gfc_code *lhs_alloc;
lhs_alloc = matmul_lhs_realloc (expr1, matrix_a, matrix_b, m_case);
*next_code_point = lhs_alloc;
next_code_point = &lhs_alloc->next;
}
*next_code_point = call;
if_limit->next = co_next;
/* Set up the BLAS call. */
call->op = EXEC_CALL;
gfc_get_sym_tree (blas_name, current_ns, &(call->symtree), true);
call->symtree->n.sym->attr.subroutine = 1;
call->symtree->n.sym->attr.procedure = 1;
call->symtree->n.sym->attr.flavor = FL_PROCEDURE;
call->resolved_sym = call->symtree->n.sym;
/* Argument TRANSA. */
next = gfc_get_actual_arglist ();
next->expr = gfc_get_character_expr (gfc_default_character_kind, &co->loc,
transa, 1);
call->ext.actual = next;
/* Argument TRANSB. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = gfc_get_character_expr (gfc_default_character_kind, &co->loc,
transb, 1);
actual->next = next;
c1 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (a->expr), 1,
gfc_integer_4_kind);
c2 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (b->expr), 2,
gfc_integer_4_kind);
b1 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (b->expr), 1,
gfc_integer_4_kind);
/* Argument M. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = c1;
actual->next = next;
/* Argument N. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = c2;
actual->next = next;
/* Argument K. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = b1;
actual->next = next;
/* Argument ALPHA - set to one. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = gfc_get_constant_expr (type, kind, &co->loc);
if (type == BT_REAL)
mpfr_set_ui (next->expr->value.real, 1, GFC_RND_MODE);
else
mpc_set_ui (next->expr->value.complex, 1, GFC_MPC_RND_MODE);
actual->next = next;
/* Argument A. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = gfc_copy_expr (matrix_a);
actual->next = next;
/* Argument LDA. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (matrix_a),
1, gfc_integer_4_kind);
actual->next = next;
/* Argument B. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = gfc_copy_expr (matrix_b);
actual->next = next;
/* Argument LDB. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (matrix_b),
1, gfc_integer_4_kind);
actual->next = next;
/* Argument BETA - set to zero. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = gfc_get_constant_expr (type, kind, &co->loc);
if (type == BT_REAL)
mpfr_set_ui (next->expr->value.real, 0, GFC_RND_MODE);
else
mpc_set_ui (next->expr->value.complex, 0, GFC_MPC_RND_MODE);
actual->next = next;
/* Argument C. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = gfc_copy_expr (expr1);
actual->next = next;
/* Argument LDC. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (expr1),
1, gfc_integer_4_kind);
actual->next = next;
return 0;
}
/* Code for index interchange for loops which are grouped together in DO
CONCURRENT or FORALL statements. This is currently only applied if the
......
......@@ -2143,6 +2143,11 @@ typedef struct gfc_expr
unsigned int no_bounds_check : 1;
/* Set this if a matmul expression has already been evaluated for conversion
to a BLAS call. */
unsigned int external_blas : 1;
/* If an expression comes from a Hollerith constant or compile-time
evaluation of a transfer statement, it may have a prescribed target-
memory representation, and these cannot always be backformed from
......
......@@ -4055,6 +4055,7 @@ gfc_conv_intrinsic_funcall (gfc_se * se, gfc_expr * expr)
to be able to call the BLAS ?gemm functions if required and possible. */
append_args = NULL;
if (expr->value.function.isym->id == GFC_ISYM_MATMUL
&& !expr->external_blas
&& sym->ts.type != BT_LOGICAL)
{
tree cint = gfc_get_int_type (gfc_c_int_kind);
......
! { dg-do compile }
! { dg-options "-std=legacy" }
*> \brief \b CGEMM
*
* =========== DOCUMENTATION ===========
*
* Online html documentation available at
* http://www.netlib.org/lapack/explore-html/
*
* Definition:
* ===========
*
* SUBROUTINE CGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
*
* .. Scalar Arguments ..
* COMPLEX ALPHA,BETA
* INTEGER K,LDA,LDB,LDC,M,N
* CHARACTER TRANSA,TRANSB
* ..
* .. Array Arguments ..
* COMPLEX A(LDA,*),B(LDB,*),C(LDC,*)
* ..
*
*
*> \par Purpose:
* =============
*>
*> \verbatim
*>
*> CGEMM performs one of the matrix-matrix operations
*>
*> C := alpha*op( A )*op( B ) + beta*C,
*>
*> where op( X ) is one of
*>
*> op( X ) = X or op( X ) = X**T or op( X ) = X**H,
*>
*> alpha and beta are scalars, and A, B and C are matrices, with op( A )
*> an m by k matrix, op( B ) a k by n matrix and C an m by n matrix.
*> \endverbatim
*
* Arguments:
* ==========
*
*> \param[in] TRANSA
*> \verbatim
*> TRANSA is CHARACTER*1
*> On entry, TRANSA specifies the form of op( A ) to be used in
*> the matrix multiplication as follows:
*>
*> TRANSA = 'N' or 'n', op( A ) = A.
*>
*> TRANSA = 'T' or 't', op( A ) = A**T.
*>
*> TRANSA = 'C' or 'c', op( A ) = A**H.
*> \endverbatim
*>
*> \param[in] TRANSB
*> \verbatim
*> TRANSB is CHARACTER*1
*> On entry, TRANSB specifies the form of op( B ) to be used in
*> the matrix multiplication as follows:
*>
*> TRANSB = 'N' or 'n', op( B ) = B.
*>
*> TRANSB = 'T' or 't', op( B ) = B**T.
*>
*> TRANSB = 'C' or 'c', op( B ) = B**H.
*> \endverbatim
*>
*> \param[in] M
*> \verbatim
*> M is INTEGER
*> On entry, M specifies the number of rows of the matrix
*> op( A ) and of the matrix C. M must be at least zero.
*> \endverbatim
*>
*> \param[in] N
*> \verbatim
*> N is INTEGER
*> On entry, N specifies the number of columns of the matrix
*> op( B ) and the number of columns of the matrix C. N must be
*> at least zero.
*> \endverbatim
*>
*> \param[in] K
*> \verbatim
*> K is INTEGER
*> On entry, K specifies the number of columns of the matrix
*> op( A ) and the number of rows of the matrix op( B ). K must
*> be at least zero.
*> \endverbatim
*>
*> \param[in] ALPHA
*> \verbatim
*> ALPHA is COMPLEX
*> On entry, ALPHA specifies the scalar alpha.
*> \endverbatim
*>
*> \param[in] A
*> \verbatim
*> A is COMPLEX array, dimension ( LDA, ka ), where ka is
*> k when TRANSA = 'N' or 'n', and is m otherwise.
*> Before entry with TRANSA = 'N' or 'n', the leading m by k
*> part of the array A must contain the matrix A, otherwise
*> the leading k by m part of the array A must contain the
*> matrix A.
*> \endverbatim
*>
*> \param[in] LDA
*> \verbatim
*> LDA is INTEGER
*> On entry, LDA specifies the first dimension of A as declared
*> in the calling (sub) program. When TRANSA = 'N' or 'n' then
*> LDA must be at least max( 1, m ), otherwise LDA must be at
*> least max( 1, k ).
*> \endverbatim
*>
*> \param[in] B
*> \verbatim
*> B is COMPLEX array, dimension ( LDB, kb ), where kb is
*> n when TRANSB = 'N' or 'n', and is k otherwise.
*> Before entry with TRANSB = 'N' or 'n', the leading k by n
*> part of the array B must contain the matrix B, otherwise
*> the leading n by k part of the array B must contain the
*> matrix B.
*> \endverbatim
*>
*> \param[in] LDB
*> \verbatim
*> LDB is INTEGER
*> On entry, LDB specifies the first dimension of B as declared
*> in the calling (sub) program. When TRANSB = 'N' or 'n' then
*> LDB must be at least max( 1, k ), otherwise LDB must be at
*> least max( 1, n ).
*> \endverbatim
*>
*> \param[in] BETA
*> \verbatim
*> BETA is COMPLEX
*> On entry, BETA specifies the scalar beta. When BETA is
*> supplied as zero then C need not be set on input.
*> \endverbatim
*>
*> \param[in,out] C
*> \verbatim
*> C is COMPLEX array, dimension ( LDC, N )
*> Before entry, the leading m by n part of the array C must
*> contain the matrix C, except when beta is zero, in which
*> case C need not be set on entry.
*> On exit, the array C is overwritten by the m by n matrix
*> ( alpha*op( A )*op( B ) + beta*C ).
*> \endverbatim
*>
*> \param[in] LDC
*> \verbatim
*> LDC is INTEGER
*> On entry, LDC specifies the first dimension of C as declared
*> in the calling (sub) program. LDC must be at least
*> max( 1, m ).
*> \endverbatim
*
* Authors:
* ========
*
*> \author Univ. of Tennessee
*> \author Univ. of California Berkeley
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
*> \date December 2016
*
*> \ingroup complex_blas_level3
*
*> \par Further Details:
* =====================
*>
*> \verbatim
*>
*> Level 3 Blas routine.
*>
*> -- Written on 8-February-1989.
*> Jack Dongarra, Argonne National Laboratory.
*> Iain Duff, AERE Harwell.
*> Jeremy Du Croz, Numerical Algorithms Group Ltd.
*> Sven Hammarling, Numerical Algorithms Group Ltd.
*> \endverbatim
*>
* =====================================================================
SUBROUTINE CGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
*
* -- Reference BLAS level3 routine (version 3.7.0) --
* -- Reference BLAS is a software package provided by Univ. of Tennessee, --
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
* December 2016
*
* .. Scalar Arguments ..
COMPLEX ALPHA,BETA
INTEGER K,LDA,LDB,LDC,M,N
CHARACTER TRANSA,TRANSB
* ..
* .. Array Arguments ..
COMPLEX A(LDA,*),B(LDB,*),C(LDC,*)
* ..
*
* =====================================================================
*
* .. External Functions ..
LOGICAL LSAME
EXTERNAL LSAME
* ..
* .. External Subroutines ..
EXTERNAL XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC CONJG,MAX
* ..
* .. Local Scalars ..
COMPLEX TEMP
INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB
LOGICAL CONJA,CONJB,NOTA,NOTB
* ..
* .. Parameters ..
COMPLEX ONE
PARAMETER (ONE= (1.0E+0,0.0E+0))
COMPLEX ZERO
PARAMETER (ZERO= (0.0E+0,0.0E+0))
* ..
*
* Set NOTA and NOTB as true if A and B respectively are not
* conjugated or transposed, set CONJA and CONJB as true if A and
* B respectively are to be transposed but not conjugated and set
* NROWA, NCOLA and NROWB as the number of rows and columns of A
* and the number of rows of B respectively.
*
NOTA = LSAME(TRANSA,'N')
NOTB = LSAME(TRANSB,'N')
CONJA = LSAME(TRANSA,'C')
CONJB = LSAME(TRANSB,'C')
IF (NOTA) THEN
NROWA = M
NCOLA = K
ELSE
NROWA = K
NCOLA = M
END IF
IF (NOTB) THEN
NROWB = K
ELSE
NROWB = N
END IF
*
* Test the input parameters.
*
INFO = 0
IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND.
+ (.NOT.LSAME(TRANSA,'T'))) THEN
INFO = 1
ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND.
+ (.NOT.LSAME(TRANSB,'T'))) THEN
INFO = 2
ELSE IF (M.LT.0) THEN
INFO = 3
ELSE IF (N.LT.0) THEN
INFO = 4
ELSE IF (K.LT.0) THEN
INFO = 5
ELSE IF (LDA.LT.MAX(1,NROWA)) THEN
INFO = 8
ELSE IF (LDB.LT.MAX(1,NROWB)) THEN
INFO = 10
ELSE IF (LDC.LT.MAX(1,M)) THEN
INFO = 13
END IF
IF (INFO.NE.0) THEN
CALL XERBLA('CGEMM ',INFO)
RETURN
END IF
*
* Quick return if possible.
*
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
+ (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN
*
* And when alpha.eq.zero.
*
IF (ALPHA.EQ.ZERO) THEN
IF (BETA.EQ.ZERO) THEN
DO 20 J = 1,N
DO 10 I = 1,M
C(I,J) = ZERO
10 CONTINUE
20 CONTINUE
ELSE
DO 40 J = 1,N
DO 30 I = 1,M
C(I,J) = BETA*C(I,J)
30 CONTINUE
40 CONTINUE
END IF
RETURN
END IF
*
* Start the operations.
*
IF (NOTB) THEN
IF (NOTA) THEN
*
* Form C := alpha*A*B + beta*C.
*
DO 90 J = 1,N
IF (BETA.EQ.ZERO) THEN
DO 50 I = 1,M
C(I,J) = ZERO
50 CONTINUE
ELSE IF (BETA.NE.ONE) THEN
DO 60 I = 1,M
C(I,J) = BETA*C(I,J)
60 CONTINUE
END IF
DO 80 L = 1,K
TEMP = ALPHA*B(L,J)
DO 70 I = 1,M
C(I,J) = C(I,J) + TEMP*A(I,L)
70 CONTINUE
80 CONTINUE
90 CONTINUE
ELSE IF (CONJA) THEN
*
* Form C := alpha*A**H*B + beta*C.
*
DO 120 J = 1,N
DO 110 I = 1,M
TEMP = ZERO
DO 100 L = 1,K
TEMP = TEMP + CONJG(A(L,I))*B(L,J)
100 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
110 CONTINUE
120 CONTINUE
ELSE
*
* Form C := alpha*A**T*B + beta*C
*
DO 150 J = 1,N
DO 140 I = 1,M
TEMP = ZERO
DO 130 L = 1,K
TEMP = TEMP + A(L,I)*B(L,J)
130 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
140 CONTINUE
150 CONTINUE
END IF
ELSE IF (NOTA) THEN
IF (CONJB) THEN
*
* Form C := alpha*A*B**H + beta*C.
*
DO 200 J = 1,N
IF (BETA.EQ.ZERO) THEN
DO 160 I = 1,M
C(I,J) = ZERO
160 CONTINUE
ELSE IF (BETA.NE.ONE) THEN
DO 170 I = 1,M
C(I,J) = BETA*C(I,J)
170 CONTINUE
END IF
DO 190 L = 1,K
TEMP = ALPHA*CONJG(B(J,L))
DO 180 I = 1,M
C(I,J) = C(I,J) + TEMP*A(I,L)
180 CONTINUE
190 CONTINUE
200 CONTINUE
ELSE
*
* Form C := alpha*A*B**T + beta*C
*
DO 250 J = 1,N
IF (BETA.EQ.ZERO) THEN
DO 210 I = 1,M
C(I,J) = ZERO
210 CONTINUE
ELSE IF (BETA.NE.ONE) THEN
DO 220 I = 1,M
C(I,J) = BETA*C(I,J)
220 CONTINUE
END IF
DO 240 L = 1,K
TEMP = ALPHA*B(J,L)
DO 230 I = 1,M
C(I,J) = C(I,J) + TEMP*A(I,L)
230 CONTINUE
240 CONTINUE
250 CONTINUE
END IF
ELSE IF (CONJA) THEN
IF (CONJB) THEN
*
* Form C := alpha*A**H*B**H + beta*C.
*
DO 280 J = 1,N
DO 270 I = 1,M
TEMP = ZERO
DO 260 L = 1,K
TEMP = TEMP + CONJG(A(L,I))*CONJG(B(J,L))
260 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
270 CONTINUE
280 CONTINUE
ELSE
*
* Form C := alpha*A**H*B**T + beta*C
*
DO 310 J = 1,N
DO 300 I = 1,M
TEMP = ZERO
DO 290 L = 1,K
TEMP = TEMP + CONJG(A(L,I))*B(J,L)
290 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
300 CONTINUE
310 CONTINUE
END IF
ELSE
IF (CONJB) THEN
*
* Form C := alpha*A**T*B**H + beta*C
*
DO 340 J = 1,N
DO 330 I = 1,M
TEMP = ZERO
DO 320 L = 1,K
TEMP = TEMP + A(L,I)*CONJG(B(J,L))
320 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
330 CONTINUE
340 CONTINUE
ELSE
*
* Form C := alpha*A**T*B**T + beta*C
*
DO 370 J = 1,N
DO 360 I = 1,M
TEMP = ZERO
DO 350 L = 1,K
TEMP = TEMP + A(L,I)*B(J,L)
350 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
360 CONTINUE
370 CONTINUE
END IF
END IF
*
RETURN
*
* End of CGEMM .
*
END
*> \brief \b LSAME
*
* =========== DOCUMENTATION ===========
*
* Online html documentation available at
* http://www.netlib.org/lapack/explore-html/
*
* Definition:
* ===========
*
* LOGICAL FUNCTION LSAME(CA,CB)
*
* .. Scalar Arguments ..
* CHARACTER CA,CB
* ..
*
*
*> \par Purpose:
* =============
*>
*> \verbatim
*>
*> LSAME returns .TRUE. if CA is the same letter as CB regardless of
*> case.
*> \endverbatim
*
* Arguments:
* ==========
*
*> \param[in] CA
*> \verbatim
*> CA is CHARACTER*1
*> \endverbatim
*>
*> \param[in] CB
*> \verbatim
*> CB is CHARACTER*1
*> CA and CB specify the single characters to be compared.
*> \endverbatim
*
* Authors:
* ========
*
*> \author Univ. of Tennessee
*> \author Univ. of California Berkeley
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
*> \date December 2016
*
*> \ingroup aux_blas
*
* =====================================================================
LOGICAL FUNCTION LSAME(CA,CB)
*
* -- Reference BLAS level1 routine (version 3.1) --
* -- Reference BLAS is a software package provided by Univ. of Tennessee, --
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
* December 2016
*
* .. Scalar Arguments ..
CHARACTER CA,CB
* ..
*
* =====================================================================
*
* .. Intrinsic Functions ..
INTRINSIC ICHAR
* ..
* .. Local Scalars ..
INTEGER INTA,INTB,ZCODE
* ..
*
* Test if the characters are equal
*
LSAME = CA .EQ. CB
IF (LSAME) RETURN
*
* Now test for equivalence if both characters are alphabetic.
*
ZCODE = ICHAR('Z')
*
* Use 'Z' rather than 'A' so that ASCII can be detected on Prime
* machines, on which ICHAR returns a value with bit 8 set.
* ICHAR('A') on Prime machines returns 193 which is the same as
* ICHAR('A') on an EBCDIC machine.
*
INTA = ICHAR(CA)
INTB = ICHAR(CB)
*
IF (ZCODE.EQ.90 .OR. ZCODE.EQ.122) THEN
*
* ASCII is assumed - ZCODE is the ASCII code of either lower or
* upper case 'Z'.
*
IF (INTA.GE.97 .AND. INTA.LE.122) INTA = INTA - 32
IF (INTB.GE.97 .AND. INTB.LE.122) INTB = INTB - 32
*
ELSE IF (ZCODE.EQ.233 .OR. ZCODE.EQ.169) THEN
*
* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or
* upper case 'Z'.
*
IF (INTA.GE.129 .AND. INTA.LE.137 .OR.
+ INTA.GE.145 .AND. INTA.LE.153 .OR.
+ INTA.GE.162 .AND. INTA.LE.169) INTA = INTA + 64
IF (INTB.GE.129 .AND. INTB.LE.137 .OR.
+ INTB.GE.145 .AND. INTB.LE.153 .OR.
+ INTB.GE.162 .AND. INTB.LE.169) INTB = INTB + 64
*
ELSE IF (ZCODE.EQ.218 .OR. ZCODE.EQ.250) THEN
*
* ASCII is assumed, on Prime machines - ZCODE is the ASCII code
* plus 128 of either lower or upper case 'Z'.
*
IF (INTA.GE.225 .AND. INTA.LE.250) INTA = INTA - 32
IF (INTB.GE.225 .AND. INTB.LE.250) INTB = INTB - 32
END IF
LSAME = INTA .EQ. INTB
*
* RETURN
*
* End of LSAME
*
END
*> \brief \b XERBLA
*
* =========== DOCUMENTATION ===========
*
* Online html documentation available at
* http://www.netlib.org/lapack/explore-html/
*
* Definition:
* ===========
*
* SUBROUTINE XERBLA( SRNAME, INFO )
*
* .. Scalar Arguments ..
* CHARACTER*(*) SRNAME
* INTEGER INFO
* ..
*
*
*> \par Purpose:
* =============
*>
*> \verbatim
*>
*> XERBLA is an error handler for the LAPACK routines.
*> It is called by an LAPACK routine if an input parameter has an
*> invalid value. A message is printed and execution stops.
*>
*> Installers may consider modifying the STOP statement in order to
*> call system-specific exception-handling facilities.
*> \endverbatim
*
* Arguments:
* ==========
*
*> \param[in] SRNAME
*> \verbatim
*> SRNAME is CHARACTER*(*)
*> The name of the routine which called XERBLA.
*> \endverbatim
*>
*> \param[in] INFO
*> \verbatim
*> INFO is INTEGER
*> The position of the invalid parameter in the parameter list
*> of the calling routine.
*> \endverbatim
*
* Authors:
* ========
*
*> \author Univ. of Tennessee
*> \author Univ. of California Berkeley
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
*> \date December 2016
*
*> \ingroup aux_blas
*
* =====================================================================
SUBROUTINE XERBLA( SRNAME, INFO )
*
* -- Reference BLAS level1 routine (version 3.7.0) --
* -- Reference BLAS is a software package provided by Univ. of Tennessee, --
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
* December 2016
*
* .. Scalar Arguments ..
CHARACTER*(*) SRNAME
INTEGER INFO
* ..
*
* =====================================================================
*
* .. Intrinsic Functions ..
INTRINSIC LEN_TRIM
* ..
* .. Executable Statements ..
*
WRITE( *, FMT = 9999 )SRNAME( 1:LEN_TRIM( SRNAME ) ), INFO
*
STOP
*
9999 FORMAT( ' ** On entry to ', A, ' parameter number ', I2, ' had ',
$ 'an illegal value' )
*
* End of XERBLA
*
END
*> \brief \b SGEMM
*
* =========== DOCUMENTATION ===========
*
* Online html documentation available at
* http://www.netlib.org/lapack/explore-html/
*
* Definition:
* ===========
*
* SUBROUTINE SGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
*
* .. Scalar Arguments ..
* REAL ALPHA,BETA
* INTEGER K,LDA,LDB,LDC,M,N
* CHARACTER TRANSA,TRANSB
* ..
* .. Array Arguments ..
* REAL A(LDA,*),B(LDB,*),C(LDC,*)
* ..
*
*
*> \par Purpose:
* =============
*>
*> \verbatim
*>
*> SGEMM performs one of the matrix-matrix operations
*>
*> C := alpha*op( A )*op( B ) + beta*C,
*>
*> where op( X ) is one of
*>
*> op( X ) = X or op( X ) = X**T,
*>
*> alpha and beta are scalars, and A, B and C are matrices, with op( A )
*> an m by k matrix, op( B ) a k by n matrix and C an m by n matrix.
*> \endverbatim
*
* Arguments:
* ==========
*
*> \param[in] TRANSA
*> \verbatim
*> TRANSA is CHARACTER*1
*> On entry, TRANSA specifies the form of op( A ) to be used in
*> the matrix multiplication as follows:
*>
*> TRANSA = 'N' or 'n', op( A ) = A.
*>
*> TRANSA = 'T' or 't', op( A ) = A**T.
*>
*> TRANSA = 'C' or 'c', op( A ) = A**T.
*> \endverbatim
*>
*> \param[in] TRANSB
*> \verbatim
*> TRANSB is CHARACTER*1
*> On entry, TRANSB specifies the form of op( B ) to be used in
*> the matrix multiplication as follows:
*>
*> TRANSB = 'N' or 'n', op( B ) = B.
*>
*> TRANSB = 'T' or 't', op( B ) = B**T.
*>
*> TRANSB = 'C' or 'c', op( B ) = B**T.
*> \endverbatim
*>
*> \param[in] M
*> \verbatim
*> M is INTEGER
*> On entry, M specifies the number of rows of the matrix
*> op( A ) and of the matrix C. M must be at least zero.
*> \endverbatim
*>
*> \param[in] N
*> \verbatim
*> N is INTEGER
*> On entry, N specifies the number of columns of the matrix
*> op( B ) and the number of columns of the matrix C. N must be
*> at least zero.
*> \endverbatim
*>
*> \param[in] K
*> \verbatim
*> K is INTEGER
*> On entry, K specifies the number of columns of the matrix
*> op( A ) and the number of rows of the matrix op( B ). K must
*> be at least zero.
*> \endverbatim
*>
*> \param[in] ALPHA
*> \verbatim
*> ALPHA is REAL
*> On entry, ALPHA specifies the scalar alpha.
*> \endverbatim
*>
*> \param[in] A
*> \verbatim
*> A is REAL array, dimension ( LDA, ka ), where ka is
*> k when TRANSA = 'N' or 'n', and is m otherwise.
*> Before entry with TRANSA = 'N' or 'n', the leading m by k
*> part of the array A must contain the matrix A, otherwise
*> the leading k by m part of the array A must contain the
*> matrix A.
*> \endverbatim
*>
*> \param[in] LDA
*> \verbatim
*> LDA is INTEGER
*> On entry, LDA specifies the first dimension of A as declared
*> in the calling (sub) program. When TRANSA = 'N' or 'n' then
*> LDA must be at least max( 1, m ), otherwise LDA must be at
*> least max( 1, k ).
*> \endverbatim
*>
*> \param[in] B
*> \verbatim
*> B is REAL array, dimension ( LDB, kb ), where kb is
*> n when TRANSB = 'N' or 'n', and is k otherwise.
*> Before entry with TRANSB = 'N' or 'n', the leading k by n
*> part of the array B must contain the matrix B, otherwise
*> the leading n by k part of the array B must contain the
*> matrix B.
*> \endverbatim
*>
*> \param[in] LDB
*> \verbatim
*> LDB is INTEGER
*> On entry, LDB specifies the first dimension of B as declared
*> in the calling (sub) program. When TRANSB = 'N' or 'n' then
*> LDB must be at least max( 1, k ), otherwise LDB must be at
*> least max( 1, n ).
*> \endverbatim
*>
*> \param[in] BETA
*> \verbatim
*> BETA is REAL
*> On entry, BETA specifies the scalar beta. When BETA is
*> supplied as zero then C need not be set on input.
*> \endverbatim
*>
*> \param[in,out] C
*> \verbatim
*> C is REAL array, dimension ( LDC, N )
*> Before entry, the leading m by n part of the array C must
*> contain the matrix C, except when beta is zero, in which
*> case C need not be set on entry.
*> On exit, the array C is overwritten by the m by n matrix
*> ( alpha*op( A )*op( B ) + beta*C ).
*> \endverbatim
*>
*> \param[in] LDC
*> \verbatim
*> LDC is INTEGER
*> On entry, LDC specifies the first dimension of C as declared
*> in the calling (sub) program. LDC must be at least
*> max( 1, m ).
*> \endverbatim
*
* Authors:
* ========
*
*> \author Univ. of Tennessee
*> \author Univ. of California Berkeley
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
*> \date December 2016
*
*> \ingroup single_blas_level3
*
*> \par Further Details:
* =====================
*>
*> \verbatim
*>
*> Level 3 Blas routine.
*>
*> -- Written on 8-February-1989.
*> Jack Dongarra, Argonne National Laboratory.
*> Iain Duff, AERE Harwell.
*> Jeremy Du Croz, Numerical Algorithms Group Ltd.
*> Sven Hammarling, Numerical Algorithms Group Ltd.
*> \endverbatim
*>
* =====================================================================
SUBROUTINE SGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
*
* -- Reference BLAS level3 routine (version 3.7.0) --
* -- Reference BLAS is a software package provided by Univ. of Tennessee, --
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
* December 2016
*
* .. Scalar Arguments ..
REAL ALPHA,BETA
INTEGER K,LDA,LDB,LDC,M,N
CHARACTER TRANSA,TRANSB
* ..
* .. Array Arguments ..
REAL A(LDA,*),B(LDB,*),C(LDC,*)
* ..
*
* =====================================================================
*
* .. External Functions ..
LOGICAL LSAME
EXTERNAL LSAME
* ..
* .. External Subroutines ..
EXTERNAL XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX
* ..
* .. Local Scalars ..
REAL TEMP
INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB
LOGICAL NOTA,NOTB
* ..
* .. Parameters ..
REAL ONE,ZERO
PARAMETER (ONE=1.0E+0,ZERO=0.0E+0)
* ..
*
* Set NOTA and NOTB as true if A and B respectively are not
* transposed and set NROWA, NCOLA and NROWB as the number of rows
* and columns of A and the number of rows of B respectively.
*
NOTA = LSAME(TRANSA,'N')
NOTB = LSAME(TRANSB,'N')
IF (NOTA) THEN
NROWA = M
NCOLA = K
ELSE
NROWA = K
NCOLA = M
END IF
IF (NOTB) THEN
NROWB = K
ELSE
NROWB = N
END IF
*
* Test the input parameters.
*
INFO = 0
IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANSA,'C')) .AND.
+ (.NOT.LSAME(TRANSA,'T'))) THEN
INFO = 1
ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANSB,'C')) .AND.
+ (.NOT.LSAME(TRANSB,'T'))) THEN
INFO = 2
ELSE IF (M.LT.0) THEN
INFO = 3
ELSE IF (N.LT.0) THEN
INFO = 4
ELSE IF (K.LT.0) THEN
INFO = 5
ELSE IF (LDA.LT.MAX(1,NROWA)) THEN
INFO = 8
ELSE IF (LDB.LT.MAX(1,NROWB)) THEN
INFO = 10
ELSE IF (LDC.LT.MAX(1,M)) THEN
INFO = 13
END IF
IF (INFO.NE.0) THEN
CALL XERBLA('SGEMM ',INFO)
RETURN
END IF
*
* Quick return if possible.
*
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
+ (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN
*
* And if alpha.eq.zero.
*
IF (ALPHA.EQ.ZERO) THEN
IF (BETA.EQ.ZERO) THEN
DO 20 J = 1,N
DO 10 I = 1,M
C(I,J) = ZERO
10 CONTINUE
20 CONTINUE
ELSE
DO 40 J = 1,N
DO 30 I = 1,M
C(I,J) = BETA*C(I,J)
30 CONTINUE
40 CONTINUE
END IF
RETURN
END IF
*
* Start the operations.
*
IF (NOTB) THEN
IF (NOTA) THEN
*
* Form C := alpha*A*B + beta*C.
*
DO 90 J = 1,N
IF (BETA.EQ.ZERO) THEN
DO 50 I = 1,M
C(I,J) = ZERO
50 CONTINUE
ELSE IF (BETA.NE.ONE) THEN
DO 60 I = 1,M
C(I,J) = BETA*C(I,J)
60 CONTINUE
END IF
DO 80 L = 1,K
TEMP = ALPHA*B(L,J)
DO 70 I = 1,M
C(I,J) = C(I,J) + TEMP*A(I,L)
70 CONTINUE
80 CONTINUE
90 CONTINUE
ELSE
*
* Form C := alpha*A**T*B + beta*C
*
DO 120 J = 1,N
DO 110 I = 1,M
TEMP = ZERO
DO 100 L = 1,K
TEMP = TEMP + A(L,I)*B(L,J)
100 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
110 CONTINUE
120 CONTINUE
END IF
ELSE
IF (NOTA) THEN
*
* Form C := alpha*A*B**T + beta*C
*
DO 170 J = 1,N
IF (BETA.EQ.ZERO) THEN
DO 130 I = 1,M
C(I,J) = ZERO
130 CONTINUE
ELSE IF (BETA.NE.ONE) THEN
DO 140 I = 1,M
C(I,J) = BETA*C(I,J)
140 CONTINUE
END IF
DO 160 L = 1,K
TEMP = ALPHA*B(J,L)
DO 150 I = 1,M
C(I,J) = C(I,J) + TEMP*A(I,L)
150 CONTINUE
160 CONTINUE
170 CONTINUE
ELSE
*
* Form C := alpha*A**T*B**T + beta*C
*
DO 200 J = 1,N
DO 190 I = 1,M
TEMP = ZERO
DO 180 L = 1,K
TEMP = TEMP + A(L,I)*B(J,L)
180 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
190 CONTINUE
200 CONTINUE
END IF
END IF
*
RETURN
*
* End of SGEMM .
*
END
*> \brief \b DGEMM
*
* =========== DOCUMENTATION ===========
*
* Online html documentation available at
* http://www.netlib.org/lapack/explore-html/
*
* Definition:
* ===========
*
* SUBROUTINE DGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
*
* .. Scalar Arguments ..
* DOUBLE PRECISION ALPHA,BETA
* INTEGER K,LDA,LDB,LDC,M,N
* CHARACTER TRANSA,TRANSB
* ..
* .. Array Arguments ..
* DOUBLE PRECISION A(LDA,*),B(LDB,*),C(LDC,*)
* ..
*
*
*> \par Purpose:
* =============
*>
*> \verbatim
*>
*> DGEMM performs one of the matrix-matrix operations
*>
*> C := alpha*op( A )*op( B ) + beta*C,
*>
*> where op( X ) is one of
*>
*> op( X ) = X or op( X ) = X**T,
*>
*> alpha and beta are scalars, and A, B and C are matrices, with op( A )
*> an m by k matrix, op( B ) a k by n matrix and C an m by n matrix.
*> \endverbatim
*
* Arguments:
* ==========
*
*> \param[in] TRANSA
*> \verbatim
*> TRANSA is CHARACTER*1
*> On entry, TRANSA specifies the form of op( A ) to be used in
*> the matrix multiplication as follows:
*>
*> TRANSA = 'N' or 'n', op( A ) = A.
*>
*> TRANSA = 'T' or 't', op( A ) = A**T.
*>
*> TRANSA = 'C' or 'c', op( A ) = A**T.
*> \endverbatim
*>
*> \param[in] TRANSB
*> \verbatim
*> TRANSB is CHARACTER*1
*> On entry, TRANSB specifies the form of op( B ) to be used in
*> the matrix multiplication as follows:
*>
*> TRANSB = 'N' or 'n', op( B ) = B.
*>
*> TRANSB = 'T' or 't', op( B ) = B**T.
*>
*> TRANSB = 'C' or 'c', op( B ) = B**T.
*> \endverbatim
*>
*> \param[in] M
*> \verbatim
*> M is INTEGER
*> On entry, M specifies the number of rows of the matrix
*> op( A ) and of the matrix C. M must be at least zero.
*> \endverbatim
*>
*> \param[in] N
*> \verbatim
*> N is INTEGER
*> On entry, N specifies the number of columns of the matrix
*> op( B ) and the number of columns of the matrix C. N must be
*> at least zero.
*> \endverbatim
*>
*> \param[in] K
*> \verbatim
*> K is INTEGER
*> On entry, K specifies the number of columns of the matrix
*> op( A ) and the number of rows of the matrix op( B ). K must
*> be at least zero.
*> \endverbatim
*>
*> \param[in] ALPHA
*> \verbatim
*> ALPHA is DOUBLE PRECISION.
*> On entry, ALPHA specifies the scalar alpha.
*> \endverbatim
*>
*> \param[in] A
*> \verbatim
*> A is DOUBLE PRECISION array, dimension ( LDA, ka ), where ka is
*> k when TRANSA = 'N' or 'n', and is m otherwise.
*> Before entry with TRANSA = 'N' or 'n', the leading m by k
*> part of the array A must contain the matrix A, otherwise
*> the leading k by m part of the array A must contain the
*> matrix A.
*> \endverbatim
*>
*> \param[in] LDA
*> \verbatim
*> LDA is INTEGER
*> On entry, LDA specifies the first dimension of A as declared
*> in the calling (sub) program. When TRANSA = 'N' or 'n' then
*> LDA must be at least max( 1, m ), otherwise LDA must be at
*> least max( 1, k ).
*> \endverbatim
*>
*> \param[in] B
*> \verbatim
*> B is DOUBLE PRECISION array, dimension ( LDB, kb ), where kb is
*> n when TRANSB = 'N' or 'n', and is k otherwise.
*> Before entry with TRANSB = 'N' or 'n', the leading k by n
*> part of the array B must contain the matrix B, otherwise
*> the leading n by k part of the array B must contain the
*> matrix B.
*> \endverbatim
*>
*> \param[in] LDB
*> \verbatim
*> LDB is INTEGER
*> On entry, LDB specifies the first dimension of B as declared
*> in the calling (sub) program. When TRANSB = 'N' or 'n' then
*> LDB must be at least max( 1, k ), otherwise LDB must be at
*> least max( 1, n ).
*> \endverbatim
*>
*> \param[in] BETA
*> \verbatim
*> BETA is DOUBLE PRECISION.
*> On entry, BETA specifies the scalar beta. When BETA is
*> supplied as zero then C need not be set on input.
*> \endverbatim
*>
*> \param[in,out] C
*> \verbatim
*> C is DOUBLE PRECISION array, dimension ( LDC, N )
*> Before entry, the leading m by n part of the array C must
*> contain the matrix C, except when beta is zero, in which
*> case C need not be set on entry.
*> On exit, the array C is overwritten by the m by n matrix
*> ( alpha*op( A )*op( B ) + beta*C ).
*> \endverbatim
*>
*> \param[in] LDC
*> \verbatim
*> LDC is INTEGER
*> On entry, LDC specifies the first dimension of C as declared
*> in the calling (sub) program. LDC must be at least
*> max( 1, m ).
*> \endverbatim
*
* Authors:
* ========
*
*> \author Univ. of Tennessee
*> \author Univ. of California Berkeley
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
*> \date December 2016
*
*> \ingroup double_blas_level3
*
*> \par Further Details:
* =====================
*>
*> \verbatim
*>
*> Level 3 Blas routine.
*>
*> -- Written on 8-February-1989.
*> Jack Dongarra, Argonne National Laboratory.
*> Iain Duff, AERE Harwell.
*> Jeremy Du Croz, Numerical Algorithms Group Ltd.
*> Sven Hammarling, Numerical Algorithms Group Ltd.
*> \endverbatim
*>
* =====================================================================
SUBROUTINE DGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
*
* -- Reference BLAS level3 routine (version 3.7.0) --
* -- Reference BLAS is a software package provided by Univ. of Tennessee, --
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
* December 2016
*
* .. Scalar Arguments ..
DOUBLE PRECISION ALPHA,BETA
INTEGER K,LDA,LDB,LDC,M,N
CHARACTER TRANSA,TRANSB
* ..
* .. Array Arguments ..
DOUBLE PRECISION A(LDA,*),B(LDB,*),C(LDC,*)
* ..
*
* =====================================================================
*
* .. External Functions ..
LOGICAL LSAME
EXTERNAL LSAME
* ..
* .. External Subroutines ..
EXTERNAL XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX
* ..
* .. Local Scalars ..
DOUBLE PRECISION TEMP
INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB
LOGICAL NOTA,NOTB
* ..
* .. Parameters ..
DOUBLE PRECISION ONE,ZERO
PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
* ..
*
* Set NOTA and NOTB as true if A and B respectively are not
* transposed and set NROWA, NCOLA and NROWB as the number of rows
* and columns of A and the number of rows of B respectively.
*
NOTA = LSAME(TRANSA,'N')
NOTB = LSAME(TRANSB,'N')
IF (NOTA) THEN
NROWA = M
NCOLA = K
ELSE
NROWA = K
NCOLA = M
END IF
IF (NOTB) THEN
NROWB = K
ELSE
NROWB = N
END IF
*
* Test the input parameters.
*
INFO = 0
IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANSA,'C')) .AND.
+ (.NOT.LSAME(TRANSA,'T'))) THEN
INFO = 1
ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANSB,'C')) .AND.
+ (.NOT.LSAME(TRANSB,'T'))) THEN
INFO = 2
ELSE IF (M.LT.0) THEN
INFO = 3
ELSE IF (N.LT.0) THEN
INFO = 4
ELSE IF (K.LT.0) THEN
INFO = 5
ELSE IF (LDA.LT.MAX(1,NROWA)) THEN
INFO = 8
ELSE IF (LDB.LT.MAX(1,NROWB)) THEN
INFO = 10
ELSE IF (LDC.LT.MAX(1,M)) THEN
INFO = 13
END IF
IF (INFO.NE.0) THEN
CALL XERBLA('DGEMM ',INFO)
RETURN
END IF
*
* Quick return if possible.
*
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
+ (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN
*
* And if alpha.eq.zero.
*
IF (ALPHA.EQ.ZERO) THEN
IF (BETA.EQ.ZERO) THEN
DO 20 J = 1,N
DO 10 I = 1,M
C(I,J) = ZERO
10 CONTINUE
20 CONTINUE
ELSE
DO 40 J = 1,N
DO 30 I = 1,M
C(I,J) = BETA*C(I,J)
30 CONTINUE
40 CONTINUE
END IF
RETURN
END IF
*
* Start the operations.
*
IF (NOTB) THEN
IF (NOTA) THEN
*
* Form C := alpha*A*B + beta*C.
*
DO 90 J = 1,N
IF (BETA.EQ.ZERO) THEN
DO 50 I = 1,M
C(I,J) = ZERO
50 CONTINUE
ELSE IF (BETA.NE.ONE) THEN
DO 60 I = 1,M
C(I,J) = BETA*C(I,J)
60 CONTINUE
END IF
DO 80 L = 1,K
TEMP = ALPHA*B(L,J)
DO 70 I = 1,M
C(I,J) = C(I,J) + TEMP*A(I,L)
70 CONTINUE
80 CONTINUE
90 CONTINUE
ELSE
*
* Form C := alpha*A**T*B + beta*C
*
DO 120 J = 1,N
DO 110 I = 1,M
TEMP = ZERO
DO 100 L = 1,K
TEMP = TEMP + A(L,I)*B(L,J)
100 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
110 CONTINUE
120 CONTINUE
END IF
ELSE
IF (NOTA) THEN
*
* Form C := alpha*A*B**T + beta*C
*
DO 170 J = 1,N
IF (BETA.EQ.ZERO) THEN
DO 130 I = 1,M
C(I,J) = ZERO
130 CONTINUE
ELSE IF (BETA.NE.ONE) THEN
DO 140 I = 1,M
C(I,J) = BETA*C(I,J)
140 CONTINUE
END IF
DO 160 L = 1,K
TEMP = ALPHA*B(J,L)
DO 150 I = 1,M
C(I,J) = C(I,J) + TEMP*A(I,L)
150 CONTINUE
160 CONTINUE
170 CONTINUE
ELSE
*
* Form C := alpha*A**T*B**T + beta*C
*
DO 200 J = 1,N
DO 190 I = 1,M
TEMP = ZERO
DO 180 L = 1,K
TEMP = TEMP + A(L,I)*B(J,L)
180 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
190 CONTINUE
200 CONTINUE
END IF
END IF
*
RETURN
*
* End of DGEMM .
*
END
*> \brief \b ZGEMM
*
* =========== DOCUMENTATION ===========
*
* Online html documentation available at
* http://www.netlib.org/lapack/explore-html/
*
* Definition:
* ===========
*
* SUBROUTINE ZGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
*
* .. Scalar Arguments ..
* COMPLEX*16 ALPHA,BETA
* INTEGER K,LDA,LDB,LDC,M,N
* CHARACTER TRANSA,TRANSB
* ..
* .. Array Arguments ..
* COMPLEX*16 A(LDA,*),B(LDB,*),C(LDC,*)
* ..
*
*
*> \par Purpose:
* =============
*>
*> \verbatim
*>
*> ZGEMM performs one of the matrix-matrix operations
*>
*> C := alpha*op( A )*op( B ) + beta*C,
*>
*> where op( X ) is one of
*>
*> op( X ) = X or op( X ) = X**T or op( X ) = X**H,
*>
*> alpha and beta are scalars, and A, B and C are matrices, with op( A )
*> an m by k matrix, op( B ) a k by n matrix and C an m by n matrix.
*> \endverbatim
*
* Arguments:
* ==========
*
*> \param[in] TRANSA
*> \verbatim
*> TRANSA is CHARACTER*1
*> On entry, TRANSA specifies the form of op( A ) to be used in
*> the matrix multiplication as follows:
*>
*> TRANSA = 'N' or 'n', op( A ) = A.
*>
*> TRANSA = 'T' or 't', op( A ) = A**T.
*>
*> TRANSA = 'C' or 'c', op( A ) = A**H.
*> \endverbatim
*>
*> \param[in] TRANSB
*> \verbatim
*> TRANSB is CHARACTER*1
*> On entry, TRANSB specifies the form of op( B ) to be used in
*> the matrix multiplication as follows:
*>
*> TRANSB = 'N' or 'n', op( B ) = B.
*>
*> TRANSB = 'T' or 't', op( B ) = B**T.
*>
*> TRANSB = 'C' or 'c', op( B ) = B**H.
*> \endverbatim
*>
*> \param[in] M
*> \verbatim
*> M is INTEGER
*> On entry, M specifies the number of rows of the matrix
*> op( A ) and of the matrix C. M must be at least zero.
*> \endverbatim
*>
*> \param[in] N
*> \verbatim
*> N is INTEGER
*> On entry, N specifies the number of columns of the matrix
*> op( B ) and the number of columns of the matrix C. N must be
*> at least zero.
*> \endverbatim
*>
*> \param[in] K
*> \verbatim
*> K is INTEGER
*> On entry, K specifies the number of columns of the matrix
*> op( A ) and the number of rows of the matrix op( B ). K must
*> be at least zero.
*> \endverbatim
*>
*> \param[in] ALPHA
*> \verbatim
*> ALPHA is COMPLEX*16
*> On entry, ALPHA specifies the scalar alpha.
*> \endverbatim
*>
*> \param[in] A
*> \verbatim
*> A is COMPLEX*16 array, dimension ( LDA, ka ), where ka is
*> k when TRANSA = 'N' or 'n', and is m otherwise.
*> Before entry with TRANSA = 'N' or 'n', the leading m by k
*> part of the array A must contain the matrix A, otherwise
*> the leading k by m part of the array A must contain the
*> matrix A.
*> \endverbatim
*>
*> \param[in] LDA
*> \verbatim
*> LDA is INTEGER
*> On entry, LDA specifies the first dimension of A as declared
*> in the calling (sub) program. When TRANSA = 'N' or 'n' then
*> LDA must be at least max( 1, m ), otherwise LDA must be at
*> least max( 1, k ).
*> \endverbatim
*>
*> \param[in] B
*> \verbatim
*> B is COMPLEX*16 array, dimension ( LDB, kb ), where kb is
*> n when TRANSB = 'N' or 'n', and is k otherwise.
*> Before entry with TRANSB = 'N' or 'n', the leading k by n
*> part of the array B must contain the matrix B, otherwise
*> the leading n by k part of the array B must contain the
*> matrix B.
*> \endverbatim
*>
*> \param[in] LDB
*> \verbatim
*> LDB is INTEGER
*> On entry, LDB specifies the first dimension of B as declared
*> in the calling (sub) program. When TRANSB = 'N' or 'n' then
*> LDB must be at least max( 1, k ), otherwise LDB must be at
*> least max( 1, n ).
*> \endverbatim
*>
*> \param[in] BETA
*> \verbatim
*> BETA is COMPLEX*16
*> On entry, BETA specifies the scalar beta. When BETA is
*> supplied as zero then C need not be set on input.
*> \endverbatim
*>
*> \param[in,out] C
*> \verbatim
*> C is COMPLEX*16 array, dimension ( LDC, N )
*> Before entry, the leading m by n part of the array C must
*> contain the matrix C, except when beta is zero, in which
*> case C need not be set on entry.
*> On exit, the array C is overwritten by the m by n matrix
*> ( alpha*op( A )*op( B ) + beta*C ).
*> \endverbatim
*>
*> \param[in] LDC
*> \verbatim
*> LDC is INTEGER
*> On entry, LDC specifies the first dimension of C as declared
*> in the calling (sub) program. LDC must be at least
*> max( 1, m ).
*> \endverbatim
*
* Authors:
* ========
*
*> \author Univ. of Tennessee
*> \author Univ. of California Berkeley
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
*> \date December 2016
*
*> \ingroup complex16_blas_level3
*
*> \par Further Details:
* =====================
*>
*> \verbatim
*>
*> Level 3 Blas routine.
*>
*> -- Written on 8-February-1989.
*> Jack Dongarra, Argonne National Laboratory.
*> Iain Duff, AERE Harwell.
*> Jeremy Du Croz, Numerical Algorithms Group Ltd.
*> Sven Hammarling, Numerical Algorithms Group Ltd.
*> \endverbatim
*>
* =====================================================================
SUBROUTINE ZGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC)
*
* -- Reference BLAS level3 routine (version 3.7.0) --
* -- Reference BLAS is a software package provided by Univ. of Tennessee, --
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
* December 2016
*
* .. Scalar Arguments ..
COMPLEX*16 ALPHA,BETA
INTEGER K,LDA,LDB,LDC,M,N
CHARACTER TRANSA,TRANSB
* ..
* .. Array Arguments ..
COMPLEX*16 A(LDA,*),B(LDB,*),C(LDC,*)
* ..
*
* =====================================================================
*
* .. External Functions ..
LOGICAL LSAME
EXTERNAL LSAME
* ..
* .. External Subroutines ..
EXTERNAL XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC DCONJG,MAX
* ..
* .. Local Scalars ..
COMPLEX*16 TEMP
INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB
LOGICAL CONJA,CONJB,NOTA,NOTB
* ..
* .. Parameters ..
COMPLEX*16 ONE
PARAMETER (ONE= (1.0D+0,0.0D+0))
COMPLEX*16 ZERO
PARAMETER (ZERO= (0.0D+0,0.0D+0))
* ..
*
* Set NOTA and NOTB as true if A and B respectively are not
* conjugated or transposed, set CONJA and CONJB as true if A and
* B respectively are to be transposed but not conjugated and set
* NROWA, NCOLA and NROWB as the number of rows and columns of A
* and the number of rows of B respectively.
*
NOTA = LSAME(TRANSA,'N')
NOTB = LSAME(TRANSB,'N')
CONJA = LSAME(TRANSA,'C')
CONJB = LSAME(TRANSB,'C')
IF (NOTA) THEN
NROWA = M
NCOLA = K
ELSE
NROWA = K
NCOLA = M
END IF
IF (NOTB) THEN
NROWB = K
ELSE
NROWB = N
END IF
*
* Test the input parameters.
*
INFO = 0
IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND.
+ (.NOT.LSAME(TRANSA,'T'))) THEN
INFO = 1
ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND.
+ (.NOT.LSAME(TRANSB,'T'))) THEN
INFO = 2
ELSE IF (M.LT.0) THEN
INFO = 3
ELSE IF (N.LT.0) THEN
INFO = 4
ELSE IF (K.LT.0) THEN
INFO = 5
ELSE IF (LDA.LT.MAX(1,NROWA)) THEN
INFO = 8
ELSE IF (LDB.LT.MAX(1,NROWB)) THEN
INFO = 10
ELSE IF (LDC.LT.MAX(1,M)) THEN
INFO = 13
END IF
IF (INFO.NE.0) THEN
CALL XERBLA('ZGEMM ',INFO)
RETURN
END IF
*
* Quick return if possible.
*
IF ((M.EQ.0) .OR. (N.EQ.0) .OR.
+ (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN
*
* And when alpha.eq.zero.
*
IF (ALPHA.EQ.ZERO) THEN
IF (BETA.EQ.ZERO) THEN
DO 20 J = 1,N
DO 10 I = 1,M
C(I,J) = ZERO
10 CONTINUE
20 CONTINUE
ELSE
DO 40 J = 1,N
DO 30 I = 1,M
C(I,J) = BETA*C(I,J)
30 CONTINUE
40 CONTINUE
END IF
RETURN
END IF
*
* Start the operations.
*
IF (NOTB) THEN
IF (NOTA) THEN
*
* Form C := alpha*A*B + beta*C.
*
DO 90 J = 1,N
IF (BETA.EQ.ZERO) THEN
DO 50 I = 1,M
C(I,J) = ZERO
50 CONTINUE
ELSE IF (BETA.NE.ONE) THEN
DO 60 I = 1,M
C(I,J) = BETA*C(I,J)
60 CONTINUE
END IF
DO 80 L = 1,K
TEMP = ALPHA*B(L,J)
DO 70 I = 1,M
C(I,J) = C(I,J) + TEMP*A(I,L)
70 CONTINUE
80 CONTINUE
90 CONTINUE
ELSE IF (CONJA) THEN
*
* Form C := alpha*A**H*B + beta*C.
*
DO 120 J = 1,N
DO 110 I = 1,M
TEMP = ZERO
DO 100 L = 1,K
TEMP = TEMP + DCONJG(A(L,I))*B(L,J)
100 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
110 CONTINUE
120 CONTINUE
ELSE
*
* Form C := alpha*A**T*B + beta*C
*
DO 150 J = 1,N
DO 140 I = 1,M
TEMP = ZERO
DO 130 L = 1,K
TEMP = TEMP + A(L,I)*B(L,J)
130 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
140 CONTINUE
150 CONTINUE
END IF
ELSE IF (NOTA) THEN
IF (CONJB) THEN
*
* Form C := alpha*A*B**H + beta*C.
*
DO 200 J = 1,N
IF (BETA.EQ.ZERO) THEN
DO 160 I = 1,M
C(I,J) = ZERO
160 CONTINUE
ELSE IF (BETA.NE.ONE) THEN
DO 170 I = 1,M
C(I,J) = BETA*C(I,J)
170 CONTINUE
END IF
DO 190 L = 1,K
TEMP = ALPHA*DCONJG(B(J,L))
DO 180 I = 1,M
C(I,J) = C(I,J) + TEMP*A(I,L)
180 CONTINUE
190 CONTINUE
200 CONTINUE
ELSE
*
* Form C := alpha*A*B**T + beta*C
*
DO 250 J = 1,N
IF (BETA.EQ.ZERO) THEN
DO 210 I = 1,M
C(I,J) = ZERO
210 CONTINUE
ELSE IF (BETA.NE.ONE) THEN
DO 220 I = 1,M
C(I,J) = BETA*C(I,J)
220 CONTINUE
END IF
DO 240 L = 1,K
TEMP = ALPHA*B(J,L)
DO 230 I = 1,M
C(I,J) = C(I,J) + TEMP*A(I,L)
230 CONTINUE
240 CONTINUE
250 CONTINUE
END IF
ELSE IF (CONJA) THEN
IF (CONJB) THEN
*
* Form C := alpha*A**H*B**H + beta*C.
*
DO 280 J = 1,N
DO 270 I = 1,M
TEMP = ZERO
DO 260 L = 1,K
TEMP = TEMP + DCONJG(A(L,I))*DCONJG(B(J,L))
260 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
270 CONTINUE
280 CONTINUE
ELSE
*
* Form C := alpha*A**H*B**T + beta*C
*
DO 310 J = 1,N
DO 300 I = 1,M
TEMP = ZERO
DO 290 L = 1,K
TEMP = TEMP + DCONJG(A(L,I))*B(J,L)
290 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
300 CONTINUE
310 CONTINUE
END IF
ELSE
IF (CONJB) THEN
*
* Form C := alpha*A**T*B**H + beta*C
*
DO 340 J = 1,N
DO 330 I = 1,M
TEMP = ZERO
DO 320 L = 1,K
TEMP = TEMP + A(L,I)*DCONJG(B(J,L))
320 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
330 CONTINUE
340 CONTINUE
ELSE
*
* Form C := alpha*A**T*B**T + beta*C
*
DO 370 J = 1,N
DO 360 I = 1,M
TEMP = ZERO
DO 350 L = 1,K
TEMP = TEMP + A(L,I)*B(J,L)
350 CONTINUE
IF (BETA.EQ.ZERO) THEN
C(I,J) = ALPHA*TEMP
ELSE
C(I,J) = ALPHA*TEMP + BETA*C(I,J)
END IF
360 CONTINUE
370 CONTINUE
END IF
END IF
*
RETURN
*
* End of ZGEMM .
*
END
......@@ -44,4 +44,4 @@ program main
deallocate(calloc)
end program main
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "original" } }
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 1 "original" } }
......@@ -58,4 +58,4 @@ program main
end do
end do
end program main
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 1 "optimized" } }
C { dg-do run }
C { dg-options "-fcheck=bounds -fdump-tree-optimized -fblas-matmul-limit=1 -O -fexternal-blas" }
C { dg-additional-sources blas_gemm_routines.f }
C Test calling of BLAS routines
program main
call sub_s
call sub_d
call sub_c
call sub_z
end
subroutine sub_d
implicit none
real(8), dimension(3,2) :: a
real(8), dimension(2,3) :: at
real(8), dimension(2,4) :: b
real(8), dimension(4,2) :: bt
real(8), dimension(3,4) :: c
real(8), dimension(3,4) :: cres
real(8), dimension(:,:), allocatable :: c_alloc
data a / 2., -3., 5., -7., 11., -13./
data b /17., -23., 29., -31., 37., -39., 41., -47./
data cres /195., -304., 384., 275., -428., 548., 347., -540.,
& 692., 411., -640., 816./
c = matmul(a,b)
if (any (c /= cres)) stop 31
at = transpose(a)
c = (1.2,-2.2)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 32
bt = transpose(b)
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 33
c_alloc = matmul(a,b)
if (any (c /= cres)) stop 34
at = transpose(a)
deallocate (c_alloc)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 35
bt = transpose(b)
allocate (c_alloc(20,20))
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 36
end
subroutine sub_s
implicit none
real, dimension(3,2) :: a
real, dimension(2,3) :: at
real, dimension(2,4) :: b
real, dimension(4,2) :: bt
real, dimension(3,4) :: c
real, dimension(3,4) :: cres
real, dimension(:,:), allocatable :: c_alloc
data a / 2., -3., 5., -7., 11., -13./
data b /17., -23., 29., -31., 37., -39., 41., -47./
data cres /195., -304., 384., 275., -428., 548., 347., -540.,
& 692., 411., -640., 816./
c = matmul(a,b)
if (any (c /= cres)) stop 21
at = transpose(a)
c = (1.2,-2.2)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 22
bt = transpose(b)
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 23
c_alloc = matmul(a,b)
if (any (c /= cres)) stop 24
at = transpose(a)
deallocate (c_alloc)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 25
bt = transpose(b)
allocate (c_alloc(20,20))
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 26
end
subroutine sub_c
implicit none
complex, dimension(3,2) :: a
complex, dimension(2,3) :: at, ah
complex, dimension(2,4) :: b
complex, dimension(4,2) :: bt, bh
complex, dimension(3,4) :: c
complex, dimension(3,4) :: cres
complex, dimension(:,:), allocatable :: c_alloc
data a / (2.,-3.), (-5.,7.), (11.,-13.), (17.,19), (-23., -29),
& (-31., 37.)/
data b / (-41., 43.), (-47., 53.), (-59.,-61.), (-67., 71),
& ( 73.,79. ), (83.,-89.), (97.,-101.), (-107.,-109.)/
data cres /(-1759.,217.), (2522.,-358.), (-396.,-2376.),
& (-2789.,-11.),
& (4322.,202.), (-1992.,-4584.), (3485.,3.), (-5408.,-244.),
& (2550.,5750.), (143.,-4379.), (-478.,6794.), (7104.,-2952.) /
c = matmul(a,b)
if (any (c /= cres)) stop 1
at = transpose(a)
c = (1.2,-2.2)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 2
bt = transpose(b)
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 3
ah = transpose(conjg(a))
c = (1.2,-2.2)
c = matmul(conjg(transpose(ah)), b)
if (any (c /= cres)) stop 4
bh = transpose(conjg(b))
c = (1.2,-2.2)
c = matmul(a, transpose(conjg(bh)))
if (any (c /= cres)) stop 5
c_alloc = matmul(a,b)
if (any (c /= cres)) stop 6
at = transpose(a)
deallocate (c_alloc)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 7
bt = transpose(b)
allocate (c_alloc(20,20))
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 8
ah = transpose(conjg(a))
c = (1.2,-2.2)
c = matmul(conjg(transpose(ah)), b)
if (any (c /= cres)) stop 9
deallocate (c_alloc)
allocate (c_alloc(0,0))
bh = transpose(conjg(b))
c = (1.2,-2.2)
c = matmul(a, transpose(conjg(bh)))
if (any (c /= cres)) stop 10
end
subroutine sub_z
implicit none
complex(8), dimension(3,2) :: a
complex(8), dimension(2,3) :: at, ah
complex(8), dimension(2,4) :: b
complex(8), dimension(4,2) :: bt, bh
complex(8), dimension(3,4) :: c
complex(8), dimension(3,4) :: cres
complex(8), dimension(:,:), allocatable :: c_alloc
data a / (2.,-3.), (-5._8,7.), (11.,-13.), (17.,19),
& (-23., -29), (-31., 37.)/
data b / (-41., 43.), (-47., 53.), (-59.,-61.), (-67., 71),
& ( 73.,79. ), (83.,-89.), (97.,-101.), (-107.,-109.)/
data cres /(-1759.,217.), (2522.,-358.), (-396.,-2376.),
& (-2789.,-11.),
& (4322.,202.), (-1992.,-4584.), (3485.,3.), (-5408.,-244.),
& (2550.,5750.), (143.,-4379.), (-478.,6794.), (7104.,-2952.) /
c = matmul(a,b)
if (any (c /= cres)) stop 11
at = transpose(a)
c = (1.2,-2.2)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 12
bt = transpose(b)
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 13
ah = transpose(conjg(a))
c = (1.2,-2.2)
c = matmul(conjg(transpose(ah)), b)
if (any (c /= cres)) stop 14
bh = transpose(conjg(b))
c = (1.2,-2.2)
c = matmul(a, transpose(conjg(bh)))
if (any (c /= cres)) stop 15
c_alloc = matmul(a,b)
if (any (c /= cres)) stop 16
at = transpose(a)
deallocate (c_alloc)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 17
bt = transpose(b)
allocate (c_alloc(20,20))
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 18
ah = transpose(conjg(a))
c = (1.2,-2.2)
c = matmul(conjg(transpose(ah)), b)
if (any (c /= cres)) stop 19
deallocate (c_alloc)
allocate (c_alloc(0,0))
bh = transpose(conjg(b))
c = (1.2,-2.2)
c = matmul(a, transpose(conjg(bh)))
if (any (c /= cres)) stop 20
end
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }
C { dg-do run }
C { dg-options "-fno-realloc-lhs -fdump-tree-optimized -fcheck=bounds -fblas-matmul-limit=1 -O -fexternal-blas" }
C { dg-shouldfail "Fortran runtime error: Array bound mismatch for dimension 2 of array." }
C { dg-additional-sources blas_gemm_routines.f }
program main
real, dimension(3,2) :: a
real, dimension(2,3) :: b
real, dimension(:,:), allocatable :: ret
a = 1.0
b = 2.3
allocate(ret(3,2))
ret = matmul(a,b) ! This should throw an error.
end
! { dg-output "Fortran runtime error: Array bound mismatch for dimension 2 of array.*" }
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }
C { dg-do run }
C { dg-options "-fdump-tree-optimized -fcheck=bounds -fblas-matmul-limit=1 -O -fexternal-blas" }
C { dg-shouldfail "Fortran runtime error: Incorrect extent in argument B in MATMUL intrinsic in dimension 1.*" }
C { dg-additional-sources blas_gemm_routines.f }
program main
character(len=20) :: line
integer :: n, m
real, dimension(3,2) :: a
real, dimension(:,:), allocatable :: b
real, dimension(:,:), allocatable :: ret
a = 1.0
line = '3 3'
read (unit=line,fmt=*) n, m
allocate (b(n,m))
b = 2.3
ret = matmul(a,b) ! This should throw an error.
end
! { dg-output "Fortran runtime error: Incorrect extent in argument B in MATMUL intrinsic in dimension 1.*" }
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }
C { dg-do run }
C { dg-options "-fdump-tree-optimized -fcheck=bounds -fblas-matmul-limit=1 -O -fexternal-blas" }
C { dg-shouldfail "Fortran runtime error: Incorrect extent in argument B in MATMUL intrinsic in dimension 1" }
C { dg-additional-sources blas_gemm_routines.f }
program main
character(len=20) :: line
integer :: n, m
real, dimension(3,2) :: a
real, dimension(:,:), allocatable :: b
real, dimension(:,:), allocatable :: ret
a = 1.0
line = '4 3'
read (unit=line,fmt=*) n, m
allocate (b(n,m))
b = 2.3
ret = matmul(transpose(a),b) ! This should throw an error.
end
! { dg-output "Fortran runtime error: Incorrect extent in argument B in MATMUL intrinsic in dimension 1.*" }
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }
! { dg-do compile }
! { dg-options "-fdefault-real-8 -fexternal-blas -fdump-tree-original -finline-matmul-limit=0" }
! { dg-options "-fdefault-real-8 -fexternal-blas -fblas-matmul-limit=1 -fdump-tree-original -finline-matmul-limit=0" }
!
! PR fortran/54463
!
......@@ -8,8 +8,9 @@
program test
implicit none
real, dimension(3,3) :: A
call random_number(a)
A = matmul(A,A)
end program test
! { dg-final { scan-tree-dump-times "sgemm_" 0 "original" } }
! { dg-final { scan-tree-dump-times "dgemm_" 1 "original" } }
! { dg-final { scan-tree-dump-times "sgemm" 0 "original" } }
! { dg-final { scan-tree-dump-times "dgemm" 1 "original" } }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment