#include "blaswrap.h"
#include "f2c.h"

/* Subroutine */ int stgsyl_(char *trans, integer *ijob, integer *m, integer *
	n, real *a, integer *lda, real *b, integer *ldb, real *c__, integer *
	ldc, real *d__, integer *ldd, real *e, integer *lde, real *f, integer 
	*ldf, real *scale, real *dif, real *work, integer *lwork, integer *
	iwork, integer *info)
{
/*  -- LAPACK routine (version 3.1) --   
       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..   
       November 2006   


    Purpose   
    =======   

    STGSYL solves the generalized Sylvester equation:   

                A * R - L * B = scale * C                 (1)   
                D * R - L * E = scale * F   

    where R and L are unknown m-by-n matrices, (A, D), (B, E) and   
    (C, F) are given matrix pairs of size m-by-m, n-by-n and m-by-n,   
    respectively, with real entries. (A, D) and (B, E) must be in   
    generalized (real) Schur canonical form, i.e. A, B are upper quasi   
    triangular and D, E are upper triangular.   

    The solution (R, L) overwrites (C, F). 0 <= SCALE <= 1 is an output   
    scaling factor chosen to avoid overflow.   

    In matrix notation (1) is equivalent to solve  Zx = scale b, where   
    Z is defined as   

               Z = [ kron(In, A)  -kron(B', Im) ]         (2)   
                   [ kron(In, D)  -kron(E', Im) ].   

    Here Ik is the identity matrix of size k and X' is the transpose of   
    X. kron(X, Y) is the Kronecker product between the matrices X and Y.   

    If TRANS = 'T', STGSYL solves the transposed system Z'*y = scale*b,   
    which is equivalent to solve for R and L in   

                A' * R  + D' * L   = scale *  C           (3)   
                R  * B' + L  * E'  = scale * (-F)   

    This case (TRANS = 'T') is used to compute an one-norm-based estimate   
    of Dif[(A,D), (B,E)], the separation between the matrix pairs (A,D)   
    and (B,E), using SLACON.   

    If IJOB >= 1, STGSYL computes a Frobenius norm-based estimate   
    of Dif[(A,D),(B,E)]. That is, the reciprocal of a lower bound on the   
    reciprocal of the smallest singular value of Z. See [1-2] for more   
    information.   

    This is a level 3 BLAS algorithm.   

    Arguments   
    =========   

    TRANS   (input) CHARACTER*1   
            = 'N', solve the generalized Sylvester equation (1).   
            = 'T', solve the 'transposed' system (3).   

    IJOB    (input) INTEGER   
            Specifies what kind of functionality to be performed.   
             =0: solve (1) only.   
             =1: The functionality of 0 and 3.   
             =2: The functionality of 0 and 4.   
             =3: Only an estimate of Dif[(A,D), (B,E)] is computed.   
                 (look ahead strategy IJOB  = 1 is used).   
             =4: Only an estimate of Dif[(A,D), (B,E)] is computed.   
                 ( SGECON on sub-systems is used ).   
            Not referenced if TRANS = 'T'.   

    M       (input) INTEGER   
            The order of the matrices A and D, and the row dimension of   
            the matrices C, F, R and L.   

    N       (input) INTEGER   
            The order of the matrices B and E, and the column dimension   
            of the matrices C, F, R and L.   

    A       (input) REAL array, dimension (LDA, M)   
            The upper quasi triangular matrix A.   

    LDA     (input) INTEGER   
            The leading dimension of the array A. LDA >= max(1, M).   

    B       (input) REAL array, dimension (LDB, N)   
            The upper quasi triangular matrix B.   

    LDB     (input) INTEGER   
            The leading dimension of the array B. LDB >= max(1, N).   

    C       (input/output) REAL array, dimension (LDC, N)   
            On entry, C contains the right-hand-side of the first matrix   
            equation in (1) or (3).   
            On exit, if IJOB = 0, 1 or 2, C has been overwritten by   
            the solution R. If IJOB = 3 or 4 and TRANS = 'N', C holds R,   
            the solution achieved during the computation of the   
            Dif-estimate.   

    LDC     (input) INTEGER   
            The leading dimension of the array C. LDC >= max(1, M).   

    D       (input) REAL array, dimension (LDD, M)   
            The upper triangular matrix D.   

    LDD     (input) INTEGER   
            The leading dimension of the array D. LDD >= max(1, M).   

    E       (input) REAL array, dimension (LDE, N)   
            The upper triangular matrix E.   

    LDE     (input) INTEGER   
            The leading dimension of the array E. LDE >= max(1, N).   

    F       (input/output) REAL array, dimension (LDF, N)   
            On entry, F contains the right-hand-side of the second matrix   
            equation in (1) or (3).   
            On exit, if IJOB = 0, 1 or 2, F has been overwritten by   
            the solution L. If IJOB = 3 or 4 and TRANS = 'N', F holds L,   
            the solution achieved during the computation of the   
            Dif-estimate.   

    LDF     (input) INTEGER   
            The leading dimension of the array F. LDF >= max(1, M).   

    DIF     (output) REAL   
            On exit DIF is the reciprocal of a lower bound of the   
            reciprocal of the Dif-function, i.e. DIF is an upper bound of   
            Dif[(A,D), (B,E)] = sigma_min(Z), where Z as in (2).   
            IF IJOB = 0 or TRANS = 'T', DIF is not touched.   

    SCALE   (output) REAL   
            On exit SCALE is the scaling factor in (1) or (3).   
            If 0 < SCALE < 1, C and F hold the solutions R and L, resp.,   
            to a slightly perturbed system but the input matrices A, B, D   
            and E have not been changed. If SCALE = 0, C and F hold the   
            solutions R and L, respectively, to the homogeneous system   
            with C = F = 0. Normally, SCALE = 1.   

    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))   
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.   

    LWORK   (input) INTEGER   
            The dimension of the array WORK. LWORK > = 1.   
            If IJOB = 1 or 2 and TRANS = 'N', LWORK >= max(1,2*M*N).   

            If LWORK = -1, then a workspace query is assumed; the routine   
            only calculates the optimal size of the WORK array, returns   
            this value as the first entry of the WORK array, and no error   
            message related to LWORK is issued by XERBLA.   

    IWORK   (workspace) INTEGER array, dimension (M+N+6)   

    INFO    (output) INTEGER   
              =0: successful exit   
              <0: If INFO = -i, the i-th argument had an illegal value.   
              >0: (A, D) and (B, E) have common or close eigenvalues.   

    Further Details   
    ===============   

    Based on contributions by   
       Bo Kagstrom and Peter Poromaa, Department of Computing Science,   
       Umea University, S-901 87 Umea, Sweden.   

    [1] B. Kagstrom and P. Poromaa, LAPACK-Style Algorithms and Software   
        for Solving the Generalized Sylvester Equation and Estimating the   
        Separation between Regular Matrix Pairs, Report UMINF - 93.23,   
        Department of Computing Science, Umea University, S-901 87 Umea,   
        Sweden, December 1993, Revised April 1994, Also as LAPACK Working   
        Note 75.  To appear in ACM Trans. on Math. Software, Vol 22,   
        No 1, 1996.   

    [2] B. Kagstrom, A Perturbation Analysis of the Generalized Sylvester   
        Equation (AR - LB, DR - LE ) = (C, F), SIAM J. Matrix Anal.   
        Appl., 15(4):1045-1060, 1994   

    [3] B. Kagstrom and L. Westin, Generalized Schur Methods with   
        Condition Estimators for Solving the Generalized Sylvester   
        Equation, IEEE Transactions on Automatic Control, Vol. 34, No. 7,   
        July 1989, pp 745-751.   

    =====================================================================   
    Replaced various illegal calls to SCOPY by calls to SLASET.   
    Sven Hammarling, 1/5/02.   


       Decode and test input parameters   

       Parameter adjustments */
    /* Table of constant values */
    static integer c__2 = 2;
    static integer c_n1 = -1;
    static integer c__5 = 5;
    static real c_b14 = 0.f;
    static integer c__1 = 1;
    static real c_b51 = -1.f;
    static real c_b52 = 1.f;
    
    /* System generated locals */
    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, d_dim1, 
	    d_offset, e_dim1, e_offset, f_dim1, f_offset, i__1, i__2, i__3, 
	    i__4;
    /* Builtin functions */
    double sqrt(doublereal);
    /* Local variables */
    static integer i__, j, k, p, q, ie, je, mb, nb, is, js, pq;
    static real dsum;
    static integer ppqq;
    extern logical lsame_(char *, char *);
    static integer ifunc;
    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
    static integer linfo;
    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, 
	    integer *, real *, real *, integer *, real *, integer *, real *, 
	    real *, integer *);
    static integer lwmin;
    static real scale2, dscale;
    extern /* Subroutine */ int stgsy2_(char *, integer *, integer *, integer 
	    *, real *, integer *, real *, integer *, real *, integer *, real *
	    , integer *, real *, integer *, real *, integer *, real *, real *,
	     real *, integer *, integer *, integer *);
    static real scaloc;
    extern /* Subroutine */ int xerbla_(char *, integer *);
    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
	    integer *, integer *, ftnlen, ftnlen);
    extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, 
	    integer *, real *, integer *), slaset_(char *, integer *, 
	    integer *, real *, real *, real *, integer *);
    static integer iround;
    static logical notran;
    static integer isolve;
    static logical lquery;


    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    b_dim1 = *ldb;
    b_offset = 1 + b_dim1;
    b -= b_offset;
    c_dim1 = *ldc;
    c_offset = 1 + c_dim1;
    c__ -= c_offset;
    d_dim1 = *ldd;
    d_offset = 1 + d_dim1;
    d__ -= d_offset;
    e_dim1 = *lde;
    e_offset = 1 + e_dim1;
    e -= e_offset;
    f_dim1 = *ldf;
    f_offset = 1 + f_dim1;
    f -= f_offset;
    --work;
    --iwork;

    /* Function Body */
    *info = 0;
    notran = lsame_(trans, "N");
    lquery = *lwork == -1;

    if (! notran && ! lsame_(trans, "T")) {
	*info = -1;
    } else if (notran) {
	if (*ijob < 0 || *ijob > 4) {
	    *info = -2;
	}
    }
    if (*info == 0) {
	if (*m <= 0) {
	    *info = -3;
	} else if (*n <= 0) {
	    *info = -4;
	} else if (*lda < max(1,*m)) {
	    *info = -6;
	} else if (*ldb < max(1,*n)) {
	    *info = -8;
	} else if (*ldc < max(1,*m)) {
	    *info = -10;
	} else if (*ldd < max(1,*m)) {
	    *info = -12;
	} else if (*lde < max(1,*n)) {
	    *info = -14;
	} else if (*ldf < max(1,*m)) {
	    *info = -16;
	}
    }

    if (*info == 0) {
	if (notran) {
	    if (*ijob == 1 || *ijob == 2) {
/* Computing MAX */
		i__1 = 1, i__2 = (*m << 1) * *n;
		lwmin = max(i__1,i__2);
	    } else {
		lwmin = 1;
	    }
	} else {
	    lwmin = 1;
	}
	work[1] = (real) lwmin;

	if (*lwork < lwmin && ! lquery) {
	    *info = -20;
	}
    }

    if (*info != 0) {
	i__1 = -(*info);
	xerbla_("STGSYL", &i__1);
	return 0;
    } else if (lquery) {
	return 0;
    }

/*     Quick return if possible */

    if (*m == 0 || *n == 0) {
	*scale = 1.f;
	if (notran) {
	    if (*ijob != 0) {
		*dif = 0.f;
	    }
	}
	return 0;
    }

/*     Determine optimal block sizes MB and NB */

    mb = ilaenv_(&c__2, "STGSYL", trans, m, n, &c_n1, &c_n1, (ftnlen)6, (
	    ftnlen)1);
    nb = ilaenv_(&c__5, "STGSYL", trans, m, n, &c_n1, &c_n1, (ftnlen)6, (
	    ftnlen)1);

    isolve = 1;
    ifunc = 0;
    if (notran) {
	if (*ijob >= 3) {
	    ifunc = *ijob - 2;
	    slaset_("F", m, n, &c_b14, &c_b14, &c__[c_offset], ldc)
		    ;
	    slaset_("F", m, n, &c_b14, &c_b14, &f[f_offset], ldf);
	} else if (*ijob >= 1 && notran) {
	    isolve = 2;
	}
    }

    if (mb <= 1 && nb <= 1 || mb >= *m && nb >= *n) {

	i__1 = isolve;
	for (iround = 1; iround <= i__1; ++iround) {

/*           Use unblocked Level 2 solver */

	    dscale = 0.f;
	    dsum = 1.f;
	    pq = 0;
	    stgsy2_(trans, &ifunc, m, n, &a[a_offset], lda, &b[b_offset], ldb,
		     &c__[c_offset], ldc, &d__[d_offset], ldd, &e[e_offset], 
		    lde, &f[f_offset], ldf, scale, &dsum, &dscale, &iwork[1], 
		    &pq, info);
	    if (dscale != 0.f) {
		if (*ijob == 1 || *ijob == 3) {
		    *dif = sqrt((real) ((*m << 1) * *n)) / (dscale * sqrt(
			    dsum));
		} else {
		    *dif = sqrt((real) pq) / (dscale * sqrt(dsum));
		}
	    }

	    if (isolve == 2 && iround == 1) {
		if (notran) {
		    ifunc = *ijob;
		}
		scale2 = *scale;
		slacpy_("F", m, n, &c__[c_offset], ldc, &work[1], m);
		slacpy_("F", m, n, &f[f_offset], ldf, &work[*m * *n + 1], m);
		slaset_("F", m, n, &c_b14, &c_b14, &c__[c_offset], ldc);
		slaset_("F", m, n, &c_b14, &c_b14, &f[f_offset], ldf);
	    } else if (isolve == 2 && iround == 2) {
		slacpy_("F", m, n, &work[1], m, &c__[c_offset], ldc);
		slacpy_("F", m, n, &work[*m * *n + 1], m, &f[f_offset], ldf);
		*scale = scale2;
	    }
/* L30: */
	}

	return 0;
    }

/*     Determine block structure of A */

    p = 0;
    i__ = 1;
L40:
    if (i__ > *m) {
	goto L50;
    }
    ++p;
    iwork[p] = i__;
    i__ += mb;
    if (i__ >= *m) {
	goto L50;
    }
    if (a[i__ + (i__ - 1) * a_dim1] != 0.f) {
	++i__;
    }
    goto L40;
L50:

    iwork[p + 1] = *m + 1;
    if (iwork[p] == iwork[p + 1]) {
	--p;
    }

/*     Determine block structure of B */

    q = p + 1;
    j = 1;
L60:
    if (j > *n) {
	goto L70;
    }
    ++q;
    iwork[q] = j;
    j += nb;
    if (j >= *n) {
	goto L70;
    }
    if (b[j + (j - 1) * b_dim1] != 0.f) {
	++j;
    }
    goto L60;
L70:

    iwork[q + 1] = *n + 1;
    if (iwork[q] == iwork[q + 1]) {
	--q;
    }

    if (notran) {

	i__1 = isolve;
	for (iround = 1; iround <= i__1; ++iround) {

/*           Solve (I, J)-subsystem   
                 A(I, I) * R(I, J) - L(I, J) * B(J, J) = C(I, J)   
                 D(I, I) * R(I, J) - L(I, J) * E(J, J) = F(I, J)   
             for I = P, P - 1,..., 1; J = 1, 2,..., Q */

	    dscale = 0.f;
	    dsum = 1.f;
	    pq = 0;
	    *scale = 1.f;
	    i__2 = q;
	    for (j = p + 2; j <= i__2; ++j) {
		js = iwork[j];
		je = iwork[j + 1] - 1;
		nb = je - js + 1;
		for (i__ = p; i__ >= 1; --i__) {
		    is = iwork[i__];
		    ie = iwork[i__ + 1] - 1;
		    mb = ie - is + 1;
		    ppqq = 0;
		    stgsy2_(trans, &ifunc, &mb, &nb, &a[is + is * a_dim1], 
			    lda, &b[js + js * b_dim1], ldb, &c__[is + js * 
			    c_dim1], ldc, &d__[is + is * d_dim1], ldd, &e[js 
			    + js * e_dim1], lde, &f[is + js * f_dim1], ldf, &
			    scaloc, &dsum, &dscale, &iwork[q + 2], &ppqq, &
			    linfo);
		    if (linfo > 0) {
			*info = linfo;
		    }

		    pq += ppqq;
		    if (scaloc != 1.f) {
			i__3 = js - 1;
			for (k = 1; k <= i__3; ++k) {
			    sscal_(m, &scaloc, &c__[k * c_dim1 + 1], &c__1);
			    sscal_(m, &scaloc, &f[k * f_dim1 + 1], &c__1);
/* L80: */
			}
			i__3 = je;
			for (k = js; k <= i__3; ++k) {
			    i__4 = is - 1;
			    sscal_(&i__4, &scaloc, &c__[k * c_dim1 + 1], &
				    c__1);
			    i__4 = is - 1;
			    sscal_(&i__4, &scaloc, &f[k * f_dim1 + 1], &c__1);
/* L90: */
			}
			i__3 = je;
			for (k = js; k <= i__3; ++k) {
			    i__4 = *m - ie;
			    sscal_(&i__4, &scaloc, &c__[ie + 1 + k * c_dim1], 
				    &c__1);
			    i__4 = *m - ie;
			    sscal_(&i__4, &scaloc, &f[ie + 1 + k * f_dim1], &
				    c__1);
/* L100: */
			}
			i__3 = *n;
			for (k = je + 1; k <= i__3; ++k) {
			    sscal_(m, &scaloc, &c__[k * c_dim1 + 1], &c__1);
			    sscal_(m, &scaloc, &f[k * f_dim1 + 1], &c__1);
/* L110: */
			}
			*scale *= scaloc;
		    }

/*                 Substitute R(I, J) and L(I, J) into remaining   
                   equation. */

		    if (i__ > 1) {
			i__3 = is - 1;
			sgemm_("N", "N", &i__3, &nb, &mb, &c_b51, &a[is * 
				a_dim1 + 1], lda, &c__[is + js * c_dim1], ldc, 
				 &c_b52, &c__[js * c_dim1 + 1], ldc);
			i__3 = is - 1;
			sgemm_("N", "N", &i__3, &nb, &mb, &c_b51, &d__[is * 
				d_dim1 + 1], ldd, &c__[is + js * c_dim1], ldc, 
				 &c_b52, &f[js * f_dim1 + 1], ldf);
		    }
		    if (j < q) {
			i__3 = *n - je;
			sgemm_("N", "N", &mb, &i__3, &nb, &c_b52, &f[is + js *
				 f_dim1], ldf, &b[js + (je + 1) * b_dim1], 
				ldb, &c_b52, &c__[is + (je + 1) * c_dim1], 
				ldc);
			i__3 = *n - je;
			sgemm_("N", "N", &mb, &i__3, &nb, &c_b52, &f[is + js *
				 f_dim1], ldf, &e[js + (je + 1) * e_dim1], 
				lde, &c_b52, &f[is + (je + 1) * f_dim1], ldf);
		    }
/* L120: */
		}
/* L130: */
	    }
	    if (dscale != 0.f) {
		if (*ijob == 1 || *ijob == 3) {
		    *dif = sqrt((real) ((*m << 1) * *n)) / (dscale * sqrt(
			    dsum));
		} else {
		    *dif = sqrt((real) pq) / (dscale * sqrt(dsum));
		}
	    }
	    if (isolve == 2 && iround == 1) {
		if (notran) {
		    ifunc = *ijob;
		}
		scale2 = *scale;
		slacpy_("F", m, n, &c__[c_offset], ldc, &work[1], m);
		slacpy_("F", m, n, &f[f_offset], ldf, &work[*m * *n + 1], m);
		slaset_("F", m, n, &c_b14, &c_b14, &c__[c_offset], ldc);
		slaset_("F", m, n, &c_b14, &c_b14, &f[f_offset], ldf);
	    } else if (isolve == 2 && iround == 2) {
		slacpy_("F", m, n, &work[1], m, &c__[c_offset], ldc);
		slacpy_("F", m, n, &work[*m * *n + 1], m, &f[f_offset], ldf);
		*scale = scale2;
	    }
/* L150: */
	}

    } else {

/*        Solve transposed (I, J)-subsystem   
               A(I, I)' * R(I, J)  + D(I, I)' * L(I, J)  =  C(I, J)   
               R(I, J)  * B(J, J)' + L(I, J)  * E(J, J)' = -F(I, J)   
          for I = 1,2,..., P; J = Q, Q-1,..., 1 */

	*scale = 1.f;
	i__1 = p;
	for (i__ = 1; i__ <= i__1; ++i__) {
	    is = iwork[i__];
	    ie = iwork[i__ + 1] - 1;
	    mb = ie - is + 1;
	    i__2 = p + 2;
	    for (j = q; j >= i__2; --j) {
		js = iwork[j];
		je = iwork[j + 1] - 1;
		nb = je - js + 1;
		stgsy2_(trans, &ifunc, &mb, &nb, &a[is + is * a_dim1], lda, &
			b[js + js * b_dim1], ldb, &c__[is + js * c_dim1], ldc,
			 &d__[is + is * d_dim1], ldd, &e[js + js * e_dim1], 
			lde, &f[is + js * f_dim1], ldf, &scaloc, &dsum, &
			dscale, &iwork[q + 2], &ppqq, &linfo);
		if (linfo > 0) {
		    *info = linfo;
		}
		if (scaloc != 1.f) {
		    i__3 = js - 1;
		    for (k = 1; k <= i__3; ++k) {
			sscal_(m, &scaloc, &c__[k * c_dim1 + 1], &c__1);
			sscal_(m, &scaloc, &f[k * f_dim1 + 1], &c__1);
/* L160: */
		    }
		    i__3 = je;
		    for (k = js; k <= i__3; ++k) {
			i__4 = is - 1;
			sscal_(&i__4, &scaloc, &c__[k * c_dim1 + 1], &c__1);
			i__4 = is - 1;
			sscal_(&i__4, &scaloc, &f[k * f_dim1 + 1], &c__1);
/* L170: */
		    }
		    i__3 = je;
		    for (k = js; k <= i__3; ++k) {
			i__4 = *m - ie;
			sscal_(&i__4, &scaloc, &c__[ie + 1 + k * c_dim1], &
				c__1);
			i__4 = *m - ie;
			sscal_(&i__4, &scaloc, &f[ie + 1 + k * f_dim1], &c__1)
				;
/* L180: */
		    }
		    i__3 = *n;
		    for (k = je + 1; k <= i__3; ++k) {
			sscal_(m, &scaloc, &c__[k * c_dim1 + 1], &c__1);
			sscal_(m, &scaloc, &f[k * f_dim1 + 1], &c__1);
/* L190: */
		    }
		    *scale *= scaloc;
		}

/*              Substitute R(I, J) and L(I, J) into remaining equation. */

		if (j > p + 2) {
		    i__3 = js - 1;
		    sgemm_("N", "T", &mb, &i__3, &nb, &c_b52, &c__[is + js * 
			    c_dim1], ldc, &b[js * b_dim1 + 1], ldb, &c_b52, &
			    f[is + f_dim1], ldf);
		    i__3 = js - 1;
		    sgemm_("N", "T", &mb, &i__3, &nb, &c_b52, &f[is + js * 
			    f_dim1], ldf, &e[js * e_dim1 + 1], lde, &c_b52, &
			    f[is + f_dim1], ldf);
		}
		if (i__ < p) {
		    i__3 = *m - ie;
		    sgemm_("T", "N", &i__3, &nb, &mb, &c_b51, &a[is + (ie + 1)
			     * a_dim1], lda, &c__[is + js * c_dim1], ldc, &
			    c_b52, &c__[ie + 1 + js * c_dim1], ldc);
		    i__3 = *m - ie;
		    sgemm_("T", "N", &i__3, &nb, &mb, &c_b51, &d__[is + (ie + 
			    1) * d_dim1], ldd, &f[is + js * f_dim1], ldf, &
			    c_b52, &c__[ie + 1 + js * c_dim1], ldc);
		}
/* L200: */
	    }
/* L210: */
	}

    }

    work[1] = (real) lwmin;

    return 0;

/*     End of STGSYL */

} /* stgsyl_ */