#include "blaswrap.h"
/* slasd2.f -- translated by f2c (version 20061008).
   You must link the resulting object file with libf2c:
	on Microsoft Windows system, link with libf2c.lib;
	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
	or, if you install libf2c.a in a standard place, with -lf2c -lm
	-- in that order, at the end of the command line, as in
		cc *.o -lf2c -lm
	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

		http://www.netlib.org/f2c/libf2c.zip
*/

#include "f2c.h"

/* Table of constant values */

static integer c__1 = 1;
static real c_b30 = 0.f;

/* Subroutine */ int slasd2_(integer *nl, integer *nr, integer *sqre, integer 
	*k, real *d__, real *z__, real *alpha, real *beta, real *u, integer *
	ldu, real *vt, integer *ldvt, real *dsigma, real *u2, integer *ldu2, 
	real *vt2, integer *ldvt2, integer *idxp, integer *idx, integer *idxc,
	 integer *idxq, integer *coltyp, integer *info)
{
    /* System generated locals */
    integer u_dim1, u_offset, u2_dim1, u2_offset, vt_dim1, vt_offset, 
	    vt2_dim1, vt2_offset, i__1;
    real r__1, r__2;

    /* Local variables */
    static real c__;
    static integer i__, j, m, n;
    static real s;
    static integer k2;
    static real z1;
    static integer ct, jp;
    static real eps, tau, tol;
    static integer psm[4], nlp1, nlp2, idxi, idxj, ctot[4];
    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, 
	    integer *, real *, real *);
    static integer idxjp, jprev;
    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, 
	    integer *);
    extern doublereal slapy2_(real *, real *), slamch_(char *);
    extern /* Subroutine */ int xerbla_(char *, integer *), slamrg_(
	    integer *, integer *, real *, integer *, integer *, integer *);
    static real hlftol;
    extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, 
	    integer *, real *, integer *), slaset_(char *, integer *, 
	    integer *, real *, real *, real *, integer *);


/*  -- LAPACK auxiliary routine (version 3.1) --   
       Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..   
       November 2006   


    Purpose   
    =======   

    SLASD2 merges the two sets of singular values together into a single   
    sorted set.  Then it tries to deflate the size of the problem.   
    There are two ways in which deflation can occur:  when two or more   
    singular values are close together or if there is a tiny entry in the   
    Z vector.  For each such occurrence the order of the related secular   
    equation problem is reduced by one.   

    SLASD2 is called from SLASD1.   

    Arguments   
    =========   

    NL     (input) INTEGER   
           The row dimension of the upper block.  NL >= 1.   

    NR     (input) INTEGER   
           The row dimension of the lower block.  NR >= 1.   

    SQRE   (input) INTEGER   
           = 0: the lower block is an NR-by-NR square matrix.   
           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.   

           The bidiagonal matrix has N = NL + NR + 1 rows and   
           M = N + SQRE >= N columns.   

    K      (output) INTEGER   
           Contains the dimension of the non-deflated matrix,   
           This is the order of the related secular equation. 1 <= K <=N.   

    D      (input/output) REAL array, dimension (N)   
           On entry D contains the singular values of the two submatrices   
           to be combined.  On exit D contains the trailing (N-K) updated   
           singular values (those which were deflated) sorted into   
           increasing order.   

    Z      (output) REAL array, dimension (N)   
           On exit Z contains the updating row vector in the secular   
           equation.   

    ALPHA  (input) REAL   
           Contains the diagonal element associated with the added row.   

    BETA   (input) REAL   
           Contains the off-diagonal element associated with the added   
           row.   

    U      (input/output) REAL array, dimension (LDU,N)   
           On entry U contains the left singular vectors of two   
           submatrices in the two square blocks with corners at (1,1),   
           (NL, NL), and (NL+2, NL+2), (N,N).   
           On exit U contains the trailing (N-K) updated left singular   
           vectors (those which were deflated) in its last N-K columns.   

    LDU    (input) INTEGER   
           The leading dimension of the array U.  LDU >= N.   

    VT     (input/output) REAL array, dimension (LDVT,M)   
           On entry VT' contains the right singular vectors of two   
           submatrices in the two square blocks with corners at (1,1),   
           (NL+1, NL+1), and (NL+2, NL+2), (M,M).   
           On exit VT' contains the trailing (N-K) updated right singular   
           vectors (those which were deflated) in its last N-K columns.   
           In case SQRE =1, the last row of VT spans the right null   
           space.   

    LDVT   (input) INTEGER   
           The leading dimension of the array VT.  LDVT >= M.   

    DSIGMA (output) REAL array, dimension (N)   
           Contains a copy of the diagonal elements (K-1 singular values   
           and one zero) in the secular equation.   

    U2     (output) REAL array, dimension (LDU2,N)   
           Contains a copy of the first K-1 left singular vectors which   
           will be used by SLASD3 in a matrix multiply (SGEMM) to solve   
           for the new left singular vectors. U2 is arranged into four   
           blocks. The first block contains a column with 1 at NL+1 and   
           zero everywhere else; the second block contains non-zero   
           entries only at and above NL; the third contains non-zero   
           entries only below NL+1; and the fourth is dense.   

    LDU2   (input) INTEGER   
           The leading dimension of the array U2.  LDU2 >= N.   

    VT2    (output) REAL array, dimension (LDVT2,N)   
           VT2' contains a copy of the first K right singular vectors   
           which will be used by SLASD3 in a matrix multiply (SGEMM) to   
           solve for the new right singular vectors. VT2 is arranged into   
           three blocks. The first block contains a row that corresponds   
           to the special 0 diagonal element in SIGMA; the second block   
           contains non-zeros only at and before NL +1; the third block   
           contains non-zeros only at and after  NL +2.   

    LDVT2  (input) INTEGER   
           The leading dimension of the array VT2.  LDVT2 >= M.   

    IDXP   (workspace) INTEGER array, dimension (N)   
           This will contain the permutation used to place deflated   
           values of D at the end of the array. On output IDXP(2:K)   
           points to the nondeflated D-values and IDXP(K+1:N)   
           points to the deflated singular values.   

    IDX    (workspace) INTEGER array, dimension (N)   
           This will contain the permutation used to sort the contents of   
           D into ascending order.   

    IDXC   (output) INTEGER array, dimension (N)   
           This will contain the permutation used to arrange the columns   
           of the deflated U matrix into three groups:  the first group   
           contains non-zero entries only at and above NL, the second   
           contains non-zero entries only below NL+2, and the third is   
           dense.   

    IDXQ   (input/output) INTEGER array, dimension (N)   
           This contains the permutation which separately sorts the two   
           sub-problems in D into ascending order.  Note that entries in   
           the first hlaf of this permutation must first be moved one   
           position backward; and entries in the second half   
           must first have NL+1 added to their values.   

    COLTYP (workspace/output) INTEGER array, dimension (N)   
           As workspace, this will contain a label which will indicate   
           which of the following types a column in the U2 matrix or a   
           row in the VT2 matrix is:   
           1 : non-zero in the upper half only   
           2 : non-zero in the lower half only   
           3 : dense   
           4 : deflated   

           On exit, it is an array of dimension 4, with COLTYP(I) being   
           the dimension of the I-th type columns.   

    INFO   (output) INTEGER   
            = 0:  successful exit.   
            < 0:  if INFO = -i, the i-th argument had an illegal value.   

    Further Details   
    ===============   

    Based on contributions by   
       Ming Gu and Huan Ren, Computer Science Division, University of   
       California at Berkeley, USA   

    =====================================================================   


       Test the input parameters.   

       Parameter adjustments */
    --d__;
    --z__;
    u_dim1 = *ldu;
    u_offset = 1 + u_dim1;
    u -= u_offset;
    vt_dim1 = *ldvt;
    vt_offset = 1 + vt_dim1;
    vt -= vt_offset;
    --dsigma;
    u2_dim1 = *ldu2;
    u2_offset = 1 + u2_dim1;
    u2 -= u2_offset;
    vt2_dim1 = *ldvt2;
    vt2_offset = 1 + vt2_dim1;
    vt2 -= vt2_offset;
    --idxp;
    --idx;
    --idxc;
    --idxq;
    --coltyp;

    /* Function Body */
    *info = 0;

    if (*nl < 1) {
	*info = -1;
    } else if (*nr < 1) {
	*info = -2;
    } else if (*sqre != 1 && *sqre != 0) {
	*info = -3;
    }

    n = *nl + *nr + 1;
    m = n + *sqre;

    if (*ldu < n) {
	*info = -10;
    } else if (*ldvt < m) {
	*info = -12;
    } else if (*ldu2 < n) {
	*info = -15;
    } else if (*ldvt2 < m) {
	*info = -17;
    }
    if (*info != 0) {
	i__1 = -(*info);
	xerbla_("SLASD2", &i__1);
	return 0;
    }

    nlp1 = *nl + 1;
    nlp2 = *nl + 2;

/*     Generate the first part of the vector Z; and move the singular   
       values in the first part of D one position backward. */

    z1 = *alpha * vt[nlp1 + nlp1 * vt_dim1];
    z__[1] = z1;
    for (i__ = *nl; i__ >= 1; --i__) {
	z__[i__ + 1] = *alpha * vt[i__ + nlp1 * vt_dim1];
	d__[i__ + 1] = d__[i__];
	idxq[i__ + 1] = idxq[i__] + 1;
/* L10: */
    }

/*     Generate the second part of the vector Z. */

    i__1 = m;
    for (i__ = nlp2; i__ <= i__1; ++i__) {
	z__[i__] = *beta * vt[i__ + nlp2 * vt_dim1];
/* L20: */
    }

/*     Initialize some reference arrays. */

    i__1 = nlp1;
    for (i__ = 2; i__ <= i__1; ++i__) {
	coltyp[i__] = 1;
/* L30: */
    }
    i__1 = n;
    for (i__ = nlp2; i__ <= i__1; ++i__) {
	coltyp[i__] = 2;
/* L40: */
    }

/*     Sort the singular values into increasing order */

    i__1 = n;
    for (i__ = nlp2; i__ <= i__1; ++i__) {
	idxq[i__] += nlp1;
/* L50: */
    }

/*     DSIGMA, IDXC, IDXC, and the first column of U2   
       are used as storage space. */

    i__1 = n;
    for (i__ = 2; i__ <= i__1; ++i__) {
	dsigma[i__] = d__[idxq[i__]];
	u2[i__ + u2_dim1] = z__[idxq[i__]];
	idxc[i__] = coltyp[idxq[i__]];
/* L60: */
    }

    slamrg_(nl, nr, &dsigma[2], &c__1, &c__1, &idx[2]);

    i__1 = n;
    for (i__ = 2; i__ <= i__1; ++i__) {
	idxi = idx[i__] + 1;
	d__[i__] = dsigma[idxi];
	z__[i__] = u2[idxi + u2_dim1];
	coltyp[i__] = idxc[idxi];
/* L70: */
    }

/*     Calculate the allowable deflation tolerance */

    eps = slamch_("Epsilon");
/* Computing MAX */
    r__1 = dabs(*alpha), r__2 = dabs(*beta);
    tol = dmax(r__1,r__2);
/* Computing MAX */
    r__2 = (r__1 = d__[n], dabs(r__1));
    tol = eps * 8.f * dmax(r__2,tol);

/*     There are 2 kinds of deflation -- first a value in the z-vector   
       is small, second two (or more) singular values are very close   
       together (their difference is small).   

       If the value in the z-vector is small, we simply permute the   
       array so that the corresponding singular value is moved to the   
       end.   

       If two values in the D-vector are close, we perform a two-sided   
       rotation designed to make one of the corresponding z-vector   
       entries zero, and then permute the array so that the deflated   
       singular value is moved to the end.   

       If there are multiple singular values then the problem deflates.   
       Here the number of equal singular values are found.  As each equal   
       singular value is found, an elementary reflector is computed to   
       rotate the corresponding singular subspace so that the   
       corresponding components of Z are zero in this new basis. */

    *k = 1;
    k2 = n + 1;
    i__1 = n;
    for (j = 2; j <= i__1; ++j) {
	if ((r__1 = z__[j], dabs(r__1)) <= tol) {

/*           Deflate due to small z component. */

	    --k2;
	    idxp[k2] = j;
	    coltyp[j] = 4;
	    if (j == n) {
		goto L120;
	    }
	} else {
	    jprev = j;
	    goto L90;
	}
/* L80: */
    }
L90:
    j = jprev;
L100:
    ++j;
    if (j > n) {
	goto L110;
    }
    if ((r__1 = z__[j], dabs(r__1)) <= tol) {

/*        Deflate due to small z component. */

	--k2;
	idxp[k2] = j;
	coltyp[j] = 4;
    } else {

/*        Check if singular values are close enough to allow deflation. */

	if ((r__1 = d__[j] - d__[jprev], dabs(r__1)) <= tol) {

/*           Deflation is possible. */

	    s = z__[jprev];
	    c__ = z__[j];

/*           Find sqrt(a**2+b**2) without overflow or   
             destructive underflow. */

	    tau = slapy2_(&c__, &s);
	    c__ /= tau;
	    s = -s / tau;
	    z__[j] = tau;
	    z__[jprev] = 0.f;

/*           Apply back the Givens rotation to the left and right   
             singular vector matrices. */

	    idxjp = idxq[idx[jprev] + 1];
	    idxj = idxq[idx[j] + 1];
	    if (idxjp <= nlp1) {
		--idxjp;
	    }
	    if (idxj <= nlp1) {
		--idxj;
	    }
	    srot_(&n, &u[idxjp * u_dim1 + 1], &c__1, &u[idxj * u_dim1 + 1], &
		    c__1, &c__, &s);
	    srot_(&m, &vt[idxjp + vt_dim1], ldvt, &vt[idxj + vt_dim1], ldvt, &
		    c__, &s);
	    if (coltyp[j] != coltyp[jprev]) {
		coltyp[j] = 3;
	    }
	    coltyp[jprev] = 4;
	    --k2;
	    idxp[k2] = jprev;
	    jprev = j;
	} else {
	    ++(*k);
	    u2[*k + u2_dim1] = z__[jprev];
	    dsigma[*k] = d__[jprev];
	    idxp[*k] = jprev;
	    jprev = j;
	}
    }
    goto L100;
L110:

/*     Record the last singular value. */

    ++(*k);
    u2[*k + u2_dim1] = z__[jprev];
    dsigma[*k] = d__[jprev];
    idxp[*k] = jprev;

L120:

/*     Count up the total number of the various types of columns, then   
       form a permutation which positions the four column types into   
       four groups of uniform structure (although one or more of these   
       groups may be empty). */

    for (j = 1; j <= 4; ++j) {
	ctot[j - 1] = 0;
/* L130: */
    }
    i__1 = n;
    for (j = 2; j <= i__1; ++j) {
	ct = coltyp[j];
	++ctot[ct - 1];
/* L140: */
    }

/*     PSM(*) = Position in SubMatrix (of types 1 through 4) */

    psm[0] = 2;
    psm[1] = ctot[0] + 2;
    psm[2] = psm[1] + ctot[1];
    psm[3] = psm[2] + ctot[2];

/*     Fill out the IDXC array so that the permutation which it induces   
       will place all type-1 columns first, all type-2 columns next,   
       then all type-3's, and finally all type-4's, starting from the   
       second column. This applies similarly to the rows of VT. */

    i__1 = n;
    for (j = 2; j <= i__1; ++j) {
	jp = idxp[j];
	ct = coltyp[jp];
	idxc[psm[ct - 1]] = j;
	++psm[ct - 1];
/* L150: */
    }

/*     Sort the singular values and corresponding singular vectors into   
       DSIGMA, U2, and VT2 respectively.  The singular values/vectors   
       which were not deflated go into the first K slots of DSIGMA, U2,   
       and VT2 respectively, while those which were deflated go into the   
       last N - K slots, except that the first column/row will be treated   
       separately. */

    i__1 = n;
    for (j = 2; j <= i__1; ++j) {
	jp = idxp[j];
	dsigma[j] = d__[jp];
	idxj = idxq[idx[idxp[idxc[j]]] + 1];
	if (idxj <= nlp1) {
	    --idxj;
	}
	scopy_(&n, &u[idxj * u_dim1 + 1], &c__1, &u2[j * u2_dim1 + 1], &c__1);
	scopy_(&m, &vt[idxj + vt_dim1], ldvt, &vt2[j + vt2_dim1], ldvt2);
/* L160: */
    }

/*     Determine DSIGMA(1), DSIGMA(2) and Z(1) */

    dsigma[1] = 0.f;
    hlftol = tol / 2.f;
    if (dabs(dsigma[2]) <= hlftol) {
	dsigma[2] = hlftol;
    }
    if (m > n) {
	z__[1] = slapy2_(&z1, &z__[m]);
	if (z__[1] <= tol) {
	    c__ = 1.f;
	    s = 0.f;
	    z__[1] = tol;
	} else {
	    c__ = z1 / z__[1];
	    s = z__[m] / z__[1];
	}
    } else {
	if (dabs(z1) <= tol) {
	    z__[1] = tol;
	} else {
	    z__[1] = z1;
	}
    }

/*     Move the rest of the updating row to Z. */

    i__1 = *k - 1;
    scopy_(&i__1, &u2[u2_dim1 + 2], &c__1, &z__[2], &c__1);

/*     Determine the first column of U2, the first row of VT2 and the   
       last row of VT. */

    slaset_("A", &n, &c__1, &c_b30, &c_b30, &u2[u2_offset], ldu2);
    u2[nlp1 + u2_dim1] = 1.f;
    if (m > n) {
	i__1 = nlp1;
	for (i__ = 1; i__ <= i__1; ++i__) {
	    vt[m + i__ * vt_dim1] = -s * vt[nlp1 + i__ * vt_dim1];
	    vt2[i__ * vt2_dim1 + 1] = c__ * vt[nlp1 + i__ * vt_dim1];
/* L170: */
	}
	i__1 = m;
	for (i__ = nlp2; i__ <= i__1; ++i__) {
	    vt2[i__ * vt2_dim1 + 1] = s * vt[m + i__ * vt_dim1];
	    vt[m + i__ * vt_dim1] = c__ * vt[m + i__ * vt_dim1];
/* L180: */
	}
    } else {
	scopy_(&m, &vt[nlp1 + vt_dim1], ldvt, &vt2[vt2_dim1 + 1], ldvt2);
    }
    if (m > n) {
	scopy_(&m, &vt[m + vt_dim1], ldvt, &vt2[m + vt2_dim1], ldvt2);
    }

/*     The deflated singular values and their corresponding vectors go   
       into the back of D, U, and V respectively. */

    if (n > *k) {
	i__1 = n - *k;
	scopy_(&i__1, &dsigma[*k + 1], &c__1, &d__[*k + 1], &c__1);
	i__1 = n - *k;
	slacpy_("A", &n, &i__1, &u2[(*k + 1) * u2_dim1 + 1], ldu2, &u[(*k + 1)
		 * u_dim1 + 1], ldu);
	i__1 = n - *k;
	slacpy_("A", &i__1, &m, &vt2[*k + 1 + vt2_dim1], ldvt2, &vt[*k + 1 + 
		vt_dim1], ldvt);
    }

/*     Copy CTOT into COLTYP for referencing in SLASD3. */

    for (j = 1; j <= 4; ++j) {
	coltyp[j] = ctot[j - 1];
/* L190: */
    }

    return 0;

/*     End of SLASD2 */

} /* slasd2_ */