◆ clatrs3()

subroutine clatrs3	(	character	uplo,
		character	trans,
		character	diag,
		character	normin,
		integer	n,
		integer	nrhs,
		complex, dimension( lda, * )	a,
		integer	lda,
		complex, dimension( ldx, * )	x,
		integer	ldx,
		real, dimension( * )	scale,
		real, dimension( * )	cnorm,
		real, dimension( * )	work,
		integer	lwork,
		integer	info )

CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.

Purpose:

!>
!> CLATRS3 solves one of the triangular systems
!>
!>    A * X = B * diag(scale),  A**T * X = B * diag(scale), or
!>    A**H * X = B * diag(scale)
!>
!> with scaling to prevent overflow.  Here A is an upper or lower
!> triangular matrix, A**T denotes the transpose of A, A**H denotes the
!> conjugate transpose of A. X and B are n-by-nrhs matrices and scale
!> is an nrhs-element vector of scaling factors. A scaling factor scale(j)
!> is usually less than or equal to 1, chosen such that X(:,j) is less
!> than the overflow threshold. If the matrix A is singular (A(j,j) = 0
!> for some j), then a non-trivial solution to A*X = 0 is returned. If
!> the system is so badly scaled that the solution cannot be represented
!> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned.
!>
!> This is a BLAS-3 version of LATRS for solving several right
!> hand sides simultaneously.
!>
!>

Parameters

[in]	UPLO	!> UPLO is CHARACTER*1 !> Specifies whether the matrix A is upper or lower triangular. !> = 'U': Upper triangular !> = 'L': Lower triangular !>
[in]	TRANS	!> TRANS is CHARACTER1 !> Specifies the operation applied to A. !> = 'N': Solve A x = sb (No transpose) !> = 'T': Solve AT x = sb (Transpose) !> = 'C': Solve AT x = s*b (Conjugate transpose) !>
[in]	DIAG	!> DIAG is CHARACTER*1 !> Specifies whether or not the matrix A is unit triangular. !> = 'N': Non-unit triangular !> = 'U': Unit triangular !>
[in]	NORMIN	!> NORMIN is CHARACTER*1 !> Specifies whether CNORM has been set or not. !> = 'Y': CNORM contains the column norms on entry !> = 'N': CNORM is not set on entry. On exit, the norms will !> be computed and stored in CNORM. !>
[in]	N	!> N is INTEGER !> The order of the matrix A. N >= 0. !>
[in]	NRHS	!> NRHS is INTEGER !> The number of columns of X. NRHS >= 0. !>
[in]	A	!> A is COMPLEX array, dimension (LDA,N) !> The triangular matrix A. If UPLO = 'U', the leading n by n !> upper triangular part of the array A contains the upper !> triangular matrix, and the strictly lower triangular part of !> A is not referenced. If UPLO = 'L', the leading n by n lower !> triangular part of the array A contains the lower triangular !> matrix, and the strictly upper triangular part of A is not !> referenced. If DIAG = 'U', the diagonal elements of A are !> also not referenced and are assumed to be 1. !>
[in]	LDA	!> LDA is INTEGER !> The leading dimension of the array A. LDA >= max (1,N). !>
[in,out]	X	!> X is COMPLEX array, dimension (LDX,NRHS) !> On entry, the right hand side B of the triangular system. !> On exit, X is overwritten by the solution matrix X. !>
[in]	LDX	!> LDX is INTEGER !> The leading dimension of the array X. LDX >= max (1,N). !>
[out]	SCALE	!> SCALE is REAL array, dimension (NRHS) !> The scaling factor s(k) is for the triangular system !> A * x(:,k) = s(k)b(:,k) or AT x(:,k) = s(k)b(:,k). !> If SCALE = 0, the matrix A is singular or badly scaled. !> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) !> that is an exact or approximate solution to Ax(:,k) = 0 !> is returned. If the system so badly scaled that solution !> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 !> is returned. !>
[in,out]	CNORM	!> CNORM is REAL array, dimension (N) !> !> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) !> contains the norm of the off-diagonal part of the j-th column !> of A. If TRANS = 'N', CNORM(j) must be greater than or equal !> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) !> must be greater than or equal to the 1-norm. !> !> If NORMIN = 'N', CNORM is an output argument and CNORM(j) !> returns the 1-norm of the offdiagonal part of the j-th column !> of A. !>
[out]	WORK	!> WORK is REAL array, dimension (MAX(1,LWORK)). !> On exit, if INFO = 0, WORK(1) returns the optimal size of !> WORK. !>
[in]	LWORK	!> LWORK is INTEGER !> The dimension of the array WORK. !> !> If MIN(N,NRHS) = 0, LWORK >= 1, else !> LWORK >= MAX(1, 2NBA MAX(NBA, MIN(NRHS, 32)), where !> NBA = (N + NB - 1)/NB and NB is the optimal block size. !> !> If LWORK = -1, then a workspace query is assumed; the routine !> only calculates the optimal dimensions of the WORK array, returns !> this value as the first entry of the WORK array, and no error !> message related to LWORK is issued by XERBLA. !>
[out]	INFO	!> INFO is INTEGER !> = 0: successful exit !> < 0: if INFO = -k, the k-th argument had an illegal value !>

Author: Univ. of Tennessee; Univ. of California Berkeley; Univ. of Colorado Denver; NAG Ltd.

Further Details:

Definition at line 233 of file clatrs3.f.

      IMPLICIT NONE
*
*     .. Scalar Arguments ..
      CHARACTER          DIAG, TRANS, NORMIN, UPLO
      INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
*     ..
*     .. Array Arguments ..
      COMPLEX            A( LDA, * ), X( LDX, * )
      REAL               CNORM( * ), SCALE( * ), WORK( * )
*     ..
*
*  =====================================================================
*
*     .. Parameters ..
      REAL               ZERO, ONE
      parameter( zero = 0.0e+0, one = 1.0e+0 )
      COMPLEX            CZERO, CONE
      parameter( czero = ( 0.0e+0, 0.0e+0 ) )
      parameter( cone = ( 1.0e+0, 0.0e+0 ) )
      INTEGER            NBMAX, NBMIN, NBRHS, NRHSMIN
      parameter( nrhsmin = 2, nbrhs = 32 )
      parameter( nbmin = 8, nbmax = 64 )
*     ..
*     .. Local Arrays ..
      REAL               W( NBMAX ), XNRM( NBRHS )
*     ..
*     .. Local Scalars ..
      LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
      INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
     $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN
      REAL               ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
     $                   SCAMIN, SMLNUM, TMAX
*     ..
*     .. External Functions ..
      LOGICAL            LSAME
      INTEGER            ILAENV
      REAL               SLAMCH, CLANGE, SLARMM,
     $                   SROUNDUP_LWORK
      EXTERNAL           ilaenv, lsame, slamch,
     $                   clange, slarmm, sroundup_lwork
*     ..
*     .. External Subroutines ..
      EXTERNAL           clatrs, csscal, xerbla
*     ..
*     .. Intrinsic Functions ..
      INTRINSIC          abs, max, min
*     ..
*     .. Executable Statements ..
*
      info = 0
      upper = lsame( uplo, 'U' )
      notran = lsame( trans, 'N' )
      nounit = lsame( diag, 'N' )
      lquery = ( lwork.EQ.-1 )
*
*     Partition A and X into blocks.
*
      nb = max( nbmin, ilaenv( 1, 'CLATRS', '', n, n, -1, -1 ) )
      nb = min( nbmax, nb )
      nba = max( 1, (n + nb - 1) / nb )
      nbx = max( 1, (nrhs + nbrhs - 1) / nbrhs )
*
*     Compute the workspace
*
*     The workspace comprises two parts.
*     The first part stores the local scale factors. Each simultaneously
*     computed right-hand side requires one local scale factor per block
*     row. WORK( I + KK * LDS ) is the scale factor of the vector
*     segment associated with the I-th block row and the KK-th vector
*     in the block column.
*
      lscale = nba * max( nba, min( nrhs, nbrhs ) )
      lds = nba
*
*     The second part stores upper bounds of the triangular A. There are
*     a total of NBA x NBA blocks, of which only the upper triangular
*     part or the lower triangular part is referenced. The upper bound of
*     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
*
      lanrm = nba * nba
      awrk = lscale
*
      IF( min( n, nrhs ).EQ.0 ) THEN
         lwmin = 1
      ELSE
         lwmin = lscale + lanrm
      END IF
      work( 1 ) = sroundup_lwork( lwmin )
*
*     Test the input parameters.
*
      IF( .NOT.upper .AND. .NOT.lsame( uplo, 'L' ) ) THEN
         info = -1
      ELSE IF( .NOT.notran .AND. .NOT.lsame( trans, 'T' ) .AND. .NOT.
     $         lsame( trans, 'C' ) ) THEN
         info = -2
      ELSE IF( .NOT.nounit .AND. .NOT.lsame( diag, 'U' ) ) THEN
         info = -3
      ELSE IF( .NOT.lsame( normin, 'Y' ) .AND. .NOT.
     $         lsame( normin, 'N' ) ) THEN
         info = -4
      ELSE IF( n.LT.0 ) THEN
         info = -5
      ELSE IF( nrhs.LT.0 ) THEN
         info = -6
      ELSE IF( lda.LT.max( 1, n ) ) THEN
         info = -8
      ELSE IF( ldx.LT.max( 1, n ) ) THEN
         info = -10
      ELSE IF( .NOT.lquery .AND. lwork.LT.lwmin ) THEN
         info = -14
      END IF
      IF( info.NE.0 ) THEN
         CALL xerbla( 'CLATRS3', -info )
         RETURN
      ELSE IF( lquery ) THEN
         RETURN
      END IF
*
*     Initialize scaling factors
*
      DO kk = 1, nrhs
         scale( kk ) = one
      END DO
*
*     Quick return if possible
*
      IF( min( n, nrhs ).EQ.0 )
     $   RETURN
*
*     Determine machine dependent constant to control overflow.
*
      bignum = slamch( 'Overflow' )
      smlnum = slamch( 'Safe Minimum' )
*
*     Use unblocked code for small problems
*
      IF( nrhs.LT.nrhsmin ) THEN
         CALL clatrs( uplo, trans, diag, normin, n, a, lda, x( 1,
     $                1 ),
     $                scale( 1 ), cnorm, info )
         DO k = 2, nrhs
            CALL clatrs( uplo, trans, diag, 'Y', n, a, lda, x( 1,
     $                   k ),
     $                   scale( k ), cnorm, info )
         END DO
         RETURN
      END IF
*
*     Compute norms of blocks of A excluding diagonal blocks and find
*     the block with the largest norm TMAX.
*
      tmax = zero
      DO j = 1, nba
         j1 = (j-1)*nb + 1
         j2 = min( j*nb, n ) + 1
         IF ( upper ) THEN
            ifirst = 1
            ilast = j - 1
         ELSE
            ifirst = j + 1
            ilast = nba
         END IF
         DO i = ifirst, ilast
            i1 = (i-1)*nb + 1
            i2 = min( i*nb, n ) + 1
*
*           Compute upper bound of A( I1:I2-1, J1:J2-1 ).
*
            IF( notran ) THEN
               anrm = clange( 'I', i2-i1, j2-j1, a( i1, j1 ), lda,
     $                        w )
               work( awrk + i+(j-1)*nba ) = anrm
            ELSE
               anrm = clange( '1', i2-i1, j2-j1, a( i1, j1 ), lda,
     $                        w )
               work( awrk + j+(i-1)*nba ) = anrm
            END IF
            tmax = max( tmax, anrm )
         END DO
      END DO
*
      IF( .NOT. tmax.LE.slamch('Overflow') ) THEN
*
*        Some matrix entries have huge absolute value. At least one upper
*        bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point
*        number, either due to overflow in LANGE or due to Inf in A.
*        Fall back to LATRS. Set normin = 'N' for every right-hand side to
*        force computation of TSCAL in LATRS to avoid the likely overflow
*        in the computation of the column norms CNORM.
*
         DO k = 1, nrhs
            CALL clatrs( uplo, trans, diag, 'N', n, a, lda, x( 1,
     $                   k ),
     $                   scale( k ), cnorm, info )
         END DO
         RETURN
      END IF
*
*     Every right-hand side requires workspace to store NBA local scale
*     factors. To save workspace, X is computed successively in block columns
*     of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient
*     workspace is available, larger values of NBRHS or NBRHS = NRHS are viable.
      DO k = 1, nbx
*        Loop over block columns (index = K) of X and, for column-wise scalings,
*        over individual columns (index = KK).
*        K1: column index of the first column in X( J, K )
*        K2: column index of the first column in X( J, K+1 )
*        so the K2 - K1 is the column count of the block X( J, K )
         k1 = (k-1)*nbrhs + 1
         k2 = min( k*nbrhs, nrhs ) + 1
*
*        Initialize local scaling factors of current block column X( J, K )
*
         DO kk = 1, k2-k1
            DO i = 1, nba
               work( i+kk*lds ) = one
            END DO
         END DO
*
         IF( notran ) THEN
*
*           Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
*
            IF( upper ) THEN
               jfirst = nba
               jlast = 1
               jinc = -1
            ELSE
               jfirst = 1
               jlast = nba
               jinc = 1
            END IF
         ELSE
*
*           Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
*           where op(A) = A**T or op(A) = A**H
*
            IF( upper ) THEN
               jfirst = 1
               jlast = nba
               jinc = 1
            ELSE
               jfirst = nba
               jlast = 1
               jinc = -1
            END IF
         END IF
 
         DO j = jfirst, jlast, jinc
*           J1: row index of the first row in A( J, J )
*           J2: row index of the first row in A( J+1, J+1 )
*           so that J2 - J1 is the row count of the block A( J, J )
            j1 = (j-1)*nb + 1
            j2 = min( j*nb, n ) + 1
*
*           Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS )
*
            DO kk = 1, k2-k1
               rhs = k1 + kk - 1
               IF( kk.EQ.1 ) THEN
                  CALL clatrs( uplo, trans, diag, 'N', j2-j1,
     $                         a( j1, j1 ), lda, x( j1, rhs ),
     $                         scaloc, cnorm, info )
               ELSE
                  CALL clatrs( uplo, trans, diag, 'Y', j2-j1,
     $                         a( j1, j1 ), lda, x( j1, rhs ),
     $                         scaloc, cnorm, info )
               END IF
*              Find largest absolute value entry in the vector segment
*              X( J1:J2-1, RHS ) as an upper bound for the worst case
*              growth in the linear updates.
               xnrm( kk ) = clange( 'I', j2-j1, 1, x( j1, rhs ),
     $                              ldx, w )
*
               IF( scaloc .EQ. zero ) THEN
*                 LATRS found that A is singular through A(j,j) = 0.
*                 Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0
*                 and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is
*                 set by LATRS.
                  scale( rhs ) = zero
                  DO ii = 1, j1-1
                     x( ii, kk ) = czero
                  END DO
                  DO ii = j2, n
                     x( ii, kk ) = czero
                  END DO
*                 Discard the local scale factors.
                  DO ii = 1, nba
                     work( ii+kk*lds ) = one
                  END DO
                  scaloc = one
               ELSE IF( scaloc*work( j+kk*lds ) .EQ. zero ) THEN
*                 LATRS computed a valid scale factor, but combined with
*                 the current scaling the solution does not have a
*                 scale factor > 0.
*
*                 Set WORK( J+KK*LDS ) to smallest valid scale
*                 factor and increase SCALOC accordingly.
                  scal = work( j+kk*lds ) / smlnum
                  scaloc = scaloc * scal
                  work( j+kk*lds ) = smlnum
*                 If LATRS overestimated the growth, x may be
*                 rescaled to preserve a valid combined scale
*                 factor WORK( J, KK ) > 0.
                  rscal = one / scaloc
                  IF( xnrm( kk )*rscal .LE. bignum ) THEN
                     xnrm( kk ) = xnrm( kk ) * rscal
                     CALL csscal( j2-j1, rscal, x( j1, rhs ), 1 )
                     scaloc = one
                  ELSE
*                    The system op(A) * x = b is badly scaled and its
*                    solution cannot be represented as (1/scale) * x.
*                    Set x to zero. This approach deviates from LATRS
*                    where a completely meaningless non-zero vector
*                    is returned that is not a solution to op(A) * x = b.
                     scale( rhs ) = zero
                     DO ii = 1, n
                        x( ii, kk ) = czero
                     END DO
*                    Discard the local scale factors.
                     DO ii = 1, nba
                        work( ii+kk*lds ) = one
                     END DO
                     scaloc = one
                  END IF
               END IF
               scaloc = scaloc * work( j+kk*lds )
               work( j+kk*lds ) = scaloc
            END DO
*
*           Linear block updates
*
            IF( notran ) THEN
               IF( upper ) THEN
                  ifirst = j - 1
                  ilast = 1
                  iinc = -1
               ELSE
                  ifirst = j + 1
                  ilast = nba
                  iinc = 1
               END IF
            ELSE
               IF( upper ) THEN
                  ifirst = j + 1
                  ilast = nba
                  iinc = 1
               ELSE
                  ifirst = j - 1
                  ilast = 1
                  iinc = -1
               END IF
            END IF
*
            DO i = ifirst, ilast, iinc
*              I1: row index of the first column in X( I, K )
*              I2: row index of the first column in X( I+1, K )
*              so the I2 - I1 is the row count of the block X( I, K )
               i1 = (i-1)*nb + 1
               i2 = min( i*nb, n ) + 1
*
*              Prepare the linear update to be executed with GEMM.
*              For each column, compute a consistent scaling, a
*              scaling factor to survive the linear update, and
*              rescale the column segments, if necessary. Then
*              the linear update is safely executed.
*
               DO kk = 1, k2-k1
                  rhs = k1 + kk - 1
*                 Compute consistent scaling
                  scamin = min( work( i+kk*lds), work( j+kk*lds ) )
*
*                 Compute scaling factor to survive the linear update
*                 simulating consistent scaling.
*
                  bnrm = clange( 'I', i2-i1, 1, x( i1, rhs ), ldx,
     $                           w )
                  bnrm = bnrm*( scamin / work( i+kk*lds ) )
                  xnrm( kk ) = xnrm( kk )*( scamin / work( j+kk*lds) )
                  anrm = work( awrk + i+(j-1)*nba )
                  scaloc = slarmm( anrm, xnrm( kk ), bnrm )
*
*                 Simultaneously apply the robust update factor and the
*                 consistency scaling factor to X( I, KK ) and X( J, KK ).
*
                  scal = ( scamin / work( i+kk*lds) )*scaloc
                  IF( scal.NE.one ) THEN
                     CALL csscal( i2-i1, scal, x( i1, rhs ), 1 )
                     work( i+kk*lds ) = scamin*scaloc
                  END IF
*
                  scal = ( scamin / work( j+kk*lds ) )*scaloc
                  IF( scal.NE.one ) THEN
                     CALL csscal( j2-j1, scal, x( j1, rhs ), 1 )
                     work( j+kk*lds ) = scamin*scaloc
                  END IF
               END DO
*
               IF( notran ) THEN
*
*                 B( I, K ) := B( I, K ) - A( I, J ) * X( J, K )
*
                  CALL cgemm( 'N', 'N', i2-i1, k2-k1, j2-j1, -cone,
     $                        a( i1, j1 ), lda, x( j1, k1 ), ldx,
     $                        cone, x( i1, k1 ), ldx )
               ELSE IF( lsame( trans, 'T' ) ) THEN
*
*                 B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K )
*
                  CALL cgemm( 'T', 'N', i2-i1, k2-k1, j2-j1, -cone,
     $                        a( j1, i1 ), lda, x( j1, k1 ), ldx,
     $                        cone, x( i1, k1 ), ldx )
               ELSE
*
*                 B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K )
*
                  CALL cgemm( 'C', 'N', i2-i1, k2-k1, j2-j1, -cone,
     $                        a( j1, i1 ), lda, x( j1, k1 ), ldx,
     $                        cone, x( i1, k1 ), ldx )
               END IF
            END DO
         END DO
*
*        Reduce local scaling factors
*
         DO kk = 1, k2-k1
            rhs = k1 + kk - 1
            DO i = 1, nba
               scale( rhs ) = min( scale( rhs ), work( i+kk*lds ) )
            END DO
         END DO
*
*        Realize consistent scaling
*
         DO kk = 1, k2-k1
            rhs = k1 + kk - 1
            IF( scale( rhs ).NE.one .AND. scale( rhs ).NE. zero ) THEN
               DO i = 1, nba
                  i1 = (i-1)*nb + 1
                  i2 = min( i*nb, n ) + 1
                  scal = scale( rhs ) / work( i+kk*lds )
                  IF( scal.NE.one )
     $               CALL csscal( i2-i1, scal, x( i1, rhs ), 1 )
               END DO
            END IF
         END DO
      END DO
*
      work( 1 ) = sroundup_lwork( lwmin )
*
      RETURN
*
*     End of CLATRS3
*

Here is the call graph for this function:

Here is the caller graph for this function: