d8/d39/pzhettrd_8f_source.html

      SUBROUTINE pzhettrd( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK,

     $                     LWORK, INFO )

*

*  -- ScaLAPACK routine (version 2.0.2) --

*     Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver

*     May 1 2012

*

*     .. Scalar Arguments ..

      CHARACTER          UPLO

      INTEGER            IA, INFO, JA, LWORK, N

*     ..

*     .. Array Arguments ..

      INTEGER            DESCA( * )

      DOUBLE PRECISION   D( * ), E( * )

      COMPLEX*16         A( * ), TAU( * ), WORK( * )

*     ..

*

*     Purpose

*

*     =======

*

*     PZHETTRD reduces a complex Hermitian matrix sub( A ) to Hermitian

*     tridiagonal form T by an unitary similarity transformation:

*     Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1).

*

*     Notes

*     =====

*

*     Each global data object is described by an associated description

*     vector.  This vector stores the information required to establish

*     the mapping between an object element and its corresponding

*     process and memory location.

*

*     Let A be a generic term for any 2D block cyclicly distributed

*     array.

*     Such a global array has an associated description vector DESCA.

*     In the following comments, the character _ should be read as

*     "of the global array".

*

*     NOTATION        STORED IN      EXPLANATION

*     --------------- -------------- -----------------------------------

*     DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,

*     DTYPE_A = 1.

*     CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle,

*     indicating the BLACS process grid A is distribu-

*     ted over. The context itself is glo-

*     bal, but the handle (the integer

*     value) may vary.

*     M_A    (global) DESCA( M_ )    The number of rows in the global

*     array A.

*     N_A    (global) DESCA( N_ )    The number of columns in the global

*     array A.

*     MB_A   (global) DESCA( MB_ )   The blocking factor used to

*     distribute the rows of the array.

*     NB_A   (global) DESCA( NB_ )   The blocking factor used to

*     distribute the columns of the array.

*     RSRC_A (global) DESCA( RSRC_ ) The process row over which the

*     first row of the array A is distributed.

*     CSRC_A (global) DESCA( CSRC_ ) The process column over which the

*     first column of the array A is

*     distributed.

*     LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local

*     array.  LLD_A >= MAX(1,LOCp(M_A)).

*

*     Let K be the number of rows or columns of a distributed matrix,

*     and assume that its process grid has dimension p x q.

*     LOCp( K ) denotes the number of elements of K that a process

*     would receive if K were distributed over the p processes of its

*     process column.

*     Similarly, LOCq( K ) denotes the number of elements of K that a

*     process would receive if K were distributed over the q processes

*     of its process row.

*     The values of LOCp() and LOCq() may be determined via a call to

*     the ScaLAPACK tool function, NUMROC:

*     LOCp( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),

*     LOCq( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).

*     An upper bound for these quantities may be computed by:

*     LOCp( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A

*     LOCq( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A

*

*     Arguments

*     =========

*

*     UPLO    (global input) CHARACTER

*     Specifies whether the upper or lower triangular part of the

*     Hermitian matrix sub( A ) is stored:

*     = 'U':  Upper triangular

*     = 'L':  Lower triangular

*

*     N       (global input) INTEGER

*     The number of rows and columns to be operated on, i.e. the

*     order of the distributed submatrix sub( A ). N >= 0.

*

*     A       (local input/local output) COMPLEX*16 pointer into the

*     local memory to an array of dimension (LLD_A,LOCq(JA+N-1)).

*     On entry, this array contains the local pieces of the

*     Hermitian distributed matrix sub( A ).  If UPLO = 'U', the

*     leading N-by-N upper triangular part of sub( A ) contains

*     the upper triangular part of the matrix, and its strictly

*     lower triangular part is not referenced. If UPLO = 'L', the

*     leading N-by-N lower triangular part of sub( A ) contains the

*     lower triangular part of the matrix, and its strictly upper

*     triangular part is not referenced. On exit, if UPLO = 'U',

*     the diagonal and first superdiagonal of sub( A ) are over-

*     written by the corresponding elements of the tridiagonal

*     matrix T, and the elements above the first superdiagonal,

*     with the array TAU, represent the unitary matrix Q as a

*     product of elementary reflectors; if UPLO = 'L', the diagonal

*     and first subdiagonal of sub( A ) are overwritten by the

*     corresponding elements of the tridiagonal matrix T, and the

*     elements below the first subdiagonal, with the array TAU,

*     represent the unitary matrix Q as a product of elementary

*     reflectors. See Further Details.

*

*     IA      (global input) INTEGER

*     The row index in the global array A indicating the first

*     row of sub( A ).

*

*     JA      (global input) INTEGER

*     The column index in the global array A indicating the

*     first column of sub( A ).

*

*     DESCA   (global and local input) INTEGER array of dimension DLEN_.

*     The array descriptor for the distributed matrix A.

*

*     D       (local output) DOUBLE PRECISION array, dim LOCq(JA+N-1)

*     The diagonal elements of the tridiagonal matrix T:

*     D(i) = A(i,i). D is tied to the distributed matrix A.

*

*     E       (local output) DOUBLE PRECISION array, dim LOCq(JA+N-1)

*     if UPLO = 'U', LOCq(JA+N-2) otherwise. The off-diagonal

*     elements of the tridiagonal matrix T: E(i) = A(i,i+1) if

*     UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the

*     distributed matrix A.

*

*     TAU     (local output) COMPLEX*16, array, dimension

*     LOCq(JA+N-1). This array contains the scalar factors TAU of

*     the elementary reflectors. TAU is tied to the distributed

*     matrix A.

*

*     WORK    (local workspace) COMPLEX*16 array, dimension (LWORK)

*     On exit, WORK( 1 ) returns the minimal and optimal workspace

*

*     LWORK   (local input) INTEGER

*     The dimension of the array WORK.

*     LWORK >= 2*( ANB+1 )*( 4*NPS+2 ) + NPS

*     Where:

*         NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB )

*         ANB = PJLAENV( DESCA( CTXT_ ), 3, 'PZHETTRD', 'L', 0, 0,

*           0, 0 )

*

*         NUMROC is a ScaLAPACK tool function;

*         PJLAENV is a ScaLAPACK envionmental inquiry function

*         MYROW, MYCOL, NPROW and NPCOL can be determined by calling

*         the subroutine BLACS_GRIDINFO.

*

*     INFO    (global output) INTEGER

*     = 0:  successful exit

*     < 0:  If the i-th argument is an array and the j-entry had

*     an illegal value, then INFO = -(i*100+j), if the i-th

*     argument is a scalar and had an illegal value, then

*     INFO = -i.

*

*     Further Details

*     ===============

*

*     If UPLO = 'U', the matrix Q is represented as a product of

*     elementary reflectors

*

*     Q = H(n-1) . . . H(2) H(1).

*

*     Each H(i) has the form

*

*     H(i) = I - tau * v * v'

*

*     where tau is a complex scalar, and v is a complex vector with

*     v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in

*     A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1).

*

*     If UPLO = 'L', the matrix Q is represented as a product of

*     elementary reflectors

*

*     Q = H(1) H(2) . . . H(n-1).

*

*     Each H(i) has the form

*

*     H(i) = I - tau * v * v'

*

*     where tau is a complex scalar, and v is a complex vector with

*     v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in

*     A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1).

*

*     The contents of sub( A ) on exit are illustrated by the following

*     examples with n = 5:

*

*     if UPLO = 'U':                       if UPLO = 'L':

*

*     (  d   e   v2  v3  v4 )              (  d                  )

*     (      d   e   v3  v4 )              (  e   d              )

*     (          d   e   v4 )              (  v1  e   d          )

*     (              d   e  )              (  v1  v2  e   d      )

*     (                  d  )              (  v1  v2  v3  e   d  )

*

*     where d and e denote diagonal and off-diagonal elements of T, and

*     vi denotes an element of the vector defining H(i).

*

*     Data storage requirements

*     =========================

*

*     PZHETTRD is not intended to be called directly.  All users are

*     encourage to call PZHETRD which will then call PZHETTRD if

*     appropriate.  A must be in cyclic format (i.e. MB = NB = 1),

*     the process grid must be square ( i.e. NPROW = NPCOL ) and

*     only lower triangular storage is supported.

*

*     Local variables

*     ===============

*

*     PZHETTRD uses five local arrays:

*       WORK ( InV ) dimension ( NP, ANB+1): array V

*       WORK ( InH ) dimension ( NP, ANB+1): array H

*       WORK ( InVT ) dimension ( NQ, ANB+1): transpose of the array V

*       WORK ( InHT ) dimension ( NQ, ANB+1): transpose of the array H

*       WORK ( InVTT ) dimension ( NQ, 1): transpose of the array VT

*

*     Arrays V and H are replicated across all processor columns.

*     Arrays V^T and H^T are replicated across all processor rows.

*

*         WORK ( InVT ), or V^T, is stored as a tall skinny

*         array ( NQ x ANB-1 ) for efficiency.  Since only the lower

*         triangular portion of A is updated, Av is computed as:

*         tril(A) * v + v^T * tril(A,-1).  This is performed as

*         two local triangular matrix-vector multiplications (both in

*         MVR2) followed by a transpose and a sum across the columns.

*         In the local computation, WORK( InVT ) is used to compute

*         tril(A) * v and WORK( InV ) is used to compute

*         v^T * tril(A,-1)

*

*     The following variables are global indices into A:

*       INDEX:  The current global row and column number.

*       MAXINDEX:  The global row and column for the first row and

*       column in the trailing block of A.

*       LIIB, LIJB:  The first row, column in

*

*     The following variables point into the arrays A, V, H, V^T, H^T:

*       BINDEX  =INDEX-MININDEX: The column index in V, H, V^T, H^T.

*       LII:  local index I:  The local row number for row INDEX

*       LIJ:  local index J:  The local column number for column INDEX

*       LIIP1:  local index I+1:  The local row number for row INDEX+1

*       LIJP1:  local index J+1:  The local col number for col INDEX+1

*       LTLI: lower triangular local index I:  The local row for the

*         upper left entry in tril( A(INDEX, INDEX) )

*       LTLIP1: lower triangular local index I+1:  The local row for the

*         upper left entry in tril( A(INDEX+1, INDEX+1) )

*

*         Details:  The distinction between LII and LTLI (and between

*         LIIP1 and LTLIP1) is subtle.  Within the current processor

*         column (i.e. MYCOL .eq. CURCOL) they are the same.  However,

*         on some processors, A( LII, LIJ ) points to an element

*         above the diagonal, on these processors, LTLI = LII+1.

*

*     The following variables give the number of rows and/or columns

*     in various matrices:

*       NP:  The number of local rows in A( 1:N, 1:N )

*       NQ:  The number of local columns in A( 1:N, 1:N )

*       NPM0:  The number of local rows in A( INDEX:N, INDEX:N )

*       NQM0:  The number of local columns in A( INDEX:N, INDEX:N )

*       NPM1:  The number of local rows in A( INDEX+1:N, INDEX:N )

*       NQM1:  The number of local columns in A( INDEX+1:N, INDEX:N )

*       LTNM0:  The number of local rows & columns in

*         tril( A( INDEX:N, INDEX:N ) )

*       LTNM1:  The number of local rows & columns in

*         tril( A( INDEX+1:N, INDEX+1:N ) )

*         NOTE:  LTNM0 == LTNM1 on all processors except the diagonal

*         processors, i.e. those where MYCOL == MYROW.

*

*         Invariants:

*           NP = NPM0 + LII - 1

*           NQ = NQM0 + LIJ - 1

*           NP = NPM1 + LIIP1 - 1

*           NQ = NQM1 + LIJP1 - 1

*           NP = LTLI + LTNM0 - 1

*           NP = LTLIP1 + LTNM1 - 1

*

*       Temporary variables.  The following variables are used within

*       a few lines after they are set and do hold state from one loop

*       iteration to the next:

*

*     The matrix A:

*       The matrix A does not hold the same values that it would

*       in an unblocked code nor the values that it would hold in

*       in a blocked code.

*

*       The value of A is confusing.  It is easiest to state the

*       difference between trueA and A at the point that MVR2 is called,

*       so we will start there.

*

*       Let trueA be the value that A would

*       have at a given point in an unblocked code and A

*       be the value that A has in this code at the same point.

*

*       At the time of the call to MVR2,

*       trueA = A + V' * H + H' * V

*       where H = H( MAXINDEX:N, 1:BINDEX ) and

*       V = V( MAXINDEX:N, 1:BINDEX ).

*

*       At the bottom of the inner loop,

*       trueA = A +  V' * H + H' * V + v' * h + h' * v

*       where H = H( MAXINDEX:N, 1:BINDEX ) and

*       V = V( MAXINDEX:N, 1:BINDEX ) and

*       v = V( liip1:N, BINDEX+1 ) and

*       h = H( liip1:N, BINDEX+1 )

*

*       At the top of the loop, BINDEX gets incremented, hence:

*       trueA = A +  V' * H + H' * V + v' * h + h' * v

*       where H = H( MAXINDEX:N, 1:BINDEX-1 ) and

*       V = V( MAXINDEX:N, 1:BINDEX-1 ) and

*       v = V( liip1:N, BINDEX ) and

*       h = H( liip1:N, BINDEX )

*

*

*       A gets updated at the bottom of the outer loop

*       After this update, trueA = A + v' * h + h' * v

*       where v = V( liip1:N, BINDEX ) and

*       h = H( liip1:N, BINDEX ) and BINDEX = 0

*       Indeed, the previous loop invariant as stated above for the

*       top of the loop still holds, but with BINDEX = 0, H and V

*       are null matrices.

*

*       After the current column of A is updated,

*         trueA( INDEX, INDEX:N ) = A( INDEX, INDEX:N )

*       the rest of A is untouched.

*

*       After the current block column of A is updated,

*       trueA = A + V' * H + H' * V

*       where H = H( MAXINDEX:N, 1:BINDEX ) and

*       V = V( MAXINDEX:N, 1:BINDEX )

*

*       This brings us back to the point at which mvr2 is called.

*

*

*     Details of the parallelization:

*

*       We delay spreading v across to all processor columns (which

*       would naturally happen at the bottom of the loop) in order to

*       combine the spread of v( : , i-1 ) with the spread of h( : , i )

*

*       In order to compute h( :, i ), we must update A( :, i )

*       which means that the processor column owning A( :, i ) must

*       have: c, tau, v( i, i ) and h( i, i ).

*

*       The traditional

*       way of computing v (and the one used in pzlatrd.f and

*       zlatrd.f) is:

*         v = tau * v

*         c = v' * h

*         alpha = - tau * c / 2

*         v = v + alpha * h

*       However, the traditional way of computing v requires that tau

*       be broadcast to all processors in the current column (to compute

*       v = tau * v) and then a sum-to-all is required (to

*       compute v' * h ).  We use the following formula instead:

*         c = v' * h

*         v = tau * ( v - c * tau' * h / 2 )

*       The above formula allows tau to be spread down in the

*       same call to DGSUM2D which performs the sum-to-all of c.

*

*       The computation of v, which could be performed in any processor

*       column (or other procesor subsets), is performed in the

*       processor column that owns A( :, i+1 ) so that A( :, i+1 )

*       can be updated prior to spreading v across.

*

*       We keep the block column of A up-to-date to minimize the

*       work required in updating the current column of A.  Updating

*       the block column of A is reasonably load balanced whereas

*       updating the current column of A is not (only the current

*       processor column is involved).

*

*     In the following overview of the steps performed, M in the

*     margin indicates message traffic and C indicates O(n^2 nb/sqrt(p))

*     or more flops per processor.

*

*     Inner loop:

*       A( index:n, index ) -= ( v * ht(bindex) + h * vt( bindex) )

*M      h = house( A(index:n, index) )

*M      Spread v, h across

*M      vt = v^T; ht = h^T

*       A( index+1:n, index+1:maxindex ) -=

*         ( v * ht(index+1:maxindex) + h *vt(index+1:maxindex) )

*C      v = tril(A) * h; vt = ht * tril(A,-1)

*MorC   v = v - H*V*h - V*H*h

*M      v = v + vt^T

*M      c = v' * h

*       v = tau * ( v - c * tau' * h / 2 )

*C    A = A - H*V - V*H

*

*

*

*     =================================================================

*

*     .. Parameters ..

      INTEGER            BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_,

     $                   mb_, nb_, rsrc_, csrc_, lld_

      parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,

     $                   ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,

     $                   rsrc_ = 7, csrc_ = 8, lld_ = 9 )

      DOUBLE PRECISION   ONE

      parameter( one = 1.0d0 )

      COMPLEX*16         Z_ONE, Z_NEGONE, Z_ZERO

      parameter( z_one = 1.0d0, z_negone = -1.0d0,

     $                   z_zero = 0.0d0 )

      DOUBLE PRECISION   ZERO

      parameter( zero = 0.0d+0 )

*     ..

*

*

*     .. Local Scalars ..

*

*

      LOGICAL            BALANCED, INTERLEAVE, TWOGEMMS, UPPER

      INTEGER            ANB, BINDEX, CURCOL, CURROW, I, ICTXT, INDEX,

     $                   indexa, indexinh, indexinv, inh, inhb, inht,

     $                   inhtb, intmp, inv, invb, invt, invtb, j, lda,

     $                   ldv, ldzg, lii, liib, liip1, lij, lijb, lijp1,

     $                   ltlip1, ltnm1, lwmin, maxindex, minindex,

     $                   mycol, myfirstrow, myrow, mysetnum, nbzg, np,

     $                   npb, npcol, npm0, npm1, nprow, nps, npset, nq,

     $                   nqb, nqm1, numrows, nxtcol, nxtrow, pbmax,

     $                   pbmin, pbsize, pnb, rowsperproc

      DOUBLE PRECISION   NORM, SAFMAX, SAFMIN

      COMPLEX*16         ALPHA, BETA, C, CONJTOPH, CONJTOPV,

     $                   oneoverbeta, toph, topnv, toptau, topv

*     ..

*     .. Local Arrays ..

*

*

*

*

      INTEGER            IDUM1( 1 ), IDUM2( 1 )

      DOUBLE PRECISION   DTMP( 5 )

      COMPLEX*16         CC( 3 )

*     ..

*     .. External Subroutines ..

      EXTERNAL           blacs_gridinfo, chk1mat, dcombnrm2, dgebr2d,

     $                   dgebs2d, dgsum2d, pchk1mat, pdtreecomb,

     $                   pxerbla, zgebr2d, zgebs2d, zgemm, zgemv,

     $                   zgerv2d, zgesd2d, zgsum2d, zlamov, zscal,

     $                   ztrmvt

*     ..

*     .. External Functions ..

*

      LOGICAL            LSAME

      INTEGER            ICEIL, NUMROC, PJLAENV

      DOUBLE PRECISION   DZNRM2, PDLAMCH

      EXTERNAL           lsame, iceil, numroc, pjlaenv, dznrm2, pdlamch

*     ..

*     .. Intrinsic Functions ..

      INTRINSIC          dble, dcmplx, dconjg, dimag, ichar, max, min,

     $                   mod, sign, sqrt

*     ..

*

*

*     .. Executable Statements ..

*       This is just to keep ftnchek and toolpack/1 happy

      IF( block_cyclic_2d*csrc_*ctxt_*dlen_*dtype_*lld_*mb_*m_*nb_*n_*

     $    rsrc_.LT.0 )RETURN

*

*

*

*     Further details

*     ===============

*

*     At the top of the loop, v and nh have been computed but not

*     spread across.  Hence, A is out-of-date even after the

*     rank 2k update.  Furthermore, we compute the next v before

*     nh is spread across.

*

*     I claim that if we used a sum-to-all on NV, by summing CC within

*     each column, that we could compute NV locally and could avoid

*     spreading V across.  Bruce claims that sum-to-all can be made

*     to cost no more than sum-to-one on the Paragon.  If that is

*     true, this would be a win.  But,

*     the BLACS sum-to-all is just a sum-to-one followed by a broadcast,

*     and hence the present scheme is better for now.

*

*     Get grid parameters

*

      ictxt = desca( ctxt_ )

      CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )

*

      safmax = sqrt( pdlamch( ictxt, 'O' ) ) / n

      safmin = sqrt( pdlamch( ictxt, 'S' ) )

*

*     Test the input parameters

*

      info = 0

      IF( nprow.EQ.-1 ) THEN

         info = -( 600+ctxt_ )

      ELSE

*

*     Here we set execution options for PZHETTRD

*

         pnb = pjlaenv( ictxt, 2, 'PZHETTRD', 'L', 0, 0, 0, 0 )

         anb = pjlaenv( ictxt, 3, 'PZHETTRD', 'L', 0, 0, 0, 0 )

*

         interleave = ( pjlaenv( ictxt, 4, 'PZHETTRD', 'L', 1, 0, 0,

     $                0 ).EQ.1 )

         twogemms = ( pjlaenv( ictxt, 4, 'PZHETTRD', 'L', 2, 0, 0,

     $              0 ).EQ.1 )

         balanced = ( pjlaenv( ictxt, 4, 'PZHETTRD', 'L', 3, 0, 0,

     $              0 ).EQ.1 )

*

         CALL chk1mat( n, 2, n, 2, ia, ja, desca, 6, info )

*

*

         upper = lsame( uplo, 'U' )

         IF( info.EQ.0 .AND. desca( nb_ ).NE.1 )

     $      info = 600 + nb_

         IF( info.EQ.0 ) THEN

*

*

*           Here is the arithmetic:

*             Let maxnpq = max( np, nq, 2 * ANB )

*             LDV = 4 * max( np, nq ) + 2

*             LWMIN = 2 * ( ANB + 1 ) * LDV + MAX( np, 2 * ANB )

*             = 2 * ( ANB + 1 ) * ( 4 * NPS + 2 ) + NPS

*

*           This overestimates memory requirements when ANB > NP/2

*           Memory requirements are lower when interleave = .false.

*           Hence, we could have two sets of memory requirements,

*           one for interleave and one for

*

*

            nps = max( numroc( n, 1, 0, 0, nprow ), 2*anb )

            lwmin = 2*( anb+1 )*( 4*nps+2 ) + nps

*

            work( 1 ) = dcmplx( lwmin )

            IF( .NOT.lsame( uplo, 'L' ) ) THEN

               info = -1

            ELSE IF( ia.NE.1 ) THEN

               info = -4

            ELSE IF( ja.NE.1 ) THEN

               info = -5

            ELSE IF( nprow.NE.npcol ) THEN

               info = -( 600+ctxt_ )

            ELSE IF( desca( dtype_ ).NE.1 ) THEN

               info = -( 600+dtype_ )

            ELSE IF( desca( mb_ ).NE.1 ) THEN

               info = -( 600+mb_ )

            ELSE IF( desca( nb_ ).NE.1 ) THEN

               info = -( 600+nb_ )

            ELSE IF( desca( rsrc_ ).NE.0 ) THEN

               info = -( 600+rsrc_ )

            ELSE IF( desca( csrc_ ).NE.0 ) THEN

               info = -( 600+csrc_ )

            ELSE IF( lwork.LT.lwmin ) THEN

               info = -11

            END IF

         END IF

         IF( upper ) THEN

            idum1( 1 ) = ichar( 'U' )

         ELSE

            idum1( 1 ) = ichar( 'L' )

         END IF

         idum2( 1 ) = 1

*

         CALL pchk1mat( n, 2, n, 2, ia, ja, desca, 6, 1, idum1, idum2,

     $                  info )

      END IF

*

      IF( info.NE.0 ) THEN

         CALL pxerbla( ictxt, 'PZHETTRD', -info )

         RETURN

      END IF

*

*     Quick return if possible

*

      IF( n.EQ.0 )

     $   RETURN

*

*

*

*     Reduce the lower triangle of sub( A )

      np = numroc( n, 1, myrow, 0, nprow )

      nq = numroc( n, 1, mycol, 0, npcol )

*

      nxtrow = 0

      nxtcol = 0

*

      liip1 = 1

      lijp1 = 1

      npm1 = np

      nqm1 = nq

*

      lda = desca( lld_ )

      ictxt = desca( ctxt_ )

*

*

*

*     Miscellaneous details:

*     Put tau, D and E in the right places

*     Check signs

*     Place all the arrays in WORK, control their placement

*     in  memory.

*

*

*

*     Loop invariants

*     A(LIIP1, LIJ) points to the first element of A(I+1,J)

*     NPM1,NQM1 = the number of rows, cols in A( LII+1:N,LIJ+1:N )

*     A(LII:N,LIJ:N) is one step out of date.

*     proc( CURROW, CURCOL ) owns A(LII,LIJ)

*     proc( NXTROW, CURCOL ) owns A(LIIP1,LIJ)

*

      inh = 1

*

      IF( interleave ) THEN

*

*        H and V are interleaved to minimize memory movement

*        LDV has to be twice as large to accomodate interleaving.

*        In addition, LDV is doubled again to allow v, h and

*        toptau to be spreaad across and transposed in a

*        single communication operation with minimum memory

*        movement.

*

*        We could reduce LDV back to 2*MAX(NPM1,NQM1)

*        by increasing the memory movement required in

*        the spread and transpose of v, h and toptau.

*        However, since the non-interleaved path already

*        provides a mear minimum memory requirement option,

*        we did not provide this additional path.

*

         ldv = 4*( max( npm1, nqm1 ) ) + 2

*

         inh = 1

*

         inv = inh + ldv / 2

         invt = inh + ( anb+1 )*ldv

*

         inht = invt + ldv / 2

         intmp = invt + ldv*( anb+1 )

*

      ELSE

         ldv = max( npm1, nqm1 )

*

         inht = inh + ldv*( anb+1 )

         inv = inht + ldv*( anb+1 )

*

*        The code works without this +1, but only because of a

*        coincidence.  Without the +1, WORK(INVT) gets trashed, but

*        WORK(INVT) is only used once and when it is used, it is

*        multiplied by WORK( INH ) which is zero.  Hence, the fact

*        that WORK(INVT) is trashed has no effect.

*

         invt = inv + ldv*( anb+1 ) + 1

         intmp = invt + ldv*( 2*anb )

*

      END IF

*

      IF( info.NE.0 ) THEN

         CALL pxerbla( ictxt, 'PZHETTRD', -info )

         work( 1 ) = dcmplx( lwmin )

         RETURN

      END IF

*

*

*        The satisfies the loop invariant: trueA = A - V * HT - H * VT,

*        (where V, H, VT and HT all have BINDEX+1 rows/columns)

*        the first ANB times through the loop.

*

*

*

*     Setting either ( InH and InHT ) or InV to Z_ZERO

*     is adequate except in the face of NaNs.

*

*

      DO 10 i = 1, np

         work( inh+i-1 ) = z_zero

         work( inv+i-1 ) = z_zero

   10 CONTINUE

      DO 20 i = 1, nq

         work( inht+i-1 ) = z_zero

   20 CONTINUE

*

*

*

      topnv = z_zero

*

      ltlip1 = lijp1

      ltnm1 = npm1

      IF( mycol.GT.myrow ) THEN

         ltlip1 = ltlip1 + 1

         ltnm1 = ltnm1 - 1

      END IF

*

*

      DO 210 minindex = 1, n - 1, anb

*

*

         maxindex = min( minindex+anb-1, n )

         lijb = numroc( maxindex, 1, mycol, 0, npcol ) + 1

         liib = numroc( maxindex, 1, myrow, 0, nprow ) + 1

*

         nqb = nq - lijb + 1

         npb = np - liib + 1

         inhtb = inht + lijb - 1

         invtb = invt + lijb - 1

         inhb = inh + liib - 1

         invb = inv + liib - 1

*

*

*

*

         DO 160 index = minindex, min( maxindex, n-1 )

*

            bindex = index - minindex

*

            currow = nxtrow

            curcol = nxtcol

*

            nxtrow = mod( currow+1, nprow )

            nxtcol = mod( curcol+1, npcol )

*

            lii = liip1

            lij = lijp1

            npm0 = npm1

*

            IF( myrow.EQ.currow ) THEN

               npm1 = npm1 - 1

               liip1 = liip1 + 1

            END IF

            IF( mycol.EQ.curcol ) THEN

               nqm1 = nqm1 - 1

               lijp1 = lijp1 + 1

               ltlip1 = ltlip1 + 1

               ltnm1 = ltnm1 - 1

            END IF

*

*

*

*

*     V = NV, VT = NVT, H = NH, HT = NHT

*

*

*     Update the current column of A

*

*

            IF( mycol.EQ.curcol ) THEN

*

               indexa = lii + ( lij-1 )*lda

               indexinv = inv + lii - 1 + ( bindex-1 )*ldv

               indexinh = inh + lii - 1 + ( bindex-1 )*ldv

               conjtoph = dconjg( work( inht+lij-1+bindex*ldv ) )

               conjtopv = dconjg( topnv )

*

               IF( index.GT.1 ) THEN

                  DO 30 i = 0, npm0 - 1

*                  A( INDEXA+I ) = A( INDEXA+I )

                     a( indexa+i ) = a( indexa+i ) -

     $                               work( indexinv+ldv+i )*conjtoph -

     $                               work( indexinh+ldv+i )*conjtopv

   30             CONTINUE

               END IF

*

*

            END IF

*

*

            IF( mycol.EQ.curcol ) THEN

*

*     Compute the householder vector

*

               IF( myrow.EQ.currow ) THEN

                  dtmp( 2 ) = dble( a( lii+( lij-1 )*lda ) )

               ELSE

                  dtmp( 2 ) = zero

               END IF

               IF( myrow.EQ.nxtrow ) THEN

                  dtmp( 3 ) = dble( a( liip1+( lij-1 )*lda ) )

                  dtmp( 4 ) = dimag( a( liip1+( lij-1 )*lda ) )

               ELSE

                  dtmp( 3 ) = zero

                  dtmp( 4 ) = zero

               END IF

*

               norm = dznrm2( npm1, a( liip1+( lij-1 )*lda ), 1 )

               dtmp( 1 ) = norm

*

*              IF DTMP(5) = 1.0, NORM is too large and might cause

*              overflow, hence PDTREECOMB must be called.  IF DTMP(5)

*              is zero on output, DTMP(1) can be trusted.

*

               dtmp( 5 ) = zero

               IF( dtmp( 1 ).GE.safmax .OR. dtmp( 1 ).LT.safmin ) THEN

                  dtmp( 5 ) = one

                  dtmp( 1 ) = zero

               END IF

*

               dtmp( 1 ) = dtmp( 1 )*dtmp( 1 )

               CALL dgsum2d( ictxt, 'C', ' ', 5, 1, dtmp, 5, -1,

     $                       curcol )

               IF( dtmp( 5 ).EQ.zero ) THEN

                  dtmp( 1 ) = sqrt( dtmp( 1 ) )

               ELSE

                  dtmp( 1 ) = norm

                  CALL pdtreecomb( ictxt, 'C', 1, dtmp, -1, mycol,

     $                             dcombnrm2 )

               END IF

*

               norm = dtmp( 1 )

*

               d( lij ) = dtmp( 2 )

               IF( myrow.EQ.currow .AND. mycol.EQ.curcol ) THEN

                  a( lii+( lij-1 )*lda ) = dcmplx( d( lij ), zero )

               END IF

*

*

               alpha = dcmplx( dtmp( 3 ), dtmp( 4 ) )

*

               norm = sign( norm, dble( alpha ) )

*

               IF( norm.EQ.zero ) THEN

                  toptau = zero

               ELSE

                  beta = norm + alpha

                  toptau = beta / norm

                  oneoverbeta = 1.0d0 / beta

*

                  CALL zscal( npm1, oneoverbeta,

     $                        a( liip1+( lij-1 )*lda ), 1 )

               END IF

*

               IF( myrow.EQ.nxtrow ) THEN

                  a( liip1+( lij-1 )*lda ) = z_one

               END IF

*

               tau( lij ) = toptau

               e( lij ) = -norm

*

            END IF

*

*

*     Spread v, nh, toptau across

*

            DO 40 i = 0, npm1 - 1

               work( inv+liip1-1+bindex*ldv+npm1+i ) = a( liip1+i+

     $            ( lij-1 )*lda )

   40       CONTINUE

*

            IF( mycol.EQ.curcol ) THEN

               work( inv+liip1-1+bindex*ldv+npm1+npm1 ) = toptau

               CALL zgebs2d( ictxt, 'R', ' ', npm1+npm1+1, 1,

     $                       work( inv+liip1-1+bindex*ldv ),

     $                       npm1+npm1+1 )

            ELSE

               CALL zgebr2d( ictxt, 'R', ' ', npm1+npm1+1, 1,

     $                       work( inv+liip1-1+bindex*ldv ),

     $                       npm1+npm1+1, myrow, curcol )

               toptau = work( inv+liip1-1+bindex*ldv+npm1+npm1 )

            END IF

            DO 50 i = 0, npm1 - 1

               work( inh+liip1-1+( bindex+1 )*ldv+i ) = work( inv+liip1-

     $            1+bindex*ldv+npm1+i )

   50       CONTINUE

*

            IF( index.LT.n ) THEN

               IF( myrow.EQ.nxtrow .AND. mycol.EQ.curcol )

     $            a( liip1+( lij-1 )*lda ) = e( lij )

            END IF

*

*     Transpose v, nh

*

*

            IF( myrow.EQ.mycol ) THEN

               DO 60 i = 0, npm1 + npm1

                  work( invt+lijp1-1+bindex*ldv+i ) = work( inv+liip1-1+

     $               bindex*ldv+i )

   60          CONTINUE

            ELSE

               CALL zgesd2d( ictxt, npm1+npm1, 1,

     $                       work( inv+liip1-1+bindex*ldv ), npm1+npm1,

     $                       mycol, myrow )

               CALL zgerv2d( ictxt, nqm1+nqm1, 1,

     $                       work( invt+lijp1-1+bindex*ldv ), nqm1+nqm1,

     $                       mycol, myrow )

            END IF

*

            DO 70 i = 0, nqm1 - 1

               work( inht+lijp1-1+( bindex+1 )*ldv+i ) = work( invt+

     $            lijp1-1+bindex*ldv+nqm1+i )

   70       CONTINUE

*

*

*           Update the current block column of A

*

            IF( index.GT.1 ) THEN

               DO 90 j = lijp1, lijb - 1

                  DO 80 i = 0, npm1 - 1

*

                     a( liip1+i+( j-1 )*lda ) = a( liip1+i+( j-1 )*lda )

     $                   - work( inv+liip1-1+bindex*ldv+i )*

     $                  dconjg( work( inht+j-1+bindex*ldv ) ) -

     $                  work( inh+liip1-1+bindex*ldv+i )*

     $                  dconjg( work( invt+j-1+bindex*ldv ) )

   80             CONTINUE

   90          CONTINUE

            END IF

*

*

*

*     Compute NV = A * NHT; NVT = A * NH

*

*           These two lines are necessary because these elements

*           are not always involved in the calls to ZTRMVT

*           for two reasons:

*           1)  On diagonal processors, the call to TRMVT

*               involves only LTNM1-1 elements

*           2)  On some processes, NQM1 < LTM1 or  LIIP1 < LTLIP1

*               and when the results are combined across all processes,

*               uninitialized values may be included.

            work( inv+liip1-1+( bindex+1 )*ldv ) = z_zero

            work( invt+lijp1-1+( bindex+1 )*ldv+nqm1-1 ) = z_zero

*

*

            IF( myrow.EQ.mycol ) THEN

               IF( ltnm1.GT.1 ) THEN

                  CALL ztrmvt( 'L', ltnm1-1,

     $                         a( ltlip1+1+( lijp1-1 )*lda ), lda,

     $                         work( invt+lijp1-1+( bindex+1 )*ldv ), 1,

     $                         work( inh+ltlip1+1-1+( bindex+1 )*ldv ),

     $                         1, work( inv+ltlip1+1-1+( bindex+1 )*

     $                         ldv ), 1, work( inht+lijp1-1+( bindex+

     $                         1 )*ldv ), 1 )

               END IF

               DO 100 i = 1, ltnm1

                  work( invt+lijp1+i-1-1+( bindex+1 )*ldv )

     $               = work( invt+lijp1+i-1-1+( bindex+1 )*ldv ) +

     $               a( ltlip1+i-1+( lijp1+i-1-1 )*lda )*

     $               work( inh+ltlip1+i-1-1+( bindex+1 )*ldv )

  100          CONTINUE

            ELSE

               IF( ltnm1.GT.0 )

     $            CALL ztrmvt( 'L', ltnm1, a( ltlip1+( lijp1-1 )*lda ),

     $                         lda, work( invt+lijp1-1+( bindex+1 )*

     $                         ldv ), 1, work( inh+ltlip1-1+( bindex+

     $                         1 )*ldv ), 1, work( inv+ltlip1-1+

     $                         ( bindex+1 )*ldv ), 1,

     $                         work( inht+lijp1-1+( bindex+1 )*ldv ),

     $                         1 )

*

            END IF

*

*

*     We take advantage of the fact that:

*     A * sum( B ) = sum ( A * B ) for matrices A,B

*

*     trueA = A + V * HT + H * VT

*     hence:  (trueA)v = Av' + V * HT * v + H * VT * v

*     VT * v = sum_p_in_NPROW ( VTp * v )

*     H * VT * v = H * sum (VTp * v) = sum ( H * VTp * v )

*

*     v = v + V * HT * h + H * VT * h

*

*

*

*     tmp = HT * nh1

            DO 110 i = 1, 2*( bindex+1 )

               work( intmp-1+i ) = 0

  110       CONTINUE

*

            IF( balanced ) THEN

               npset = nprow

               mysetnum = myrow

               rowsperproc = iceil( nqb, npset )

               myfirstrow = min( nqb+1, 1+rowsperproc*mysetnum )

               numrows = min( rowsperproc, nqb-myfirstrow+1 )

*

*

*     tmp = HT * v

*

               CALL zgemv( 'C', numrows, bindex+1, z_one,

     $                     work( inhtb+myfirstrow-1 ), ldv,

     $                     work( inhtb+myfirstrow-1+( bindex+1 )*ldv ),

     $                     1, z_zero, work( intmp ), 1 )

*     tmp2 = VT * v

               CALL zgemv( 'C', numrows, bindex+1, z_one,

     $                     work( invtb+myfirstrow-1 ), ldv,

     $                     work( inhtb+myfirstrow-1+( bindex+1 )*ldv ),

     $                     1, z_zero, work( intmp+bindex+1 ), 1 )

*

*

               CALL zgsum2d( ictxt, 'C', ' ', 2*( bindex+1 ), 1,

     $                       work( intmp ), 2*( bindex+1 ), -1, -1 )

            ELSE

*     tmp = HT * v

*

               CALL zgemv( 'C', nqb, bindex+1, z_one, work( inhtb ),

     $                     ldv, work( inhtb+( bindex+1 )*ldv ), 1,

     $                     z_zero, work( intmp ), 1 )

*     tmp2 = VT * v

               CALL zgemv( 'C', nqb, bindex+1, z_one, work( invtb ),

     $                     ldv, work( inhtb+( bindex+1 )*ldv ), 1,

     $                     z_zero, work( intmp+bindex+1 ), 1 )

*

            END IF

*

*

*

            IF( balanced ) THEN

               mysetnum = mycol

*

               rowsperproc = iceil( npb, npset )

               myfirstrow = min( npb+1, 1+rowsperproc*mysetnum )

               numrows = min( rowsperproc, npb-myfirstrow+1 )

*

               CALL zgsum2d( ictxt, 'R', ' ', 2*( bindex+1 ), 1,

     $                       work( intmp ), 2*( bindex+1 ), -1, -1 )

*

*

*     v = v + V * tmp

               IF( index.GT.1. ) THEN

                  CALL zgemv( 'N', numrows, bindex+1, z_negone,

     $                        work( invb+myfirstrow-1 ), ldv,

     $                        work( intmp ), 1, z_one,

     $                        work( invb+myfirstrow-1+( bindex+1 )*

     $                        ldv ), 1 )

*

*     v = v + H * tmp2

                  CALL zgemv( 'N', numrows, bindex+1, z_negone,

     $                        work( inhb+myfirstrow-1 ), ldv,

     $                        work( intmp+bindex+1 ), 1, z_one,

     $                        work( invb+myfirstrow-1+( bindex+1 )*

     $                        ldv ), 1 )

               END IF

*

            ELSE

*     v = v + V * tmp

               CALL zgemv( 'N', npb, bindex+1, z_negone, work( invb ),

     $                     ldv, work( intmp ), 1, z_one,

     $                     work( invb+( bindex+1 )*ldv ), 1 )

*

*

*     v = v + H * tmp2

               CALL zgemv( 'N', npb, bindex+1, z_negone, work( inhb ),

     $                     ldv, work( intmp+bindex+1 ), 1, z_one,

     $                     work( invb+( bindex+1 )*ldv ), 1 )

*

            END IF

*

*

*     Transpose NV and add it back into NVT

*

            IF( myrow.EQ.mycol ) THEN

               DO 120 i = 0, nqm1 - 1

                  work( intmp+i ) = work( invt+lijp1-1+( bindex+1 )*ldv+

     $                              i )

  120          CONTINUE

            ELSE

               CALL zgesd2d( ictxt, nqm1, 1,

     $                       work( invt+lijp1-1+( bindex+1 )*ldv ),

     $                       nqm1, mycol, myrow )

               CALL zgerv2d( ictxt, npm1, 1, work( intmp ), npm1, mycol,

     $                       myrow )

*

            END IF

            DO 130 i = 0, npm1 - 1

               work( inv+liip1-1+( bindex+1 )*ldv+i ) = work( inv+liip1-

     $            1+( bindex+1 )*ldv+i ) + work( intmp+i )

  130       CONTINUE

*

*     Sum-to-one NV rowwise (within a row)

*

            CALL zgsum2d( ictxt, 'R', ' ', npm1, 1,

     $                    work( inv+liip1-1+( bindex+1 )*ldv ), npm1,

     $                    myrow, nxtcol )

*

*

*     Dot product c = NV * NH

*     Sum-to-all c within next processor column

*

*

            IF( mycol.EQ.nxtcol ) THEN

               cc( 1 ) = z_zero

               DO 140 i = 0, npm1 - 1

                  cc( 1 ) = cc( 1 ) + dconjg( work( inv+liip1-1+

     $                      ( bindex+1 )*ldv+i ) )*

     $                      work( inh+liip1-1+( bindex+1 )*ldv+i )

  140          CONTINUE

               IF( myrow.EQ.nxtrow ) THEN

                  cc( 2 ) = work( inv+liip1-1+( bindex+1 )*ldv )

                  cc( 3 ) = work( inh+liip1-1+( bindex+1 )*ldv )

               ELSE

                  cc( 2 ) = z_zero

                  cc( 3 ) = z_zero

               END IF

               CALL zgsum2d( ictxt, 'C', ' ', 3, 1, cc, 3, -1, nxtcol )

*

               topv = cc( 2 )

               c = cc( 1 )

               toph = cc( 3 )

*

               topnv = toptau*( topv-c*dconjg( toptau ) / 2*toph )

*

*

*     Compute V = Tau * (V - C * Tau' / 2 * H )

*

*

               DO 150 i = 0, npm1 - 1

                  work( inv+liip1-1+( bindex+1 )*ldv+i ) = toptau*

     $               ( work( inv+liip1-1+( bindex+1 )*ldv+i )-c*

     $               dconjg( toptau ) / 2*work( inh+liip1-1+( bindex+

     $               1 )*ldv+i ) )

  150          CONTINUE

*

            END IF

*

*

  160    CONTINUE

*

*

*     Perform the rank2k update

*

         IF( maxindex.LT.n ) THEN

*

            DO 170 i = 0, npm1 - 1

               work( intmp+i ) = work( inh+liip1-1+anb*ldv+i )

  170       CONTINUE

*

*

*

            IF( .NOT.twogemms ) THEN

               IF( interleave ) THEN

                  ldzg = ldv / 2

               ELSE

                  CALL zlamov( 'A', ltnm1, anb, work( inht+lijp1-1 ),

     $                         ldv, work( invt+lijp1-1+anb*ldv ), ldv )

*

                  CALL zlamov( 'A', ltnm1, anb, work( inv+ltlip1-1 ),

     $                         ldv, work( inh+ltlip1-1+anb*ldv ), ldv )

                  ldzg = ldv

               END IF

               nbzg = anb*2

            ELSE

               ldzg = ldv

               nbzg = anb

            END IF

*

*

            DO 180 pbmin = 1, ltnm1, pnb

*

               pbsize = min( pnb, ltnm1-pbmin+1 )

               pbmax = min( ltnm1, pbmin+pnb-1 )

               CALL zgemm( 'N', 'C', pbsize, pbmax, nbzg, z_negone,

     $                     work( inh+ltlip1-1+pbmin-1 ), ldzg,

     $                     work( invt+lijp1-1 ), ldzg, z_one,

     $                     a( ltlip1+pbmin-1+( lijp1-1 )*lda ), lda )

               IF( twogemms ) THEN

                  CALL zgemm( 'N', 'C', pbsize, pbmax, anb, z_negone,

     $                        work( inv+ltlip1-1+pbmin-1 ), ldzg,

     $                        work( inht+lijp1-1 ), ldzg, z_one,

     $                        a( ltlip1+pbmin-1+( lijp1-1 )*lda ), lda )

               END IF

  180       CONTINUE

*

*

*

            DO 190 i = 0, npm1 - 1

               work( inv+liip1-1+i ) = work( inv+liip1-1+anb*ldv+i )

               work( inh+liip1-1+i ) = work( intmp+i )

  190       CONTINUE

            DO 200 i = 0, nqm1 - 1

               work( inht+lijp1-1+i ) = work( inht+lijp1-1+anb*ldv+i )

  200       CONTINUE

*

*

         END IF

*

*     End of the update A code

*

  210 CONTINUE

*

      IF( mycol.EQ.nxtcol ) THEN

         IF( myrow.EQ.nxtrow ) THEN

*

            d( nq ) = dble( a( np+( nq-1 )*lda ) )

            a( np+( nq-1 )*lda ) = d( nq )

*

            CALL dgebs2d( ictxt, 'C', ' ', 1, 1, d( nq ), 1 )

         ELSE

            CALL dgebr2d( ictxt, 'C', ' ', 1, 1, d( nq ), 1, nxtrow,

     $                    nxtcol )

         END IF

      END IF

*

*

*

*

      work( 1 ) = dcmplx( lwmin )

      RETURN

*

*     End of PZHETTRD

*

*


      END

chk1mat
subroutine chk1mat(ma, mapos0, na, napos0, ia, ja, desca, descapos0, info)
Definition chk1mat.f:3

max
#define max(A, B)
Definition pcgemr.c:180

min
#define min(A, B)
Definition pcgemr.c:181

pchk1mat
subroutine pchk1mat(ma, mapos0, na, napos0, ia, ja, desca, descapos0, nextra, ex, expos, info)
Definition pchkxmat.f:3

dcombnrm2
subroutine dcombnrm2(x, y)
Definition pdtreecomb.f:307

pdtreecomb
subroutine pdtreecomb(ictxt, scope, n, mine, rdest0, cdest0, subptr)
Definition pdtreecomb.f:3

pxerbla
subroutine pxerbla(ictxt, srname, info)
Definition pxerbla.f:2

pzhettrd
subroutine pzhettrd(uplo, n, a, ia, ja, desca, d, e, tau, work, lwork, info)
Definition pzhettrd.f:3

ztrmvt
subroutine ztrmvt(uplo, n, t, ldt, x, incx, y, incy, w, incw, z, incz)
Definition ztrmvt.f:3