SUBROUTINE PCLATRAN( N, NB, A, IA, JA, DESCA, WORK )
*
*  -- ScaLAPACK auxiliary routine (version 1.7) --
*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
*     and University of California, Berkeley.
*     October 15, 1999
*
*     .. Scalar Arguments ..
      INTEGER            IA, JA, N, NB
*     ..
*     .. Array Arguments ..
      INTEGER            DESCA( * )
      COMPLEX            A( * ), WORK( * )
*     ..
*
*  Purpose
*
*  =======
*
*  PCLATRAN transpose a lower triangular matrix on to the upper
*  triangular portion of the same matrix.
*
*  This is an auxiliary routine called by PCHETRD.
*
*  Notes
*  =====
*
*  IA must equal 1
*  JA must equal 1
*  DESCA( MB_ ) must equal 1
*  DESCA( NB_ ) must equal 1
*  DESCA( RSRC_ ) must equal 1
*  DESCA( CSRC_ ) must equal 1
*
*
*  Arguments
*  =========
*
*  N       (global input) INTEGER
*          The size of the matrix to be transposed.
*
*  NB      (global input) INTEGER
*          The number of rows and columns to be transposed with each
*          message sent.  NB has no impact on the result, it is striclty
*          a performance tuning parameter.
*
*  A       (local input/local output) COMPLEX*16 pointer into the
*          local memory to an array of dimension (LLD_A,LOCc(JA+N-1)).
*          On entry, this array contains the local pieces of the
*          Hermitian distributed matrix sub( A ).  On entry, the
*          leading N-by-N upper triangular part of sub( A ) contains
*          the upper triangular part of the matrix. On exit, the
*          leading N-by-N lower triangular part of sub( A ) contains the
*          lower triangular part of the matrix, and its strictly upper
*          triangular part is undefined (and may have been modified).
*
*  IA      (global input) INTEGER
*          A's global row index, which points to the beginning of the
*          submatrix which is to be operated on.
*          Must be equal to 1.
*
*  JA      (global input) INTEGER
*          A's global column index, which points to the beginning of
*          the submatrix which is to be operated on.
*          Must be equal to 1.
*
*  DESCA   (global and local input) INTEGER array of dimension DLEN_.
*          The array descriptor for the distributed matrix A.
*          DESCA( MB_ ) must equal 1
*          DESCA( NB_ ) must equal 1
*          DESCA( ICTXT_ ) must point to a square process grid
*            i.e. one where NPROW is equal to NPCOL
*
*  WORK    (local workspace) COMPLEX*16 array, dimension ( LWORK )
*
*          Where:
*          LWORK >= NB * NUMROC( N, 1, 0, 0, NPROW )
*
*  =====================================================================
*
*     .. Parameters ..
      INTEGER            BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_,
     $                   MB_, NB_, RSRC_, CSRC_, LLD_
      PARAMETER          ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
     $                   CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
     $                   RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
*     ..
*     .. Local Scalars ..
      INTEGER            I, ICTXT, IRECV, ISEND, J, JJ, JRECV, JSEND,
     $                   LDA, MAXIRECV, MAXISEND, MAXJRECV, MAXJSEND,
     $                   MINIRECV, MINISEND, MINJRECV, MINJSEND, MYCOL,
     $                   MYROW, NP, NPCOL, NPROW, NQ, RECVNB, SENDNB,
     $                   STARTCOL, STARTROW
*     ..
*     .. External Subroutines ..
      EXTERNAL           BLACS_GRIDINFO, CTRRV2D, CTRSD2D
*     ..
*     .. External Functions ..
      INTEGER            NUMROC
      EXTERNAL           NUMROC
*     ..
*     .. Intrinsic Functions ..
      INTRINSIC          CONJG, MAX, MIN
*     ..
*     .. Executable Statements ..
*       This is just to keep ftnchek and toolpack/1 happy
      IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_*
     $    RSRC_.LT.0 )RETURN
*
*     Further details
*
*     Because the processor grid is square each process needs only send
*     data to its transpose process.  (Likewsie it need only receive
*     data from its transpose process.)  Because the data decomposition
*     is cyclic, the local portion of the array is triangular.
*
*     This routine requires that the data be buffered (i.e. copied)
*     on the sending process (because of the triangular shape) and
*     unbuffered on the receiving process.  Hence, two local memory to
*     memory copies are performed within the communications routines
*     followed by a memory to memory copy outside of the communications
*     routines.  It would be nice to avoid having back to back memory
*     to memory copies (as we do presently on the receiving processor).
*     This could be done by packaging the data ourselves in the sender
*     and then unpacking it directly into the matrix.  However, this
*     code seems cleaner and so since this routine is not a significant
*     performance bottleneck we have left it this way.
*
*
*
*
*     Quick return if possible
*
      IF( N.LE.0 )
     $   RETURN
*
      ICTXT = DESCA( CTXT_ )
      LDA = DESCA( LLD_ )
      CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
*
*
      NP = NUMROC( N, 1, MYROW, 0, NPROW )
      NQ = NUMROC( N, 1, MYCOL, 0, NPCOL )
*
*
      IF( MYROW.EQ.MYCOL ) THEN
*
         DO 20 J = 1, NP
            DO 10 I = J + 1, NQ
               A( J+( I-1 )*LDA ) = CONJG( A( I+( J-1 )*LDA ) )
   10       CONTINUE
   20    CONTINUE
*
      ELSE
         IF( MYROW.GT.MYCOL ) THEN
            STARTROW = 1
            STARTCOL = 2
         ELSE
            IF( MYROW.EQ.MYCOL ) THEN
               STARTROW = 2
               STARTCOL = 2
            ELSE
               STARTROW = 2
               STARTCOL = 1
            END IF
         END IF
*
         DO 50 JJ = 1, MAX( NP, NQ ), NB
            MINJSEND = STARTCOL + JJ - 1
            MINJRECV = STARTROW + JJ - 1
            MAXJSEND = MIN( MINJSEND+NB-1, NQ )
            MAXJRECV = MIN( MINJRECV+NB-1, NP )
*
            SENDNB = MAXJSEND - MINJSEND + 1
            RECVNB = MAXJRECV - MINJRECV + 1
*
            MINISEND = 1
            MINIRECV = 1
            MAXISEND = MIN( NP, JJ+SENDNB-1 )
            MAXIRECV = MIN( NQ, JJ+RECVNB-1 )
*
            ISEND = MAXISEND - MINISEND + 1
            IRECV = MAXIRECV - MINIRECV + 1
            JSEND = MAXJSEND - MINJSEND + 1
            JRECV = MAXJRECV - MINJRECV + 1
*
*
*
            DO 40 J = MINJRECV, MAXJRECV
               DO 30 I = MINIRECV, MAXIRECV + J - MAXJRECV
                  WORK( I+( J-MINJRECV )*IRECV )
     $               = CONJG( A( J+( I-1 )*LDA ) )
   30          CONTINUE
   40       CONTINUE
*
            IF( IRECV.GT.0 .AND. JRECV.GT.0 )
     $         CALL CTRSD2D( ICTXT, 'U', 'N', IRECV, JRECV, WORK, IRECV,
     $                       MYCOL, MYROW )
*
            IF( ISEND.GT.0 .AND. JSEND.GT.0 )
     $         CALL CTRRV2D( ICTXT, 'U', 'N', ISEND, JSEND,
     $                       A( MINISEND+( MINJSEND-1 )*LDA ), LDA,
     $                       MYCOL, MYROW )
*
*
   50    CONTINUE
*
      END IF
*
      RETURN
*
*     End of PCLATRD
*
      END