d8/d03/pclatran_8f_source.html

      SUBROUTINE pclatran( N, NB, A, IA, JA, DESCA, WORK )

*

*  -- ScaLAPACK auxiliary routine (version 1.7) --

*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,

*     and University of California, Berkeley.

*     October 15, 1999

*

*     .. Scalar Arguments ..

      INTEGER            IA, JA, N, NB

*     ..

*     .. Array Arguments ..

      INTEGER            DESCA( * )

      COMPLEX            A( * ), WORK( * )

*     ..

*

*  Purpose

*

*  =======

*

*  PCLATRAN transpose a lower triangular matrix on to the upper

*  triangular portion of the same matrix.

*

*  This is an auxiliary routine called by PCHETRD.

*

*  Notes

*  =====

*

*  IA must equal 1

*  JA must equal 1

*  DESCA( MB_ ) must equal 1

*  DESCA( NB_ ) must equal 1

*  DESCA( RSRC_ ) must equal 1

*  DESCA( CSRC_ ) must equal 1

*

*

*  Arguments

*  =========

*

*  N       (global input) INTEGER

*          The size of the matrix to be transposed.

*

*  NB      (global input) INTEGER

*          The number of rows and columns to be transposed with each

*          message sent.  NB has no impact on the result, it is striclty

*          a performance tuning parameter.

*

*  A       (local input/local output) COMPLEX*16 pointer into the

*          local memory to an array of dimension (LLD_A,LOCc(JA+N-1)).

*          On entry, this array contains the local pieces of the

*          Hermitian distributed matrix sub( A ).  On entry, the

*          leading N-by-N upper triangular part of sub( A ) contains

*          the upper triangular part of the matrix. On exit, the

*          leading N-by-N lower triangular part of sub( A ) contains the

*          lower triangular part of the matrix, and its strictly upper

*          triangular part is undefined (and may have been modified).

*

*  IA      (global input) INTEGER

*          A's global row index, which points to the beginning of the

*          submatrix which is to be operated on.

*          Must be equal to 1.

*

*  JA      (global input) INTEGER

*          A's global column index, which points to the beginning of

*          the submatrix which is to be operated on.

*          Must be equal to 1.

*

*  DESCA   (global and local input) INTEGER array of dimension DLEN_.

*          The array descriptor for the distributed matrix A.

*          DESCA( MB_ ) must equal 1

*          DESCA( NB_ ) must equal 1

*          DESCA( ICTXT_ ) must point to a square process grid

*            i.e. one where NPROW is equal to NPCOL

*

*  WORK    (local workspace) COMPLEX*16 array, dimension ( LWORK )

*

*          Where:

*          LWORK >= NB * NUMROC( N, 1, 0, 0, NPROW )

*

*  =====================================================================

*

*     .. Parameters ..

      INTEGER            BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_,

     $                   MB_, NB_, RSRC_, CSRC_, LLD_

      parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,

     $                   ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,

     $                   rsrc_ = 7, csrc_ = 8, lld_ = 9 )

*     ..

*     .. Local Scalars ..

      INTEGER            I, ICTXT, IRECV, ISEND, J, JJ, JRECV, JSEND,

     $                   LDA, MAXIRECV, MAXISEND, MAXJRECV, MAXJSEND,

     $                   MINIRECV, MINISEND, MINJRECV, MINJSEND, MYCOL,

     $                   MYROW, NP, NPCOL, NPROW, NQ, RECVNB, SENDNB,

     $                   STARTCOL, STARTROW

*     ..

*     .. External Subroutines ..

      EXTERNAL           blacs_gridinfo, ctrrv2d, ctrsd2d

*     ..

*     .. External Functions ..

      INTEGER            NUMROC

      EXTERNAL           numroc

*     ..

*     .. Intrinsic Functions ..

      INTRINSIC          conjg, max, min

*     ..

*     .. Executable Statements ..

*       This is just to keep ftnchek and toolpack/1 happy

      IF( block_cyclic_2d*csrc_*ctxt_*dlen_*dtype_*lld_*mb_*m_*nb_*n_*

     $    rsrc_.LT.0 )RETURN

*

*     Further details

*

*     Because the processor grid is square each process needs only send

*     data to its transpose process.  (Likewsie it need only receive

*     data from its transpose process.)  Because the data decomposition

*     is cyclic, the local portion of the array is triangular.

*

*     This routine requires that the data be buffered (i.e. copied)

*     on the sending process (because of the triangular shape) and

*     unbuffered on the receiving process.  Hence, two local memory to

*     memory copies are performed within the communications routines

*     followed by a memory to memory copy outside of the communications

*     routines.  It would be nice to avoid having back to back memory

*     to memory copies (as we do presently on the receiving processor).

*     This could be done by packaging the data ourselves in the sender

*     and then unpacking it directly into the matrix.  However, this

*     code seems cleaner and so since this routine is not a significant

*     performance bottleneck we have left it this way.

*

*

*

*

*     Quick return if possible

*

      IF( n.LE.0 )

     $   RETURN

*

      ictxt = desca( ctxt_ )

      lda = desca( lld_ )

      CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )

*

*

      np = numroc( n, 1, myrow, 0, nprow )

      nq = numroc( n, 1, mycol, 0, npcol )

*

*

      IF( myrow.EQ.mycol ) THEN

*

         DO 20 j = 1, np

            DO 10 i = j + 1, nq

               a( j+( i-1 )*lda ) = conjg( a( i+( j-1 )*lda ) )

   10       CONTINUE

   20    CONTINUE

*

      ELSE

         IF( myrow.GT.mycol ) THEN

            startrow = 1

            startcol = 2

         ELSE

            IF( myrow.EQ.mycol ) THEN

               startrow = 2

               startcol = 2

            ELSE

               startrow = 2

               startcol = 1

            END IF

         END IF

*

         DO 50 jj = 1, max( np, nq ), nb

            minjsend = startcol + jj - 1

            minjrecv = startrow + jj - 1

            maxjsend = min( minjsend+nb-1, nq )

            maxjrecv = min( minjrecv+nb-1, np )

*

            sendnb = maxjsend - minjsend + 1

            recvnb = maxjrecv - minjrecv + 1

*

            minisend = 1

            minirecv = 1

            maxisend = min( np, jj+sendnb-1 )

            maxirecv = min( nq, jj+recvnb-1 )

*

            isend = maxisend - minisend + 1

            irecv = maxirecv - minirecv + 1

            jsend = maxjsend - minjsend + 1

            jrecv = maxjrecv - minjrecv + 1

*

*

*

            DO 40 j = minjrecv, maxjrecv

               DO 30 i = minirecv, maxirecv + j - maxjrecv

                  work( i+( j-minjrecv )*irecv )

     $               = conjg( a( j+( i-1 )*lda ) )

   30          CONTINUE

   40       CONTINUE

*

            IF( irecv.GT.0 .AND. jrecv.GT.0 )

     $         CALL ctrsd2d( ictxt, 'U', 'N', irecv, jrecv, work, irecv,

     $                       mycol, myrow )

*

            IF( isend.GT.0 .AND. jsend.GT.0 )

     $         CALL ctrrv2d( ictxt, 'U', 'N', isend, jsend,

     $                       a( minisend+( minjsend-1 )*lda ), lda,

     $                       mycol, myrow )

*

*

   50    CONTINUE

*

      END IF

*

      RETURN

*

*     End of PCLATRD

*

      END