program example3 implicit none * * simple example to show how to generate a scalapack matrix * contribution from Ed d'Azevedo, ORNL, 2005 * integer BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ parameter ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * integer descA(DLEN_) integer descAtmp(DLEN_) * integer lwork parameter(lwork=10*1000) double precision work(lwork) * integer maxproc parameter(maxproc=8*1024) * integer iabegin(0:maxproc) integer ialast(0:maxproc) integer ia_begin,ia_last,ia_size integer local_ia, local_ja * integer krow parameter(krow=10) integer Atmpsize parameter(Atmpsize=4*1000*1000) * integer Asize parameter(Asize=32*1000*1000) * integer nout parameter(nout=16) * double precision aij double precision A(Asize) double precision Atmp(Atmpsize) * integer m,n,mb,nb integer info, ierr(1) integer iam,nprocs integer icontext, myprow,mypcol,nprow,npcol integer rsrc,csrc,lld,Locp,Locq,Aneed * integer ia,ja,irprnt,icprnt logical isroot, isok * integer prow,pcol logical do_print * integer iastart,iaend integer mm,nn * double precision t1,t2 double precision MPI_Wtime external MPI_Wtime * integer numroc, indxg2p external numroc, indxg2p external descinit * mb = 50 nb = 50 m = 400 n = 400 * * ----------------------- * setup blacs environment * ----------------------- * call blacs_pinfo( iam,nprocs) * do nprow=int( sqrt(real(nprocs)) )+1,1,-1 npcol = nprocs/nprow if (nprow*npcol.eq.nprocs) goto 11 enddo 11 continue * call blacs_get(-1,0,icontext) call blacs_gridinit( icontext, 'Col-major', nprow,npcol) * call blacs_gridinfo( icontext, nprow,npcol, myprow,mypcol) isroot = (myprow.eq.0).and.(mypcol.eq.0) * if (isroot) then write(*,*) 'nprow,npcol ', nprow,npcol write(*,*) 'm,n ', m,n write(*,*) 'mb,nb ', mb,nb endif * * --------------------------------------------------------- * compute local extent and allocate storage for local piece * --------------------------------------------------------- * csrc = 0 rsrc = 0 * Locq = numroc(n,nb,mypcol,csrc,npcol) Locq = max(1,Locq) Locp = numroc(m,mb,myprow,rsrc,nprow) * lld = max(Locp,1) Aneed = lld*Locq isok = (Aneed.le.Asize) if (.not.isok) then if (isroot) then write(*,*) 'increase Asize to ',Aneed + 1 endif goto 999 endif * * ---------------- * setup descriptor * ---------------- * call descinit(descA,m,n,mb,nb,rsrc,csrc,icontext,lld,info) ierr(1) = info call igsum2d( icontext, 'All', ' ',1,1,ierr,1,-1,-1) isok = (info.eq.0) if (.not.isok) then if (isroot) then write(*,*) 'descinit returns info = ',info endif goto 999 endif * * --------------------------------- * setup descriptor and local matrix * --------------------------------- * call descinit(descAtmp,krow,n,krow,n,myprow,mypcol, & icontext,krow,info) ierr(1) = info call igsum2d( icontext, 'All', ' ',1,1,ierr,1,-1,-1) isok = (info.eq.0) if (.not.isok) then if (isroot) then write(*,*) 'descinit(Atmp) returns info = ',info endif goto 999 endif * isok = (krow*n.le.Atmpsize) if (.not.isok) then if (isroot) then write(*,*) 'increase Atmpsize to ',krow*n+1 endif goto 999 endif * * ------------------------------ * better method to setup matrix * take advantage of the block 2D cyclic format * * (ia,ja) are global indices * ------------------------------ * call blacs_barrier( icontext, 'All') t1 = MPI_Wtime() * * ---------------------------------------------- * Each processor generates several complete rows * and then copies into global matrix * ---------------------------------------------- * isok = (nprow*npcol.le.maxproc) if (.not.isok) then if (isroot) then write(*,*) 'increase maxproc to ',nprow*npcol+1 endif goto 999 endif * do iastart=1,descA(M_),krow*nprow*npcol * iaend = min( descA(M_),iastart + krow*nprow*npcol-1) * * ---------------------------- * partition work to processors * ---------------------------- * ia_begin = iastart do pcol=0,npcol-1 do prow=0,nprow-1 ia_last = min(iaend, ia_begin+krow-1) iabegin( prow + pcol*nprow ) = ia_begin ialast( prow + pcol*nprow ) = ia_last * ia_begin = ia_last + 1 enddo enddo * * ------------------------------------------ * generate matrix entries on local processor * ------------------------------------------ * prow = myprow pcol = mypcol ia_begin = iabegin( prow + pcol*nprow ) ia_last = ialast( prow + pcol*nprow ) * do ja=1,descA(N_) do ia=ia_begin,ia_last * call generate_Aij( m,n, ia,ja, aij ) * local_ia = (ia-ia_begin) + 1 local_ja = ja Atmp( local_ia + (local_ja-1)*descAtmp(LLD_) ) = aij enddo enddo * * ----------------------------------------- * copy data to distributed block 2d format * rely on PBLAS for communication * ----------------------------------------- * do pcol=0,npcol-1 do prow=0,nprow-1 ia_begin = iabegin( prow + pcol*nprow ) ia_last = ialast( prow + pcol*nprow ) ia_size = ia_last - ia_begin + 1 * * ----------------- * adjust descriptor * ----------------- * descAtmp(RSRC_) = prow descAtmp(CSRC_) = pcol * mm = ia_size nn = descA(N_) ia = ia_begin ja = 1 call pdgecopy( mm,nn, & Atmp,1,1,descAtmp, & A,ia,ja,descA) enddo enddo enddo * call blacs_barrier( icontext, 'All') t2 = MPI_Wtime() if (isroot) then write(*,*) 'time to build matrix is ',t2-t1,' sec' endif * * -------------------------------------------- * if matrix is not too big, print out content * for debugging * -------------------------------------------- * do_print = ((m*n.le.200*1000).and.(lwork.ge.mb)) if (do_print) then * ia = 1 ja = 1 irprnt = 0 icprnt = 0 call pdlaprnt(m,n,A,ia,ja,descA,irprnt,icprnt,'A',nout,work) * endif * 999 continue * * --------------- * prepare to exit * --------------- * call blacs_barrier(icontext, 'All') call blacs_gridexit( icontext ) call blacs_exit(0) stop end * subroutine generate_Aij( m,n, ia,ja, aij ) implicit none integer m,n, ia,ja double precision aij * aij = dble(ia) + dble(ja-1)*dble(m) return end * subroutine pdgecopy( m,n,A,ia,ja,descA,C,ic,jc,descC ) implicit none integer m,n, ia,ja,descA(*), ic,jc, descC(*) double precision A(*), C(*) * double precision alpha, beta * * PxGEADD is new capability available in PBLAS Version 2 * * PDGEADD adds a matrix to another * * sub( C ) := beta*sub( C ) + alpha*op( sub( A ) ) * * beta = 0.0d0 alpha = 1.0d0 call pdgeadd( 'NoTrans', m,n, alpha, A,ia,ja,descA, & beta, C,ic,jc,descC ) return end * subroutine pdgecopy0( m,n,A,ia,ja,descA,C,ic,jc,descC ) implicit none integer BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ parameter ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * integer m,n, ia,ja,descA(*), ic,jc, descC(*) double precision A(*), C(*) * integer iia,jja, iic,jjc, ix,iy, i,j * * ----------------------------------------------- * if PBLAS version 2 is not available, use slower * PBLAS PxCOPY instead * ----------------------------------------------- * if( m.ge.n) then do j=1,n iia = ia jja = (ja-1) + j iic = ic jjc = (jc-1) + j ix = 1 iy = 1 call pdcopy( m, A, iia,jja, descA, ix, & C, iic,jjc, descC, iy ) enddo else do i=1,m iia = (ia-1) + i jja = ja iic = (ic-1) + i jjc = jc ix = descA(M_) iy = descC(M_) call pdcopy( n, A, iia, jja, descA, ix, & C, iic, jjc, descC, iy ) enddo endif return end *