ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pcrow2col.f
Go to the documentation of this file.
1  SUBROUTINE pcrow2col( ICTXT, M, N, NB, VS, LDVS, VD, LDVD,
2  $ RSRC, CSRC, RDEST, CDEST, WORK)
3 *
4 * -- ScaLAPACK tools routine (version 1.7) --
5 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
6 * and University of California, Berkeley.
7 * May 1, 1997
8 *
9 * .. Scalar Arguments ..
10  INTEGER CDEST, CSRC, ICTXT, LDVD, LDVS, M, N, NB,
11  $ rdest, rsrc
12 * ..
13 * .. Array Arguments ..
14  COMPLEX VD( LDVD, * ), VS( LDVS, * ), WORK( * )
15 * ..
16 *
17 * Purpose
18 * =======
19 *
20 * Take a block of vectors with M total rows which are distributed over
21 * a row of processes, and distribute those rows over a column of
22 * processes. This routine minimizes communication by sending all
23 * information it has that a given process in the CDEST needs at once.
24 * To do this it uses the least common multiple (LCM) concept. This is
25 * simply the realization that if I have part of a vector split over a
26 * process row consisting of Q processes, and I want to send all of that
27 * vector that I own to a new vector distributed over P processes within
28 * a process column, that after I find the process in RDEST that owns
29 * the row of the vector I'm currently looking at, he will want every
30 * ( (LCM(P,Q)/Q ) block of my vector (the block being of size NB x N).
31 *
32 * Arguments
33 * =========
34 *
35 * Rem: MP, resp. NQ, denotes the number of local rows, resp. local
36 * ==== columns, necessary to store a global vector of dimension M
37 * across P processes, resp. N over Q processes.
38 *
39 * ICTXT (global input) INTEGER
40 * The BLACS context handle, indicating the global context of
41 * the operation. The context itself is global.
42 *
43 * M (global input) INTEGER
44 * The number of global rows each vector has.
45 *
46 * N (global input) INTEGER
47 * The number of vectors in the vector block
48 *
49 * NB (global input) INTEGER
50 * The blocking factor used to divide the rows of the vector
51 * amongst the processes of a row.
52 *
53 * VS (local input) COMPLEX
54 * Array of dimension (LDVS,N), the block of vectors stored on
55 * process row RSRC to be put into memory VD, and stored on
56 * process column CDEST.
57 *
58 * LDVS (local input) INTEGER
59 * The leading dimension of VS.
60 *
61 * VD (local output) COMPLEX
62 * Array of dimension (LDVD,N), on output, the contents of VD
63 * stored on process column CDEST will be here.
64 *
65 * LDVD (local input) INTEGER
66 * The leading dimension of VD.
67 *
68 * RSRC (global input) INTEGER
69 * The process row VS is distributed over.
70 *
71 * CSRC (global input) INTEGER
72 * The process column the distributed block of vectors VS
73 * begins on.
74 *
75 * RDEST (global input) INTEGER
76 * The process row that VD begins on.
77 *
78 * CDEST (global input) INTEGER
79 * The process column to distribute VD over.
80 *
81 * WORK (local workspace) COMPLEX
82 * Array, dimension (LDW). The required size of work varies:
83 * if( nprow.eq.npcol ) then
84 * LDW = 0; WORK not accessed.
85 * else
86 * lcm = least common multiple of process rows and columns.
87 * Mq = number of rows of VS on my process.
88 * npcol = number of process columns
89 * CEIL = the ceiling of given operation
90 * LDW = NB*N*CEIL( CEIL( Mq/NB )/(LCM/npcol) )
91 * end if
92 *
93 * =====================================================================
94 *
95 * .. Local Scalars ..
96  INTEGER CBLKSKIP, ICPY, II, ISTART, ICSRC, IRDEST, JB,
97  $ jj, k, lcm, mp, mq, mycol, mydist, myrow,
98  $ nblocks, npcol, nprow, rblkskip
99 * ..
100 * .. External Subroutines ..
101  EXTERNAL blacs_gridinfo, cgesd2d, cgerv2d, clacpy
102 * ..
103 * .. External Functions ..
104  INTEGER ILCM, NUMROC
105  EXTERNAL ilcm, numroc
106 * ..
107 * .. Executable Statements ..
108 *
109  icpy = 0
110 *
111  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
112 *
113 * If we are not in special case for NPROW = NPCOL where there is no
114 * copying required
115 *
116  IF( nprow .NE. npcol ) THEN
117  lcm = ilcm( nprow, npcol )
118  rblkskip = lcm / npcol
119  cblkskip = lcm / nprow
120 *
121 * If I have part of VS, the source vector(s)
122 *
123  IF( myrow.EQ.rsrc ) THEN
124 *
125  istart = 1
126 *
127 * Figure my distance from CSRC: the process in CDEST the same
128 * distance from RDEST will want my first block
129 *
130  mydist = mod( npcol+mycol-csrc, npcol )
131  mq = numroc( m, nb, mycol, csrc, npcol )
132  irdest = mod( rdest+mydist, nprow )
133 *
134 * Loop over all possible destination processes
135 *
136  DO 20 k = 1, rblkskip
137  jj = 1
138 *
139 * If I am not destination process
140 *
141  IF( (myrow.NE.irdest).OR.(mycol.NE.cdest) ) THEN
142 *
143 * Pack all data I own that destination needs
144 *
145  DO 10 ii = istart, mq, nb*rblkskip
146  jb = min( nb, mq-ii+1 )
147  CALL clacpy( 'G', jb, n, vs(ii,1), ldvs,
148  $ work(jj), jb )
149  jj = jj + nb*n
150  10 CONTINUE
151 *
152 * Figure how many rows are to be sent and send them if
153 * necessary, NOTE: will send extra if NB > JB
154 *
155  jj = jj - 1
156  IF( jj.GT.0 )
157  $ CALL cgesd2d( ictxt, jj, 1, work, jj, irdest,
158  $ cdest )
159 *
160 * I am both source and destination, save where to start
161 * copying from for later use
162 *
163  ELSE
164  icpy = istart
165  END IF
166 *
167  istart = istart + nb
168  irdest = mod( irdest+npcol, nprow )
169  20 CONTINUE
170  END IF
171 *
172 * If I should receive info into VD
173 *
174  IF( mycol.EQ.cdest ) THEN
175 *
176  istart = 1
177 *
178 * Figure my distance from CDEST: the process in CSRC the same
179 * distance from RSRC will have my first block
180 *
181  mydist = mod( nprow+myrow-rdest, nprow )
182  mp = numroc( m, nb, myrow, rdest, nprow )
183  icsrc = mod( csrc+mydist, npcol )
184 *
185 * Loop over all sending processes
186 *
187  DO 50 k = 1, cblkskip
188 *
189 * If I don't already possess the required data
190 *
191  IF( (myrow.NE.rsrc).OR.(mycol.NE.icsrc) ) THEN
192 *
193 * Figure how many rows to receive, and receive them
194 * NOTE: may receive to much -- NB instead of JB
195 *
196  nblocks = (mp - istart + nb) / nb
197  jj = ((nblocks+cblkskip-1) / cblkskip)*nb
198  IF( jj.GT.0 )
199  $ CALL cgerv2d( ictxt, jj, n, work, jj, rsrc, icsrc )
200 *
201 * Copy data to destination vector
202 *
203  jj = 1
204  DO 30 ii = istart, mp, nb*cblkskip
205  jb = min( nb, mp-ii+1 )
206  CALL clacpy( 'G', jb, n, work(jj), jb, vd(ii,1),
207  $ ldvd )
208  jj = jj + nb*n
209  30 CONTINUE
210 *
211 * If I am both source and destination
212 *
213  ELSE
214  jj = icpy
215  DO 40 ii = istart, mp, nb*cblkskip
216  jb = min( nb, mp-ii+1 )
217  CALL clacpy( 'G', jb, n, vs(jj,1), ldvs, vd(ii,1),
218  $ ldvd )
219  jj = jj + nb*rblkskip
220  40 CONTINUE
221  END IF
222  istart = istart + nb
223  icsrc = mod( icsrc+nprow, npcol )
224  50 CONTINUE
225  END IF
226 *
227 * if NPROW = NPCOL, there is a one-to-one correspondance between
228 * process rows and columns, so no work space or copying required
229 *
230  ELSE
231 *
232  IF( myrow.EQ.rsrc ) THEN
233 *
234 * Figure my distance from CSRC: the process in CDEST the same
235 * distance from RDEST will want my piece of the vector
236 *
237  mydist = mod( npcol+mycol-csrc, npcol )
238  mq = numroc( m, nb, mycol, csrc, npcol )
239  irdest = mod( rdest+mydist, nprow )
240  IF( (myrow.NE.irdest).OR.(mycol.NE.cdest) ) THEN
241  CALL cgesd2d( ictxt, mq, n, vs, ldvs, irdest, cdest )
242  ELSE
243  CALL clacpy( 'G', mq, n, vs, ldvs, vd, ldvd )
244  END IF
245  END IF
246  IF( mycol.EQ.cdest ) THEN
247 *
248 * Figure my distance from RDEST: the process in RSRC the same
249 * distance from CSRC will have my piece of the vector
250 *
251  mydist = mod( nprow+myrow-rdest, nprow )
252  mp = numroc( m, nb, myrow, rdest, nprow )
253  icsrc = mod( csrc+mydist, npcol )
254  IF( (mycol.NE.icsrc).OR.(myrow.NE. rsrc) )
255  $ CALL cgerv2d( ictxt, mp, n, vd, ldvd, rsrc, icsrc )
256  END IF
257  END IF
258 *
259  RETURN
260 *
261 * End of PCROW2COL
262 *
263  END
pcrow2col
subroutine pcrow2col(ICTXT, M, N, NB, VS, LDVS, VD, LDVD, RSRC, CSRC, RDEST, CDEST, WORK)
Definition: pcrow2col.f:3
min
#define min(A, B)
Definition: pcgemr.c:181