ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CpswapND.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_CpswapND( PBTYP_T * TYPE, int N,
21  char * X, int IX, int JX, int * DESCX, int INCX,
22  char * Y, int IY, int JY, int * DESCY, int INCY )
23 #else
24 void PB_CpswapND( TYPE, N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY )
25 /*
26 * .. Scalar Arguments ..
27 */
28  int INCX, INCY, IX, IY, JX, JY, N;
29  PBTYP_T * TYPE;
30 /*
31 * .. Array Arguments ..
32 */
33  int * DESCX, * DESCY;
34  char * X, * Y;
35 #endif
36 {
37 /*
38 * Purpose
39 * =======
40 *
41 * PB_CpswapND swaps two subvectors,
42 *
43 * sub( Y ) := sub( X ) and sub( X ) := sub( Y )
44 *
45 * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
46 * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X,
47 *
48 * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y,
49 * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y.
50 *
51 * sub( X ) is assumed to be not distributed, and sub( Y ) is assumed to
52 * be distributed.
53 *
54 * Notes
55 * =====
56 *
57 * A description vector is associated with each 2D block-cyclicly dis-
58 * tributed matrix. This vector stores the information required to
59 * establish the mapping between a matrix entry and its corresponding
60 * process and memory location.
61 *
62 * In the following comments, the character _ should be read as
63 * "of the distributed matrix". Let A be a generic term for any 2D
64 * block cyclicly distributed matrix. Its description vector is DESC_A:
65 *
66 * NOTATION STORED IN EXPLANATION
67 * ---------------- --------------- ------------------------------------
68 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
69 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
70 * the NPROW x NPCOL BLACS process grid
71 * A is distributed over. The context
72 * itself is global, but the handle
73 * (the integer value) may vary.
74 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
75 * ted matrix A, M_A >= 0.
76 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
77 * buted matrix A, N_A >= 0.
78 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
79 * block of the matrix A, IMB_A > 0.
80 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
81 * left block of the matrix A,
82 * INB_A > 0.
83 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
84 * bute the last M_A-IMB_A rows of A,
85 * MB_A > 0.
86 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
87 * bute the last N_A-INB_A columns of
88 * A, NB_A > 0.
89 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
90 * row of the matrix A is distributed,
91 * NPROW > RSRC_A >= 0.
92 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
93 * first column of A is distributed.
94 * NPCOL > CSRC_A >= 0.
95 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
96 * array storing the local blocks of
97 * the distributed matrix A,
98 * IF( Lc( 1, N_A ) > 0 )
99 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
100 * ELSE
101 * LLD_A >= 1.
102 *
103 * Let K be the number of rows of a matrix A starting at the global in-
104 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
105 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
106 * receive if these K rows were distributed over NPROW processes. If K
107 * is the number of columns of a matrix A starting at the global index
108 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
109 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
110 * these K columns were distributed over NPCOL processes.
111 *
112 * The values of Lr() and Lc() may be determined via a call to the func-
113 * tion PB_Cnumroc:
114 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
115 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
116 *
117 * Arguments
118 * =========
119 *
120 * TYPE (local input) pointer to a PBTYP_T structure
121 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
122 * that contains type information (See pblas.h).
123 *
124 * N (global input) INTEGER
125 * On entry, N specifies the length of the subvectors to be
126 * swapped. N must be at least zero.
127 *
128 * X (local input/local output) pointer to CHAR
129 * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
130 * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
131 * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
132 * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
133 * Before entry, this array contains the local entries of the
134 * matrix X. On exit, sub( X ) is overwritten with sub( Y ).
135 *
136 * IX (global input) INTEGER
137 * On entry, IX specifies X's global row index, which points to
138 * the beginning of the submatrix sub( X ).
139 *
140 * JX (global input) INTEGER
141 * On entry, JX specifies X's global column index, which points
142 * to the beginning of the submatrix sub( X ).
143 *
144 * DESCX (global and local input) INTEGER array
145 * On entry, DESCX is an integer array of dimension DLEN_. This
146 * is the array descriptor for the matrix X.
147 *
148 * INCX (global input) INTEGER
149 * On entry, INCX specifies the global increment for the
150 * elements of X. Only two values of INCX are supported in
151 * this version, namely 1 and M_X. INCX must not be zero.
152 *
153 * Y (local input/local output) pointer to CHAR
154 * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y
155 * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and
156 * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least
157 * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise.
158 * Before entry, this array contains the local entries of the
159 * matrix Y. On exit, sub( Y ) is overwritten with sub( X ).
160 *
161 * IY (global input) INTEGER
162 * On entry, IY specifies Y's global row index, which points to
163 * the beginning of the submatrix sub( Y ).
164 *
165 * JY (global input) INTEGER
166 * On entry, JY specifies Y's global column index, which points
167 * to the beginning of the submatrix sub( Y ).
168 *
169 * DESCY (global and local input) INTEGER array
170 * On entry, DESCY is an integer array of dimension DLEN_. This
171 * is the array descriptor for the matrix Y.
172 *
173 * INCY (global input) INTEGER
174 * On entry, INCY specifies the global increment for the
175 * elements of Y. Only two values of INCY are supported in
176 * this version, namely 1 and M_Y. INCY must not be zero.
177 *
178 * -- Written on April 1, 1998 by
179 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
180 *
181 * ---------------------------------------------------------------------
182 */
183 /*
184 * .. Local Scalars ..
185 */
186  char scope, * top, * zero;
187  int RRorCC, Xcol, Xii, XisR, XisRow, Xjj, Xld, Xlinc, Xm,
188  XmyprocD, XmyprocR, Xn, XnprocsD, XnprocsR, XprocR, Xroc,
189  Xrow, Ycol, Yii, Yinb1D, YisR, YisRow, Yjj, Yld, Ylinc,
190  YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD, YnprocsR, YprocD,
191  YprocR, Yroc, Yrow, ctxt, ione=1, k, kbb, kk, kn, ktmp, mycol,
192  mydist, myproc, myrow, npcol, nprow, p, size;
193 /*
194 * .. Local Arrays ..
195 */
196  char * buf = NULL;
197 /* ..
198 * .. Executable Statements ..
199 *
200 */
201 /*
202 * Retrieve process grid information
203 */
204  Cblacs_gridinfo( ( ctxt = DESCX[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
205 /*
206 * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ...
207 */
208  PB_Cinfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, &Xii, &Xjj,
209  &Xrow, &Xcol );
210  if( ( XisRow = ( INCX == DESCX[M_] ) ) != 0 )
211  {
212  Xld = DESCX[LLD_]; Xlinc = Xld;
213  XmyprocD = mycol; XnprocsD = npcol;
214  XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow;
215  XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) );
216  }
217  else
218  {
219  Xld = DESCX[LLD_]; Xlinc = 1;
220  XmyprocD = myrow; XnprocsD = nprow;
221  XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol;
222  XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) );
223  }
224 /*
225 * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ...
226 */
227  PB_Cinfog2l( IY, JY, DESCY, nprow, npcol, myrow, mycol, &Yii, &Yjj,
228  &Yrow, &Ycol );
229  if( ( YisRow = ( INCY == DESCY[M_] ) ) != 0 )
230  {
231  YnbD = DESCY[NB_]; Yld = DESCY[LLD_]; Ylinc = Yld;
232  YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow;
233  YprocD = Ycol; YmyprocD = mycol; YnprocsD = npcol;
234  Yinb1D = PB_Cfirstnb( N, JY, DESCY[INB_], YnbD );
235  }
236  else
237  {
238  YnbD = DESCY[MB_]; Yld = DESCY[LLD_]; Ylinc = 1;
239  YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol;
240  YprocD = Yrow; YmyprocD = myrow; YnprocsD = nprow;
241  Yinb1D = PB_Cfirstnb( N, IY, DESCY[IMB_], YnbD );
242  }
243 
244  YisR = ( ( YprocR == -1 ) || ( YnprocsR == 1 ) );
245 /*
246 * Are sub( X ) and sub( Y ) both row or column vectors ?
247 */
248  RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) );
249 /*
250 * sub( X ) is not distributed and sub( Y ) is distributed
251 */
252  size = TYPE->size; zero = TYPE->zero;
253 
254  if( !( XisR ) )
255  {
256 /*
257 * sub( X ) is not replicated. Since this operation is local if sub( X ) and
258 * sub( Y ) are both row or column vectors, choose YprocR = XprocR when RRorCC,
259 * and YprocR = 0 otherwise.
260 */
261  if( YisR ) { YprocR = ( ( RRorCC ) ? XprocR : 0 ); }
262 /*
263 * Now, it is just like sub( Y ) is not replicated, this information however is
264 * kept in YisR for later use.
265 */
266  if( ( XmyprocR == XprocR ) || ( YmyprocR == YprocR ) )
267  {
268 /*
269 * sub( X ) and sub( Y ) are both row or column vectors
270 */
271  if( RRorCC )
272  {
273  YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD );
274 /*
275 * sub( X ) and sub( Y ) are in the same process row or column
276 */
277  if( XprocR == YprocR )
278  {
279 /*
280 * In a given process, sub( Y ) is swapped with the corresponding non distribu-
281 * ted part of sub( X ). In the other processes, this part of sub( X ) is set
282 * to zero for later reduction.
283 */
284  if( YnpD > 0 )
285  {
286  Yroc = YprocD;
287  if( XisRow ) { kk = Yjj; ktmp = JX + N; kn = JX + Yinb1D; }
288  else { kk = Yii; ktmp = IX + N; kn = IX + Yinb1D; }
289 
290  if( YmyprocD == Yroc )
291  {
292  TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld, size ),
293  &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ),
294  &Ylinc );
295  kk += Yinb1D;
296  }
297  else
298  {
299  TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj, Xld, size ),
300  &Xlinc );
301  }
302  Yroc = MModAdd1( Yroc, YnprocsD );
303 
304  for( k = kn; k < ktmp; k += YnbD )
305  {
306  kbb = ktmp - k; kbb = MIN( kbb, YnbD );
307  if( YmyprocD == Yroc )
308  {
309  if( XisRow )
310  TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld, size ),
311  &Xlinc, Mptr( Y, Yii, kk, Yld, size ),
312  &Ylinc );
313  else
314  TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld, size ),
315  &Xlinc, Mptr( Y, kk, Yjj, Yld, size ),
316  &Ylinc );
317  kk += kbb;
318  }
319  else
320  {
321  if( XisRow )
322  TYPE->Fset( &kbb, zero, Mptr( X, Xii, k, Xld, size ),
323  &Xlinc );
324  else
325  TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj, Xld, size ),
326  &Xlinc );
327  }
328  Yroc = MModAdd1( Yroc, YnprocsD );
329  }
330  }
331  else
332  {
333 /*
334 * If I don't own any entries of sub( Y ), then zero the entire sub( X )
335 * residing in this process.
336 */
337  TYPE->Fset( &N, zero, Mptr( X, Xii, Xjj, Xld, size ),
338  &Xlinc );
339  }
340 /*
341 * Replicate locally scattered sub( X ) by reducing it
342 */
343  if( XisRow )
344  {
345  top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
346  TYPE->Cgsum2d( ctxt, ROW, top, 1, N, Mptr( X, Xii, Xjj, Xld,
347  size ), Xld, -1, 0 );
348  }
349  else
350  {
351  top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
352  TYPE->Cgsum2d( ctxt, COLUMN, top, N, 1, Mptr( X, Xii, Xjj,
353  Xld, size ), Xld, -1, 0 );
354  }
355  }
356  else
357  {
358 /*
359 * sub( X ) and sub( Y ) are in a different process row or column
360 */
361  if( YmyprocR == YprocR )
362  {
363 /*
364 * If I own a piece of sub( Y ), then send it to the process row or column where
365 * sub( X ) resides and receive back the sub( X ) data from the same process.
366 */
367  if( YnpD > 0 )
368  {
369  if( YisRow )
370  {
371  TYPE->Cgesd2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj, Yld,
372  size ), Yld, XprocR, YmyprocD );
373  TYPE->Cgerv2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj, Yld,
374  size ), Yld, XprocR, YmyprocD );
375  }
376  else
377  {
378  TYPE->Cgesd2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj, Yld,
379  size ), Yld, YmyprocD, XprocR );
380  TYPE->Cgerv2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj, Yld,
381  size ), Yld, YmyprocD, XprocR );
382  }
383  }
384  }
385 
386  if( XmyprocR == XprocR )
387  {
388 /*
389 * If I own a sub( X ), then receive the distributed part of sub( Y ) owned by
390 * the process where sub( Y ) resides in my row or column. Perform a local swap
391 * as if sub( Y ) would reside in the same process row or column as sub( X ).
392 * Send the result back and finally perform the reduction to replicate sub( X ).
393 */
394  if( YnpD > 0 )
395  {
396  buf = PB_Cmalloc( YnpD * size );
397  if( YisRow )
398  TYPE->Cgerv2d( ctxt, 1, YnpD, buf, 1, YprocR,
399  XmyprocD );
400  else
401  TYPE->Cgerv2d( ctxt, YnpD, 1, buf, YnpD, XmyprocD,
402  YprocR );
403 
404  Yroc = YprocD;
405  kk = 0;
406  if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; }
407  else { ktmp = IX + N; kn = IX + Yinb1D; }
408 
409  if( YmyprocD == Yroc )
410  {
411  TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld, size ),
412  &Xlinc, buf, &ione );
413  kk += Yinb1D;
414  }
415  else
416  {
417  TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj, Xld,
418  size ), &Xlinc );
419  }
420  Yroc = MModAdd1( Yroc, YnprocsD );
421 
422  for( k = kn; k < ktmp; k += YnbD )
423  {
424  kbb = ktmp - k; kbb = MIN( kbb, YnbD );
425 
426  if( YmyprocD == Yroc )
427  {
428  if( XisRow )
429  TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld, size ),
430  &Xlinc, buf+kk*size, &ione );
431  else
432  TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld, size ),
433  &Xlinc, buf+kk*size, &ione );
434  kk += kbb;
435  }
436  else
437  {
438  if( XisRow )
439  TYPE->Fset( &kbb, zero, Mptr( X, Xii, k, Xld,
440  size ), &Xlinc );
441  else
442  TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj, Xld,
443  size ), &Xlinc );
444  }
445  Yroc = MModAdd1( Yroc, YnprocsD );
446  }
447  if( YisRow )
448  TYPE->Cgesd2d( ctxt, 1, YnpD, buf, 1, YprocR,
449  XmyprocD );
450  else
451  TYPE->Cgesd2d( ctxt, YnpD, 1, buf, YnpD, XmyprocD,
452  YprocR );
453  if( buf ) free( buf );
454  }
455  else
456  {
457  TYPE->Fset( &N, zero, Mptr( X, Xii, Xjj, Xld, size ),
458  &Xlinc );
459  }
460 /*
461 * Replicate locally scattered sub( X ) by reducing it
462 */
463  if( XisRow )
464  {
465  top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
466  TYPE->Cgsum2d( ctxt, ROW, top, 1, N, Mptr( X, Xii, Xjj,
467  Xld, size ), Xld, -1, 0 );
468  }
469  else
470  {
471  top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
472  TYPE->Cgsum2d( ctxt, COLUMN, top, N, 1, Mptr( X, Xii, Xjj,
473  Xld, size ), Xld, -1, 0 );
474  }
475  }
476  }
477  }
478  else
479  {
480 /*
481 * sub( X ) and sub( Y ) are not both row or column vectors
482 */
483  Xroc = 0;
484  if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; }
485  else { ktmp = IX + N; kn = IX + Yinb1D; }
486 /*
487 * Loop over the processes in which sub( Y ) resides, for each process find the
488 * next process Xroc and swap the data. After this, it will be needed to reduce
489 * sub( X ) as above.
490 */
491  for( p = 0; p < YnprocsD; p++ )
492  {
493  mydist = MModSub( p, YprocD, YnprocsD );
494  myproc = MModAdd( YprocD, mydist, YnprocsD );
495 
496  if( ( XprocR == p ) && ( YprocR == Xroc ) )
497  {
498 /*
499 * Swap locally at the intersection of the process cross
500 */
501  if( XmyprocR == p )
502  {
503  YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD,
504  YnprocsD );
505  if( YnpD > 0 )
506  {
507  Yroc = YprocD;
508  kk = ( XisRow ? Yii : Yjj );
509 
510  if( myproc == Yroc )
511  {
512  if( XmyprocD == Xroc )
513  {
514  TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld,
515  size ), &Xlinc, Mptr( Y, Yii, Yjj,
516  Yld, size ), &Ylinc );
517  kk += Yinb1D;
518  }
519  else
520  {
521  TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj, Xld,
522  size ), &Xlinc );
523  }
524  }
525  Yroc = MModAdd1( Yroc, YnprocsD );
526 
527  for( k = kn; k < ktmp; k += YnbD )
528  {
529  kbb = ktmp - k; kbb = MIN( kbb, YnbD );
530  if( myproc == Yroc )
531  {
532  if( XmyprocD == Xroc )
533  {
534  if( XisRow )
535  TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld,
536  size ), &Xlinc, Mptr( Y, kk,
537  Yjj, Yld, size ), &Ylinc );
538  else
539  TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld,
540  size ), &Xlinc, Mptr( Y, Yii,
541  kk, Yld, size ), &Ylinc );
542  kk += kbb;
543  }
544  else
545  {
546  if( XisRow )
547  TYPE->Fset( &kbb, zero, Mptr( X, Xii, k,
548  Xld, size ), &Xlinc );
549  else
550  TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj,
551  Xld, size ), &Xlinc );
552  }
553  }
554  Yroc = MModAdd1( Yroc, YnprocsD );
555  }
556  }
557  }
558  }
559  else
560  {
561 /*
562 * Message exchange
563 */
564  if( ( YmyprocR == YprocR ) && ( YmyprocD == p ) )
565  {
566  YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD,
567  YnprocsD );
568  if( YnpD > 0 )
569  {
570  if( XisRow )
571  {
572  TYPE->Cgesd2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj,
573  Yld, size ), Yld, XprocR, Xroc );
574  TYPE->Cgerv2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj,
575  Yld, size ), Yld, XprocR, Xroc );
576  }
577  else
578  {
579  TYPE->Cgesd2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj,
580  Yld, size ), Yld, Xroc, XprocR );
581  TYPE->Cgerv2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj,
582  Yld, size ), Yld, Xroc, XprocR );
583  }
584  }
585  }
586 
587  if( XmyprocR == XprocR )
588  {
589  YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD,
590  YnprocsD );
591  if( YnpD > 0 )
592  {
593  Yroc = YprocD;
594  kk = 0;
595 /*
596 * Receive the piece of sub( Y ) that I should handle
597 */
598  if( XmyprocD == Xroc )
599  {
600  buf = PB_Cmalloc( YnpD * size );
601  if( XisRow )
602  TYPE->Cgerv2d( ctxt, YnpD, 1, buf, YnpD,
603  p, YprocR );
604  else
605  TYPE->Cgerv2d( ctxt, 1, YnpD, buf, 1,
606  YprocR, p );
607  }
608 
609  if( myproc == Yroc )
610  {
611  if( XmyprocD == Xroc )
612  {
613  TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld,
614  size ), &Xlinc, buf, &ione );
615  kk += Yinb1D;
616  }
617  else
618  {
619  TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj,
620  Xld, size ), &Xlinc );
621  }
622  }
623  Yroc = MModAdd1( Yroc, YnprocsD );
624 
625  for( k = kn; k < ktmp; k += YnbD )
626  {
627  kbb = ktmp - k; kbb = MIN( kbb, YnbD );
628  if( myproc == Yroc )
629  {
630  if( XmyprocD == Xroc )
631  {
632  if( XisRow )
633  TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld,
634  size ), &Xlinc, buf+kk*size,
635  &ione );
636  else
637  TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld,
638  size ), &Xlinc, buf+kk*size,
639  &ione );
640  kk += kbb;
641  }
642  else
643  {
644  if( XisRow )
645  TYPE->Fset( &kbb, zero, Mptr( X, Xii, k,
646  Xld, size ), &Xlinc );
647  else
648  TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj,
649  Xld, size ), &Xlinc );
650  }
651  }
652  Yroc = MModAdd1( Yroc, YnprocsD );
653  }
654 
655  if( XmyprocD == Xroc )
656  {
657  if( XisRow )
658  TYPE->Cgesd2d( ctxt, YnpD, 1, buf, YnpD,
659  p, YprocR );
660  else
661  TYPE->Cgesd2d( ctxt, 1, YnpD, buf, 1,
662  YprocR, p );
663  if( buf ) free( buf );
664  }
665  }
666  }
667  }
668  Xroc = MModAdd1( Xroc, XnprocsD );
669  }
670 /*
671 * Replicate locally scattered sub( X ) by reducing it
672 */
673  if( XmyprocR == XprocR )
674  {
675  if( XisRow )
676  {
677  top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
678  TYPE->Cgsum2d( ctxt, ROW, top, 1, N, Mptr( X, Xii, Xjj,
679  Xld, size ), Xld, -1, 0 );
680  }
681  else
682  {
683  top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
684  TYPE->Cgsum2d( ctxt, COLUMN, top, N, 1, Mptr( X, Xii, Xjj,
685  Xld, size ), Xld, -1, 0 );
686  }
687  }
688  }
689  }
690 
691  if( YisR )
692  {
693 /*
694 * Replicate sub( Y )
695 */
696  YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD );
697  if( YnpD > 0 )
698  {
699  if( YisRow )
700  {
701  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
702  if( YmyprocR == YprocR )
703  TYPE->Cgebs2d( ctxt, COLUMN, top, 1, YnpD, Mptr( Y, Yii, Yjj,
704  Yld, size ), Yld );
705  else
706  TYPE->Cgebr2d( ctxt, COLUMN, top, 1, YnpD, Mptr( Y, Yii, Yjj,
707  Yld, size ), Yld, YprocR, YmyprocD );
708  }
709  else
710  {
711  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
712  if( YmyprocR == YprocR )
713  TYPE->Cgebs2d( ctxt, ROW, top, YnpD, 1, Mptr( Y, Yii, Yjj,
714  Yld, size ), Yld );
715  else
716  TYPE->Cgebr2d( ctxt, ROW, top, YnpD, 1, Mptr( Y, Yii, Yjj,
717  Yld, size ), Yld, YmyprocD, YprocR );
718  }
719  }
720  }
721  }
722  else
723  {
724 /*
725 * sub( X ) is replicated in every process. Swap the data in process row or
726 * column YprocR when sub( Y ) is not replicated and in every process otherwise.
727 */
728  if( YisR || ( YmyprocR == YprocR ) )
729  {
730  YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD );
731 
732  if( YnpD > 0 )
733  {
734  Yroc = YprocD;
735  kk = ( YisRow ? Yjj : Yii );
736 
737  if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; }
738  else { ktmp = IX + N; kn = IX + Yinb1D; }
739 
740  if( YmyprocD == Yroc )
741  {
742  TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc,
743  Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc );
744  kk += Yinb1D;
745  }
746  else
747  {
748  TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj, Xld, size ),
749  &Xlinc );
750  }
751  Yroc = MModAdd1( Yroc, YnprocsD );
752 
753  for( k = kn; k < ktmp; k += YnbD )
754  {
755  kbb = ktmp - k; kbb = MIN( kbb, YnbD );
756  if( YmyprocD == Yroc )
757  {
758  if( YisRow )
759  {
760  if( XisRow )
761  TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld, size ), &Xlinc,
762  Mptr( Y, Yii, kk, Yld, size ), &Ylinc );
763  else
764  TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld, size ), &Xlinc,
765  Mptr( Y, Yii, kk, Yld, size ), &Ylinc );
766  }
767  else
768  {
769  if( XisRow )
770  TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld, size ), &Xlinc,
771  Mptr( Y, kk, Yjj, Yld, size ), &Ylinc );
772  else
773  TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld, size ), &Xlinc,
774  Mptr( Y, kk, Yjj, Yld, size ), &Ylinc );
775  }
776  kk += kbb;
777  }
778  else
779  {
780  if( XisRow )
781  TYPE->Fset( &kbb, zero, Mptr( X, Xii, k, Xld, size ),
782  &Xlinc );
783  else
784  TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj, Xld, size ),
785  &Xlinc );
786  }
787  Yroc = MModAdd1( Yroc, YnprocsD );
788  }
789  }
790  else
791  {
792 /*
793 * If I don't own any of sub( Y ), then just zero sub( X )
794 */
795  TYPE->Fset( &N, zero, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc );
796  }
797 /*
798 * Replicate locally scattered sub( X ) by reducing it in the process scope of
799 * sub( Y )
800 */
801  scope = ( YisRow ? CROW : CCOLUMN );
802  top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
803  if( XisRow )
804  TYPE->Cgsum2d( ctxt, &scope, top, 1, N, Mptr( X, Xii, Xjj, Xld,
805  size ), Xld, -1, 0 );
806  else
807  TYPE->Cgsum2d( ctxt, &scope, top, N, 1, Mptr( X, Xii, Xjj, Xld,
808  size ), Xld, -1, 0 );
809  }
810 
811  if( !YisR )
812  {
813 /*
814 * If sub( Y ) is not replicated, then broadcast the result to the other pro-
815 * cesses that own a piece of sub( X ), but were not involved in the above swap
816 * operation.
817 */
818  if( XisRow ) { Xm = 1; Xn = N; }
819  else { Xm = N; Xn = 1; }
820 
821  if( YisRow )
822  {
823  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
824  if( YmyprocR == YprocR )
825  TYPE->Cgebs2d( ctxt, COLUMN, top, Xm, Xn, Mptr( X, Xii, Xjj, Xld,
826  size ), Xld );
827  else
828  TYPE->Cgebr2d( ctxt, COLUMN, top, Xm, Xn, Mptr( X, Xii, Xjj, Xld,
829  size ), Xld, YprocR, YmyprocD );
830  }
831  else
832  {
833  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
834  if( YmyprocR == YprocR )
835  TYPE->Cgebs2d( ctxt, ROW, top, Xm, Xn, Mptr( X, Xii, Xjj, Xld,
836  size ), Xld );
837  else
838  TYPE->Cgebr2d( ctxt, ROW, top, Xm, Xn, Mptr( X, Xii, Xjj, Xld,
839  size ), Xld, YmyprocD, YprocR );
840  }
841  }
842  }
843 /*
844 * End of PB_CpswapND
845 */
846 }
M_
#define M_
Definition: PBtools.h:39
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
PB_Cfirstnb
int PB_Cfirstnb()
LLD_
#define LLD_
Definition: PBtools.h:47
MModAdd
#define MModAdd(I1, I2, d)
Definition: PBtools.h:97
CROW
#define CROW
Definition: PBblacs.h:21
PB_CpswapND
void PB_CpswapND(PBTYP_T *TYPE, int N, char *X, int IX, int JX, int *DESCX, int INCX, char *Y, int IY, int JY, int *DESCY, int INCY)
Definition: PB_CpswapND.c:24
IMB_
#define IMB_
Definition: PBtools.h:41
MModSub
#define MModSub(I1, I2, d)
Definition: PBtools.h:102
MModAdd1
#define MModAdd1(I, d)
Definition: PBtools.h:100
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
PB_Ctop
char * PB_Ctop()
BCAST
#define BCAST
Definition: PBblacs.h:48
COMBINE
#define COMBINE
Definition: PBblacs.h:49
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
PB_Cmalloc
char * PB_Cmalloc()
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
CCOLUMN
#define CCOLUMN
Definition: PBblacs.h:20
INB_
#define INB_
Definition: PBtools.h:42
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CTXT_
#define CTXT_
Definition: PBtools.h:38