ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
psamax_.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "pblas.h"
14 #include "PBpblas.h"
15 #include "PBtools.h"
16 #include "PBblacs.h"
17 #include "PBblas.h"
18 
19 #ifdef __STDC__
20 void psamax_( int * N, float * AMAX, int * INDX,
21  float * X, int * IX, int * JX, int * DESCX, int * INCX )
22 #else
23 void psamax_( N, AMAX, INDX, X, IX, JX, DESCX, INCX )
24 /*
25 * .. Scalar Arguments ..
26 */
27  int * INCX, * INDX, * IX, * JX, * N;
28  float * AMAX;
29 /*
30 * .. Array Arguments ..
31 */
32  int * DESCX;
33  float * X;
34 #endif
35 {
36 /*
37 * Purpose
38 * =======
39 *
40 * PSAMAX computes the global index of the maximum element in absolute
41 * value of a subvector sub( X ). The global index is returned in INDX
42 * and the value of that element is returned in AMAX,
43 *
44 * where
45 *
46 * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
47 * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X.
48 *
49 * Notes
50 * =====
51 *
52 * A description vector is associated with each 2D block-cyclicly dis-
53 * tributed matrix. This vector stores the information required to
54 * establish the mapping between a matrix entry and its corresponding
55 * process and memory location.
56 *
57 * In the following comments, the character _ should be read as
58 * "of the distributed matrix". Let A be a generic term for any 2D
59 * block cyclicly distributed matrix. Its description vector is DESC_A:
60 *
61 * NOTATION STORED IN EXPLANATION
62 * ---------------- --------------- ------------------------------------
63 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
64 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
65 * the NPROW x NPCOL BLACS process grid
66 * A is distributed over. The context
67 * itself is global, but the handle
68 * (the integer value) may vary.
69 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
70 * ted matrix A, M_A >= 0.
71 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
72 * buted matrix A, N_A >= 0.
73 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
74 * block of the matrix A, IMB_A > 0.
75 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
76 * left block of the matrix A,
77 * INB_A > 0.
78 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
79 * bute the last M_A-IMB_A rows of A,
80 * MB_A > 0.
81 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
82 * bute the last N_A-INB_A columns of
83 * A, NB_A > 0.
84 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
85 * row of the matrix A is distributed,
86 * NPROW > RSRC_A >= 0.
87 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
88 * first column of A is distributed.
89 * NPCOL > CSRC_A >= 0.
90 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
91 * array storing the local blocks of
92 * the distributed matrix A,
93 * IF( Lc( 1, N_A ) > 0 )
94 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
95 * ELSE
96 * LLD_A >= 1.
97 *
98 * Let K be the number of rows of a matrix A starting at the global in-
99 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
100 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
101 * receive if these K rows were distributed over NPROW processes. If K
102 * is the number of columns of a matrix A starting at the global index
103 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
104 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
105 * these K columns were distributed over NPCOL processes.
106 *
107 * The values of Lr() and Lc() may be determined via a call to the func-
108 * tion PB_Cnumroc:
109 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
110 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
111 *
112 * Arguments
113 * =========
114 *
115 * N (global input) INTEGER
116 * On entry, N specifies the length of the subvector sub( X ).
117 * N must be at least zero.
118 *
119 * AMAX (global output) REAL array
120 * On exit, AMAX specifies the largest entry in absolute value
121 * of the subvector sub( X ) only in its scope (See below for
122 * further details).
123 *
124 * INDX (global output) INTEGER
125 * On exit, INDX specifies the global index of the maximum ele-
126 * ment in absolute value of the subvector sub( X ) only in its
127 * scope (See below for further details).
128 *
129 * X (local input) REAL array
130 * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
131 * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
132 * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
133 * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
134 * Before entry, this array contains the local entries of the
135 * matrix X.
136 *
137 * IX (global input) INTEGER
138 * On entry, IX specifies X's global row index, which points to
139 * the beginning of the submatrix sub( X ).
140 *
141 * JX (global input) INTEGER
142 * On entry, JX specifies X's global column index, which points
143 * to the beginning of the submatrix sub( X ).
144 *
145 * DESCX (global and local input) INTEGER array
146 * On entry, DESCX is an integer array of dimension DLEN_. This
147 * is the array descriptor for the matrix X.
148 *
149 * INCX (global input) INTEGER
150 * On entry, INCX specifies the global increment for the
151 * elements of X. Only two values of INCX are supported in
152 * this version, namely 1 and M_X. INCX must not be zero.
153 *
154 * Further Details
155 * ===============
156 *
157 * When the result of a vector-oriented PBLAS call is a scalar, this
158 * scalar is set only within the process scope which owns the vector(s)
159 * being operated on. Let sub( X ) be a generic term for the input vec-
160 * tor(s). Then, the processes owning the correct the answer is determi-
161 * ned as follows: if an operation involves more than one vector, the
162 * processes receiving the result will be the union of the following set
163 * of processes for each vector:
164 *
165 * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro-
166 * cess row or process column owns the vector operand, therefore only
167 * the process owning sub( X ) receives the correct result;
168 *
169 * If INCX = M_X, then sub( X ) is a vector distributed over a process
170 * row. Each process in this row receives the result;
171 *
172 * If INCX = 1, then sub( X ) is a vector distributed over a process
173 * column. Each process in this column receives the result;
174 *
175 * -- Written on April 1, 1998 by
176 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
177 *
178 * ---------------------------------------------------------------------
179 */
180 /*
181 * .. Local Scalars ..
182 */
183  char cbtop, cctop, rbtop, rctop;
184  int Xcol, Xgindx, Xi, Xii, Ximb, Xinb, Xj, Xjj, Xlindx, Xld, Xmb,
185  Xnb, Xnp, Xnq, Xrow, Xsrc, ctxt, dist, dst, idumm, info, k,
186  maxpos, mycol, mydist, myrow, npcol, nprow, src;
187 /*
188 * .. Local Arrays ..
189 */
190  int Xd[DLEN_];
191  float work[4];
192 /* ..
193 * .. Executable Statements ..
194 *
195 */
196  PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd );
197 #ifndef NO_ARGCHK
198 /*
199 * Test the input parameters
200 */
201  Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
202  if( !( info = ( ( nprow == -1 ) ? -( 701 + CTXT_ ) : 0 ) ) )
203  PB_Cchkvec( ctxt, "PSAMAX", "X", *N, 1, Xi, Xj, Xd, *INCX, 7, &info );
204  if( info ) { PB_Cabort( ctxt, "PSAMAX", info ); return; }
205 #endif
206 /*
207 * Initialize INDX and AMAX
208 */
209  *INDX = 0; *AMAX = ZERO;
210 /*
211 * Quick return if possible
212 */
213  if( *N == 0 ) return;
214 /*
215 * Retrieve process grid information
216 */
217 #ifdef NO_ARGCHK
218  Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
219 #endif
220 /*
221 * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol
222 */
223  PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj,
224  &Xrow, &Xcol );
225 /*
226 * Handle degenerate case separately, sub( X )'s scope is just one process
227 */
228  if( ( *INCX == 1 ) && ( Xd[M_] == 1 ) && ( *N == 1 ) )
229  {
230 /*
231 * Make sure I own some data and compute INDX and AMAX
232 */
233  if( ( ( myrow == Xrow ) || ( Xrow < 0 ) ) &&
234  ( ( mycol == Xcol ) || ( Xcol < 0 ) ) )
235  {
236  *INDX = *JX; *AMAX = X[Xii+Xjj*Xd[LLD_]];
237  }
238  return;
239  }
240  else if( *INCX == Xd[M_] )
241  {
242 /*
243 * sub( X ) resides in (a) process row(s)
244 */
245  if( ( myrow == Xrow ) || ( Xrow < 0 ) )
246  {
247  rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
248 
249  if( ( rctop == CTOP_DEFAULT ) || ( rctop == CTOP_TREE1 ) )
250  {
251 /*
252 * Inline the 1-tree combine for communication savings
253 */
254  Xinb = Xd[INB_ ]; Xnb = Xd[NB_ ]; Xsrc = Xd[CSRC_];
255  Xnq = PB_Cnumroc( *N, Xj, Xinb, Xnb, mycol, Xsrc, npcol );
256 /*
257 * Make sure I own some data and compute local INDX and AMAX
258 */
259  if( Xnq > 0 )
260  {
261  Xld = Xd[LLD_];
262  Xlindx = Xjj - 1 +
263  isamax_( &Xnq, ((char*)(X+(Xii+Xjj*Xld))), &Xld );
264  Mindxl2g( Xgindx, Xlindx, Xinb, Xnb, mycol, Xsrc, npcol );
265  work[0] = X[Xii+Xlindx*Xld];
266  work[1] = ((float )( Xgindx+1 ));
267  }
268  else
269  {
270  work[0] = ZERO;
271  work[1] = ZERO;
272  }
273 /*
274 * Combine the local results using a 1-tree topology within process column 0
275 * if npcol > 1 or Xcol >= 0, i.e sub( X ) is distributed.
276 */
277  if( ( npcol >= 2 ) && ( Xcol >= 0 ) )
278  {
279  mydist = mycol;
280  k = 1;
281 l_10:
282  if( mydist & 1 )
283  {
284  dist = k * ( mydist - 1 );
285  dst = MPosMod( dist, npcol );
286  Csgesd2d( ctxt, 2, 1, ((char*)work), 2, myrow, dst );
287  goto l_20;
288  }
289  else
290  {
291  dist = mycol + k;
292  src = MPosMod( dist, npcol );
293 
294  if( mycol < src )
295  {
296  Csgerv2d( ctxt, 2, 1, ((char*) &work[2]), 2, myrow,
297  src );
298  if( ABS( work[0] ) < ABS( work[2] ) )
299  { work[0] = work[2]; work[1] = work[3]; }
300  }
301  mydist >>= 1;
302  }
303  k <<= 1;
304 
305  if( k < npcol ) goto l_10;
306 l_20:
307 /*
308 * Process column 0 broadcasts the combined values of INDX and AMAX within
309 * their process row.
310 */
311  rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
312  if( mycol == 0 )
313  {
314  Csgebs2d( ctxt, ROW, &rbtop, 2, 1, ((char*)work), 2 );
315  }
316  else
317  {
318  Csgebr2d( ctxt, ROW, &rbtop, 2, 1, ((char*)work), 2,
319  myrow, 0 );
320  }
321  }
322 /*
323 * Set INDX and AMAX to the replicated answers contained in work. If AMAX is
324 * zero, then select a coherent INDX.
325 */
326  *AMAX = work[0];
327  *INDX = ( ( *AMAX == ZERO ) ? ( *JX ) : ( (int)(work[1]) ) );
328  }
329  else
330  {
331 /*
332 * Otherwise use the current topology settings to combine the results
333 */
334  Xinb = Xd[INB_ ]; Xnb = Xd[NB_ ]; Xsrc = Xd[CSRC_];
335  Xnq = PB_Cnumroc( *N, Xj, Xinb, Xnb, mycol, Xsrc, npcol );
336 /*
337 * Make sure I own some data and compute local INDX and AMAX
338 */
339  if( Xnq > 0 )
340  {
341 /*
342 * Compute the local maximum and its corresponding local index
343 */
344  Xld = Xd[LLD_];
345  Xlindx = Xjj - 1 +
346  isamax_( &Xnq, ((char*)(X+(Xii+Xjj*Xld))), &Xld );
347  *AMAX = X[Xii+Xlindx*Xld];
348  }
349  else
350  {
351  *AMAX = ZERO;
352  }
353 
354  if( Xcol >= 0 )
355  {
356 /*
357 * Combine leave on all the local maximum if Xcol >= 0, i.e sub( X ) is
358 * distributed
359 */
360  Csgamx2d( ctxt, ROW, &rctop, 1, 1, ((char*)AMAX), 1,
361  &idumm, &maxpos, 1, -1, mycol );
362 /*
363 * Broadcast the corresponding global index
364 */
365  if( *AMAX != ZERO )
366  {
367  rbtop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
368  if( mycol == maxpos )
369  {
370  Mindxl2g( Xgindx, Xlindx, Xinb, Xnb, mycol, Xsrc, npcol );
371  *INDX = Xgindx + 1;
372  Cigebs2d( ctxt, ROW, &rbtop, 1, 1, ((char*)INDX), 1 );
373  }
374  else
375  {
376  Cigebr2d( ctxt, ROW, &rbtop, 1, 1, ((char*)INDX), 1,
377  myrow, maxpos );
378  }
379  }
380  else
381  {
382 /*
383 * If AMAX is zero, then select a coherent INDX.
384 */
385  *INDX = *JX;
386  }
387  }
388  else
389  {
390 /*
391 * sub( X ) is not distributed. If AMAX is zero, then select a coherent INDX.
392 */
393  *INDX = ( ( *AMAX == ZERO ) ? ( *JX ) : Xlindx + 1 );
394  }
395  }
396  }
397  return;
398  }
399  else
400  {
401 /*
402 * sub( X ) resides in (a) process column(s)
403 */
404  if( ( mycol == Xcol ) || ( Xcol < 0 ) )
405  {
406  cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
407 
408  if( ( cctop == CTOP_DEFAULT ) || ( cctop == CTOP_TREE1 ) )
409  {
410 /*
411 * Inline the 1-tree combine for communication savings
412 */
413  Ximb = Xd[IMB_ ]; Xmb = Xd[MB_ ]; Xsrc = Xd[RSRC_];
414  Xnp = PB_Cnumroc( *N, Xi, Ximb, Xmb, myrow, Xsrc, nprow );
415 /*
416 * Make sure I own some data and compute local INDX and AMAX
417 */
418  if( Xnp > 0 )
419  {
420  Xld = Xd[LLD_];
421  Xlindx = Xii - 1 +
422  isamax_( &Xnp, ((char*)(X+(Xii+Xjj*Xld))), INCX );
423  Mindxl2g( Xgindx, Xlindx, Ximb, Xmb, myrow, Xsrc, nprow );
424  work[0] = X[Xlindx+Xjj*Xld];
425  work[1] = ((float )( Xgindx+1 ));
426  }
427  else
428  {
429  work[0] = ZERO;
430  work[1] = ZERO;
431  }
432 /*
433 * Combine the local results using a 1-tree topology within process row 0
434 * if nprow > 1 or Xrow >= 0, i.e sub( X ) is distributed.
435 */
436  if( ( nprow >= 2 ) && ( Xrow >= 0 ) )
437  {
438  mydist = myrow;
439  k = 1;
440 l_30:
441  if( mydist & 1 )
442  {
443  dist = k * ( mydist - 1 );
444  dst = MPosMod( dist, nprow );
445  Csgesd2d( ctxt, 2, 1, ((char*)work), 2, dst, mycol );
446  goto l_40;
447  }
448  else
449  {
450  dist = myrow + k;
451  src = MPosMod( dist, nprow );
452 
453  if( myrow < src )
454  {
455  Csgerv2d( ctxt, 2, 1, ((char*) &work[2]), 2,
456  src, mycol );
457  if( ABS( work[0] ) < ABS( work[2] ) )
458  { work[0] = work[2]; work[1] = work[3]; }
459  }
460  mydist >>= 1;
461  }
462  k <<= 1;
463 
464  if( k < nprow ) goto l_30;
465 l_40:
466 /*
467 * Process row 0 broadcasts the combined values of INDX and AMAX within their
468 * process column.
469 */
470  cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
471  if( myrow == 0 )
472  {
473  Csgebs2d( ctxt, COLUMN, &cbtop, 2, 1, ((char*)work), 2 );
474  }
475  else
476  {
477  Csgebr2d( ctxt, COLUMN, &cbtop, 2, 1, ((char*)work), 2,
478  0, mycol );
479  }
480  }
481 /*
482 * Set INDX and AMAX to the replicated answers contained in work. If AMAX is
483 * zero, then select a coherent INDX.
484 */
485  *AMAX = work[0];
486  *INDX = ( ( *AMAX == ZERO ) ? ( *IX ) : ( (int)(work[1]) ) );
487  }
488  else
489  {
490 /*
491 * Otherwise use the current topology settings to combine the results
492 */
493  Ximb = Xd[IMB_ ]; Xmb = Xd[MB_ ]; Xsrc = Xd[RSRC_];
494  Xnp = PB_Cnumroc( *N, Xi, Ximb, Xmb, myrow, Xsrc, nprow );
495 /*
496 * Make sure I own some data and compute local INDX and AMAX
497 */
498 
499  if( Xnp > 0 )
500  {
501 /*
502 * Compute the local maximum and its corresponding local index
503 */
504  Xld = Xd[LLD_];
505  Xlindx = Xii - 1 +
506  isamax_( &Xnp, ((char*)(X+(Xii+Xjj*Xld))), INCX );
507  *AMAX = X[Xlindx+Xjj*Xld];
508  }
509  else
510  {
511  *AMAX = ZERO;
512  }
513 
514  if( Xrow >= 0 )
515  {
516 /*
517 * Combine leave on all the local maximum if Xrow >= 0, i.e sub( X ) is
518 * distributed.
519 */
520  Csgamx2d( ctxt, COLUMN, &cctop, 1, 1, ((char*)AMAX), 1,
521  &maxpos, &idumm, 1, -1, mycol );
522 /*
523 * Broadcast the corresponding global index
524 */
525  if( *AMAX != ZERO )
526  {
527  cbtop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
528  if( myrow == maxpos )
529  {
530  Mindxl2g( Xgindx, Xlindx, Ximb, Xmb, myrow, Xsrc, nprow );
531  *INDX = Xgindx + 1;
532  Cigebs2d( ctxt, COLUMN, &cbtop, 1, 1, ((char*)INDX), 1 );
533  }
534  else
535  {
536  Cigebr2d( ctxt, COLUMN, &cbtop, 1, 1, ((char*)INDX), 1,
537  maxpos, mycol );
538  }
539  }
540  else
541  {
542 /*
543 * If AMAX is zero, then select a coherent INDX.
544 */
545  *INDX = *IX;
546  }
547  }
548  else
549  {
550 /*
551 * sub( X ) is not distributed. If AMAX is zero, then select a coherent INDX.
552 */
553  *INDX = ( ( *AMAX == ZERO ) ? ( *IX ) : Xlindx + 1 );
554  }
555  }
556  }
557  return;
558  }
559 /*
560 * End of PSAMAX
561 */
562 }
M_
#define M_
Definition: PBtools.h:39
Mindxl2g
#define Mindxl2g(ig_, il_, inb_, nb_, proc_, srcproc_, nprocs_)
Definition: PBtools.h:170
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
Cigebr2d
void Cigebr2d()
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
isamax_
int isamax_()
PBblacs.h
PBtools.h
Csgesd2d
void Csgesd2d()
Csgebs2d
void Csgebs2d()
PBblas.h
Csgamx2d
void Csgamx2d()
PBpblas.h
DLEN_
#define DLEN_
Definition: PBtools.h:48
MPosMod
#define MPosMod(I, d)
Definition: PBtools.h:95
LLD_
#define LLD_
Definition: PBtools.h:47
Csgebr2d
void Csgebr2d()
Cigebs2d
void Cigebs2d()
ZERO
#define ZERO
Definition: PBtools.h:66
PB_Cchkvec
void PB_Cchkvec()
IMB_
#define IMB_
Definition: PBtools.h:41
PB_Cabort
void PB_Cabort()
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
Csgerv2d
void Csgerv2d()
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
PB_CargFtoC
void PB_CargFtoC()
BCAST
#define BCAST
Definition: PBblacs.h:48
COMBINE
#define COMBINE
Definition: PBblacs.h:49
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
ABS
#define ABS(a_)
Definition: PBtools.h:75
CTOP_TREE1
#define CTOP_TREE1
Definition: PBblacs.h:34
INB_
#define INB_
Definition: PBtools.h:42
Cblacs_gridinfo
void Cblacs_gridinfo()
pblas.h
CTXT_
#define CTXT_
Definition: PBtools.h:38
CTOP_DEFAULT
#define CTOP_DEFAULT
Definition: PBblacs.h:26
psamax_
void psamax_(int *N, float *AMAX, int *INDX, float *X, int *IX, int *JX, int *DESCX, int *INCX)
Definition: psamax_.c:23