ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
psdot_.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "pblas.h"
14 #include "PBpblas.h"
15 #include "PBtools.h"
16 #include "PBblacs.h"
17 #include "PBblas.h"
18 
19 #ifdef __STDC__
20 void psdot_( int * N,
21  float * DOT,
22  float * X, int * IX, int * JX, int * DESCX, int * INCX,
23  float * Y, int * IY, int * JY, int * DESCY, int * INCY )
24 #else
25 void psdot_( N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY )
26 /*
27 * .. Scalar Arguments ..
28 */
29  int * INCX, * INCY, * IX, * IY, * JX, * JY, * N;
30  float * DOT;
31 /*
32 * .. Array Arguments ..
33 */
34  int * DESCX, * DESCY;
35  float * X, * Y;
36 #endif
37 {
38 /*
39 * Purpose
40 * =======
41 *
42 * PSDOT forms the dot product of two subvectors,
43 *
44 * DOT := sub( X )**T * sub( Y ),
45 *
46 * where
47 *
48 * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
49 * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and,
50 *
51 * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y,
52 * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y.
53 *
54 * Notes
55 * =====
56 *
57 * A description vector is associated with each 2D block-cyclicly dis-
58 * tributed matrix. This vector stores the information required to
59 * establish the mapping between a matrix entry and its corresponding
60 * process and memory location.
61 *
62 * In the following comments, the character _ should be read as
63 * "of the distributed matrix". Let A be a generic term for any 2D
64 * block cyclicly distributed matrix. Its description vector is DESC_A:
65 *
66 * NOTATION STORED IN EXPLANATION
67 * ---------------- --------------- ------------------------------------
68 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
69 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
70 * the NPROW x NPCOL BLACS process grid
71 * A is distributed over. The context
72 * itself is global, but the handle
73 * (the integer value) may vary.
74 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
75 * ted matrix A, M_A >= 0.
76 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
77 * buted matrix A, N_A >= 0.
78 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
79 * block of the matrix A, IMB_A > 0.
80 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
81 * left block of the matrix A,
82 * INB_A > 0.
83 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
84 * bute the last M_A-IMB_A rows of A,
85 * MB_A > 0.
86 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
87 * bute the last N_A-INB_A columns of
88 * A, NB_A > 0.
89 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
90 * row of the matrix A is distributed,
91 * NPROW > RSRC_A >= 0.
92 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
93 * first column of A is distributed.
94 * NPCOL > CSRC_A >= 0.
95 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
96 * array storing the local blocks of
97 * the distributed matrix A,
98 * IF( Lc( 1, N_A ) > 0 )
99 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
100 * ELSE
101 * LLD_A >= 1.
102 *
103 * Let K be the number of rows of a matrix A starting at the global in-
104 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
105 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
106 * receive if these K rows were distributed over NPROW processes. If K
107 * is the number of columns of a matrix A starting at the global index
108 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
109 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
110 * these K columns were distributed over NPCOL processes.
111 *
112 * The values of Lr() and Lc() may be determined via a call to the func-
113 * tion PB_Cnumroc:
114 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
115 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
116 *
117 * Arguments
118 * =========
119 *
120 * N (global input) INTEGER
121 * On entry, N specifies the length of the subvectors to be
122 * multiplied. N must be at least zero.
123 *
124 * DOT (local output) REAL array
125 * On exit, DOT specifies the dot product of the two subvectors
126 * sub( X ) and sub( Y ) only in their scope (See below for fur-
127 * ther details).
128 *
129 * X (local input) REAL array
130 * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
131 * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
132 * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
133 * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
134 * Before entry, this array contains the local entries of the
135 * matrix X.
136 *
137 * IX (global input) INTEGER
138 * On entry, IX specifies X's global row index, which points to
139 * the beginning of the submatrix sub( X ).
140 *
141 * JX (global input) INTEGER
142 * On entry, JX specifies X's global column index, which points
143 * to the beginning of the submatrix sub( X ).
144 *
145 * DESCX (global and local input) INTEGER array
146 * On entry, DESCX is an integer array of dimension DLEN_. This
147 * is the array descriptor for the matrix X.
148 *
149 * INCX (global input) INTEGER
150 * On entry, INCX specifies the global increment for the
151 * elements of X. Only two values of INCX are supported in
152 * this version, namely 1 and M_X. INCX must not be zero.
153 *
154 * Y (local input) REAL array
155 * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y
156 * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and
157 * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least
158 * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise.
159 * Before entry, this array contains the local entries of the
160 * matrix Y.
161 *
162 * IY (global input) INTEGER
163 * On entry, IY specifies Y's global row index, which points to
164 * the beginning of the submatrix sub( Y ).
165 *
166 * JY (global input) INTEGER
167 * On entry, JY specifies Y's global column index, which points
168 * to the beginning of the submatrix sub( Y ).
169 *
170 * DESCY (global and local input) INTEGER array
171 * On entry, DESCY is an integer array of dimension DLEN_. This
172 * is the array descriptor for the matrix Y.
173 *
174 * INCY (global input) INTEGER
175 * On entry, INCY specifies the global increment for the
176 * elements of Y. Only two values of INCY are supported in
177 * this version, namely 1 and M_Y. INCY must not be zero.
178 *
179 * Further Details
180 * ===============
181 *
182 * When the result of a vector-oriented PBLAS call is a scalar, this
183 * scalar is set only within the process scope which owns the vector(s)
184 * being operated on. Let sub( X ) be a generic term for the input vec-
185 * tor(s). Then, the processes owning the correct the answer is determi-
186 * ned as follows: if an operation involves more than one vector, the
187 * processes receiving the result will be the union of the following set
188 * of processes for each vector:
189 *
190 * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro-
191 * cess row or process column owns the vector operand, therefore only
192 * the process owning sub( X ) receives the correct result;
193 *
194 * If INCX = M_X, then sub( X ) is a vector distributed over a process
195 * row. Each process in this row receives the result;
196 *
197 * If INCX = 1, then sub( X ) is a vector distributed over a process
198 * column. Each process in this column receives the result;
199 *
200 * -- Written on April 1, 1998 by
201 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
202 *
203 * ---------------------------------------------------------------------
204 */
205 /*
206 * .. Local Scalars ..
207 */
208  char scope, * top;
209  int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, Xii, XinbD,
210  Xinb1D, XisD, XisR, XisRow, Xj, Xjj, Xld, Xlinc, XmyprocD,
211  XmyprocR, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR,
212  Xrow, Ycol, Yi, Yii, YinbD, Yinb1D, YisD, YisR, YisRow, Yj,
213  Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD,
214  YnprocsR, YprocD, YprocR, Yrow, cdst, csrc, ctxt, dst, info,
215  ione=1, mycol, myrow, npcol, nprow, rdst, rsrc, size, src;
216  PBTYP_T * type;
217  VVDOT_T dot;
218 /*
219 * .. Local Arrays ..
220 */
221  char * buf = NULL;
222  int Xd[DLEN_], Yd[DLEN_], dbuf[ DLEN_ ];
223 /* ..
224 * .. Executable Statements ..
225 *
226 */
227  PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd );
228  PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd );
229 #ifndef NO_ARGCHK
230 /*
231 * Test the input parameters
232 */
233  Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
234  if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) )
235  {
236  PB_Cchkvec( ctxt, "PSDOT", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info );
237  PB_Cchkvec( ctxt, "PSDOT", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info );
238  }
239  if( info ) { PB_Cabort( ctxt, "PSDOT", info ); return; }
240 #endif
241  DOT[REAL_PART] = ZERO;
242 /*
243 * Quick return if possible
244 */
245  if( *N == 0 ) return;
246 /*
247 * Handle degenerate case
248 */
249  if( ( *N == 1 ) && ( ( Xd[ M_ ] == 1 ) || ( Yd[ M_ ] == 1 ) ) )
250  {
251  type = PB_Cstypeset();
252  PB_Cpdot11( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
253  ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu );
254  return;
255  }
256 /*
257 * Start the operations
258 */
259 #ifdef NO_ARGCHK
260  Cblacs_gridinfo( ( ctxt = Xd[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol );
261 #endif
262 /*
263 * Determine if sub( X ) is distributed or not
264 */
265  if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 )
266  XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) );
267  else
268  XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) );
269 /*
270 * Determine if sub( Y ) is distributed or not
271 */
272  if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 )
273  YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) );
274  else
275  YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) );
276 /*
277 * Are sub( X ) and sub( Y ) both row or column vectors ?
278 */
279  RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) );
280 /*
281 * XisD && YisD <=> both vector operands are indeed distributed
282 */
283  if( XisD && YisD )
284  {
285 /*
286 * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol
287 */
288  PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj,
289  &Xrow, &Xcol );
290  if( XisRow )
291  {
292  XinbD = Xd[INB_]; XnbD = Xd[NB_];
293  Xld = Xd[LLD_]; Xlinc = Xld;
294  XprocD = Xcol; XmyprocD = mycol;
295  XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow;
296  XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) );
297  Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD );
298  }
299  else
300  {
301  XinbD = Xd[IMB_]; XnbD = Xd[MB_];
302  Xld = Xd[LLD_]; Xlinc = 1;
303  XprocD = Xrow; XmyprocD = myrow;
304  XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol;
305  XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) );
306  Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD );
307  }
308 /*
309 * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol
310 */
311  PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj,
312  &Yrow, &Ycol );
313  if( YisRow )
314  {
315  YinbD = Yd[INB_]; YnbD = Yd[NB_];
316  Yld = Yd[LLD_]; Ylinc = Yld;
317  YprocD = Ycol; YmyprocD = mycol;
318  YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow;
319  YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) );
320  Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD );
321  }
322  else
323  {
324  YinbD = Yd[IMB_]; YnbD = Yd[MB_];
325  Yld = Yd[LLD_]; Ylinc = 1;
326  YprocD = Yrow; YmyprocD = myrow;
327  YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol;
328  YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) );
329  Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD );
330  }
331 /*
332 * Do sub( X ) and sub( Y ) span more than one process ?
333 */
334  OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) );
335  OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) );
336 /*
337 * Are sub( X ) and sub( Y ) distributed in the same manner ?
338 */
339  Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) &&
340  ( XnprocsD == YnprocsD ) );
341 
342  if( !( XisR ) )
343  {
344 /*
345 * sub( X ) is not replicated
346 */
347  if( YisR )
348  {
349 /*
350 * If sub( X ) is not replicated, but sub( Y ) is, a process row or column
351 * YprocR need to be selected. It will contain the non-replicated vector used
352 * to perform the dot product computation.
353 */
354  if( RRorCC )
355  {
356 /*
357 * sub( X ) and sub( Y ) are both row or column vectors
358 */
359  if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
360  {
361 /*
362 * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
363 * Enforce a purely local operation by choosing YprocR to be equal to XprocR.
364 */
365  YprocR = XprocR;
366  }
367  else
368  {
369 /*
370 * Otherwise, communication has to occur, so choose the next process row or
371 * column for YprocR to maximize the number of links, i.e reduce contention.
372 */
373  YprocR = MModAdd1( XprocR, XnprocsR );
374  }
375  }
376  else
377  {
378 /*
379 * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is
380 * chosen for YprocR does not really matter. Select the process origin.
381 */
382  YprocR = XprocD;
383  }
384  }
385  else
386  {
387 /*
388 * Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or
389 * column XprocR and not in process row or column YprocR, then quick return.
390 */
391  if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) )
392  return;
393  }
394  }
395  else
396  {
397 /*
398 * sub( X ) is distributed and replicated (so no quick return possible)
399 */
400  if( YisR )
401  {
402 /*
403 * sub( Y ) is distributed and replicated as well
404 */
405  if( RRorCC )
406  {
407 /*
408 * sub( X ) and sub( Y ) are both row or column vectors
409 */
410  if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
411  {
412 /*
413 * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
414 * Enforce a purely local operation by choosing XprocR and YprocR to be equal
415 * to zero.
416 */
417  XprocR = YprocR = 0;
418  }
419  else
420  {
421 /*
422 * Otherwise, communication has to occur, so select YprocR to be zero and the
423 * next process row or column for XprocR in order to maximize the number of
424 * used links, i.e reduce contention.
425 */
426  YprocR = 0;
427  XprocR = MModAdd1( YprocR, YnprocsR );
428  }
429  }
430  else
431  {
432 /*
433 * sub( X ) and sub( Y ) are distributed in orthogonal directions, select the
434 * origin processes.
435 */
436  XprocR = YprocD;
437  YprocR = XprocD;
438  }
439  }
440  else
441  {
442 /*
443 * sub( Y ) is distributed, but not replicated
444 */
445  if( RRorCC )
446  {
447 /*
448 * sub( X ) and sub( Y ) are both row or column vectors
449 */
450  if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
451  {
452 /*
453 * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
454 * Enforce a purely local operation by choosing XprocR to be equal to YprocR.
455 */
456  XprocR = YprocR;
457  }
458  else
459  {
460 /*
461 * Otherwise, communication has to occur, so choose the next process row or
462 * column for XprocR to maximize the number of links, i.e reduce contention.
463 */
464  XprocR = MModAdd1( YprocR, YnprocsR );
465  }
466  }
467  else
468  {
469 /*
470 * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is
471 * chosen for XprocR does not really matter. Select the origin process.
472 */
473  XprocR = YprocD;
474  }
475  }
476  }
477 /*
478 * Even if sub( X ) and/or sub( Y ) are replicated, only two process row or
479 * column are active, namely XprocR and YprocR. If any of those operands is
480 * replicated, broadcast will occur (unless there is an easy way out).
481 */
482  type = PB_Cstypeset(); size = type->size; dot = type->Fvvdotu;
483 /*
484 * A purely operation occurs iff the operands start in the same process and if
485 * either the grid is mono-dimensional or there is a single local block to be
486 * operated with or if both operands are aligned.
487 */
488  if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) ||
489  ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) &&
490  ( OneDgrid || OneBlock || ( RRorCC && Square ) ) )
491  {
492  if( ( !XisR && ( XmyprocR == XprocR ) &&
493  !YisR && ( YmyprocR == YprocR ) ) ||
494  ( !XisR && YisR && ( YmyprocR == YprocR ) ) ||
495  ( !YisR && XisR && ( XmyprocR == XprocR ) ) ||
496  ( XisR && YisR ) )
497  {
498  XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD,
499  XnprocsD );
500  YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
501  YnprocsD );
502  if( ( XnpD > 0 ) && ( YnpD > 0 ) )
503  {
504  dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
505  size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ),
506  &Ylinc );
507  }
508  }
509 /*
510 * Combine the local results in sub( X )'s scope
511 */
512  if( ( XisR && YisR ) || ( XmyprocR == XprocR ) )
513  {
514  scope = ( XisRow ? CROW : CCOLUMN );
515  top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
516  Csgsum2d( ctxt, &scope, top, 1, 1, ((char *) DOT), 1, -1, 0 );
517  }
518  if( RRorCC && XisR && YisR ) return;
519  }
520  else if( ( RRorCC && OneDgrid ) || OneBlock || Square )
521  {
522 /*
523 * Otherwise, it may be possible to compute the desired dot-product in a single
524 * message exchange iff the grid is mono-dimensional and the operands are
525 * distributed in the same direction, or there is just one block to be exchanged
526 * or if both operands are similarly distributed in their respective direction.
527 */
528  if( ( YmyprocR == YprocR ) )
529  {
530 /*
531 * The processes owning a piece of sub( Y ) send it to the corresponding
532 * process owning s piece of sub ( X ).
533 */
534  YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
535  YnprocsD );
536  if( YnpD > 0 )
537  {
538  dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD );
539  dst = MPosMod( dst, XnprocsD );
540  if( XisRow ) { rdst = XprocR; cdst = dst; }
541  else { rdst = dst; cdst = XprocR; }
542 
543  if( ( myrow == rdst ) && ( mycol == cdst ) )
544  {
545  dot( &YnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
546  size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld,
547  size ), &Ylinc );
548  }
549  else
550  {
551  if( YisRow )
552  Csgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj,
553  Yld, size ), Yld, rdst, cdst );
554  else
555  Csgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj,
556  Yld, size ), Yld, rdst, cdst );
557  }
558  }
559  }
560  if( XmyprocR == XprocR )
561  {
562 /*
563 * The processes owning a piece of sub( X ) receive the corresponding local
564 * piece of sub( Y ), compute the local dot product and combine the results
565 * within sub( X )'s scope.
566 */
567  XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD,
568  XnprocsD );
569  if( XnpD > 0 )
570  {
571  src = YprocD + MModSub( XmyprocD, XprocD, XnprocsD );
572  src = MPosMod( src, YnprocsD );
573  if( YisRow ) { rsrc = YprocR; csrc = src; }
574  else { rsrc = src; csrc = YprocR; }
575  if( ( myrow != rsrc ) || ( mycol != csrc ) )
576  {
577  buf = PB_Cmalloc( XnpD * size );
578  if( YisRow )
579  Csgerv2d( ctxt, 1, XnpD, buf, 1, rsrc, csrc );
580  else
581  Csgerv2d( ctxt, XnpD, 1, buf, XnpD, rsrc, csrc );
582  dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
583  size ), &Xlinc, buf, &ione );
584  if( buf ) free( buf );
585  }
586  }
587  if( XisRow )
588  {
589  top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
590  Csgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 );
591  }
592  else
593  {
594  top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
595  Csgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 );
596  }
597  }
598  }
599  else
600  {
601 /*
602 * General case, copy sub( Y ) within sub( X )'s scope, compute the local
603 * results and combine them within sub( X )'s scope.
604 */
605  XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD );
606 
607  if( XisRow )
608  {
609  PB_Cdescset( dbuf, 1, *N, 1, Xinb1D, 1, XnbD, XprocR, XprocD, ctxt,
610  1 );
611  }
612  else
613  {
614  PB_Cdescset( dbuf, *N, 1, Xinb1D, 1, XnbD, 1, XprocD, XprocR, ctxt,
615  MAX( 1, XnpD ) );
616  }
617  if( ( XmyprocR == XprocR ) && ( XnpD > 0 ) )
618  buf = PB_Cmalloc( XnpD * size );
619 
620  if( YisRow )
621  {
622  PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) Y), Yi, Yj,
623  Yd, ROW, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW :
624  COLUMN ) );
625  }
626  else
627  {
628  PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) Y), Yi, Yj,
629  Yd, COLUMN, type->zero, buf, 0, 0, dbuf, ( XisRow ?
630  ROW : COLUMN ) );
631  }
632 
633  if( XmyprocR == XprocR )
634  {
635  if( XnpD > 0 )
636  {
637  dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
638  size ), &Xlinc, buf, &ione );
639  if( buf ) free( buf );
640  }
641  if( XisRow )
642  {
643  top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
644  Csgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 );
645  }
646  else
647  {
648  top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
649  Csgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 );
650  }
651  }
652  }
653 /*
654 * Send the DOT product result within sub( Y )'s scope
655 */
656  if( XisR || YisR )
657  {
658 /*
659 * Either sub( X ) or sub( Y ) are replicated, so that every process should have
660 * the result -> broadcast it orthogonally from sub( X )'s direction.
661 */
662  if( XisRow )
663  {
664  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
665  if( XmyprocR == XprocR )
666  Csgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 );
667  else
668  Csgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR,
669  XmyprocD );
670  }
671  else
672  {
673  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
674  if( XmyprocR == XprocR )
675  Csgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 );
676  else
677  Csgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, XmyprocD,
678  XprocR );
679  }
680  }
681  else
682  {
683 /*
684 * Neither sub( X ) nor sub( Y ) are replicated
685 */
686  if( RRorCC )
687  {
688 /*
689 * Both sub( X ) are distributed in the same direction -> the process row or
690 * column XprocR sends the result to the process row or column YprocR.
691 */
692  if( XprocR != YprocR )
693  {
694  if( XmyprocR == XprocR )
695  {
696  if( XisRow )
697  Csgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YprocR,
698  YmyprocD );
699  else
700  Csgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YmyprocD,
701  YprocR );
702  }
703  else if( YmyprocR == YprocR )
704  {
705  if( XisRow )
706  Csgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XprocR,
707  XmyprocD );
708  else
709  Csgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XmyprocD,
710  XprocR );
711  }
712  }
713  }
714  else
715  {
716 /*
717 * Otherwise, the process at the intersection of sub( X )'s and sub( Y )'s
718 * scope, broadcast the result within sub( Y )'s scope.
719 */
720  if( YmyprocR == YprocR )
721  {
722  if( YisRow )
723  {
724  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
725  if( YmyprocD == XprocR )
726  Csgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 );
727  else
728  Csgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1,
729  YprocR, XprocR );
730  }
731  else
732  {
733  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
734  if( YmyprocD == XprocR )
735  Csgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 );
736  else
737  Csgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1,
738  XprocR, YprocR );
739  }
740  }
741  }
742  }
743  }
744  else if( !( XisD ) && YisD )
745  {
746 /*
747 * sub( X ) is not distributed and sub( Y ) is distributed.
748 */
749  type = PB_Cstypeset();
750  PB_CpdotND( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
751  ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu );
752  }
753  else if( XisD && !( YisD ) )
754  {
755 /*
756 * sub( X ) is distributed and sub( Y ) is not distributed.
757 */
758  type = PB_Cstypeset();
759  PB_CpdotND( type, *N, ((char *) DOT), ((char *) Y), Yi, Yj, Yd, *INCY,
760  ((char *) X), Xi, Xj, Xd, *INCX, type->Fvvdotu );
761  }
762  else
763  {
764 /*
765 * Neither sub( X ) nor sub( Y ) are distributed
766 */
767  type = PB_Cstypeset();
768  PB_CpdotNN( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
769  ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu );
770  }
771 /*
772 * End of PSDOT
773 */
774 }
M_
#define M_
Definition: PBtools.h:39
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
PB_Cpaxpby
void PB_Cpaxpby()
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
PB_CpdotND
void PB_CpdotND()
PBblacs.h
PBtools.h
Csgesd2d
void Csgesd2d()
Csgebs2d
void Csgebs2d()
PBblas.h
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
REAL_PART
#define REAL_PART
Definition: pblas.h:135
PBpblas.h
psdot_
void psdot_(int *N, float *DOT, float *X, int *IX, int *JX, int *DESCX, int *INCX, float *Y, int *IY, int *JY, int *DESCY, int *INCY)
Definition: psdot_.c:25
DLEN_
#define DLEN_
Definition: PBtools.h:48
MPosMod
#define MPosMod(I, d)
Definition: PBtools.h:95
LLD_
#define LLD_
Definition: PBtools.h:47
Csgebr2d
void Csgebr2d()
CROW
#define CROW
Definition: PBblacs.h:21
ZERO
#define ZERO
Definition: PBtools.h:66
PB_Cchkvec
void PB_Cchkvec()
IMB_
#define IMB_
Definition: PBtools.h:41
MModSub
#define MModSub(I1, I2, d)
Definition: PBtools.h:102
PB_Cdescset
void PB_Cdescset()
PB_Cabort
void PB_Cabort()
MModAdd1
#define MModAdd1(I, d)
Definition: PBtools.h:100
PB_CpdotNN
void PB_CpdotNN()
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
Csgerv2d
void Csgerv2d()
Mfirstnb
#define Mfirstnb(inbt_, n_, i_, inb_, nb_)
Definition: PBtools.h:139
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
PBTYP_T::one
char * one
Definition: pblas.h:331
PB_Cpdot11
void PB_Cpdot11()
PB_CargFtoC
void PB_CargFtoC()
BCAST
#define BCAST
Definition: PBblacs.h:48
VVDOT_T
F_VOID_FCT(* VVDOT_T)()
Definition: pblas.h:286
COMBINE
#define COMBINE
Definition: PBblacs.h:49
PBTYP_T::size
int size
Definition: pblas.h:329
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
PB_Cmalloc
char * PB_Cmalloc()
PB_Cstypeset
PBTYP_T * PB_Cstypeset()
Definition: PB_Cstypeset.c:19
CCOLUMN
#define CCOLUMN
Definition: PBblacs.h:20
INB_
#define INB_
Definition: PBtools.h:42
PBTYP_T::Fvvdotu
VVDOT_T Fvvdotu
Definition: pblas.h:353
MAX
#define MAX(a_, b_)
Definition: PBtools.h:77
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
pblas.h
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CTXT_
#define CTXT_
Definition: PBtools.h:38
PBTYP_T::zero
char * zero
Definition: pblas.h:331
Csgsum2d
void Csgsum2d()