ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CpdotND.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_CpdotND( PBTYP_T * TYPE, int N, char * DOT,
21  char * X, int IX, int JX, int * DESCX, int INCX,
22  char * Y, int IY, int JY, int * DESCY, int INCY,
23  VVDOT_T FDOT )
24 #else
25 void PB_CpdotND( TYPE, N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY,
26  INCY, FDOT )
27 /*
28 * .. Scalar Arguments ..
29 */
30  int INCX, INCY, IX, IY, JX, JY, N;
31  char * DOT;
32  PBTYP_T * TYPE;
33  VVDOT_T FDOT;
34 /*
35 * .. Array Arguments ..
36 */
37  int * DESCX, * DESCY;
38  char * X, * Y;
39 #endif
40 {
41 /*
42 * Purpose
43 * =======
44 *
45 * PB_CpdotND forms the dot product of two subvectors,
46 *
47 * DOT := sub( X )**T * sub( Y ) or DOT := sub( X )**H * sub( Y ),
48 *
49 * where
50 *
51 * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
52 * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and,
53 *
54 * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y,
55 * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y.
56 *
57 * sub( X ) is assumed to be not distributed, and sub( Y ) is assumed to
58 * be distributed.
59 *
60 * Notes
61 * =====
62 *
63 * A description vector is associated with each 2D block-cyclicly dis-
64 * tributed matrix. This vector stores the information required to
65 * establish the mapping between a matrix entry and its corresponding
66 * process and memory location.
67 *
68 * In the following comments, the character _ should be read as
69 * "of the distributed matrix". Let A be a generic term for any 2D
70 * block cyclicly distributed matrix. Its description vector is DESC_A:
71 *
72 * NOTATION STORED IN EXPLANATION
73 * ---------------- --------------- ------------------------------------
74 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
75 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
76 * the NPROW x NPCOL BLACS process grid
77 * A is distributed over. The context
78 * itself is global, but the handle
79 * (the integer value) may vary.
80 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
81 * ted matrix A, M_A >= 0.
82 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
83 * buted matrix A, N_A >= 0.
84 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
85 * block of the matrix A, IMB_A > 0.
86 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
87 * left block of the matrix A,
88 * INB_A > 0.
89 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
90 * bute the last M_A-IMB_A rows of A,
91 * MB_A > 0.
92 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
93 * bute the last N_A-INB_A columns of
94 * A, NB_A > 0.
95 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
96 * row of the matrix A is distributed,
97 * NPROW > RSRC_A >= 0.
98 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
99 * first column of A is distributed.
100 * NPCOL > CSRC_A >= 0.
101 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
102 * array storing the local blocks of
103 * the distributed matrix A,
104 * IF( Lc( 1, N_A ) > 0 )
105 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
106 * ELSE
107 * LLD_A >= 1.
108 *
109 * Let K be the number of rows of a matrix A starting at the global in-
110 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
111 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
112 * receive if these K rows were distributed over NPROW processes. If K
113 * is the number of columns of a matrix A starting at the global index
114 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
115 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
116 * these K columns were distributed over NPCOL processes.
117 *
118 * The values of Lr() and Lc() may be determined via a call to the func-
119 * tion PB_Cnumroc:
120 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
121 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
122 *
123 * Arguments
124 * =========
125 *
126 * TYPE (local input) pointer to a PBTYP_T structure
127 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
128 * that contains type information (See pblas.h).
129 *
130 * N (global input) INTEGER
131 * On entry, N specifies the length of the subvectors to be
132 * multiplied. N must be at least zero.
133 *
134 * DOT (local output) pointer to CHAR
135 * On exit, DOT specifies the dot product of the two subvectors
136 * sub( X ) and sub( Y ) only in their scope (See below for fur-
137 * ther details).
138 *
139 * X (local input) pointer to CHAR
140 * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
141 * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
142 * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
143 * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
144 * Before entry, this array contains the local entries of the
145 * matrix X.
146 *
147 * IX (global input) INTEGER
148 * On entry, IX specifies X's global row index, which points to
149 * the beginning of the submatrix sub( X ).
150 *
151 * JX (global input) INTEGER
152 * On entry, JX specifies X's global column index, which points
153 * to the beginning of the submatrix sub( X ).
154 *
155 * DESCX (global and local input) INTEGER array
156 * On entry, DESCX is an integer array of dimension DLEN_. This
157 * is the array descriptor for the matrix X.
158 *
159 * INCX (global input) INTEGER
160 * On entry, INCX specifies the global increment for the
161 * elements of X. Only two values of INCX are supported in
162 * this version, namely 1 and M_X. INCX must not be zero.
163 *
164 * Y (local input) pointer to CHAR
165 * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y
166 * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and
167 * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least
168 * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise.
169 * Before entry, this array contains the local entries of the
170 * matrix Y.
171 *
172 * IY (global input) INTEGER
173 * On entry, IY specifies Y's global row index, which points to
174 * the beginning of the submatrix sub( Y ).
175 *
176 * JY (global input) INTEGER
177 * On entry, JY specifies Y's global column index, which points
178 * to the beginning of the submatrix sub( Y ).
179 *
180 * DESCY (global and local input) INTEGER array
181 * On entry, DESCY is an integer array of dimension DLEN_. This
182 * is the array descriptor for the matrix Y.
183 *
184 * INCY (global input) INTEGER
185 * On entry, INCY specifies the global increment for the
186 * elements of Y. Only two values of INCY are supported in
187 * this version, namely 1 and M_Y. INCY must not be zero.
188 *
189 * FDOT (local input) pointer to a function of type VVDOT
190 * On entry, FDOT points to a subroutine that computes the local
191 * dot product of two vectors.
192 *
193 * Further Details
194 * ===============
195 *
196 * When the result of a vector-oriented PBLAS call is a scalar, this
197 * scalar is set only within the process scope which owns the vector(s)
198 * being operated on. Let sub( X ) be a generic term for the input vec-
199 * tor(s). Then, the processes owning the correct the answer is determi-
200 * ned as follows: if an operation involves more than one vector, the
201 * processes receiving the result will be the union of the following set
202 * of processes for each vector:
203 *
204 * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro-
205 * cess row or process column owns the vector operand, therefore only
206 * the process owning sub( X ) receives the correct result;
207 *
208 * If INCX = M_X, then sub( X ) is a vector distributed over a process
209 * row. Each process in this row receives the result;
210 *
211 * If INCX = 1, then sub( X ) is a vector distributed over a process
212 * column. Each process in this column receives the result;
213 *
214 * -- Written on April 1, 1998 by
215 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
216 *
217 * ---------------------------------------------------------------------
218 */
219 /*
220 * .. Local Scalars ..
221 */
222  char * top;
223  int RRorCC, Xcol, Xii, XisR, XisRow, Xjj, Xld, Xlinc, XmyprocD,
224  XmyprocR, XnprocsD, XnprocsR, XprocR, Xroc, Xrow, Ycol, Yii,
225  Yinb1D, YisR, YisRow, Yjj, Yld, Ylinc, YmyprocD, YmyprocR,
226  YnbD, YnpD, YnprocsD, YnprocsR, YprocD, YprocR, Yroc, Yrow,
227  ctxt, ione=1, k, kbb, kk, kn, ktmp, mycol, mydist, myproc,
228  myrow, npcol, nprow, p, size;
229 /*
230 * .. Local Arrays ..
231 */
232  char * Xptr = NULL, * Yptr = NULL, * buf = NULL;
233 /* ..
234 * .. Executable Statements ..
235 *
236 */
237 /*
238 * Retrieve process grid information
239 */
240  Cblacs_gridinfo( ( ctxt = DESCX[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
241 /*
242 * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ...
243 */
244  PB_Cinfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, &Xii, &Xjj,
245  &Xrow, &Xcol );
246  if( ( XisRow = ( INCX == DESCX[M_] ) ) != 0 )
247  {
248  Xld = DESCX[LLD_]; Xlinc = Xld;
249  XmyprocD = mycol; XnprocsD = npcol;
250  XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow;
251  XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) );
252  }
253  else
254  {
255  Xld = DESCX[LLD_]; Xlinc = 1;
256  XmyprocD = myrow; XnprocsD = nprow;
257  XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol;
258  XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) );
259  }
260 /*
261 * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ...
262 */
263  PB_Cinfog2l( IY, JY, DESCY, nprow, npcol, myrow, mycol, &Yii, &Yjj,
264  &Yrow, &Ycol );
265  if( ( YisRow = ( INCY == DESCY[M_] ) ) != 0 )
266  {
267  YnbD = DESCY[NB_]; Yld = DESCY[LLD_]; Ylinc = Yld;
268  YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow;
269  YprocD = Ycol; YmyprocD = mycol; YnprocsD = npcol;
270  Yinb1D = PB_Cfirstnb( N, JY, DESCY[INB_], YnbD );
271  }
272  else
273  {
274  YnbD = DESCY[MB_]; Yld = DESCY[LLD_]; Ylinc = 1;
275  YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol;
276  YprocD = Yrow; YmyprocD = myrow; YnprocsD = nprow;
277  Yinb1D = PB_Cfirstnb( N, IY, DESCY[IMB_], YnbD );
278  }
279  YisR = ( ( YprocR == -1 ) || ( YnprocsR == 1 ) );
280 /*
281 * Are sub( X ) and sub( Y ) both row or column vectors ?
282 */
283  RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) );
284 /*
285 * sub( X ) is not distributed and sub( Y ) is distributed
286 */
287  if( !( XisR ) )
288  {
289 /*
290 * sub( X ) is not replicated. Since this operation is local if sub( X ) and
291 * sub( Y ) are both row or column vectors, choose YprocR = XprocR when RRorCC,
292 * and YprocR = 0 otherwise.
293 */
294  if( YisR ) { YprocR = ( ( RRorCC ) ? XprocR : 0 ); }
295 /*
296 * Now, it is just like sub( Y ) is not replicated, this information however is
297 * kept in YisR for later use.
298 */
299  if( RRorCC )
300  {
301 /*
302 * sub( X ) and sub( Y ) are both row or column vectors
303 */
304  if( XprocR == YprocR )
305  {
306 /*
307 * sub( X ) and sub( Y ) are in the same process row or column
308 */
309  if( ( XmyprocR == XprocR ) || ( YmyprocR == YprocR ) )
310  {
311  size = TYPE->size;
312  YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
313  YnprocsD );
314 /*
315 * In a given process, the dot product is computed with sub( Y ) and the cor-
316 * responding non distributed part of sub( X ). In the other processes, this
317 * part of sub( X ) is simply ignored.
318 */
319  if( YnpD > 0 )
320  {
321  Yroc = YprocD;
322  if( XisRow ) { kk = Yjj; ktmp = JX + N; kn = JX + Yinb1D; }
323  else { kk = Yii; ktmp = IX + N; kn = IX + Yinb1D; }
324 
325  if( YmyprocD == Yroc )
326  {
327  FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc,
328  Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc );
329  kk += Yinb1D;
330  }
331  Yroc = MModAdd1( Yroc, YnprocsD );
332 
333  for( k = kn; k < ktmp; k += YnbD )
334  {
335  kbb = ktmp - k; kbb = MIN( kbb, YnbD );
336  if( YmyprocD == Yroc )
337  {
338  if( XisRow )
339  FDOT( &kbb, DOT, Mptr( X, Xii, k, Xld, size ),
340  &Xlinc, Mptr( Y, Yii, kk, Yld, size ),
341  &Ylinc );
342  else
343  FDOT( &kbb, DOT, Mptr( X, k, Xjj, Xld, size ),
344  &Xlinc, Mptr( Y, kk, Yjj, Yld, size ),
345  &Ylinc );
346  kk += kbb;
347  }
348  Yroc = MModAdd1( Yroc, YnprocsD );
349  }
350  }
351 /*
352 * Replicate locally scattered dot product by reducing it
353 */
354  if( XisRow )
355  {
356  top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
357  TYPE->Cgsum2d( ctxt, ROW, top, 1, 1, DOT, 1, -1, 0 );
358  }
359  else
360  {
361  top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
362  TYPE->Cgsum2d( ctxt, COLUMN, top, 1, 1, DOT, 1, -1, 0 );
363  }
364  }
365  }
366  else
367  {
368 /*
369 * sub( X ) and sub( Y ) are in a different process row or column
370 */
371  if( YmyprocR == YprocR )
372  {
373  size = TYPE->size;
374  YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
375  YnprocsD );
376 /*
377 * If I own a piece of sub( Y ), then send it to the process row or column where
378 * sub( X ) resides and receive the dot product when sub( Y ) is not replicated.
379 */
380  if( YisRow )
381  {
382  if( YnpD > 0 )
383  TYPE->Cgesd2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj, Yld,
384  size ), Yld, XprocR, YmyprocD );
385  TYPE->Cgerv2d( ctxt, 1, 1, DOT, 1, XprocR, XmyprocD );
386  }
387  else
388  {
389  if( YnpD > 0 )
390  TYPE->Cgesd2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj, Yld,
391  size ), Yld, YmyprocD, XprocR );
392  TYPE->Cgerv2d( ctxt, 1, 1, DOT, 1, XmyprocD, XprocR );
393  }
394  }
395 
396  if( XmyprocR == XprocR )
397  {
398  size = TYPE->size;
399  YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
400  YnprocsD );
401 /*
402 * If I own sub( X ), then receive the distributed part of sub( Y ) owned by
403 * the process where sub( Y ) resides in my row or column. Compute the partial
404 * dot product as if sub( Y ) would reside in the same process row or column as
405 * sub( X ). Combine the local results.
406 */
407  if( YnpD > 0 )
408  {
409  buf = PB_Cmalloc( YnpD * size );
410  if( YisRow )
411  TYPE->Cgerv2d( ctxt, 1, YnpD, buf, 1, YprocR,
412  XmyprocD );
413  else
414  TYPE->Cgerv2d( ctxt, YnpD, 1, buf, YnpD, XmyprocD,
415  YprocR );
416 
417  Yroc = YprocD;
418  kk = 0;
419  if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; }
420  else { ktmp = IX + N; kn = IX + Yinb1D; }
421 
422  if( YmyprocD == Yroc )
423  {
424  FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ),
425  &Xlinc, buf, &ione );
426  kk += Yinb1D;
427  }
428  Yroc = MModAdd1( Yroc, YnprocsD );
429 
430  for( k = kn; k < ktmp; k += YnbD )
431  {
432  kbb = ktmp - k; kbb = MIN( kbb, YnbD );
433  if( YmyprocD == Yroc )
434  {
435  if( XisRow )
436  FDOT( &kbb, DOT, Mptr( X, Xii, k, Xld, size ),
437  &Xlinc, buf+kk*size, &ione );
438  else
439  FDOT( &kbb, DOT, Mptr( X, k, Xjj, Xld, size ),
440  &Xlinc, buf+kk*size, &ione );
441  kk += kbb;
442  }
443  Yroc = MModAdd1( Yroc, YnprocsD );
444  }
445  if( buf ) free( buf );
446  }
447 /*
448 * Combine the local results within the process row or column XprocR and
449 * send the result to the process row or column YprocR when sub( Y ) is not
450 * replicated.
451 */
452  if( XisRow )
453  {
454  top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
455  TYPE->Cgsum2d( ctxt, ROW, top, 1, 1, DOT, 1, -1, 0 );
456  if( !YisR )
457  TYPE->Cgesd2d( ctxt, 1, 1, DOT, 1, YprocR, YmyprocD );
458  }
459  else
460  {
461  top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
462  TYPE->Cgsum2d( ctxt, COLUMN, top, 1, 1, DOT, 1, -1, 0 );
463  if( !YisR )
464  TYPE->Cgesd2d( ctxt, 1, 1, DOT, 1, YmyprocD, YprocR );
465  }
466  }
467  }
468 
469  if( YisR )
470  {
471 /*
472 * If sub( Y ) is replicated, then bcast the result
473 */
474  if( XisRow )
475  {
476  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
477  if( XmyprocR == XprocR )
478  TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 );
479  else
480  TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, XprocR,
481  XmyprocD );
482  }
483  else
484  {
485  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
486  if( XmyprocR == XprocR )
487  TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 );
488  else
489  TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, XmyprocD,
490  XprocR );
491  }
492  }
493  }
494  else
495  {
496 /*
497 * sub( X ) and sub( Y ) are not both row or column vectors
498 */
499  if( ( XmyprocR == XprocR ) || ( YmyprocR == YprocR ) )
500  {
501  size = TYPE->size;
502  Xroc = 0;
503  if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; }
504  else { ktmp = IX + N; kn = IX + Yinb1D; }
505 /*
506 * Loop over the processes in which sub( Y ) resides, for each process find the
507 * next process Xroc and compute the dot product. After this, it will be needed
508 * to reduce the local dot produsts as above.
509 */
510  for( p = 0; p < YnprocsD; p++ )
511  {
512  mydist = MModSub( p, YprocD, YnprocsD );
513  myproc = MModAdd( YprocD, mydist, YnprocsD );
514 
515  if( ( XprocR == p ) && ( YprocR == Xroc ) )
516  {
517 /*
518 * Compute locally the partial dot product at the intersection of the process
519 * cross.
520 */
521  if( ( XmyprocR == p ) && ( XmyprocD == Xroc ) )
522  {
523  YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD,
524  YnprocsD );
525  if( YnpD > 0 )
526  {
527  Yroc = YprocD;
528  kk = ( XisRow ? Yii : Yjj );
529 
530  if( myproc == Yroc )
531  {
532  FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ),
533  &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ),
534  &Ylinc );
535  kk += Yinb1D;
536  }
537  Yroc = MModAdd1( Yroc, YnprocsD );
538 
539  for( k = kn; k < ktmp; k += YnbD )
540  {
541  kbb = ktmp - k; kbb = MIN( kbb, YnbD );
542  if( myproc == Yroc )
543  {
544  if( XisRow )
545  FDOT( &kbb, DOT, Mptr( X, Xii, k, Xld, size ),
546  &Xlinc, Mptr( Y, kk, Yjj, Yld, size ),
547  &Ylinc );
548  else
549  FDOT( &kbb, DOT, Mptr( X, k, Xjj, Xld, size ),
550  &Xlinc, Mptr( Y, Yii, kk, Yld, size ),
551  &Ylinc );
552  kk += kbb;
553  }
554  Yroc = MModAdd1( Yroc, YnprocsD );
555  }
556  }
557  }
558  }
559  else
560  {
561 /*
562 * Message exchange
563 */
564  if( ( YmyprocR == YprocR ) && ( YmyprocD == p ) )
565  {
566  YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD,
567  YnprocsD );
568  if( YnpD > 0 )
569  {
570  if( XisRow )
571  TYPE->Cgesd2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj, Yld,
572  size ), Yld, XprocR, Xroc );
573  else
574  TYPE->Cgesd2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj, Yld,
575  size ), Yld, Xroc, XprocR );
576  }
577  }
578 
579  if( ( XmyprocR == XprocR ) && ( XmyprocD == Xroc ) )
580  {
581  YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD,
582  YnprocsD );
583  if( YnpD > 0 )
584  {
585  buf = PB_Cmalloc( YnpD * size );
586  Yroc = YprocD;
587  kk = 0;
588 /*
589 * Receive the piece of sub( Y ) that I should handle
590 */
591  if( XisRow )
592  TYPE->Cgerv2d( ctxt, YnpD, 1, buf, YnpD, p, YprocR );
593  else
594  TYPE->Cgerv2d( ctxt, 1, YnpD, buf, 1, YprocR, p );
595 
596  if( myproc == Yroc )
597  {
598  FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ),
599  &Xlinc, buf, &ione );
600  kk += Yinb1D;
601  }
602  Yroc = MModAdd1( Yroc, YnprocsD );
603 
604  for( k = kn; k < ktmp; k += YnbD )
605  {
606  kbb = ktmp - k; kbb = MIN( kbb, YnbD );
607  if( myproc == Yroc )
608  {
609  if( XisRow )
610  FDOT( &kbb, DOT, Mptr( X, Xii, k, Xld, size ),
611  &Xlinc, buf+kk*size, &ione );
612  else
613  FDOT( &kbb, DOT, Mptr( X, k, Xjj, Xld, size ),
614  &Xlinc, buf+kk*size, &ione );
615  kk += kbb;
616  }
617  Yroc = MModAdd1( Yroc, YnprocsD );
618  }
619  if( buf ) free( buf );
620  }
621  }
622  }
623  Xroc = MModAdd1( Xroc, XnprocsD );
624  }
625 /*
626 * Combine the local results in sub( X )'s scope
627 */
628  if( XmyprocR == XprocR )
629  {
630  if( XisRow )
631  {
632  top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
633  TYPE->Cgsum2d( ctxt, ROW, top, 1, 1, DOT, 1, -1, 0 );
634  }
635  else
636  {
637  top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
638  TYPE->Cgsum2d( ctxt, COLUMN, top, 1, 1, DOT, 1, -1, 0 );
639  }
640  }
641  }
642 /*
643 * Broadcast the result in sub( Y )'s scope
644 */
645  if( YisR || ( YmyprocR == YprocR ) )
646  {
647  if( YisRow )
648  {
649  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
650  if( XmyprocR == XprocR )
651  TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 );
652  else
653  TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, YmyprocR,
654  XprocR );
655  }
656  else
657  {
658  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
659  if( XmyprocR == XprocR )
660  TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 );
661  else
662  TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, XprocR,
663  YmyprocR );
664  }
665  }
666  }
667  }
668  else
669  {
670 /*
671 * sub( X ) is replicated in every process. Compute the local dot product in
672 * process row or column YprocR when sub( Y ) is not replicated and in every
673 * process otherwise.
674 */
675  if( YisR || ( YmyprocR == YprocR ) )
676  {
677  size = TYPE->size;
678  Yroc = YprocD;
679  kk = ( YisRow ? Yjj : Yii );
680 
681  if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; }
682  else { ktmp = IX + N; kn = IX + Yinb1D; }
683 
684  if( YmyprocD == Yroc )
685  {
686  FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y,
687  Yii, Yjj, Yld, size ), &Ylinc );
688  kk += Yinb1D;
689  }
690  Yroc = MModAdd1( Yroc, YnprocsD );
691 
692  for( k = kn; k < ktmp; k += YnbD )
693  {
694  kbb = ktmp - k; kbb = MIN( kbb, YnbD );
695  if( YmyprocD == Yroc )
696  {
697  if( XisRow ) { Xptr = Mptr( X, Xii, k, Xld, size ); }
698  else { Xptr = Mptr( X, k, Xjj, Xld, size ); }
699  if( YisRow ) { Yptr = Mptr( Y, Yii, kk, Yld, size ); }
700  else { Yptr = Mptr( Y, kk, Yjj, Yld, size ); }
701  FDOT( &kbb, DOT, Xptr, &Xlinc, Yptr, &Ylinc );
702  kk += kbb;
703  }
704  Yroc = MModAdd1( Yroc, YnprocsD );
705  }
706  }
707 
708  if( YisR )
709  {
710 /*
711 * sub( Y ) is replicated, combine the results in each process row or column.
712 */
713  if( YisRow )
714  {
715  top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
716  TYPE->Cgsum2d( ctxt, ROW, top, 1, 1, DOT, 1, -1, 0 );
717  }
718  else
719  {
720  top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
721  TYPE->Cgsum2d( ctxt, COLUMN, top, 1, 1, DOT, 1, -1, 0 );
722  }
723  }
724  else
725  {
726 /*
727 * sub( Y ) is not replicated, combine the results in the entire grid at once.
728 */
729  top = PB_Ctop( &ctxt, COMBINE, ALL, TOP_GET );
730  TYPE->Cgsum2d( ctxt, ALL, top, 1, 1, DOT, 1, -1, 0 );
731  }
732  }
733 /*
734 * End of PB_CpdotND
735 */
736 }
M_
#define M_
Definition: PBtools.h:39
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
PB_CpdotND
void PB_CpdotND(PBTYP_T *TYPE, int N, char *DOT, char *X, int IX, int JX, int *DESCX, int INCX, char *Y, int IY, int JY, int *DESCY, int INCY, VVDOT_T FDOT)
Definition: PB_CpdotND.c:25
PB_Cfirstnb
int PB_Cfirstnb()
LLD_
#define LLD_
Definition: PBtools.h:47
MModAdd
#define MModAdd(I1, I2, d)
Definition: PBtools.h:97
IMB_
#define IMB_
Definition: PBtools.h:41
MModSub
#define MModSub(I1, I2, d)
Definition: PBtools.h:102
MModAdd1
#define MModAdd1(I, d)
Definition: PBtools.h:100
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
PB_Ctop
char * PB_Ctop()
BCAST
#define BCAST
Definition: PBblacs.h:48
VVDOT_T
F_VOID_FCT(* VVDOT_T)()
Definition: pblas.h:286
COMBINE
#define COMBINE
Definition: PBblacs.h:49
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
PB_Cmalloc
char * PB_Cmalloc()
ALL
#define ALL
Definition: PBblas.h:50
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
INB_
#define INB_
Definition: PBtools.h:42
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CTXT_
#define CTXT_
Definition: PBtools.h:38