SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
All Classes Files Functions Variables Typedefs Macros

◆ psdot_()

void psdot_ ( Int N,
float *  DOT,
float *  X,
Int IX,
Int JX,
Int DESCX,
Int INCX,
float *  Y,
Int IY,
Int JY,
Int DESCY,
Int INCY 
)

Definition at line 25 of file psdot_.c.

37{
38/*
39* Purpose
40* =======
41*
42* PSDOT forms the dot product of two subvectors,
43*
44* DOT := sub( X )**T * sub( Y ),
45*
46* where
47*
48* sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
49* X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and,
50*
51* sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y,
52* Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y.
53*
54* Notes
55* =====
56*
57* A description vector is associated with each 2D block-cyclicly dis-
58* tributed matrix. This vector stores the information required to
59* establish the mapping between a matrix entry and its corresponding
60* process and memory location.
61*
62* In the following comments, the character _ should be read as
63* "of the distributed matrix". Let A be a generic term for any 2D
64* block cyclicly distributed matrix. Its description vector is DESC_A:
65*
66* NOTATION STORED IN EXPLANATION
67* ---------------- --------------- ------------------------------------
68* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
69* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
70* the NPROW x NPCOL BLACS process grid
71* A is distributed over. The context
72* itself is global, but the handle
73* (the integer value) may vary.
74* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
75* ted matrix A, M_A >= 0.
76* N_A (global) DESCA[ N_ ] The number of columns in the distri-
77* buted matrix A, N_A >= 0.
78* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
79* block of the matrix A, IMB_A > 0.
80* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
81* left block of the matrix A,
82* INB_A > 0.
83* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
84* bute the last M_A-IMB_A rows of A,
85* MB_A > 0.
86* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
87* bute the last N_A-INB_A columns of
88* A, NB_A > 0.
89* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
90* row of the matrix A is distributed,
91* NPROW > RSRC_A >= 0.
92* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
93* first column of A is distributed.
94* NPCOL > CSRC_A >= 0.
95* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
96* array storing the local blocks of
97* the distributed matrix A,
98* IF( Lc( 1, N_A ) > 0 )
99* LLD_A >= MAX( 1, Lr( 1, M_A ) )
100* ELSE
101* LLD_A >= 1.
102*
103* Let K be the number of rows of a matrix A starting at the global in-
104* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
105* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
106* receive if these K rows were distributed over NPROW processes. If K
107* is the number of columns of a matrix A starting at the global index
108* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
109* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
110* these K columns were distributed over NPCOL processes.
111*
112* The values of Lr() and Lc() may be determined via a call to the func-
113* tion PB_Cnumroc:
114* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
115* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
116*
117* Arguments
118* =========
119*
120* N (global input) INTEGER
121* On entry, N specifies the length of the subvectors to be
122* multiplied. N must be at least zero.
123*
124* DOT (local output) REAL array
125* On exit, DOT specifies the dot product of the two subvectors
126* sub( X ) and sub( Y ) only in their scope (See below for fur-
127* ther details).
128*
129* X (local input) REAL array
130* On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
131* is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
132* MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
133* Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
134* Before entry, this array contains the local entries of the
135* matrix X.
136*
137* IX (global input) INTEGER
138* On entry, IX specifies X's global row index, which points to
139* the beginning of the submatrix sub( X ).
140*
141* JX (global input) INTEGER
142* On entry, JX specifies X's global column index, which points
143* to the beginning of the submatrix sub( X ).
144*
145* DESCX (global and local input) INTEGER array
146* On entry, DESCX is an integer array of dimension DLEN_. This
147* is the array descriptor for the matrix X.
148*
149* INCX (global input) INTEGER
150* On entry, INCX specifies the global increment for the
151* elements of X. Only two values of INCX are supported in
152* this version, namely 1 and M_X. INCX must not be zero.
153*
154* Y (local input) REAL array
155* On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y
156* is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and
157* MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least
158* Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise.
159* Before entry, this array contains the local entries of the
160* matrix Y.
161*
162* IY (global input) INTEGER
163* On entry, IY specifies Y's global row index, which points to
164* the beginning of the submatrix sub( Y ).
165*
166* JY (global input) INTEGER
167* On entry, JY specifies Y's global column index, which points
168* to the beginning of the submatrix sub( Y ).
169*
170* DESCY (global and local input) INTEGER array
171* On entry, DESCY is an integer array of dimension DLEN_. This
172* is the array descriptor for the matrix Y.
173*
174* INCY (global input) INTEGER
175* On entry, INCY specifies the global increment for the
176* elements of Y. Only two values of INCY are supported in
177* this version, namely 1 and M_Y. INCY must not be zero.
178*
179* Further Details
180* ===============
181*
182* When the result of a vector-oriented PBLAS call is a scalar, this
183* scalar is set only within the process scope which owns the vector(s)
184* being operated on. Let sub( X ) be a generic term for the input vec-
185* tor(s). Then, the processes owning the correct the answer is determi-
186* ned as follows: if an operation involves more than one vector, the
187* processes receiving the result will be the union of the following set
188* of processes for each vector:
189*
190* If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro-
191* cess row or process column owns the vector operand, therefore only
192* the process owning sub( X ) receives the correct result;
193*
194* If INCX = M_X, then sub( X ) is a vector distributed over a process
195* row. Each process in this row receives the result;
196*
197* If INCX = 1, then sub( X ) is a vector distributed over a process
198* column. Each process in this column receives the result;
199*
200* -- Written on April 1, 1998 by
201* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
202*
203* ---------------------------------------------------------------------
204*/
205/*
206* .. Local Scalars ..
207*/
208 char scope, * top;
209 Int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, Xii, XinbD,
210 Xinb1D, XisD, XisR, XisRow, Xj, Xjj, Xld, Xlinc, XmyprocD,
211 XmyprocR, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR,
212 Xrow, Ycol, Yi, Yii, YinbD, Yinb1D, YisD, YisR, YisRow, Yj,
213 Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD,
214 YnprocsR, YprocD, YprocR, Yrow, cdst, csrc, ctxt, dst, info,
215 ione=1, mycol, myrow, npcol, nprow, rdst, rsrc, size, src;
216 PBTYP_T * type;
217 VVDOT_T dot;
218/*
219* .. Local Arrays ..
220*/
221 char * buf = NULL;
222 Int Xd[DLEN_], Yd[DLEN_], dbuf[ DLEN_ ];
223/* ..
224* .. Executable Statements ..
225*
226*/
227 PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd );
228 PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd );
229#ifndef NO_ARGCHK
230/*
231* Test the input parameters
232*/
233 Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
234 if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) )
235 {
236 PB_Cchkvec( ctxt, "PSDOT", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info );
237 PB_Cchkvec( ctxt, "PSDOT", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info );
238 }
239 if( info ) { PB_Cabort( ctxt, "PSDOT", info ); return; }
240#endif
241 DOT[REAL_PART] = ZERO;
242/*
243* Quick return if possible
244*/
245 if( *N == 0 ) return;
246/*
247* Handle degenerate case
248*/
249 if( ( *N == 1 ) && ( ( Xd[ M_ ] == 1 ) || ( Yd[ M_ ] == 1 ) ) )
250 {
251 type = PB_Cstypeset();
252 PB_Cpdot11( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
253 ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu );
254 return;
255 }
256/*
257* Start the operations
258*/
259#ifdef NO_ARGCHK
260 Cblacs_gridinfo( ( ctxt = Xd[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol );
261#endif
262/*
263* Determine if sub( X ) is distributed or not
264*/
265 if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 )
266 XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) );
267 else
268 XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) );
269/*
270* Determine if sub( Y ) is distributed or not
271*/
272 if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 )
273 YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) );
274 else
275 YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) );
276/*
277* Are sub( X ) and sub( Y ) both row or column vectors ?
278*/
279 RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) );
280/*
281* XisD && YisD <=> both vector operands are indeed distributed
282*/
283 if( XisD && YisD )
284 {
285/*
286* Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol
287*/
288 PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj,
289 &Xrow, &Xcol );
290 if( XisRow )
291 {
292 XinbD = Xd[INB_]; XnbD = Xd[NB_];
293 Xld = Xd[LLD_]; Xlinc = Xld;
294 XprocD = Xcol; XmyprocD = mycol;
295 XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow;
296 XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) );
297 Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD );
298 }
299 else
300 {
301 XinbD = Xd[IMB_]; XnbD = Xd[MB_];
302 Xld = Xd[LLD_]; Xlinc = 1;
303 XprocD = Xrow; XmyprocD = myrow;
304 XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol;
305 XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) );
306 Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD );
307 }
308/*
309* Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol
310*/
311 PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj,
312 &Yrow, &Ycol );
313 if( YisRow )
314 {
315 YinbD = Yd[INB_]; YnbD = Yd[NB_];
316 Yld = Yd[LLD_]; Ylinc = Yld;
317 YprocD = Ycol; YmyprocD = mycol;
318 YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow;
319 YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) );
320 Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD );
321 }
322 else
323 {
324 YinbD = Yd[IMB_]; YnbD = Yd[MB_];
325 Yld = Yd[LLD_]; Ylinc = 1;
326 YprocD = Yrow; YmyprocD = myrow;
327 YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol;
328 YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) );
329 Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD );
330 }
331/*
332* Do sub( X ) and sub( Y ) span more than one process ?
333*/
334 OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) );
335 OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) );
336/*
337* Are sub( X ) and sub( Y ) distributed in the same manner ?
338*/
339 Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) &&
340 ( XnprocsD == YnprocsD ) );
341
342 if( !( XisR ) )
343 {
344/*
345* sub( X ) is not replicated
346*/
347 if( YisR )
348 {
349/*
350* If sub( X ) is not replicated, but sub( Y ) is, a process row or column
351* YprocR need to be selected. It will contain the non-replicated vector used
352* to perform the dot product computation.
353*/
354 if( RRorCC )
355 {
356/*
357* sub( X ) and sub( Y ) are both row or column vectors
358*/
359 if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
360 {
361/*
362* sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
363* Enforce a purely local operation by choosing YprocR to be equal to XprocR.
364*/
365 YprocR = XprocR;
366 }
367 else
368 {
369/*
370* Otherwise, communication has to occur, so choose the next process row or
371* column for YprocR to maximize the number of links, i.e reduce contention.
372*/
373 YprocR = MModAdd1( XprocR, XnprocsR );
374 }
375 }
376 else
377 {
378/*
379* sub( X ) and sub( Y ) are distributed in orthogonal directions, what is
380* chosen for YprocR does not really matter. Select the process origin.
381*/
382 YprocR = XprocD;
383 }
384 }
385 else
386 {
387/*
388* Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or
389* column XprocR and not in process row or column YprocR, then quick return.
390*/
391 if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) )
392 return;
393 }
394 }
395 else
396 {
397/*
398* sub( X ) is distributed and replicated (so no quick return possible)
399*/
400 if( YisR )
401 {
402/*
403* sub( Y ) is distributed and replicated as well
404*/
405 if( RRorCC )
406 {
407/*
408* sub( X ) and sub( Y ) are both row or column vectors
409*/
410 if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
411 {
412/*
413* sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
414* Enforce a purely local operation by choosing XprocR and YprocR to be equal
415* to zero.
416*/
417 XprocR = YprocR = 0;
418 }
419 else
420 {
421/*
422* Otherwise, communication has to occur, so select YprocR to be zero and the
423* next process row or column for XprocR in order to maximize the number of
424* used links, i.e reduce contention.
425*/
426 YprocR = 0;
427 XprocR = MModAdd1( YprocR, YnprocsR );
428 }
429 }
430 else
431 {
432/*
433* sub( X ) and sub( Y ) are distributed in orthogonal directions, select the
434* origin processes.
435*/
436 XprocR = YprocD;
437 YprocR = XprocD;
438 }
439 }
440 else
441 {
442/*
443* sub( Y ) is distributed, but not replicated
444*/
445 if( RRorCC )
446 {
447/*
448* sub( X ) and sub( Y ) are both row or column vectors
449*/
450 if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
451 {
452/*
453* sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
454* Enforce a purely local operation by choosing XprocR to be equal to YprocR.
455*/
456 XprocR = YprocR;
457 }
458 else
459 {
460/*
461* Otherwise, communication has to occur, so choose the next process row or
462* column for XprocR to maximize the number of links, i.e reduce contention.
463*/
464 XprocR = MModAdd1( YprocR, YnprocsR );
465 }
466 }
467 else
468 {
469/*
470* sub( X ) and sub( Y ) are distributed in orthogonal directions, what is
471* chosen for XprocR does not really matter. Select the origin process.
472*/
473 XprocR = YprocD;
474 }
475 }
476 }
477/*
478* Even if sub( X ) and/or sub( Y ) are replicated, only two process row or
479* column are active, namely XprocR and YprocR. If any of those operands is
480* replicated, broadcast will occur (unless there is an easy way out).
481*/
482 type = PB_Cstypeset(); size = type->size; dot = type->Fvvdotu;
483/*
484* A purely operation occurs iff the operands start in the same process and if
485* either the grid is mono-dimensional or there is a single local block to be
486* operated with or if both operands are aligned.
487*/
488 if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) ||
489 ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) &&
490 ( OneDgrid || OneBlock || ( RRorCC && Square ) ) )
491 {
492 if( ( !XisR && ( XmyprocR == XprocR ) &&
493 !YisR && ( YmyprocR == YprocR ) ) ||
494 ( !XisR && YisR && ( YmyprocR == YprocR ) ) ||
495 ( !YisR && XisR && ( XmyprocR == XprocR ) ) ||
496 ( XisR && YisR ) )
497 {
498 XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD,
499 XnprocsD );
500 YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
501 YnprocsD );
502 if( ( XnpD > 0 ) && ( YnpD > 0 ) )
503 {
504 dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
505 size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ),
506 &Ylinc );
507 }
508 }
509/*
510* Combine the local results in sub( X )'s scope
511*/
512 if( ( XisR && YisR ) || ( XmyprocR == XprocR ) )
513 {
514 scope = ( XisRow ? CROW : CCOLUMN );
515 top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
516 Csgsum2d( ctxt, &scope, top, 1, 1, ((char *) DOT), 1, -1, 0 );
517 }
518 if( RRorCC && XisR && YisR ) return;
519 }
520 else if( ( RRorCC && OneDgrid ) || OneBlock || Square )
521 {
522/*
523* Otherwise, it may be possible to compute the desired dot-product in a single
524* message exchange iff the grid is mono-dimensional and the operands are
525* distributed in the same direction, or there is just one block to be exchanged
526* or if both operands are similarly distributed in their respective direction.
527*/
528 if( ( YmyprocR == YprocR ) )
529 {
530/*
531* The processes owning a piece of sub( Y ) send it to the corresponding
532* process owning s piece of sub ( X ).
533*/
534 YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
535 YnprocsD );
536 if( YnpD > 0 )
537 {
538 dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD );
539 dst = MPosMod( dst, XnprocsD );
540 if( XisRow ) { rdst = XprocR; cdst = dst; }
541 else { rdst = dst; cdst = XprocR; }
542
543 if( ( myrow == rdst ) && ( mycol == cdst ) )
544 {
545 dot( &YnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
546 size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld,
547 size ), &Ylinc );
548 }
549 else
550 {
551 if( YisRow )
552 Csgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj,
553 Yld, size ), Yld, rdst, cdst );
554 else
555 Csgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj,
556 Yld, size ), Yld, rdst, cdst );
557 }
558 }
559 }
560 if( XmyprocR == XprocR )
561 {
562/*
563* The processes owning a piece of sub( X ) receive the corresponding local
564* piece of sub( Y ), compute the local dot product and combine the results
565* within sub( X )'s scope.
566*/
567 XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD,
568 XnprocsD );
569 if( XnpD > 0 )
570 {
571 src = YprocD + MModSub( XmyprocD, XprocD, XnprocsD );
572 src = MPosMod( src, YnprocsD );
573 if( YisRow ) { rsrc = YprocR; csrc = src; }
574 else { rsrc = src; csrc = YprocR; }
575 if( ( myrow != rsrc ) || ( mycol != csrc ) )
576 {
577 buf = PB_Cmalloc( XnpD * size );
578 if( YisRow )
579 Csgerv2d( ctxt, 1, XnpD, buf, 1, rsrc, csrc );
580 else
581 Csgerv2d( ctxt, XnpD, 1, buf, XnpD, rsrc, csrc );
582 dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
583 size ), &Xlinc, buf, &ione );
584 if( buf ) free( buf );
585 }
586 }
587 if( XisRow )
588 {
589 top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
590 Csgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 );
591 }
592 else
593 {
594 top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
595 Csgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 );
596 }
597 }
598 }
599 else
600 {
601/*
602* General case, copy sub( Y ) within sub( X )'s scope, compute the local
603* results and combine them within sub( X )'s scope.
604*/
605 XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD );
606
607 if( XisRow )
608 {
609 PB_Cdescset( dbuf, 1, *N, 1, Xinb1D, 1, XnbD, XprocR, XprocD, ctxt,
610 1 );
611 }
612 else
613 {
614 PB_Cdescset( dbuf, *N, 1, Xinb1D, 1, XnbD, 1, XprocD, XprocR, ctxt,
615 MAX( 1, XnpD ) );
616 }
617 if( ( XmyprocR == XprocR ) && ( XnpD > 0 ) )
618 buf = PB_Cmalloc( XnpD * size );
619
620 if( YisRow )
621 {
622 PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) Y), Yi, Yj,
623 Yd, ROW, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW :
624 COLUMN ) );
625 }
626 else
627 {
628 PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) Y), Yi, Yj,
629 Yd, COLUMN, type->zero, buf, 0, 0, dbuf, ( XisRow ?
630 ROW : COLUMN ) );
631 }
632
633 if( XmyprocR == XprocR )
634 {
635 if( XnpD > 0 )
636 {
637 dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
638 size ), &Xlinc, buf, &ione );
639 if( buf ) free( buf );
640 }
641 if( XisRow )
642 {
643 top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
644 Csgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 );
645 }
646 else
647 {
648 top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
649 Csgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 );
650 }
651 }
652 }
653/*
654* Send the DOT product result within sub( Y )'s scope
655*/
656 if( XisR || YisR )
657 {
658/*
659* Either sub( X ) or sub( Y ) are replicated, so that every process should have
660* the result -> broadcast it orthogonally from sub( X )'s direction.
661*/
662 if( XisRow )
663 {
664 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
665 if( XmyprocR == XprocR )
666 Csgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 );
667 else
668 Csgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR,
669 XmyprocD );
670 }
671 else
672 {
673 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
674 if( XmyprocR == XprocR )
675 Csgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 );
676 else
677 Csgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, XmyprocD,
678 XprocR );
679 }
680 }
681 else
682 {
683/*
684* Neither sub( X ) nor sub( Y ) are replicated
685*/
686 if( RRorCC )
687 {
688/*
689* Both sub( X ) are distributed in the same direction -> the process row or
690* column XprocR sends the result to the process row or column YprocR.
691*/
692 if( XprocR != YprocR )
693 {
694 if( XmyprocR == XprocR )
695 {
696 if( XisRow )
697 Csgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YprocR,
698 YmyprocD );
699 else
700 Csgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YmyprocD,
701 YprocR );
702 }
703 else if( YmyprocR == YprocR )
704 {
705 if( XisRow )
706 Csgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XprocR,
707 XmyprocD );
708 else
709 Csgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XmyprocD,
710 XprocR );
711 }
712 }
713 }
714 else
715 {
716/*
717* Otherwise, the process at the intersection of sub( X )'s and sub( Y )'s
718* scope, broadcast the result within sub( Y )'s scope.
719*/
720 if( YmyprocR == YprocR )
721 {
722 if( YisRow )
723 {
724 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
725 if( YmyprocD == XprocR )
726 Csgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 );
727 else
728 Csgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1,
729 YprocR, XprocR );
730 }
731 else
732 {
733 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
734 if( YmyprocD == XprocR )
735 Csgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 );
736 else
737 Csgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1,
738 XprocR, YprocR );
739 }
740 }
741 }
742 }
743 }
744 else if( !( XisD ) && YisD )
745 {
746/*
747* sub( X ) is not distributed and sub( Y ) is distributed.
748*/
749 type = PB_Cstypeset();
750 PB_CpdotND( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
751 ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu );
752 }
753 else if( XisD && !( YisD ) )
754 {
755/*
756* sub( X ) is distributed and sub( Y ) is not distributed.
757*/
758 type = PB_Cstypeset();
759 PB_CpdotND( type, *N, ((char *) DOT), ((char *) Y), Yi, Yj, Yd, *INCY,
760 ((char *) X), Xi, Xj, Xd, *INCX, type->Fvvdotu );
761 }
762 else
763 {
764/*
765* Neither sub( X ) nor sub( Y ) are distributed
766*/
767 type = PB_Cstypeset();
768 PB_CpdotNN( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
769 ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu );
770 }
771/*
772* End of PSDOT
773*/
774}
#define Int
Definition Bconfig.h:22
#define REAL_PART
Definition pblas.h:139
F_VOID_FCT(* VVDOT_T)()
Definition pblas.h:290
#define CCOLUMN
Definition PBblacs.h:20
#define TOP_GET
Definition PBblacs.h:50
void Csgerv2d()
void Csgebr2d()
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define CROW
Definition PBblacs.h:21
void Csgebs2d()
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define BCAST
Definition PBblacs.h:48
void Csgsum2d()
void Csgesd2d()
#define NOCONJG
Definition PBblas.h:45
#define CTXT_
Definition PBtools.h:38
#define MAX(a_, b_)
Definition PBtools.h:77
#define MB_
Definition PBtools.h:43
void PB_Cabort()
void PB_Cchkvec()
char * PB_Cmalloc()
void PB_Cinfog2l()
#define MModSub(I1, I2, d)
Definition PBtools.h:102
void PB_Cpdot11()
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
#define LLD_
Definition PBtools.h:47
Int PB_Cnumroc()
char * PB_Ctop()
void PB_CpdotNN()
PBTYP_T * PB_Cstypeset()
#define RSRC_
Definition PBtools.h:45
void PB_Cdescset()
#define MModAdd1(I, d)
Definition PBtools.h:100
#define Mfirstnb(inbt_, n_, i_, inb_, nb_)
Definition PBtools.h:139
#define M_
Definition PBtools.h:39
#define INB_
Definition PBtools.h:42
#define MPosMod(I, d)
Definition PBtools.h:95
void PB_CpdotND()
void PB_CargFtoC()
#define CSRC_
Definition PBtools.h:46
#define IMB_
Definition PBtools.h:41
#define ZERO
Definition PBtools.h:66
#define DLEN_
Definition PBtools.h:48
#define NB_
Definition PBtools.h:44
void PB_Cpaxpby()
VVDOT_T Fvvdotu
Definition pblas.h:357
Int size
Definition pblas.h:333
char * zero
Definition pblas.h:335
char * one
Definition pblas.h:336
Here is the call graph for this function: