SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
PB_CpdotND.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS auxiliary routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "../pblas.h"
14#include "../PBpblas.h"
15#include "../PBtools.h"
16#include "../PBblacs.h"
17#include "../PBblas.h"
18
19#ifdef __STDC__
20void PB_CpdotND( PBTYP_T * TYPE, Int N, char * DOT,
21 char * X, Int IX, Int JX, Int * DESCX, Int INCX,
22 char * Y, Int IY, Int JY, Int * DESCY, Int INCY,
23 VVDOT_T FDOT )
24#else
25void PB_CpdotND( TYPE, N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY,
26 INCY, FDOT )
27/*
28* .. Scalar Arguments ..
29*/
30 Int INCX, INCY, IX, IY, JX, JY, N;
31 char * DOT;
32 PBTYP_T * TYPE;
33 VVDOT_T FDOT;
34/*
35* .. Array Arguments ..
36*/
37 Int * DESCX, * DESCY;
38 char * X, * Y;
39#endif
40{
41/*
42* Purpose
43* =======
44*
45* PB_CpdotND forms the dot product of two subvectors,
46*
47* DOT := sub( X )**T * sub( Y ) or DOT := sub( X )**H * sub( Y ),
48*
49* where
50*
51* sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
52* X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and,
53*
54* sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y,
55* Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y.
56*
57* sub( X ) is assumed to be not distributed, and sub( Y ) is assumed to
58* be distributed.
59*
60* Notes
61* =====
62*
63* A description vector is associated with each 2D block-cyclicly dis-
64* tributed matrix. This vector stores the information required to
65* establish the mapping between a matrix entry and its corresponding
66* process and memory location.
67*
68* In the following comments, the character _ should be read as
69* "of the distributed matrix". Let A be a generic term for any 2D
70* block cyclicly distributed matrix. Its description vector is DESC_A:
71*
72* NOTATION STORED IN EXPLANATION
73* ---------------- --------------- ------------------------------------
74* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
75* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
76* the NPROW x NPCOL BLACS process grid
77* A is distributed over. The context
78* itself is global, but the handle
79* (the integer value) may vary.
80* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
81* ted matrix A, M_A >= 0.
82* N_A (global) DESCA[ N_ ] The number of columns in the distri-
83* buted matrix A, N_A >= 0.
84* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
85* block of the matrix A, IMB_A > 0.
86* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
87* left block of the matrix A,
88* INB_A > 0.
89* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
90* bute the last M_A-IMB_A rows of A,
91* MB_A > 0.
92* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
93* bute the last N_A-INB_A columns of
94* A, NB_A > 0.
95* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
96* row of the matrix A is distributed,
97* NPROW > RSRC_A >= 0.
98* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
99* first column of A is distributed.
100* NPCOL > CSRC_A >= 0.
101* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
102* array storing the local blocks of
103* the distributed matrix A,
104* IF( Lc( 1, N_A ) > 0 )
105* LLD_A >= MAX( 1, Lr( 1, M_A ) )
106* ELSE
107* LLD_A >= 1.
108*
109* Let K be the number of rows of a matrix A starting at the global in-
110* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
111* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
112* receive if these K rows were distributed over NPROW processes. If K
113* is the number of columns of a matrix A starting at the global index
114* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
115* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
116* these K columns were distributed over NPCOL processes.
117*
118* The values of Lr() and Lc() may be determined via a call to the func-
119* tion PB_Cnumroc:
120* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
121* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
122*
123* Arguments
124* =========
125*
126* TYPE (local input) pointer to a PBTYP_T structure
127* On entry, TYPE is a pointer to a structure of type PBTYP_T,
128* that contains type information (See pblas.h).
129*
130* N (global input) INTEGER
131* On entry, N specifies the length of the subvectors to be
132* multiplied. N must be at least zero.
133*
134* DOT (local output) pointer to CHAR
135* On exit, DOT specifies the dot product of the two subvectors
136* sub( X ) and sub( Y ) only in their scope (See below for fur-
137* ther details).
138*
139* X (local input) pointer to CHAR
140* On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
141* is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
142* MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
143* Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
144* Before entry, this array contains the local entries of the
145* matrix X.
146*
147* IX (global input) INTEGER
148* On entry, IX specifies X's global row index, which points to
149* the beginning of the submatrix sub( X ).
150*
151* JX (global input) INTEGER
152* On entry, JX specifies X's global column index, which points
153* to the beginning of the submatrix sub( X ).
154*
155* DESCX (global and local input) INTEGER array
156* On entry, DESCX is an integer array of dimension DLEN_. This
157* is the array descriptor for the matrix X.
158*
159* INCX (global input) INTEGER
160* On entry, INCX specifies the global increment for the
161* elements of X. Only two values of INCX are supported in
162* this version, namely 1 and M_X. INCX must not be zero.
163*
164* Y (local input) pointer to CHAR
165* On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y
166* is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and
167* MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least
168* Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise.
169* Before entry, this array contains the local entries of the
170* matrix Y.
171*
172* IY (global input) INTEGER
173* On entry, IY specifies Y's global row index, which points to
174* the beginning of the submatrix sub( Y ).
175*
176* JY (global input) INTEGER
177* On entry, JY specifies Y's global column index, which points
178* to the beginning of the submatrix sub( Y ).
179*
180* DESCY (global and local input) INTEGER array
181* On entry, DESCY is an integer array of dimension DLEN_. This
182* is the array descriptor for the matrix Y.
183*
184* INCY (global input) INTEGER
185* On entry, INCY specifies the global increment for the
186* elements of Y. Only two values of INCY are supported in
187* this version, namely 1 and M_Y. INCY must not be zero.
188*
189* FDOT (local input) pointer to a function of type VVDOT
190* On entry, FDOT points to a subroutine that computes the local
191* dot product of two vectors.
192*
193* Further Details
194* ===============
195*
196* When the result of a vector-oriented PBLAS call is a scalar, this
197* scalar is set only within the process scope which owns the vector(s)
198* being operated on. Let sub( X ) be a generic term for the input vec-
199* tor(s). Then, the processes owning the correct the answer is determi-
200* ned as follows: if an operation involves more than one vector, the
201* processes receiving the result will be the union of the following set
202* of processes for each vector:
203*
204* If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro-
205* cess row or process column owns the vector operand, therefore only
206* the process owning sub( X ) receives the correct result;
207*
208* If INCX = M_X, then sub( X ) is a vector distributed over a process
209* row. Each process in this row receives the result;
210*
211* If INCX = 1, then sub( X ) is a vector distributed over a process
212* column. Each process in this column receives the result;
213*
214* -- Written on April 1, 1998 by
215* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
216*
217* ---------------------------------------------------------------------
218*/
219/*
220* .. Local Scalars ..
221*/
222 char * top;
223 Int RRorCC, Xcol, Xii, XisR, XisRow, Xjj, Xld, Xlinc, XmyprocD,
224 XmyprocR, XnprocsD, XnprocsR, XprocR, Xroc, Xrow, Ycol, Yii,
225 Yinb1D, YisR, YisRow, Yjj, Yld, Ylinc, YmyprocD, YmyprocR,
226 YnbD, YnpD, YnprocsD, YnprocsR, YprocD, YprocR, Yroc, Yrow,
227 ctxt, ione=1, k, kbb, kk, kn, ktmp, mycol, mydist, myproc,
228 myrow, npcol, nprow, p, size;
229/*
230* .. Local Arrays ..
231*/
232 char * Xptr = NULL, * Yptr = NULL, * buf = NULL;
233/* ..
234* .. Executable Statements ..
235*
236*/
237/*
238* Retrieve process grid information
239*/
240 Cblacs_gridinfo( ( ctxt = DESCX[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
241/*
242* Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ...
243*/
244 PB_Cinfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, &Xii, &Xjj,
245 &Xrow, &Xcol );
246 if( ( XisRow = ( INCX == DESCX[M_] ) ) != 0 )
247 {
248 Xld = DESCX[LLD_]; Xlinc = Xld;
249 XmyprocD = mycol; XnprocsD = npcol;
250 XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow;
251 XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) );
252 }
253 else
254 {
255 Xld = DESCX[LLD_]; Xlinc = 1;
256 XmyprocD = myrow; XnprocsD = nprow;
257 XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol;
258 XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) );
259 }
260/*
261* Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ...
262*/
263 PB_Cinfog2l( IY, JY, DESCY, nprow, npcol, myrow, mycol, &Yii, &Yjj,
264 &Yrow, &Ycol );
265 if( ( YisRow = ( INCY == DESCY[M_] ) ) != 0 )
266 {
267 YnbD = DESCY[NB_]; Yld = DESCY[LLD_]; Ylinc = Yld;
268 YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow;
269 YprocD = Ycol; YmyprocD = mycol; YnprocsD = npcol;
270 Yinb1D = PB_Cfirstnb( N, JY, DESCY[INB_], YnbD );
271 }
272 else
273 {
274 YnbD = DESCY[MB_]; Yld = DESCY[LLD_]; Ylinc = 1;
275 YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol;
276 YprocD = Yrow; YmyprocD = myrow; YnprocsD = nprow;
277 Yinb1D = PB_Cfirstnb( N, IY, DESCY[IMB_], YnbD );
278 }
279 YisR = ( ( YprocR == -1 ) || ( YnprocsR == 1 ) );
280/*
281* Are sub( X ) and sub( Y ) both row or column vectors ?
282*/
283 RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) );
284/*
285* sub( X ) is not distributed and sub( Y ) is distributed
286*/
287 if( !( XisR ) )
288 {
289/*
290* sub( X ) is not replicated. Since this operation is local if sub( X ) and
291* sub( Y ) are both row or column vectors, choose YprocR = XprocR when RRorCC,
292* and YprocR = 0 otherwise.
293*/
294 if( YisR ) { YprocR = ( ( RRorCC ) ? XprocR : 0 ); }
295/*
296* Now, it is just like sub( Y ) is not replicated, this information however is
297* kept in YisR for later use.
298*/
299 if( RRorCC )
300 {
301/*
302* sub( X ) and sub( Y ) are both row or column vectors
303*/
304 if( XprocR == YprocR )
305 {
306/*
307* sub( X ) and sub( Y ) are in the same process row or column
308*/
309 if( ( XmyprocR == XprocR ) || ( YmyprocR == YprocR ) )
310 {
311 size = TYPE->size;
312 YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
313 YnprocsD );
314/*
315* In a given process, the dot product is computed with sub( Y ) and the cor-
316* responding non distributed part of sub( X ). In the other processes, this
317* part of sub( X ) is simply ignored.
318*/
319 if( YnpD > 0 )
320 {
321 Yroc = YprocD;
322 if( XisRow ) { kk = Yjj; ktmp = JX + N; kn = JX + Yinb1D; }
323 else { kk = Yii; ktmp = IX + N; kn = IX + Yinb1D; }
324
325 if( YmyprocD == Yroc )
326 {
327 FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc,
328 Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc );
329 kk += Yinb1D;
330 }
331 Yroc = MModAdd1( Yroc, YnprocsD );
332
333 for( k = kn; k < ktmp; k += YnbD )
334 {
335 kbb = ktmp - k; kbb = MIN( kbb, YnbD );
336 if( YmyprocD == Yroc )
337 {
338 if( XisRow )
339 FDOT( &kbb, DOT, Mptr( X, Xii, k, Xld, size ),
340 &Xlinc, Mptr( Y, Yii, kk, Yld, size ),
341 &Ylinc );
342 else
343 FDOT( &kbb, DOT, Mptr( X, k, Xjj, Xld, size ),
344 &Xlinc, Mptr( Y, kk, Yjj, Yld, size ),
345 &Ylinc );
346 kk += kbb;
347 }
348 Yroc = MModAdd1( Yroc, YnprocsD );
349 }
350 }
351/*
352* Replicate locally scattered dot product by reducing it
353*/
354 if( XisRow )
355 {
356 top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
357 TYPE->Cgsum2d( ctxt, ROW, top, 1, 1, DOT, 1, -1, 0 );
358 }
359 else
360 {
361 top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
362 TYPE->Cgsum2d( ctxt, COLUMN, top, 1, 1, DOT, 1, -1, 0 );
363 }
364 }
365 }
366 else
367 {
368/*
369* sub( X ) and sub( Y ) are in a different process row or column
370*/
371 if( YmyprocR == YprocR )
372 {
373 size = TYPE->size;
374 YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
375 YnprocsD );
376/*
377* If I own a piece of sub( Y ), then send it to the process row or column where
378* sub( X ) resides and receive the dot product when sub( Y ) is not replicated.
379*/
380 if( YisRow )
381 {
382 if( YnpD > 0 )
383 TYPE->Cgesd2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj, Yld,
384 size ), Yld, XprocR, YmyprocD );
385 TYPE->Cgerv2d( ctxt, 1, 1, DOT, 1, XprocR, XmyprocD );
386 }
387 else
388 {
389 if( YnpD > 0 )
390 TYPE->Cgesd2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj, Yld,
391 size ), Yld, YmyprocD, XprocR );
392 TYPE->Cgerv2d( ctxt, 1, 1, DOT, 1, XmyprocD, XprocR );
393 }
394 }
395
396 if( XmyprocR == XprocR )
397 {
398 size = TYPE->size;
399 YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
400 YnprocsD );
401/*
402* If I own sub( X ), then receive the distributed part of sub( Y ) owned by
403* the process where sub( Y ) resides in my row or column. Compute the partial
404* dot product as if sub( Y ) would reside in the same process row or column as
405* sub( X ). Combine the local results.
406*/
407 if( YnpD > 0 )
408 {
409 buf = PB_Cmalloc( YnpD * size );
410 if( YisRow )
411 TYPE->Cgerv2d( ctxt, 1, YnpD, buf, 1, YprocR,
412 XmyprocD );
413 else
414 TYPE->Cgerv2d( ctxt, YnpD, 1, buf, YnpD, XmyprocD,
415 YprocR );
416
417 Yroc = YprocD;
418 kk = 0;
419 if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; }
420 else { ktmp = IX + N; kn = IX + Yinb1D; }
421
422 if( YmyprocD == Yroc )
423 {
424 FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ),
425 &Xlinc, buf, &ione );
426 kk += Yinb1D;
427 }
428 Yroc = MModAdd1( Yroc, YnprocsD );
429
430 for( k = kn; k < ktmp; k += YnbD )
431 {
432 kbb = ktmp - k; kbb = MIN( kbb, YnbD );
433 if( YmyprocD == Yroc )
434 {
435 if( XisRow )
436 FDOT( &kbb, DOT, Mptr( X, Xii, k, Xld, size ),
437 &Xlinc, buf+kk*size, &ione );
438 else
439 FDOT( &kbb, DOT, Mptr( X, k, Xjj, Xld, size ),
440 &Xlinc, buf+kk*size, &ione );
441 kk += kbb;
442 }
443 Yroc = MModAdd1( Yroc, YnprocsD );
444 }
445 if( buf ) free( buf );
446 }
447/*
448* Combine the local results within the process row or column XprocR and
449* send the result to the process row or column YprocR when sub( Y ) is not
450* replicated.
451*/
452 if( XisRow )
453 {
454 top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
455 TYPE->Cgsum2d( ctxt, ROW, top, 1, 1, DOT, 1, -1, 0 );
456 if( !YisR )
457 TYPE->Cgesd2d( ctxt, 1, 1, DOT, 1, YprocR, YmyprocD );
458 }
459 else
460 {
461 top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
462 TYPE->Cgsum2d( ctxt, COLUMN, top, 1, 1, DOT, 1, -1, 0 );
463 if( !YisR )
464 TYPE->Cgesd2d( ctxt, 1, 1, DOT, 1, YmyprocD, YprocR );
465 }
466 }
467 }
468
469 if( YisR )
470 {
471/*
472* If sub( Y ) is replicated, then bcast the result
473*/
474 if( XisRow )
475 {
476 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
477 if( XmyprocR == XprocR )
478 TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 );
479 else
480 TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, XprocR,
481 XmyprocD );
482 }
483 else
484 {
485 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
486 if( XmyprocR == XprocR )
487 TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 );
488 else
489 TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, XmyprocD,
490 XprocR );
491 }
492 }
493 }
494 else
495 {
496/*
497* sub( X ) and sub( Y ) are not both row or column vectors
498*/
499 if( ( XmyprocR == XprocR ) || ( YmyprocR == YprocR ) )
500 {
501 size = TYPE->size;
502 Xroc = 0;
503 if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; }
504 else { ktmp = IX + N; kn = IX + Yinb1D; }
505/*
506* Loop over the processes in which sub( Y ) resides, for each process find the
507* next process Xroc and compute the dot product. After this, it will be needed
508* to reduce the local dot produsts as above.
509*/
510 for( p = 0; p < YnprocsD; p++ )
511 {
512 mydist = MModSub( p, YprocD, YnprocsD );
513 myproc = MModAdd( YprocD, mydist, YnprocsD );
514
515 if( ( XprocR == p ) && ( YprocR == Xroc ) )
516 {
517/*
518* Compute locally the partial dot product at the intersection of the process
519* cross.
520*/
521 if( ( XmyprocR == p ) && ( XmyprocD == Xroc ) )
522 {
523 YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD,
524 YnprocsD );
525 if( YnpD > 0 )
526 {
527 Yroc = YprocD;
528 kk = ( XisRow ? Yii : Yjj );
529
530 if( myproc == Yroc )
531 {
532 FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ),
533 &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ),
534 &Ylinc );
535 kk += Yinb1D;
536 }
537 Yroc = MModAdd1( Yroc, YnprocsD );
538
539 for( k = kn; k < ktmp; k += YnbD )
540 {
541 kbb = ktmp - k; kbb = MIN( kbb, YnbD );
542 if( myproc == Yroc )
543 {
544 if( XisRow )
545 FDOT( &kbb, DOT, Mptr( X, Xii, k, Xld, size ),
546 &Xlinc, Mptr( Y, kk, Yjj, Yld, size ),
547 &Ylinc );
548 else
549 FDOT( &kbb, DOT, Mptr( X, k, Xjj, Xld, size ),
550 &Xlinc, Mptr( Y, Yii, kk, Yld, size ),
551 &Ylinc );
552 kk += kbb;
553 }
554 Yroc = MModAdd1( Yroc, YnprocsD );
555 }
556 }
557 }
558 }
559 else
560 {
561/*
562* Message exchange
563*/
564 if( ( YmyprocR == YprocR ) && ( YmyprocD == p ) )
565 {
566 YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD,
567 YnprocsD );
568 if( YnpD > 0 )
569 {
570 if( XisRow )
571 TYPE->Cgesd2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj, Yld,
572 size ), Yld, XprocR, Xroc );
573 else
574 TYPE->Cgesd2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj, Yld,
575 size ), Yld, Xroc, XprocR );
576 }
577 }
578
579 if( ( XmyprocR == XprocR ) && ( XmyprocD == Xroc ) )
580 {
581 YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD,
582 YnprocsD );
583 if( YnpD > 0 )
584 {
585 buf = PB_Cmalloc( YnpD * size );
586 Yroc = YprocD;
587 kk = 0;
588/*
589* Receive the piece of sub( Y ) that I should handle
590*/
591 if( XisRow )
592 TYPE->Cgerv2d( ctxt, YnpD, 1, buf, YnpD, p, YprocR );
593 else
594 TYPE->Cgerv2d( ctxt, 1, YnpD, buf, 1, YprocR, p );
595
596 if( myproc == Yroc )
597 {
598 FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ),
599 &Xlinc, buf, &ione );
600 kk += Yinb1D;
601 }
602 Yroc = MModAdd1( Yroc, YnprocsD );
603
604 for( k = kn; k < ktmp; k += YnbD )
605 {
606 kbb = ktmp - k; kbb = MIN( kbb, YnbD );
607 if( myproc == Yroc )
608 {
609 if( XisRow )
610 FDOT( &kbb, DOT, Mptr( X, Xii, k, Xld, size ),
611 &Xlinc, buf+kk*size, &ione );
612 else
613 FDOT( &kbb, DOT, Mptr( X, k, Xjj, Xld, size ),
614 &Xlinc, buf+kk*size, &ione );
615 kk += kbb;
616 }
617 Yroc = MModAdd1( Yroc, YnprocsD );
618 }
619 if( buf ) free( buf );
620 }
621 }
622 }
623 Xroc = MModAdd1( Xroc, XnprocsD );
624 }
625/*
626* Combine the local results in sub( X )'s scope
627*/
628 if( XmyprocR == XprocR )
629 {
630 if( XisRow )
631 {
632 top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
633 TYPE->Cgsum2d( ctxt, ROW, top, 1, 1, DOT, 1, -1, 0 );
634 }
635 else
636 {
637 top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
638 TYPE->Cgsum2d( ctxt, COLUMN, top, 1, 1, DOT, 1, -1, 0 );
639 }
640 }
641 }
642/*
643* Broadcast the result in sub( Y )'s scope
644*/
645 if( YisR || ( YmyprocR == YprocR ) )
646 {
647 if( YisRow )
648 {
649 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
650 if( XmyprocR == XprocR )
651 TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 );
652 else
653 TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, YmyprocR,
654 XprocR );
655 }
656 else
657 {
658 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
659 if( XmyprocR == XprocR )
660 TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 );
661 else
662 TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, XprocR,
663 YmyprocR );
664 }
665 }
666 }
667 }
668 else
669 {
670/*
671* sub( X ) is replicated in every process. Compute the local dot product in
672* process row or column YprocR when sub( Y ) is not replicated and in every
673* process otherwise.
674*/
675 if( YisR || ( YmyprocR == YprocR ) )
676 {
677 size = TYPE->size;
678 Yroc = YprocD;
679 kk = ( YisRow ? Yjj : Yii );
680
681 if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; }
682 else { ktmp = IX + N; kn = IX + Yinb1D; }
683
684 if( YmyprocD == Yroc )
685 {
686 FDOT( &Yinb1D, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y,
687 Yii, Yjj, Yld, size ), &Ylinc );
688 kk += Yinb1D;
689 }
690 Yroc = MModAdd1( Yroc, YnprocsD );
691
692 for( k = kn; k < ktmp; k += YnbD )
693 {
694 kbb = ktmp - k; kbb = MIN( kbb, YnbD );
695 if( YmyprocD == Yroc )
696 {
697 if( XisRow ) { Xptr = Mptr( X, Xii, k, Xld, size ); }
698 else { Xptr = Mptr( X, k, Xjj, Xld, size ); }
699 if( YisRow ) { Yptr = Mptr( Y, Yii, kk, Yld, size ); }
700 else { Yptr = Mptr( Y, kk, Yjj, Yld, size ); }
701 FDOT( &kbb, DOT, Xptr, &Xlinc, Yptr, &Ylinc );
702 kk += kbb;
703 }
704 Yroc = MModAdd1( Yroc, YnprocsD );
705 }
706 }
707
708 if( YisR )
709 {
710/*
711* sub( Y ) is replicated, combine the results in each process row or column.
712*/
713 if( YisRow )
714 {
715 top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
716 TYPE->Cgsum2d( ctxt, ROW, top, 1, 1, DOT, 1, -1, 0 );
717 }
718 else
719 {
720 top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
721 TYPE->Cgsum2d( ctxt, COLUMN, top, 1, 1, DOT, 1, -1, 0 );
722 }
723 }
724 else
725 {
726/*
727* sub( Y ) is not replicated, combine the results in the entire grid at once.
728*/
729 top = PB_Ctop( &ctxt, COMBINE, ALL, TOP_GET );
730 TYPE->Cgsum2d( ctxt, ALL, top, 1, 1, DOT, 1, -1, 0 );
731 }
732 }
733/*
734* End of PB_CpdotND
735*/
736}
#define Int
Definition Bconfig.h:22
F_VOID_FCT(* VVDOT_T)()
Definition pblas.h:290
#define TOP_GET
Definition PBblacs.h:50
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define BCAST
Definition PBblacs.h:48
#define ALL
Definition PBblas.h:50
#define CTXT_
Definition PBtools.h:38
Int PB_Cfirstnb()
#define MB_
Definition PBtools.h:43
char * PB_Cmalloc()
void PB_Cinfog2l()
#define MModSub(I1, I2, d)
Definition PBtools.h:102
#define MIN(a_, b_)
Definition PBtools.h:76
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
#define LLD_
Definition PBtools.h:47
Int PB_Cnumroc()
char * PB_Ctop()
#define MModAdd1(I, d)
Definition PBtools.h:100
#define M_
Definition PBtools.h:39
#define MModAdd(I1, I2, d)
Definition PBtools.h:97
#define INB_
Definition PBtools.h:42
void PB_CpdotND()
#define IMB_
Definition PBtools.h:41
#define NB_
Definition PBtools.h:44
#define TYPE
Definition clamov.c:7