SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
PB_CpaxpbyDN.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS auxiliary routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "../pblas.h"
14#include "../PBpblas.h"
15#include "../PBtools.h"
16#include "../PBblacs.h"
17#include "../PBblas.h"
18
19#ifdef __STDC__
20void PB_CpaxpbyDN( PBTYP_T * TYPE, char * CONJUG, Int M, Int N,
21 char * ALPHA,
22 char * A, Int IA, Int JA, Int * DESCA, char * AROC,
23 char * BETA,
24 char * B, Int IB, Int JB, Int * DESCB, char * BROC )
25#else
26void PB_CpaxpbyDN( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, AROC,
27 BETA, B, IB, JB, DESCB, BROC )
28/*
29* .. Scalar Arguments ..
30*/
31 char * AROC, * BROC, * CONJUG;
32 Int IA, IB, JA, JB, M, N;
33 char * ALPHA, * BETA;
34 PBTYP_T * TYPE;
35/*
36* .. Array Arguments ..
37*/
38 Int * DESCA, * DESCB;
39 char * A, * B;
40#endif
41{
42/*
43* Purpose
44* =======
45*
46* PB_CpaxpbyDN adds one submatrix to another,
47*
48* sub( B ) := beta * sub( B ) + alpha * sub( A ), or,
49*
50* sub( B ) := beta * sub( B ) + alpha * conjg( sub( A ) ),
51*
52* where sub( A ) is distributed and sub( B ) is not distributed.
53*
54* sub( A ) always denotes A(IA:IA+M-1,JA:JA+N-1). When AROC is 'R' or
55* 'r' sub( A ) resides in a process row, otherwise sub( A ) resides in
56* a process column. When sub( A ) resides in a process row and BROC is
57* 'R' or 'r' or sub( A ) resides in a process column and BROC is 'C' or
58* 'c', then sub( B ) denotes B( IB:IB+M-1, JB:JB+N-1 ), and otherwise
59* sub( B ) denotes B(IB:IB+N-1,JB:JB+M-1).
60* otherwise.
61*
62* Notes
63* =====
64*
65* A description vector is associated with each 2D block-cyclicly dis-
66* tributed matrix. This vector stores the information required to
67* establish the mapping between a matrix entry and its corresponding
68* process and memory location.
69*
70* In the following comments, the character _ should be read as
71* "of the distributed matrix". Let A be a generic term for any 2D
72* block cyclicly distributed matrix. Its description vector is DESC_A:
73*
74* NOTATION STORED IN EXPLANATION
75* ---------------- --------------- ------------------------------------
76* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
77* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
78* the NPROW x NPCOL BLACS process grid
79* A is distributed over. The context
80* itself is global, but the handle
81* (the integer value) may vary.
82* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
83* ted matrix A, M_A >= 0.
84* N_A (global) DESCA[ N_ ] The number of columns in the distri-
85* buted matrix A, N_A >= 0.
86* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
87* block of the matrix A, IMB_A > 0.
88* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
89* left block of the matrix A,
90* INB_A > 0.
91* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
92* bute the last M_A-IMB_A rows of A,
93* MB_A > 0.
94* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
95* bute the last N_A-INB_A columns of
96* A, NB_A > 0.
97* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
98* row of the matrix A is distributed,
99* NPROW > RSRC_A >= 0.
100* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
101* first column of A is distributed.
102* NPCOL > CSRC_A >= 0.
103* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
104* array storing the local blocks of
105* the distributed matrix A,
106* IF( Lc( 1, N_A ) > 0 )
107* LLD_A >= MAX( 1, Lr( 1, M_A ) )
108* ELSE
109* LLD_A >= 1.
110*
111* Let K be the number of rows of a matrix A starting at the global in-
112* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
113* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
114* receive if these K rows were distributed over NPROW processes. If K
115* is the number of columns of a matrix A starting at the global index
116* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
117* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
118* these K columns were distributed over NPCOL processes.
119*
120* The values of Lr() and Lc() may be determined via a call to the func-
121* tion PB_Cnumroc:
122* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
123* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
124*
125* Arguments
126* =========
127*
128* TYPE (local input) pointer to a PBTYP_T structure
129* On entry, TYPE is a pointer to a structure of type PBTYP_T,
130* that contains type information (See pblas.h).
131*
132* CONJUG (global input) pointer to CHAR
133* On entry, CONJUG specifies whether conjg( sub( A ) ) or
134* sub( A ) should be added to sub( B ) as follows:
135* CONJUG = 'N' or 'n':
136* sub( B ) := beta*sub( B ) + alpha*sub( A ),
137* otherwise
138* sub( B ) := beta*sub( B ) + alpha*conjg( sub( A ) ).
139*
140* M (global input) INTEGER
141* On entry, M specifies the number of rows of the submatrix
142* sub( A ). M must be at least zero.
143*
144* N (global input) INTEGER
145* On entry, N specifies the number of columns of the submatrix
146* sub( A ). N must be at least zero.
147*
148* ALPHA (global input) pointer to CHAR
149* On entry, ALPHA specifies the scalar alpha. When ALPHA is
150* supplied as zero then the local entries of the array A cor-
151* responding to the entries of the submatrix sub( A ) need not
152* be set on input.
153*
154* A (local input) pointer to CHAR
155* On entry, A is an array of dimension (LLD_A, Ka), where LLD_A
156* is at least MAX( 1, Lr( 1, IA+M-1 ) ), and, Ka is at least
157* Lc( 1, JA+N-1 ). Before entry, this array contains the local
158* entries of the matrix A.
159*
160* IA (global input) INTEGER
161* On entry, IA specifies A's global row index, which points to
162* the beginning of the submatrix sub( A ).
163*
164* JA (global input) INTEGER
165* On entry, JA specifies A's global column index, which points
166* to the beginning of the submatrix sub( A ).
167*
168* DESCA (global and local input) INTEGER array
169* On entry, DESCA is an integer array of dimension DLEN_. This
170* is the array descriptor for the matrix A.
171*
172* AROC (global input) pointer to CHAR
173* On entry, AROC specifies the orientation of the subvector
174* sub( A ). When AROC is 'R' or 'r', sub( A ) is a row vector,
175* and a column vector otherwise.
176*
177* BETA (global input) pointer to CHAR
178* On entry, BETA specifies the scalar beta. When BETA is sup-
179* plied as zero then the local entries of the array B corres-
180* ponding to the entries of the submatrix sub( B ) need not be
181* set on input.
182*
183* B (local input/local output) pointer to CHAR
184* On entry, B is an array of dimension (LLD_B, Kb), where LLD_B
185* is at least MAX( 1, Lr( 1, IB+M-1 ) ) when sub( A ) and
186* sub( B ) are both distributed along a process column or a
187* process row. In that case, Kb is at least Lc( 1, JB+N-1 ).
188* Otherwise, LLD_B is at least MAX( 1, Lr( 1, IB+N-1 ) ) and
189* Kb is at least Lc( 1, JB+M-1 ). Before entry, this array
190* contains the local entries of the matrix B. On exit, sub( B )
191* is overwritten with the updated submatrix.
192*
193* IB (global input) INTEGER
194* On entry, IB specifies B's global row index, which points to
195* the beginning of the submatrix sub( B ).
196*
197* JB (global input) INTEGER
198* On entry, JB specifies B's global column index, which points
199* to the beginning of the submatrix sub( B ).
200*
201* DESCB (global and local input) INTEGER array
202* On entry, DESCB is an integer array of dimension DLEN_. This
203* is the array descriptor for the matrix B.
204*
205* BROC (global input) pointer to CHAR
206* On entry, BROC specifies the orientation of the subvector
207* sub( B ). When BROC is 'R' or 'r', sub( B ) is a row vector,
208* and a column vector otherwise.
209*
210* -- Written on April 1, 1998 by
211* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
212*
213* ---------------------------------------------------------------------
214*/
215/*
216* .. Local Scalars ..
217*/
218 char scope, * top, * zero;
219 Int Acol, Aii, Ainb1D, AisR, AisRow, Ajj, Ald, AmyprocD, AmyprocR,
220 AnD, AnbD, AnpD, AnprocsD, AprocD, AprocR, Aroc, Arow, Bcol,
221 Bii, BisR, BisRow, Bjj, Bld, Bm, BmyprocD, BmyprocR, Bn,
222 BnprocsD, BprocR, Broc, Brow, RRorCC, ctxt, izero=0, k, kbb,
223 kk, kn, ktmp, mycol, mydist, myproc, myrow, npcol, nprow, p,
224 size;
225 MMADD_T add;
226 TZPAD_T pad;
227/*
228* .. Local Arrays ..
229*/
230 char * buf = NULL;
231/* ..
232* .. Executable Statements ..
233*
234*/
235/*
236* Retrieve process grid information
237*/
238 Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
239/*
240* Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ...
241*/
242 PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
243 &Arow, &Acol );
244 if( ( AisRow = ( Mupcase( AROC[0] ) == CROW ) ) != 0 )
245 {
246 AnD = N; AnbD = DESCA[NB_]; Ald = DESCA[LLD_];
247 AprocD = Acol; AprocR = Arow;
248 AmyprocD = mycol; AmyprocR = myrow; AnprocsD = npcol;
249 AisR = ( ( Arow == -1 ) || ( nprow == 1 ) );
250 Ainb1D = PB_Cfirstnb( AnD, JA, DESCA[INB_], AnbD );
251 }
252 else
253 {
254 AnD = M; AnbD = DESCA[MB_]; Ald = DESCA[LLD_];
255 AprocD = Arow; AprocR = Acol;
256 AmyprocD = myrow; AmyprocR = mycol; AnprocsD = nprow;
257 AisR = ( ( Acol == -1 ) || ( npcol == 1 ) );
258 Ainb1D = PB_Cfirstnb( AnD, IA, DESCA[IMB_], AnbD );
259 }
260/*
261* Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol ...
262*/
263 PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj,
264 &Brow, &Bcol );
265 if( ( BisRow = ( Mupcase( BROC[0] ) == CROW ) ) != 0 )
266 {
267 Bld = DESCB[LLD_];
268 BmyprocD = mycol; BnprocsD = npcol;
269 BprocR = Brow; BmyprocR = myrow;
270 BisR = ( ( BprocR == -1 ) || ( nprow == 1 ) );
271 }
272 else
273 {
274 Bld = DESCB[LLD_];
275 BmyprocD = myrow; BnprocsD = nprow;
276 BprocR = Bcol; BmyprocR = mycol;
277 BisR = ( ( BprocR == -1 ) || ( npcol == 1 ) );
278 }
279/*
280* Are sub( A ) and sub( B ) both row or column vectors ?
281*/
282 RRorCC = ( ( AisRow && BisRow ) || ( !( AisRow ) && !( BisRow ) ) );
283/*
284* Select the local add routine accordingly
285*/
286 size = TYPE->size;
287/*
288* sub( A ) is distributed and sub( B ) is not distributed
289*/
290 if( !( BisR ) )
291 {
292/*
293* sub( B ) is not replicated. Since this operation is local if sub( B ) and
294* sub( A ) are both row or column vectors, choose AprocR = BprocR when RRorCC,
295* and AprocR = 0 otherwise.
296*/
297 if( AisR ) { AprocR = ( ( RRorCC ) ? BprocR : 0 ); }
298/*
299* Now, it is just like sub( A ) is not replicated, this information however is
300* kept in AisR for later use.
301*/
302 if( ( AmyprocR == AprocR ) || ( BmyprocR == BprocR ) )
303 {
304 if( RRorCC )
305 {
306/*
307* sub( A ) and sub( B ) are both row or column vectors
308*/
309 zero = TYPE->zero;
310 if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmcadd;
311 else add = TYPE->Fmmadd;
312 pad = TYPE->Ftzpad;
313
314 AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, AmyprocD, AprocD,
315 AnprocsD );
316/*
317* sub( A ) and sub( B ) are in the same process row or column
318*/
319 if( AprocR == BprocR )
320 {
321/*
322* In each process, the distributed part of sub( A ) is added to sub( B ). In
323* the other processes, this replicated of sub( B ) is set to zero for later
324* reduction.
325*/
326 if( AnpD > 0 )
327 {
328 Aroc = AprocD;
329 if( BisRow ) { kk = Ajj; ktmp = JB + N; kn = JB + Ainb1D; }
330 else { kk = Aii; ktmp = IB + M; kn = IB + Ainb1D; }
331
332 if( AmyprocD == Aroc )
333 {
334 if( BisRow )
335 add( &M, &Ainb1D, ALPHA, Mptr( A, Aii, Ajj, Ald, size ),
336 &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
337 else
338 add( &Ainb1D, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ),
339 &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
340 kk += Ainb1D;
341 }
342 else
343 {
344 if( BisRow )
345 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &Ainb1D,
346 &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ),
347 &Bld );
348 else
349 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Ainb1D, &N,
350 &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ),
351 &Bld );
352 }
353 Aroc = MModAdd1( Aroc, AnprocsD );
354
355 for( k = kn; k < ktmp; k += AnbD )
356 {
357 kbb = ktmp - k; kbb = MIN( kbb, AnbD );
358
359 if( AmyprocD == Aroc )
360 {
361 if( BisRow )
362 add( &M, &kbb, ALPHA, Mptr( A, Aii, kk, Ald, size ),
363 &Ald, BETA, Mptr( B, Bii, k, Bld, size ),
364 &Bld );
365 else
366 add( &kbb, &N, ALPHA, Mptr( A, kk, Ajj, Ald, size ),
367 &Ald, BETA, Mptr( B, k, Bjj, Bld, size ),
368 &Bld );
369 kk += kbb;
370 }
371 else
372 {
373 if( BisRow )
374 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &kbb,
375 &izero, zero, zero, Mptr( B, Bii, k, Bld,
376 size ), &Bld );
377 else
378 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &kbb, &N,
379 &izero, zero, zero, Mptr( B, k, Bjj, Bld,
380 size ), &Bld );
381 }
382 Aroc = MModAdd1( Aroc, AnprocsD );
383 }
384 }
385 else
386 {
387/*
388* If I don't own any entries of sub( A ), then zero the entire sub( B )
389* residing in this process.
390*/
391 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &N, &izero,
392 zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
393 }
394/*
395* Replicate locally scattered sub( B ) by reducing it
396*/
397 scope = ( BisRow ? CROW : CCOLUMN );
398 top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
399 TYPE->Cgsum2d( ctxt, &scope, top, M, N, Mptr( B, Bii, Bjj, Bld,
400 size ), Bld, -1, 0 );
401 }
402 else
403 {
404/*
405* sub( A ) and sub( B ) are in a different process row or column
406*/
407 if( AmyprocR == AprocR )
408 {
409/*
410* If I own a piece of sub( A ), then send it to the corresponding process row
411* or column where sub( B ) resides.
412*/
413 if( AnpD > 0 )
414 {
415 if( AisRow )
416 TYPE->Cgesd2d( ctxt, M, AnpD, Mptr( A, Aii, Ajj, Ald,
417 size ), Ald, BprocR, BmyprocD );
418 else
419 TYPE->Cgesd2d( ctxt, AnpD, N, Mptr( A, Aii, Ajj, Ald,
420 size ), Ald, BmyprocD, BprocR );
421 }
422 }
423
424 if( BmyprocR == BprocR )
425 {
426/*
427* If I own sub( B ), then receive and unpack distributed part of sub( A ) that
428* should be added to sub( B ). Combine the results.
429*/
430 if( AnpD > 0 )
431 {
432 if( BisRow )
433 {
434 ktmp = JB + N;
435 kn = JB + Ainb1D;
436 buf = PB_Cmalloc( M * AnpD * size );
437 TYPE->Cgerv2d( ctxt, M, AnpD, buf, M, AprocR,
438 AmyprocD );
439 }
440 else
441 {
442 ktmp = IB + M;
443 kn = IB + Ainb1D;
444 buf = PB_Cmalloc( AnpD * N * size );
445 TYPE->Cgerv2d( ctxt, AnpD, N, buf, AnpD, AmyprocD,
446 AprocR );
447 }
448 Aroc = AprocD;
449 kk = 0;
450
451 if( AmyprocD == Aroc )
452 {
453 if( BisRow )
454 add( &M, &Ainb1D, ALPHA, buf, &M, BETA, Mptr( B,
455 Bii, Bjj, Bld, size ), &Bld );
456 else
457 add( &Ainb1D, &N, ALPHA, buf, &AnpD, BETA, Mptr( B,
458 Bii, Bjj, Bld, size ), &Bld );
459 kk += Ainb1D;
460 }
461 else
462 {
463 if( BisRow )
464 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M,
465 &Ainb1D, &izero, zero, zero, Mptr( B, Bii, Bjj,
466 Bld, size ), &Bld );
467 else
468 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Ainb1D,
469 &N, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld,
470 size ), &Bld );
471 }
472 Aroc = MModAdd1( Aroc, AnprocsD );
473
474 for( k = kn; k < ktmp; k += AnbD )
475 {
476 kbb = ktmp - k; kbb = MIN( kbb, AnbD );
477
478 if( AmyprocD == Aroc )
479 {
480 if( BisRow )
481 add( &M, &kbb, ALPHA, Mptr( buf, 0, kk, M, size ),
482 &M, BETA, Mptr( B, Bii, k, Bld, size ),
483 &Bld );
484 else
485 add( &kbb, &N, ALPHA, Mptr( buf, kk, 0, AnpD,
486 size ), &AnpD, BETA, Mptr( B, k, Bjj, Bld,
487 size ), &Bld );
488 kk += kbb;
489 }
490 else
491 {
492 if( BisRow )
493 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M,
494 &kbb, &izero, zero, zero, Mptr( B, Bii, k,
495 Bld, size ), &Bld );
496 else
497 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &kbb,
498 &N, &izero, zero, zero, Mptr( B, k, Bjj, Bld,
499 size ), &Bld );
500 }
501 Aroc = MModAdd1( Aroc, AnprocsD );
502 }
503 if( buf ) free( buf );
504 }
505 else
506 {
507/*
508* If I don't own any entries of sub( A ), then zero the entire sub( B )
509* residing in this process.
510*/
511 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &N, &izero,
512 zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
513 }
514/*
515* Replicate locally scattered sub( B ) by reducing it
516*/
517 scope = ( BisRow ? CROW : CCOLUMN );
518 top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
519 TYPE->Cgsum2d( ctxt, &scope, top, M, N, Mptr( B, Bii, Bjj,
520 Bld, size ), Bld, -1, 0 );
521 }
522 }
523 }
524 else
525 {
526/*
527* sub( A ) and sub( B ) are not both row or column vectors
528*/
529 zero = TYPE->zero;
530 if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmtcadd;
531 else add = TYPE->Fmmtadd;
532 pad = TYPE->Ftzpad;
533
534 Broc = 0;
535 if( BisRow ) { ktmp = JB + M; kn = JB + Ainb1D; }
536 else { ktmp = IB + N; kn = IB + Ainb1D; }
537/*
538* Loop over the processes in which sub( A ) resides, for each process find the
539* next process Xroc. Exchange and add the data.
540*/
541 for( p = 0; p < AnprocsD; p++ )
542 {
543 mydist = MModSub( p, AprocD, AnprocsD );
544 myproc = MModAdd( AprocD, mydist, AnprocsD );
545
546 if( ( BprocR == p ) && ( AprocR == Broc ) )
547 {
548 if( BmyprocR == p )
549 {
550/*
551* local add at the intersection of the process cross
552*/
553 AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, p, AprocD,
554 AnprocsD );
555 if( AnpD > 0 )
556 {
557 Aroc = AprocD;
558 kk = ( BisRow ? Aii : Ajj );
559
560 if( myproc == Aroc )
561 {
562 if( BmyprocD == Broc )
563 {
564 if( AisRow )
565 add( &M, &Ainb1D, ALPHA, Mptr( A, Aii, Ajj,
566 Ald, size ), &Ald, BETA, Mptr( B, Bii,
567 Bjj, Bld, size ), &Bld );
568 else
569 add( &Ainb1D, &N, ALPHA, Mptr( A, Aii, Ajj,
570 Ald, size ), &Ald, BETA, Mptr( B, Bii,
571 Bjj, Bld, size ), &Bld );
572 kk += Ainb1D;
573 }
574 else
575 {
576 if( BisRow )
577 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &N,
578 &Ainb1D, &izero, zero, zero, Mptr( B, Bii,
579 Bjj, Bld, size ), &Bld );
580 else
581 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ),
582 &Ainb1D, &M, &izero, zero, zero, Mptr( B,
583 Bii, Bjj, Bld, size ), &Bld );
584 }
585 }
586 Aroc = MModAdd1( Aroc, AnprocsD );
587
588 for( k = kn; k < ktmp; k += AnbD )
589 {
590 kbb = ktmp - k; kbb = MIN( kbb, AnbD );
591 if( myproc == Aroc )
592 {
593 if( BmyprocD == Broc )
594 {
595 if( AisRow )
596 add( &M, &kbb, ALPHA, Mptr( A, Aii, kk, Ald,
597 size ), &Ald, BETA, Mptr( B, k, Bjj,
598 Bld, size ), &Bld );
599 else
600 add( &kbb, &N, ALPHA, Mptr( A, kk, Ajj, Ald,
601 size ), &Ald, BETA, Mptr( B, Bii, k,
602 Bld, size ), &Bld );
603 kk += kbb;
604 }
605 else
606 {
607 if( BisRow )
608 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ),
609 &N, &kbb, &izero, zero, zero, Mptr( B,
610 Bii, k, Bld, size ), &Bld );
611 else
612 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ),
613 &kbb, &M, &izero, zero, zero, Mptr( B,
614 k, Bjj, Bld, size ), &Bld );
615 }
616 }
617 Aroc = MModAdd1( Aroc, AnprocsD );
618 }
619 }
620 }
621 }
622 else
623 {
624/*
625* Message exchange
626*/
627 if( ( AmyprocR == AprocR ) && ( AmyprocD == p ) )
628 {
629 AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, p, AprocD,
630 AnprocsD );
631 if( AnpD > 0 )
632 {
633 if( AisRow )
634 TYPE->Cgesd2d( ctxt, M, AnpD, Mptr( A, Aii, Ajj, Ald,
635 size ), Ald, Broc, BprocR );
636 else
637 TYPE->Cgesd2d( ctxt, AnpD, N, Mptr( A, Aii, Ajj, Ald,
638 size ), Ald, BprocR, Broc );
639 }
640 }
641
642 if( BmyprocR == BprocR )
643 {
644 AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, p, AprocD,
645 AnprocsD );
646 if( AnpD > 0 )
647 {
648 Aroc = AprocD;
649 kk = 0;
650
651 if( BmyprocD == Broc )
652 {
653 if( AisRow )
654 {
655 buf = PB_Cmalloc( M * AnpD * size );
656 TYPE->Cgerv2d( ctxt, M, AnpD, buf, M, AprocR, p );
657 }
658 else
659 {
660 buf = PB_Cmalloc( AnpD * N * size );
661 TYPE->Cgerv2d( ctxt, AnpD, N, buf, AnpD, p,
662 AprocR );
663 }
664 }
665
666 if( myproc == Aroc )
667 {
668 if( BmyprocD == Broc )
669 {
670 if( AisRow )
671 add( &M, &Ainb1D, ALPHA, buf, &M, BETA,
672 Mptr( B, Bii, Bjj, Bld, size ), &Bld );
673 else
674 add( &Ainb1D, &N, ALPHA, buf, &AnpD, BETA,
675 Mptr( B, Bii, Bjj, Bld, size ), &Bld );
676 kk += Ainb1D;
677 }
678 else
679 {
680 if( BisRow )
681 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &N,
682 &Ainb1D, &izero, zero, zero, Mptr( B, Bii,
683 Bjj, Bld, size ), &Bld );
684 else
685 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ),
686 &Ainb1D, &M, &izero, zero, zero, Mptr( B,
687 Bii, Bjj, Bld, size ), &Bld );
688 }
689 }
690 Aroc = MModAdd1( Aroc, AnprocsD );
691
692 for( k = kn; k < ktmp; k += AnbD )
693 {
694 kbb = ktmp - k; kbb = MIN( kbb, AnbD );
695 if( myproc == Aroc )
696 {
697 if( BmyprocD == Broc )
698 {
699 if( AisRow )
700 add( &M, &kbb, ALPHA, Mptr( buf, 0, kk, M,
701 size ), &M, BETA, Mptr( B, k, Bjj,
702 Bld, size ), &Bld );
703 else
704 add( &kbb, &N, ALPHA, Mptr( buf, kk, 0,
705 AnpD, size ), &AnpD, BETA, Mptr( B,
706 Bii, k, Bld, size ), &Bld );
707 kk += kbb;
708 }
709 else
710 {
711 if( BisRow )
712 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ),
713 &N, &kbb, &izero, zero, zero, Mptr( B,
714 Bii, k, Bld, size ), &Bld );
715 else
716 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ),
717 &kbb, &M, &izero, zero, zero, Mptr( B,
718 k, Bjj, Bld, size ), &Bld );
719 }
720 }
721 Aroc = MModAdd1( Aroc, AnprocsD );
722 }
723 if( ( BmyprocD == Broc ) && ( buf ) ) free( buf );
724 }
725 }
726 }
727 Broc = MModAdd1( Broc, BnprocsD );
728 }
729
730 if( BmyprocR == BprocR )
731 {
732/*
733* Replicate locally scattered sub( B ) by reducing it
734*/
735 scope = ( BisRow ? CROW : CCOLUMN );
736 top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
737 TYPE->Cgsum2d( ctxt, &scope, top, N, M, Mptr( B, Bii, Bjj, Bld,
738 size ), Bld, -1, 0 );
739 }
740 }
741 }
742
743 if( BisR )
744 {
745/*
746* Replicate sub( B )
747*/
748 if( BisRow )
749 {
750 if( AisRow ) { Bm = M; Bn = N; }
751 else { Bm = N; Bn = M; }
752 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
753 if( BmyprocR == BprocR )
754 TYPE->Cgebs2d( ctxt, COLUMN, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
755 size ), Bld );
756 else
757 TYPE->Cgebr2d( ctxt, COLUMN, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
758 size ), Bld, BprocR, BmyprocD );
759 }
760 else
761 {
762 if( AisRow ) { Bm = N; Bn = M; }
763 else { Bm = M; Bn = N; }
764 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
765 if( BmyprocR == BprocR )
766 TYPE->Cgebs2d( ctxt, ROW, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
767 size ), Bld );
768 else
769 TYPE->Cgebr2d( ctxt, ROW, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
770 size ), Bld, BmyprocD, BprocR );
771 }
772 }
773 }
774 else
775 {
776/*
777* sub( B ) is replicated in every process. Add the data in process row or
778* column AprocR when sub( A ) is not replicated and in every process otherwise.
779*/
780 if( AisR || ( AmyprocR == AprocR ) )
781 {
782 zero = TYPE->zero;
783 if( RRorCC )
784 {
785 if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmcadd;
786 else add = TYPE->Fmmadd;
787 }
788 else
789 {
790 if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmtcadd;
791 else add = TYPE->Fmmtadd;
792 }
793 pad = TYPE->Ftzpad;
794
795 AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, AmyprocD, AprocD, AnprocsD );
796 if( AnpD > 0 )
797 {
798 Aroc = AprocD;
799 kk = ( AisRow ? Ajj : Aii );
800
801 if( BisRow ) { ktmp = JB + ( RRorCC ? N : M ); kn = JB + Ainb1D; }
802 else { ktmp = IB + ( RRorCC ? M : N ); kn = IB + Ainb1D; }
803
804 if( AmyprocD == Aroc )
805 {
806 if( AisRow )
807 add( &M, &Ainb1D, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald,
808 BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
809 else
810 add( &Ainb1D, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald,
811 BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
812 kk += Ainb1D;
813 }
814 else
815 {
816 if( RRorCC )
817 {
818 if( AisRow ) { Bm = M; Bn = Ainb1D; }
819 else { Bm = Ainb1D; Bn = N; }
820 }
821 else
822 {
823 if( AisRow ) { Bm = Ainb1D; Bn = M; }
824 else { Bm = N; Bn = Ainb1D; }
825 }
826 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Bm, &Bn, &izero,
827 zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
828 }
829 Aroc = MModAdd1( Aroc, AnprocsD );
830
831 for( k = kn; k < ktmp; k += AnbD )
832 {
833 kbb = ktmp - k; kbb = MIN( kbb, AnbD );
834
835 if( BisRow ) { buf = Mptr( B, Bii, k, Bld, size ); }
836 else { buf = Mptr( B, k, Bjj, Bld, size ); }
837
838 if( AmyprocD == Aroc )
839 {
840 if( AisRow )
841 add( &M, &kbb, ALPHA, Mptr( A, Aii, kk, Ald, size ), &Ald,
842 BETA, buf, &Bld );
843 else
844 add( &kbb, &N, ALPHA, Mptr( A, kk, Ajj, Ald, size ), &Ald,
845 BETA, buf, &Bld );
846 kk += kbb;
847 }
848 else
849 {
850 if( RRorCC )
851 {
852 if( AisRow ) { Bm = M; Bn = kbb; }
853 else { Bm = kbb; Bn = N; }
854 }
855 else
856 {
857 if( AisRow ) { Bm = kbb; Bn = M; }
858 else { Bm = N; Bn = kbb; }
859 }
860 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Bm, &Bn, &izero,
861 zero, zero, buf, &Bld );
862 }
863 Aroc = MModAdd1( Aroc, AnprocsD );
864 }
865 }
866 else
867 {
868 if( RRorCC )
869 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &N, &izero, zero,
870 zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
871 else
872 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &N, &M, &izero, zero,
873 zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
874 }
875/*
876* Replicate locally scattered sub( B ) by reducing it in the process scope of
877* sub( A )
878*/
879 scope = ( AisRow ? CROW : CCOLUMN );
880 top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
881 if( RRorCC )
882 TYPE->Cgsum2d( ctxt, &scope, top, M, N, Mptr( B, Bii, Bjj, Bld,
883 size ), Bld, -1, 0 );
884 else
885 TYPE->Cgsum2d( ctxt, &scope, top, N, M, Mptr( B, Bii, Bjj, Bld,
886 size ), Bld, -1, 0 );
887 }
888
889 if( !AisR )
890 {
891/*
892* If sub( A ) is not replicated, then broadcast the result to the other pro-
893* cesses that own a piece of sub( B ), but were not involved in the above
894* addition operation.
895*/
896 if( RRorCC ) { Bm = M; Bn = N; }
897 else { Bm = N; Bn = M; }
898
899 if( AisRow )
900 {
901 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
902 if( AmyprocR == AprocR )
903 TYPE->Cgebs2d( ctxt, COLUMN, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
904 size ), Bld );
905 else
906 TYPE->Cgebr2d( ctxt, COLUMN, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
907 size ), Bld, AprocR, AmyprocD );
908 }
909 else
910 {
911 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
912 if( AmyprocR == AprocR )
913 TYPE->Cgebs2d( ctxt, ROW, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
914 size ), Bld );
915 else
916 TYPE->Cgebr2d( ctxt, ROW, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
917 size ), Bld, AmyprocD, AprocR );
918 }
919 }
920 }
921/*
922* End of PB_CpaxpbyDN
923*/
924}
#define Int
Definition Bconfig.h:22
F_VOID_FCT(* MMADD_T)()
Definition pblas.h:288
#define C2F_CHAR(a)
Definition pblas.h:125
F_VOID_FCT(* TZPAD_T)()
Definition pblas.h:292
#define CCOLUMN
Definition PBblacs.h:20
#define TOP_GET
Definition PBblacs.h:50
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define CROW
Definition PBblacs.h:21
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define BCAST
Definition PBblacs.h:48
#define ALL
Definition PBblas.h:50
#define CNOCONJG
Definition PBblas.h:19
#define NOCONJG
Definition PBblas.h:45
#define CTXT_
Definition PBtools.h:38
Int PB_Cfirstnb()
#define MB_
Definition PBtools.h:43
void PB_CpaxpbyDN()
char * PB_Cmalloc()
void PB_Cinfog2l()
#define MModSub(I1, I2, d)
Definition PBtools.h:102
#define MIN(a_, b_)
Definition PBtools.h:76
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
#define LLD_
Definition PBtools.h:47
Int PB_Cnumroc()
char * PB_Ctop()
#define MModAdd1(I, d)
Definition PBtools.h:100
#define MModAdd(I1, I2, d)
Definition PBtools.h:97
#define INB_
Definition PBtools.h:42
#define IMB_
Definition PBtools.h:41
#define Mupcase(C)
Definition PBtools.h:83
#define NB_
Definition PBtools.h:44
#define TYPE
Definition clamov.c:7