ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CpaxpbyDN.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_CpaxpbyDN( PBTYP_T * TYPE, char * CONJUG, int M, int N,
21  char * ALPHA,
22  char * A, int IA, int JA, int * DESCA, char * AROC,
23  char * BETA,
24  char * B, int IB, int JB, int * DESCB, char * BROC )
25 #else
26 void PB_CpaxpbyDN( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, AROC,
27  BETA, B, IB, JB, DESCB, BROC )
28 /*
29 * .. Scalar Arguments ..
30 */
31  char * AROC, * BROC, * CONJUG;
32  int IA, IB, JA, JB, M, N;
33  char * ALPHA, * BETA;
34  PBTYP_T * TYPE;
35 /*
36 * .. Array Arguments ..
37 */
38  int * DESCA, * DESCB;
39  char * A, * B;
40 #endif
41 {
42 /*
43 * Purpose
44 * =======
45 *
46 * PB_CpaxpbyDN adds one submatrix to another,
47 *
48 * sub( B ) := beta * sub( B ) + alpha * sub( A ), or,
49 *
50 * sub( B ) := beta * sub( B ) + alpha * conjg( sub( A ) ),
51 *
52 * where sub( A ) is distributed and sub( B ) is not distributed.
53 *
54 * sub( A ) always denotes A(IA:IA+M-1,JA:JA+N-1). When AROC is 'R' or
55 * 'r' sub( A ) resides in a process row, otherwise sub( A ) resides in
56 * a process column. When sub( A ) resides in a process row and BROC is
57 * 'R' or 'r' or sub( A ) resides in a process column and BROC is 'C' or
58 * 'c', then sub( B ) denotes B( IB:IB+M-1, JB:JB+N-1 ), and otherwise
59 * sub( B ) denotes B(IB:IB+N-1,JB:JB+M-1).
60 * otherwise.
61 *
62 * Notes
63 * =====
64 *
65 * A description vector is associated with each 2D block-cyclicly dis-
66 * tributed matrix. This vector stores the information required to
67 * establish the mapping between a matrix entry and its corresponding
68 * process and memory location.
69 *
70 * In the following comments, the character _ should be read as
71 * "of the distributed matrix". Let A be a generic term for any 2D
72 * block cyclicly distributed matrix. Its description vector is DESC_A:
73 *
74 * NOTATION STORED IN EXPLANATION
75 * ---------------- --------------- ------------------------------------
76 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
77 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
78 * the NPROW x NPCOL BLACS process grid
79 * A is distributed over. The context
80 * itself is global, but the handle
81 * (the integer value) may vary.
82 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
83 * ted matrix A, M_A >= 0.
84 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
85 * buted matrix A, N_A >= 0.
86 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
87 * block of the matrix A, IMB_A > 0.
88 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
89 * left block of the matrix A,
90 * INB_A > 0.
91 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
92 * bute the last M_A-IMB_A rows of A,
93 * MB_A > 0.
94 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
95 * bute the last N_A-INB_A columns of
96 * A, NB_A > 0.
97 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
98 * row of the matrix A is distributed,
99 * NPROW > RSRC_A >= 0.
100 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
101 * first column of A is distributed.
102 * NPCOL > CSRC_A >= 0.
103 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
104 * array storing the local blocks of
105 * the distributed matrix A,
106 * IF( Lc( 1, N_A ) > 0 )
107 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
108 * ELSE
109 * LLD_A >= 1.
110 *
111 * Let K be the number of rows of a matrix A starting at the global in-
112 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
113 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
114 * receive if these K rows were distributed over NPROW processes. If K
115 * is the number of columns of a matrix A starting at the global index
116 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
117 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
118 * these K columns were distributed over NPCOL processes.
119 *
120 * The values of Lr() and Lc() may be determined via a call to the func-
121 * tion PB_Cnumroc:
122 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
123 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
124 *
125 * Arguments
126 * =========
127 *
128 * TYPE (local input) pointer to a PBTYP_T structure
129 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
130 * that contains type information (See pblas.h).
131 *
132 * CONJUG (global input) pointer to CHAR
133 * On entry, CONJUG specifies whether conjg( sub( A ) ) or
134 * sub( A ) should be added to sub( B ) as follows:
135 * CONJUG = 'N' or 'n':
136 * sub( B ) := beta*sub( B ) + alpha*sub( A ),
137 * otherwise
138 * sub( B ) := beta*sub( B ) + alpha*conjg( sub( A ) ).
139 *
140 * M (global input) INTEGER
141 * On entry, M specifies the number of rows of the submatrix
142 * sub( A ). M must be at least zero.
143 *
144 * N (global input) INTEGER
145 * On entry, N specifies the number of columns of the submatrix
146 * sub( A ). N must be at least zero.
147 *
148 * ALPHA (global input) pointer to CHAR
149 * On entry, ALPHA specifies the scalar alpha. When ALPHA is
150 * supplied as zero then the local entries of the array A cor-
151 * responding to the entries of the submatrix sub( A ) need not
152 * be set on input.
153 *
154 * A (local input) pointer to CHAR
155 * On entry, A is an array of dimension (LLD_A, Ka), where LLD_A
156 * is at least MAX( 1, Lr( 1, IA+M-1 ) ), and, Ka is at least
157 * Lc( 1, JA+N-1 ). Before entry, this array contains the local
158 * entries of the matrix A.
159 *
160 * IA (global input) INTEGER
161 * On entry, IA specifies A's global row index, which points to
162 * the beginning of the submatrix sub( A ).
163 *
164 * JA (global input) INTEGER
165 * On entry, JA specifies A's global column index, which points
166 * to the beginning of the submatrix sub( A ).
167 *
168 * DESCA (global and local input) INTEGER array
169 * On entry, DESCA is an integer array of dimension DLEN_. This
170 * is the array descriptor for the matrix A.
171 *
172 * AROC (global input) pointer to CHAR
173 * On entry, AROC specifies the orientation of the subvector
174 * sub( A ). When AROC is 'R' or 'r', sub( A ) is a row vector,
175 * and a column vector otherwise.
176 *
177 * BETA (global input) pointer to CHAR
178 * On entry, BETA specifies the scalar beta. When BETA is sup-
179 * plied as zero then the local entries of the array B corres-
180 * ponding to the entries of the submatrix sub( B ) need not be
181 * set on input.
182 *
183 * B (local input/local output) pointer to CHAR
184 * On entry, B is an array of dimension (LLD_B, Kb), where LLD_B
185 * is at least MAX( 1, Lr( 1, IB+M-1 ) ) when sub( A ) and
186 * sub( B ) are both distributed along a process column or a
187 * process row. In that case, Kb is at least Lc( 1, JB+N-1 ).
188 * Otherwise, LLD_B is at least MAX( 1, Lr( 1, IB+N-1 ) ) and
189 * Kb is at least Lc( 1, JB+M-1 ). Before entry, this array
190 * contains the local entries of the matrix B. On exit, sub( B )
191 * is overwritten with the updated submatrix.
192 *
193 * IB (global input) INTEGER
194 * On entry, IB specifies B's global row index, which points to
195 * the beginning of the submatrix sub( B ).
196 *
197 * JB (global input) INTEGER
198 * On entry, JB specifies B's global column index, which points
199 * to the beginning of the submatrix sub( B ).
200 *
201 * DESCB (global and local input) INTEGER array
202 * On entry, DESCB is an integer array of dimension DLEN_. This
203 * is the array descriptor for the matrix B.
204 *
205 * BROC (global input) pointer to CHAR
206 * On entry, BROC specifies the orientation of the subvector
207 * sub( B ). When BROC is 'R' or 'r', sub( B ) is a row vector,
208 * and a column vector otherwise.
209 *
210 * -- Written on April 1, 1998 by
211 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
212 *
213 * ---------------------------------------------------------------------
214 */
215 /*
216 * .. Local Scalars ..
217 */
218  char scope, * top, * zero;
219  int Acol, Aii, Ainb1D, AisR, AisRow, Ajj, Ald, AmyprocD, AmyprocR,
220  AnD, AnbD, AnpD, AnprocsD, AprocD, AprocR, Aroc, Arow, Bcol,
221  Bii, BisR, BisRow, Bjj, Bld, Bm, BmyprocD, BmyprocR, Bn,
222  BnprocsD, BprocR, Broc, Brow, RRorCC, ctxt, izero=0, k, kbb,
223  kk, kn, ktmp, mycol, mydist, myproc, myrow, npcol, nprow, p,
224  size;
225  MMADD_T add;
226  TZPAD_T pad;
227 /*
228 * .. Local Arrays ..
229 */
230  char * buf = NULL;
231 /* ..
232 * .. Executable Statements ..
233 *
234 */
235 /*
236 * Retrieve process grid information
237 */
238  Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
239 /*
240 * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ...
241 */
242  PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
243  &Arow, &Acol );
244  if( ( AisRow = ( Mupcase( AROC[0] ) == CROW ) ) != 0 )
245  {
246  AnD = N; AnbD = DESCA[NB_]; Ald = DESCA[LLD_];
247  AprocD = Acol; AprocR = Arow;
248  AmyprocD = mycol; AmyprocR = myrow; AnprocsD = npcol;
249  AisR = ( ( Arow == -1 ) || ( nprow == 1 ) );
250  Ainb1D = PB_Cfirstnb( AnD, JA, DESCA[INB_], AnbD );
251  }
252  else
253  {
254  AnD = M; AnbD = DESCA[MB_]; Ald = DESCA[LLD_];
255  AprocD = Arow; AprocR = Acol;
256  AmyprocD = myrow; AmyprocR = mycol; AnprocsD = nprow;
257  AisR = ( ( Acol == -1 ) || ( npcol == 1 ) );
258  Ainb1D = PB_Cfirstnb( AnD, IA, DESCA[IMB_], AnbD );
259  }
260 /*
261 * Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol ...
262 */
263  PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj,
264  &Brow, &Bcol );
265  if( ( BisRow = ( Mupcase( BROC[0] ) == CROW ) ) != 0 )
266  {
267  Bld = DESCB[LLD_];
268  BmyprocD = mycol; BnprocsD = npcol;
269  BprocR = Brow; BmyprocR = myrow;
270  BisR = ( ( BprocR == -1 ) || ( nprow == 1 ) );
271  }
272  else
273  {
274  Bld = DESCB[LLD_];
275  BmyprocD = myrow; BnprocsD = nprow;
276  BprocR = Bcol; BmyprocR = mycol;
277  BisR = ( ( BprocR == -1 ) || ( npcol == 1 ) );
278  }
279 /*
280 * Are sub( A ) and sub( B ) both row or column vectors ?
281 */
282  RRorCC = ( ( AisRow && BisRow ) || ( !( AisRow ) && !( BisRow ) ) );
283 /*
284 * Select the local add routine accordingly
285 */
286  size = TYPE->size;
287 /*
288 * sub( A ) is distributed and sub( B ) is not distributed
289 */
290  if( !( BisR ) )
291  {
292 /*
293 * sub( B ) is not replicated. Since this operation is local if sub( B ) and
294 * sub( A ) are both row or column vectors, choose AprocR = BprocR when RRorCC,
295 * and AprocR = 0 otherwise.
296 */
297  if( AisR ) { AprocR = ( ( RRorCC ) ? BprocR : 0 ); }
298 /*
299 * Now, it is just like sub( A ) is not replicated, this information however is
300 * kept in AisR for later use.
301 */
302  if( ( AmyprocR == AprocR ) || ( BmyprocR == BprocR ) )
303  {
304  if( RRorCC )
305  {
306 /*
307 * sub( A ) and sub( B ) are both row or column vectors
308 */
309  zero = TYPE->zero;
310  if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmcadd;
311  else add = TYPE->Fmmadd;
312  pad = TYPE->Ftzpad;
313 
314  AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, AmyprocD, AprocD,
315  AnprocsD );
316 /*
317 * sub( A ) and sub( B ) are in the same process row or column
318 */
319  if( AprocR == BprocR )
320  {
321 /*
322 * In each process, the distributed part of sub( A ) is added to sub( B ). In
323 * the other processes, this replicated of sub( B ) is set to zero for later
324 * reduction.
325 */
326  if( AnpD > 0 )
327  {
328  Aroc = AprocD;
329  if( BisRow ) { kk = Ajj; ktmp = JB + N; kn = JB + Ainb1D; }
330  else { kk = Aii; ktmp = IB + M; kn = IB + Ainb1D; }
331 
332  if( AmyprocD == Aroc )
333  {
334  if( BisRow )
335  add( &M, &Ainb1D, ALPHA, Mptr( A, Aii, Ajj, Ald, size ),
336  &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
337  else
338  add( &Ainb1D, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ),
339  &Ald, BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
340  kk += Ainb1D;
341  }
342  else
343  {
344  if( BisRow )
345  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &Ainb1D,
346  &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ),
347  &Bld );
348  else
349  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Ainb1D, &N,
350  &izero, zero, zero, Mptr( B, Bii, Bjj, Bld, size ),
351  &Bld );
352  }
353  Aroc = MModAdd1( Aroc, AnprocsD );
354 
355  for( k = kn; k < ktmp; k += AnbD )
356  {
357  kbb = ktmp - k; kbb = MIN( kbb, AnbD );
358 
359  if( AmyprocD == Aroc )
360  {
361  if( BisRow )
362  add( &M, &kbb, ALPHA, Mptr( A, Aii, kk, Ald, size ),
363  &Ald, BETA, Mptr( B, Bii, k, Bld, size ),
364  &Bld );
365  else
366  add( &kbb, &N, ALPHA, Mptr( A, kk, Ajj, Ald, size ),
367  &Ald, BETA, Mptr( B, k, Bjj, Bld, size ),
368  &Bld );
369  kk += kbb;
370  }
371  else
372  {
373  if( BisRow )
374  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &kbb,
375  &izero, zero, zero, Mptr( B, Bii, k, Bld,
376  size ), &Bld );
377  else
378  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &kbb, &N,
379  &izero, zero, zero, Mptr( B, k, Bjj, Bld,
380  size ), &Bld );
381  }
382  Aroc = MModAdd1( Aroc, AnprocsD );
383  }
384  }
385  else
386  {
387 /*
388 * If I don't own any entries of sub( A ), then zero the entire sub( B )
389 * residing in this process.
390 */
391  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &N, &izero,
392  zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
393  }
394 /*
395 * Replicate locally scattered sub( B ) by reducing it
396 */
397  scope = ( BisRow ? CROW : CCOLUMN );
398  top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
399  TYPE->Cgsum2d( ctxt, &scope, top, M, N, Mptr( B, Bii, Bjj, Bld,
400  size ), Bld, -1, 0 );
401  }
402  else
403  {
404 /*
405 * sub( A ) and sub( B ) are in a different process row or column
406 */
407  if( AmyprocR == AprocR )
408  {
409 /*
410 * If I own a piece of sub( A ), then send it to the corresponding process row
411 * or column where sub( B ) resides.
412 */
413  if( AnpD > 0 )
414  {
415  if( AisRow )
416  TYPE->Cgesd2d( ctxt, M, AnpD, Mptr( A, Aii, Ajj, Ald,
417  size ), Ald, BprocR, BmyprocD );
418  else
419  TYPE->Cgesd2d( ctxt, AnpD, N, Mptr( A, Aii, Ajj, Ald,
420  size ), Ald, BmyprocD, BprocR );
421  }
422  }
423 
424  if( BmyprocR == BprocR )
425  {
426 /*
427 * If I own sub( B ), then receive and unpack distributed part of sub( A ) that
428 * should be added to sub( B ). Combine the results.
429 */
430  if( AnpD > 0 )
431  {
432  if( BisRow )
433  {
434  ktmp = JB + N;
435  kn = JB + Ainb1D;
436  buf = PB_Cmalloc( M * AnpD * size );
437  TYPE->Cgerv2d( ctxt, M, AnpD, buf, M, AprocR,
438  AmyprocD );
439  }
440  else
441  {
442  ktmp = IB + M;
443  kn = IB + Ainb1D;
444  buf = PB_Cmalloc( AnpD * N * size );
445  TYPE->Cgerv2d( ctxt, AnpD, N, buf, AnpD, AmyprocD,
446  AprocR );
447  }
448  Aroc = AprocD;
449  kk = 0;
450 
451  if( AmyprocD == Aroc )
452  {
453  if( BisRow )
454  add( &M, &Ainb1D, ALPHA, buf, &M, BETA, Mptr( B,
455  Bii, Bjj, Bld, size ), &Bld );
456  else
457  add( &Ainb1D, &N, ALPHA, buf, &AnpD, BETA, Mptr( B,
458  Bii, Bjj, Bld, size ), &Bld );
459  kk += Ainb1D;
460  }
461  else
462  {
463  if( BisRow )
464  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M,
465  &Ainb1D, &izero, zero, zero, Mptr( B, Bii, Bjj,
466  Bld, size ), &Bld );
467  else
468  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Ainb1D,
469  &N, &izero, zero, zero, Mptr( B, Bii, Bjj, Bld,
470  size ), &Bld );
471  }
472  Aroc = MModAdd1( Aroc, AnprocsD );
473 
474  for( k = kn; k < ktmp; k += AnbD )
475  {
476  kbb = ktmp - k; kbb = MIN( kbb, AnbD );
477 
478  if( AmyprocD == Aroc )
479  {
480  if( BisRow )
481  add( &M, &kbb, ALPHA, Mptr( buf, 0, kk, M, size ),
482  &M, BETA, Mptr( B, Bii, k, Bld, size ),
483  &Bld );
484  else
485  add( &kbb, &N, ALPHA, Mptr( buf, kk, 0, AnpD,
486  size ), &AnpD, BETA, Mptr( B, k, Bjj, Bld,
487  size ), &Bld );
488  kk += kbb;
489  }
490  else
491  {
492  if( BisRow )
493  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M,
494  &kbb, &izero, zero, zero, Mptr( B, Bii, k,
495  Bld, size ), &Bld );
496  else
497  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &kbb,
498  &N, &izero, zero, zero, Mptr( B, k, Bjj, Bld,
499  size ), &Bld );
500  }
501  Aroc = MModAdd1( Aroc, AnprocsD );
502  }
503  if( buf ) free( buf );
504  }
505  else
506  {
507 /*
508 * If I don't own any entries of sub( A ), then zero the entire sub( B )
509 * residing in this process.
510 */
511  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &N, &izero,
512  zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
513  }
514 /*
515 * Replicate locally scattered sub( B ) by reducing it
516 */
517  scope = ( BisRow ? CROW : CCOLUMN );
518  top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
519  TYPE->Cgsum2d( ctxt, &scope, top, M, N, Mptr( B, Bii, Bjj,
520  Bld, size ), Bld, -1, 0 );
521  }
522  }
523  }
524  else
525  {
526 /*
527 * sub( A ) and sub( B ) are not both row or column vectors
528 */
529  zero = TYPE->zero;
530  if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmtcadd;
531  else add = TYPE->Fmmtadd;
532  pad = TYPE->Ftzpad;
533 
534  Broc = 0;
535  if( BisRow ) { ktmp = JB + M; kn = JB + Ainb1D; }
536  else { ktmp = IB + N; kn = IB + Ainb1D; }
537 /*
538 * Loop over the processes in which sub( A ) resides, for each process find the
539 * next process Xroc. Exchange and add the data.
540 */
541  for( p = 0; p < AnprocsD; p++ )
542  {
543  mydist = MModSub( p, AprocD, AnprocsD );
544  myproc = MModAdd( AprocD, mydist, AnprocsD );
545 
546  if( ( BprocR == p ) && ( AprocR == Broc ) )
547  {
548  if( BmyprocR == p )
549  {
550 /*
551 * local add at the intersection of the process cross
552 */
553  AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, p, AprocD,
554  AnprocsD );
555  if( AnpD > 0 )
556  {
557  Aroc = AprocD;
558  kk = ( BisRow ? Aii : Ajj );
559 
560  if( myproc == Aroc )
561  {
562  if( BmyprocD == Broc )
563  {
564  if( AisRow )
565  add( &M, &Ainb1D, ALPHA, Mptr( A, Aii, Ajj,
566  Ald, size ), &Ald, BETA, Mptr( B, Bii,
567  Bjj, Bld, size ), &Bld );
568  else
569  add( &Ainb1D, &N, ALPHA, Mptr( A, Aii, Ajj,
570  Ald, size ), &Ald, BETA, Mptr( B, Bii,
571  Bjj, Bld, size ), &Bld );
572  kk += Ainb1D;
573  }
574  else
575  {
576  if( BisRow )
577  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &N,
578  &Ainb1D, &izero, zero, zero, Mptr( B, Bii,
579  Bjj, Bld, size ), &Bld );
580  else
581  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ),
582  &Ainb1D, &M, &izero, zero, zero, Mptr( B,
583  Bii, Bjj, Bld, size ), &Bld );
584  }
585  }
586  Aroc = MModAdd1( Aroc, AnprocsD );
587 
588  for( k = kn; k < ktmp; k += AnbD )
589  {
590  kbb = ktmp - k; kbb = MIN( kbb, AnbD );
591  if( myproc == Aroc )
592  {
593  if( BmyprocD == Broc )
594  {
595  if( AisRow )
596  add( &M, &kbb, ALPHA, Mptr( A, Aii, kk, Ald,
597  size ), &Ald, BETA, Mptr( B, k, Bjj,
598  Bld, size ), &Bld );
599  else
600  add( &kbb, &N, ALPHA, Mptr( A, kk, Ajj, Ald,
601  size ), &Ald, BETA, Mptr( B, Bii, k,
602  Bld, size ), &Bld );
603  kk += kbb;
604  }
605  else
606  {
607  if( BisRow )
608  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ),
609  &N, &kbb, &izero, zero, zero, Mptr( B,
610  Bii, k, Bld, size ), &Bld );
611  else
612  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ),
613  &kbb, &M, &izero, zero, zero, Mptr( B,
614  k, Bjj, Bld, size ), &Bld );
615  }
616  }
617  Aroc = MModAdd1( Aroc, AnprocsD );
618  }
619  }
620  }
621  }
622  else
623  {
624 /*
625 * Message exchange
626 */
627  if( ( AmyprocR == AprocR ) && ( AmyprocD == p ) )
628  {
629  AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, p, AprocD,
630  AnprocsD );
631  if( AnpD > 0 )
632  {
633  if( AisRow )
634  TYPE->Cgesd2d( ctxt, M, AnpD, Mptr( A, Aii, Ajj, Ald,
635  size ), Ald, Broc, BprocR );
636  else
637  TYPE->Cgesd2d( ctxt, AnpD, N, Mptr( A, Aii, Ajj, Ald,
638  size ), Ald, BprocR, Broc );
639  }
640  }
641 
642  if( BmyprocR == BprocR )
643  {
644  AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, p, AprocD,
645  AnprocsD );
646  if( AnpD > 0 )
647  {
648  Aroc = AprocD;
649  kk = 0;
650 
651  if( BmyprocD == Broc )
652  {
653  if( AisRow )
654  {
655  buf = PB_Cmalloc( M * AnpD * size );
656  TYPE->Cgerv2d( ctxt, M, AnpD, buf, M, AprocR, p );
657  }
658  else
659  {
660  buf = PB_Cmalloc( AnpD * N * size );
661  TYPE->Cgerv2d( ctxt, AnpD, N, buf, AnpD, p,
662  AprocR );
663  }
664  }
665 
666  if( myproc == Aroc )
667  {
668  if( BmyprocD == Broc )
669  {
670  if( AisRow )
671  add( &M, &Ainb1D, ALPHA, buf, &M, BETA,
672  Mptr( B, Bii, Bjj, Bld, size ), &Bld );
673  else
674  add( &Ainb1D, &N, ALPHA, buf, &AnpD, BETA,
675  Mptr( B, Bii, Bjj, Bld, size ), &Bld );
676  kk += Ainb1D;
677  }
678  else
679  {
680  if( BisRow )
681  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &N,
682  &Ainb1D, &izero, zero, zero, Mptr( B, Bii,
683  Bjj, Bld, size ), &Bld );
684  else
685  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ),
686  &Ainb1D, &M, &izero, zero, zero, Mptr( B,
687  Bii, Bjj, Bld, size ), &Bld );
688  }
689  }
690  Aroc = MModAdd1( Aroc, AnprocsD );
691 
692  for( k = kn; k < ktmp; k += AnbD )
693  {
694  kbb = ktmp - k; kbb = MIN( kbb, AnbD );
695  if( myproc == Aroc )
696  {
697  if( BmyprocD == Broc )
698  {
699  if( AisRow )
700  add( &M, &kbb, ALPHA, Mptr( buf, 0, kk, M,
701  size ), &M, BETA, Mptr( B, k, Bjj,
702  Bld, size ), &Bld );
703  else
704  add( &kbb, &N, ALPHA, Mptr( buf, kk, 0,
705  AnpD, size ), &AnpD, BETA, Mptr( B,
706  Bii, k, Bld, size ), &Bld );
707  kk += kbb;
708  }
709  else
710  {
711  if( BisRow )
712  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ),
713  &N, &kbb, &izero, zero, zero, Mptr( B,
714  Bii, k, Bld, size ), &Bld );
715  else
716  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ),
717  &kbb, &M, &izero, zero, zero, Mptr( B,
718  k, Bjj, Bld, size ), &Bld );
719  }
720  }
721  Aroc = MModAdd1( Aroc, AnprocsD );
722  }
723  if( ( BmyprocD == Broc ) && ( buf ) ) free( buf );
724  }
725  }
726  }
727  Broc = MModAdd1( Broc, BnprocsD );
728  }
729 
730  if( BmyprocR == BprocR )
731  {
732 /*
733 * Replicate locally scattered sub( B ) by reducing it
734 */
735  scope = ( BisRow ? CROW : CCOLUMN );
736  top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
737  TYPE->Cgsum2d( ctxt, &scope, top, N, M, Mptr( B, Bii, Bjj, Bld,
738  size ), Bld, -1, 0 );
739  }
740  }
741  }
742 
743  if( BisR )
744  {
745 /*
746 * Replicate sub( B )
747 */
748  if( BisRow )
749  {
750  if( AisRow ) { Bm = M; Bn = N; }
751  else { Bm = N; Bn = M; }
752  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
753  if( BmyprocR == BprocR )
754  TYPE->Cgebs2d( ctxt, COLUMN, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
755  size ), Bld );
756  else
757  TYPE->Cgebr2d( ctxt, COLUMN, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
758  size ), Bld, BprocR, BmyprocD );
759  }
760  else
761  {
762  if( AisRow ) { Bm = N; Bn = M; }
763  else { Bm = M; Bn = N; }
764  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
765  if( BmyprocR == BprocR )
766  TYPE->Cgebs2d( ctxt, ROW, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
767  size ), Bld );
768  else
769  TYPE->Cgebr2d( ctxt, ROW, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
770  size ), Bld, BmyprocD, BprocR );
771  }
772  }
773  }
774  else
775  {
776 /*
777 * sub( B ) is replicated in every process. Add the data in process row or
778 * column AprocR when sub( A ) is not replicated and in every process otherwise.
779 */
780  if( AisR || ( AmyprocR == AprocR ) )
781  {
782  zero = TYPE->zero;
783  if( RRorCC )
784  {
785  if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmcadd;
786  else add = TYPE->Fmmadd;
787  }
788  else
789  {
790  if( Mupcase( CONJUG[0] ) != CNOCONJG ) add = TYPE->Fmmtcadd;
791  else add = TYPE->Fmmtadd;
792  }
793  pad = TYPE->Ftzpad;
794 
795  AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, AmyprocD, AprocD, AnprocsD );
796  if( AnpD > 0 )
797  {
798  Aroc = AprocD;
799  kk = ( AisRow ? Ajj : Aii );
800 
801  if( BisRow ) { ktmp = JB + ( RRorCC ? N : M ); kn = JB + Ainb1D; }
802  else { ktmp = IB + ( RRorCC ? M : N ); kn = IB + Ainb1D; }
803 
804  if( AmyprocD == Aroc )
805  {
806  if( AisRow )
807  add( &M, &Ainb1D, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald,
808  BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
809  else
810  add( &Ainb1D, &N, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald,
811  BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
812  kk += Ainb1D;
813  }
814  else
815  {
816  if( RRorCC )
817  {
818  if( AisRow ) { Bm = M; Bn = Ainb1D; }
819  else { Bm = Ainb1D; Bn = N; }
820  }
821  else
822  {
823  if( AisRow ) { Bm = Ainb1D; Bn = M; }
824  else { Bm = N; Bn = Ainb1D; }
825  }
826  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Bm, &Bn, &izero,
827  zero, zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
828  }
829  Aroc = MModAdd1( Aroc, AnprocsD );
830 
831  for( k = kn; k < ktmp; k += AnbD )
832  {
833  kbb = ktmp - k; kbb = MIN( kbb, AnbD );
834 
835  if( BisRow ) { buf = Mptr( B, Bii, k, Bld, size ); }
836  else { buf = Mptr( B, k, Bjj, Bld, size ); }
837 
838  if( AmyprocD == Aroc )
839  {
840  if( AisRow )
841  add( &M, &kbb, ALPHA, Mptr( A, Aii, kk, Ald, size ), &Ald,
842  BETA, buf, &Bld );
843  else
844  add( &kbb, &N, ALPHA, Mptr( A, kk, Ajj, Ald, size ), &Ald,
845  BETA, buf, &Bld );
846  kk += kbb;
847  }
848  else
849  {
850  if( RRorCC )
851  {
852  if( AisRow ) { Bm = M; Bn = kbb; }
853  else { Bm = kbb; Bn = N; }
854  }
855  else
856  {
857  if( AisRow ) { Bm = kbb; Bn = M; }
858  else { Bm = N; Bn = kbb; }
859  }
860  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &Bm, &Bn, &izero,
861  zero, zero, buf, &Bld );
862  }
863  Aroc = MModAdd1( Aroc, AnprocsD );
864  }
865  }
866  else
867  {
868  if( RRorCC )
869  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &M, &N, &izero, zero,
870  zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
871  else
872  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &N, &M, &izero, zero,
873  zero, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
874  }
875 /*
876 * Replicate locally scattered sub( B ) by reducing it in the process scope of
877 * sub( A )
878 */
879  scope = ( AisRow ? CROW : CCOLUMN );
880  top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
881  if( RRorCC )
882  TYPE->Cgsum2d( ctxt, &scope, top, M, N, Mptr( B, Bii, Bjj, Bld,
883  size ), Bld, -1, 0 );
884  else
885  TYPE->Cgsum2d( ctxt, &scope, top, N, M, Mptr( B, Bii, Bjj, Bld,
886  size ), Bld, -1, 0 );
887  }
888 
889  if( !AisR )
890  {
891 /*
892 * If sub( A ) is not replicated, then broadcast the result to the other pro-
893 * cesses that own a piece of sub( B ), but were not involved in the above
894 * addition operation.
895 */
896  if( RRorCC ) { Bm = M; Bn = N; }
897  else { Bm = N; Bn = M; }
898 
899  if( AisRow )
900  {
901  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
902  if( AmyprocR == AprocR )
903  TYPE->Cgebs2d( ctxt, COLUMN, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
904  size ), Bld );
905  else
906  TYPE->Cgebr2d( ctxt, COLUMN, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
907  size ), Bld, AprocR, AmyprocD );
908  }
909  else
910  {
911  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
912  if( AmyprocR == AprocR )
913  TYPE->Cgebs2d( ctxt, ROW, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
914  size ), Bld );
915  else
916  TYPE->Cgebr2d( ctxt, ROW, top, Bm, Bn, Mptr( B, Bii, Bjj, Bld,
917  size ), Bld, AmyprocD, AprocR );
918  }
919  }
920  }
921 /*
922 * End of PB_CpaxpbyDN
923 */
924 }
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
PB_Cfirstnb
int PB_Cfirstnb()
LLD_
#define LLD_
Definition: PBtools.h:47
CNOCONJG
#define CNOCONJG
Definition: PBblas.h:19
MModAdd
#define MModAdd(I1, I2, d)
Definition: PBtools.h:97
CROW
#define CROW
Definition: PBblacs.h:21
PB_CpaxpbyDN
void PB_CpaxpbyDN(PBTYP_T *TYPE, char *CONJUG, int M, int N, char *ALPHA, char *A, int IA, int JA, int *DESCA, char *AROC, char *BETA, char *B, int IB, int JB, int *DESCB, char *BROC)
Definition: PB_CpaxpbyDN.c:26
TZPAD_T
F_VOID_FCT(* TZPAD_T)()
Definition: pblas.h:288
IMB_
#define IMB_
Definition: PBtools.h:41
MModSub
#define MModSub(I1, I2, d)
Definition: PBtools.h:102
MModAdd1
#define MModAdd1(I, d)
Definition: PBtools.h:100
MMADD_T
F_VOID_FCT(* MMADD_T)()
Definition: pblas.h:284
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
PB_Ctop
char * PB_Ctop()
BCAST
#define BCAST
Definition: PBblacs.h:48
COMBINE
#define COMBINE
Definition: PBblacs.h:49
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
PB_Cmalloc
char * PB_Cmalloc()
ALL
#define ALL
Definition: PBblas.h:50
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
CCOLUMN
#define CCOLUMN
Definition: PBblacs.h:20
INB_
#define INB_
Definition: PBtools.h:42
C2F_CHAR
#define C2F_CHAR(a)
Definition: pblas.h:121
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CTXT_
#define CTXT_
Definition: PBtools.h:38