ScaLAPACK 2.1  2.1 ScaLAPACK: Scalable Linear Algebra PACKage
PB_Cpaxpby.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18
19 #ifdef __STDC__
20 void PB_Cpaxpby( PBTYP_T * TYPE, char * CONJUG, int M, int N,
21  char * ALPHA,
22  char * A, int IA, int JA, int * DESCA, char * AROC,
23  char * BETA,
24  char * B, int IB, int JB, int * DESCB, char * BROC )
25 #else
26 void PB_Cpaxpby( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, AROC,
27  BETA, B, IB, JB, DESCB, BROC )
28 /*
29 * .. Scalar Arguments ..
30 */
31  char * AROC, * BROC, * CONJUG;
32  int IA, IB, JA, JB, M, N;
33  char * ALPHA, * BETA;
34  PBTYP_T * TYPE;
35 /*
36 * .. Array Arguments ..
37 */
38  int * DESCA, * DESCB;
39  char * A, * B;
40 #endif
41 {
42 /*
43 * Purpose
44 * =======
45 *
46 * PB_Cpaxpby adds one submatrix to another,
47 *
48 * sub( B ) := beta * sub( B ) + alpha * sub( A ), or,
49 *
50 * sub( B ) := beta * sub( B ) + alpha * conjg( sub( A ) ),
51 *
52 * where both submatrices are distributed along one dimension; sub( A )
53 * always denotes A(IA:IA+M-1,JA:JA+N-1). When AROC is 'R' or 'r'
54 * sub( A ) is distributed along a process row, otherwise sub( A )
55 * is distributed along a process column. When sub( A ) is distributed
56 * along a process row and BROC is 'R' or 'r' or sub( A ) is distributed
57 * along a process column and BROC is 'C' or 'c', then sub( B ) denotes
58 * B(IB:IB+M-1,JB:JB+N-1), and B(IB:IB+N-1,JB:JB+M-1) otherwise.
59 *
60 * Notes
61 * =====
62 *
63 * A description vector is associated with each 2D block-cyclicly dis-
64 * tributed matrix. This vector stores the information required to
65 * establish the mapping between a matrix entry and its corresponding
66 * process and memory location.
67 *
68 * In the following comments, the character _ should be read as
69 * "of the distributed matrix". Let A be a generic term for any 2D
70 * block cyclicly distributed matrix. Its description vector is DESC_A:
71 *
72 * NOTATION STORED IN EXPLANATION
73 * ---------------- --------------- ------------------------------------
74 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
75 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
76 * the NPROW x NPCOL BLACS process grid
77 * A is distributed over. The context
78 * itself is global, but the handle
79 * (the integer value) may vary.
80 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
81 * ted matrix A, M_A >= 0.
82 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
83 * buted matrix A, N_A >= 0.
84 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
85 * block of the matrix A, IMB_A > 0.
86 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
87 * left block of the matrix A,
88 * INB_A > 0.
89 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
90 * bute the last M_A-IMB_A rows of A,
91 * MB_A > 0.
92 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
93 * bute the last N_A-INB_A columns of
94 * A, NB_A > 0.
95 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
96 * row of the matrix A is distributed,
97 * NPROW > RSRC_A >= 0.
98 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
99 * first column of A is distributed.
100 * NPCOL > CSRC_A >= 0.
101 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
102 * array storing the local blocks of
103 * the distributed matrix A,
104 * IF( Lc( 1, N_A ) > 0 )
105 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
106 * ELSE
107 * LLD_A >= 1.
108 *
109 * Let K be the number of rows of a matrix A starting at the global in-
110 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
111 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
112 * receive if these K rows were distributed over NPROW processes. If K
113 * is the number of columns of a matrix A starting at the global index
114 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
115 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
116 * these K columns were distributed over NPCOL processes.
117 *
118 * The values of Lr() and Lc() may be determined via a call to the func-
119 * tion PB_Cnumroc:
120 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
121 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
122 *
123 * Arguments
124 * =========
125 *
126 * TYPE (local input) pointer to a PBTYP_T structure
127 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
128 * that contains type information (See pblas.h).
129 *
130 * CONJUG (global input) pointer to CHAR
131 * On entry, CONJUG specifies whether conjg( sub( A ) ) or
132 * sub( A ) should be added to sub( B ) as follows:
133 * CONJUG = 'N' or 'n':
134 * sub( B ) := beta*sub( B ) + alpha*sub( A ),
135 * otherwise
136 * sub( B ) := beta*sub( B ) + alpha*conjg( sub( A ) ).
137 *
138 * M (global input) INTEGER
139 * On entry, M specifies the number of rows of the submatrix
140 * sub( A ). M must be at least zero.
141 *
142 * N (global input) INTEGER
143 * On entry, N specifies the number of columns of the submatrix
144 * sub( A ). N must be at least zero.
145 *
146 * ALPHA (global input) pointer to CHAR
147 * On entry, ALPHA specifies the scalar alpha. When ALPHA is
148 * supplied as zero then the local entries of the array A cor-
149 * responding to the entries of the submatrix sub( A ) need not
150 * be set on input.
151 *
152 * A (local input) pointer to CHAR
153 * On entry, A is an array of dimension (LLD_A, Ka), where LLD_A
154 * is at least MAX( 1, Lr( 1, IA+M-1 ) ), and, Ka is at least
155 * Lc( 1, JA+N-1 ). Before entry, this array contains the local
156 * entries of the matrix A.
157 *
158 * IA (global input) INTEGER
159 * On entry, IA specifies A's global row index, which points to
160 * the beginning of the submatrix sub( A ).
161 *
162 * JA (global input) INTEGER
163 * On entry, JA specifies A's global column index, which points
164 * to the beginning of the submatrix sub( A ).
165 *
166 * DESCA (global and local input) INTEGER array
167 * On entry, DESCA is an integer array of dimension DLEN_. This
168 * is the array descriptor for the matrix A.
169 *
170 * AROC (global input) pointer to CHAR
171 * On entry, AROC specifies the orientation of the subvector
172 * sub( A ). When AROC is 'R' or 'r', sub( A ) is a row vector,
173 * and a column vector otherwise.
174 *
175 * BETA (global input) pointer to CHAR
176 * On entry, BETA specifies the scalar beta. When BETA is sup-
177 * plied as zero then the local entries of the array B corres-
178 * ponding to the entries of the submatrix sub( B ) need not be
179 * set on input.
180 *
181 * B (local input/local output) pointer to CHAR
182 * On entry, B is an array of dimension (LLD_B, Kb), where LLD_B
183 * is at least MAX( 1, Lr( 1, IB+M-1 ) ) when sub( A ) and
184 * sub( B ) are both distributed along a process column or a
185 * process row. In that case, Kb is at least Lc( 1, JB+N-1 ).
186 * Otherwise, LLD_B is at least MAX( 1, Lr( 1, IB+N-1 ) ) and
187 * Kb is at least Lc( 1, JB+M-1 ). Before entry, this array
188 * contains the local entries of the matrix B. On exit, sub( B )
189 * is overwritten with the updated submatrix.
190 *
191 * IB (global input) INTEGER
192 * On entry, IB specifies B's global row index, which points to
193 * the beginning of the submatrix sub( B ).
194 *
195 * JB (global input) INTEGER
196 * On entry, JB specifies B's global column index, which points
197 * to the beginning of the submatrix sub( B ).
198 *
199 * DESCB (global and local input) INTEGER array
200 * On entry, DESCB is an integer array of dimension DLEN_. This
201 * is the array descriptor for the matrix B.
202 *
203 * BROC (global input) pointer to CHAR
204 * On entry, BROC specifies the orientation of the subvector
205 * sub( B ). When BROC is 'R' or 'r', sub( B ) is a row vector,
206 * and a column vector otherwise.
207 *
208 * -- Written on April 1, 1998 by
209 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
210 *
211 * ---------------------------------------------------------------------
212 */
213 /*
214 * .. Local Scalars ..
215 */
216  char ascope, bscope, * buf = NULL, * one, * top, tran, * zero;
217  int Acol, Aii, AinbD, Ainb1D, AisD, AisR, AisRow, AiD, Ajj, Ald,
218  AmyprocD, AmyprocR, AnbD, AnD, AnR, AnpD, AnprocsD, AnprocsR,
219  AprocD, AprocR, Aroc, Arow, Bcol, Bii, BinbD, Binb1D, BisD,
220  BisR, BisRow, BiD, Bjj, Bld, BmyprocD, BmyprocR, BnbD, BnD,
221  BnR, BnpD, BnprocsD, BnprocsR, BprocD, BprocR, Broc, Brow,
222  BsrcD, OneBlock, OneDgrid, RRorCC, Square, cdst, csrc, ctxt,
223  dst, gcdPQ, k, l, lcmPQ, lcmb, ma, mb, mycol, myrow, na, nb,
224  npcol, npq, nprow, p, q, rdst, rsrc, size, src;
225  PB_VM_T VM;
227 /* ..
228 * .. Executable Statements ..
229 *
230 */
231 /*
232 * Quick return if possible
233 */
234  if( ( M <= 0 ) || ( N <= 0 ) ) return;
235 /*
236 * Retrieve process grid information
237 */
238  Cblacs_gridinfo( ( ctxt = DESCA[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol );
239 /*
240 * Determine if sub( A ) is distributed or not
241 */
242  if( ( AisRow = ( Mupcase( AROC[0] ) == CROW ) ) != 0 )
243  AisD = ( ( DESCA[CSRC_] >= 0 ) && ( ( AnprocsD = npcol ) > 1 ) );
244  else
245  AisD = ( ( DESCA[RSRC_] >= 0 ) && ( ( AnprocsD = nprow ) > 1 ) );
246 /*
247 * Determine if sub( B ) is distributed or not
248 */
249  if( ( BisRow = ( Mupcase( BROC[0] ) == CROW ) ) != 0 )
250  BisD = ( ( DESCB[CSRC_] >= 0 ) && ( ( BnprocsD = npcol ) > 1 ) );
251  else
252  BisD = ( ( DESCB[RSRC_] >= 0 ) && ( ( BnprocsD = nprow ) > 1 ) );
253 /*
254 * AisD && BisD <=> both operands are indeed distributed
255 */
256  if( AisD && BisD )
257  {
258 /*
259 * Retrieve sub( A )'s local information: Aii, Ajj, Arow, Acol ...
260 */
261  PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj, &Arow,
262  &Acol );
263  if( AisRow )
264  {
265  AinbD = DESCA[INB_]; AnbD = DESCA[NB_]; Ald = DESCA[LLD_];
266  AiD = JA; AnD = N; AnR = M;
267  AprocD = Acol; AmyprocD = mycol;
268  AprocR = Arow; AmyprocR = myrow; AnprocsR = nprow;
269  AisR = ( ( DESCA[ RSRC_ ] == -1 ) || ( AnprocsR == 1 ) );
270  }
271  else
272  {
273  AinbD = DESCA[IMB_]; AnbD = DESCA[MB_]; Ald = DESCA[LLD_];
274  AiD = IA; AnD = M; AnR = N;
275  AprocD = Arow; AmyprocD = myrow;
276  AprocR = Acol; AmyprocR = mycol; AnprocsR = npcol;
277  AisR = ( ( DESCA[ CSRC_ ] == -1 ) || ( AnprocsR == 1 ) );
278  }
279  Ainb1D = PB_Cfirstnb( AnD, AiD, AinbD, AnbD );
280 /*
281 * Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol ...
282 */
283  PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow,
284  &Bcol );
285  if( BisRow )
286  {
287  BinbD = DESCB[ INB_ ]; BnbD = DESCB[ NB_ ];
288  BsrcD = DESCB[ CSRC_ ]; Bld = DESCB[ LLD_ ];
289  BiD = JB;
290  if( AisRow ) { BnD = N; BnR = M; } else { BnD = M; BnR = N; }
291  BprocD = Bcol; BmyprocD = mycol;
292  BprocR = Brow; BmyprocR = myrow; BnprocsR = nprow;
293  BisR = ( ( DESCB[ RSRC_ ] == -1 ) || ( BnprocsR == 1 ) );
294  }
295  else
296  {
297  BinbD = DESCB[ IMB_ ]; BnbD = DESCB[ MB_ ];
298  BsrcD = DESCB[ RSRC_ ]; Bld = DESCB[ LLD_ ];
299  BiD = IB;
300  if( AisRow ) { BnD = N; BnR = M; } else { BnD = M; BnR = N; }
301  BprocD = Brow; BmyprocD = myrow;
302  BprocR = Bcol; BmyprocR = mycol; BnprocsR = npcol;
303  BisR = ( ( DESCB[ CSRC_ ] == -1 ) || ( BnprocsR == 1 ) );
304  }
305  Binb1D = PB_Cfirstnb( BnD, BiD, BinbD, BnbD );
306 /*
307 * Are sub( A ) and sub( B ) both row or column vectors ?
308 */
309  RRorCC = ( ( AisRow && BisRow ) || ( !( AisRow ) && !( BisRow ) ) );
310 /*
311 * Do sub( A ) and sub( B ) span more than one process ?
312 */
313  OneDgrid = ( ( AnprocsD == 1 ) && ( BnprocsD == 1 ) );
314  OneBlock = ( ( Ainb1D >= AnD ) && ( Binb1D >= BnD ) );
315 /*
316 * Are sub( A ) and sub( B ) distributed in the same manner ?
317 */
318  Square = ( ( Ainb1D == Binb1D ) && ( AnbD == BnbD ) &&
319  ( AnprocsD == BnprocsD ) );
320
321  if( !( AisR ) )
322  {
323 /*
324 * sub( A ) is distributed but not replicated
325 */
326  if( BisR )
327  {
328 /*
329 * If sub( A ) is not replicated, but sub( B ) is, a process row or column
330 * BprocR need to be selected. It will contain the non-replicated vector to
331 * add sub( A ) to.
332 */
333  if( RRorCC )
334  {
335 /*
336 * sub( A ) and sub( B ) are both row or column vectors
337 */
338  if( ( OneDgrid || OneBlock || Square ) && ( AprocD == BprocD ) )
339  {
340 /*
341 * sub( A ) and sub( B ) start in the same process row or column AprocD=BprocD.
342 * Enforce a purely local operation by choosing BprocR to be equal to AprocR.
343 */
344  BprocR = AprocR;
345  }
346  else
347  {
348 /*
349 * Otherwise, communication has to occur, so choose the next process row or
350 * column for BprocR to maximize the number of links, i.e reduce contention.
351 */
352  BprocR = MModAdd1( AprocR, AnprocsR );
353  }
354  }
355  else
356  {
357 /*
358 * sub( A ) and sub( B ) are distributed in orthogonal directions, what is
359 * chosen for YprocR does not really matter. Select the process origin.
360 */
361  BprocR = AprocD;
362  }
363  }
364  else
365  {
366 /*
367 * Neither sub( A ) nor sub( B ) are replicated. If I am not in process row or
368 * column AprocR and not in process row or column BprocR, then quick return.
369 */
370  if( ( AmyprocR != AprocR ) && ( BmyprocR != BprocR ) )
371  return;
372  }
373  }
374  else
375  {
376 /*
377 * sub( A ) is distributed and replicated (so no quick return possible)
378 */
379  if( BisR )
380  {
381 /*
382 * sub( B ) is distributed and replicated as well
383 */
384  if( RRorCC )
385  {
386 /*
387 * sub( A ) and sub( B ) are both row or column vectors
388 */
389  if( ( OneDgrid || OneBlock || Square ) && ( AprocD == BprocD ) )
390  {
391 /*
392 * sub( A ) and sub( B ) start in the same process row or column AprocD=BprocD.
393 * Enforce a purely local operation by choosing AprocR and BprocR to be equal
394 * to zero.
395 */
396  AprocR = BprocR = 0;
397  }
398  else
399  {
400 /*
401 * Otherwise, communication has to occur, so select BprocR to be zero and the
402 * next process row or column for AprocR in order to maximize the number of
403 * used links, i.e reduce contention.
404 */
405  BprocR = 0;
406  AprocR = MModAdd1( BprocR, BnprocsR );
407  }
408  }
409  else
410  {
411 /*
412 * sub( A ) and sub( B ) are distributed in orthogonal directions, select the
413 * origin processes.
414 */
415  AprocR = BprocD;
416  BprocR = AprocD;
417  }
418  }
419  else
420  {
421 /*
422 * sub( B ) is distributed, but not replicated
423 */
424  if( RRorCC )
425  {
426 /*
427 * sub( A ) and sub( B ) are both row or column vectors
428 */
429  if( ( OneDgrid || OneBlock || Square ) && ( AprocD == BprocD ) )
430  {
431 /*
432 * sub( A ) and sub( B ) start in the same process row or column AprocD=BprocD.
433 * Enforce a purely local operation by choosing AprocR to be equal to BprocR.
434 */
435  AprocR = BprocR;
436  if( ( AmyprocR != AprocR ) && ( BmyprocR != BprocR ) ) return;
437  }
438  else
439  {
440 /*
441 * Otherwise, communication has to occur, so choose the next process row or
442 * column for AprocR to maximize the number of links, i.e reduce contention.
443 */
444  AprocR = MModAdd1( BprocR, BnprocsR );
445  }
446  }
447  else
448  {
449 /*
450 * sub( A ) and sub( B ) are distributed in orthogonal directions, what is
451 * chosen for AprocR does not really matter. Select the origin process.
452 */
453  AprocR = BprocD;
454  if( ( OneDgrid || OneBlock || Square ) &&
455  ( AmyprocR != AprocR ) && ( BmyprocR != BprocR ) ) return;
456  }
457  }
458  }
459 /*
460 * Even if sub( A ) and/or sub( B ) are replicated, only two process row or
461 * column are active, namely AprocR and BprocR. If any of those operands is
462 * replicated, broadcast will occur (unless there is an easy way out).
463 */
464  size = TYPE->size;
465 /*
466 * A purely local operation occurs iff the operands start in the same process
467 * and, if either the grid is mono-dimensional or there is a single local block
468 * to be added or if both operands are aligned.
469 */
470  if( ( ( RRorCC && ( AprocD == BprocD ) &&
471  ( AisR || BisR || ( AprocR == BprocR ) ) ) ||
472  ( !( RRorCC ) && ( BisR || ( AprocD == BprocR ) ) &&
473  ( AisR || ( AprocR == BprocD ) ) ) ) &&
474  ( OneDgrid || OneBlock || ( RRorCC && Square ) ) )
475  {
476  if( ( !AisR && ( AmyprocR == AprocR ) ) ||
477  ( AisR && ( BisR || BmyprocR == BprocR ) ) )
478  {
479  AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, AmyprocD, AprocD,
480  AnprocsD );
481  BnpD = PB_Cnumroc( BnD, 0, Binb1D, BnbD, BmyprocD, BprocD,
482  BnprocsD );
483  if( ( AnpD > 0 ) && ( BnpD > 0 ) )
484  {
485 /*
486 * Select the local add routine accordingly to RRorCC
487 */
488  if( RRorCC )
489  {
492  }
493  else
494  {
497  }
498 /*
500 */
501  if( AisRow )
502  add( &AnR, &AnpD, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald,
503  BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
504  else
505  add( &AnpD, &AnR, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald,
506  BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
507  }
508  }
509  if( RRorCC && AisR && BisR ) return;
510  }
511  else if( ( RRorCC && OneDgrid ) || OneBlock || Square )
512  {
513 /*
514 * Otherwise, it may be possible to add the distributed vectors in a single
515 * message exchange iff the grid is mono-dimensional and the operands are
516 * distributed in the same direction, or there is just one block to be exchanged
517 * or if both operands are similarly distributed in their respective direction.
518 */
519  if( RRorCC )
520  {
523  }
524  else
525  {
528  }
529
530  if( ( AisR && BisR ) || ( AmyprocR == AprocR ) )
531  {
532
533  AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, AmyprocD, AprocD,
534  AnprocsD );
535  if( AnpD > 0 )
536  {
537  dst = BprocD + MModSub( AmyprocD, AprocD, AnprocsD );
538  dst = MPosMod( dst, BnprocsD );
539  if( AisRow ) { ma = AnR; na = AnpD; }
540  else { ma = AnpD; na = AnR; }
541  if( !( AisR && BisR ) )
542  {
543  if( BisRow ) { rdst = BprocR; cdst = dst; }
544  else { rdst = dst; cdst = BprocR; }
545  }
546  else
547  {
548  if( BisRow )
549  {
550  if( !AisRow ) { rdst = AmyprocR; }
551  else { rdst = MModAdd1( BmyprocR, BnprocsR ); }
552  cdst = dst;
553  }
554  else
555  {
556  rdst = dst;
557  if( AisRow ) { cdst = AmyprocR; }
558  else { cdst = MModAdd1( BmyprocR, BnprocsR ); }
559  }
560  }
561  if( ( myrow == rdst ) && ( mycol == cdst ) )
562  {
563  add( &ma, &na, ALPHA, Mptr( A, Aii, Ajj, Ald, size ), &Ald,
564  BETA, Mptr( B, Bii, Bjj, Bld, size ), &Bld );
565  }
566  else
567  {
568  TYPE->Cgesd2d( ctxt, ma, na, Mptr( A, Aii, Ajj, Ald, size ),
569  Ald, rdst, cdst );
570  }
571  }
572  }
573
574  if( ( AisR && BisR ) || ( BmyprocR == BprocR ) )
575  {
576  BnpD = PB_Cnumroc( BnD, 0, Binb1D, BnbD, BmyprocD, BprocD,
577  BnprocsD );
578  if( BnpD > 0 )
579  {
580  src = AprocD + MModSub( BmyprocD, BprocD, BnprocsD );
581  src = MPosMod( src, AnprocsD );
582  if( AisRow ) { ma = BnR; na = BnpD; }
583  else { ma = BnpD; na = BnR; }
584  if( !( AisR && BisR ) )
585  {
586  if( AisRow ) { rsrc = AprocR; csrc = src; }
587  else { rsrc = src; csrc = AprocR; }
588  }
589  else
590  {
591  if( AisRow )
592  {
593  if( !BisRow ) { rsrc = BmyprocR; }
594  else { rsrc = MModSub1( AmyprocR, AnprocsR ); }
595  csrc = src;
596  }
597  else
598  {
599  rsrc = src;
600  if( BisRow ) { csrc = BmyprocR; }
601  else { csrc = MModSub1( AmyprocR, AnprocsR ); }
602  }
603  }
604  if( ( myrow != rsrc ) || ( mycol != csrc ) )
605  {
606  buf = PB_Cmalloc( BnpD * BnR * size );
607  TYPE->Cgerv2d( ctxt, ma, na, buf, ma, rsrc, csrc );
608  add( &ma, &na, ALPHA, buf, &ma, BETA, Mptr( B, Bii, Bjj, Bld,
609  size ), &Bld );
610  if( buf ) free( buf );
611  }
612  }
613  }
614  if( AisR && BisR ) return;
615  }
616  else
617  {
618 /*
619 * General case
620 */
621  if( RRorCC )
622  {
623  if( Mupcase( CONJUG[0] ) != CNOCONJG ) tran = CCONJG;
624  else tran = CNOTRAN;
625  }
626  else
627  {
628  if( Mupcase( CONJUG[0] ) != CNOCONJG ) tran = CCOTRAN;
629  else tran = CTRAN;
630  }
631
632  if( AisRow ) { ascope = CCOLUMN; ma = AnR; }
633  else { ascope = CROW; na = AnR; }
634  bscope = ( BisRow ? CCOLUMN : CROW );
635  lcmb = PB_Clcm( AnprocsD * AnbD, BnprocsD * BnbD );
636  one = TYPE->one; zero = TYPE->zero;
637  gcdPQ = PB_Cgcd( AnprocsD, BnprocsD );
638  lcmPQ = ( AnprocsD / gcdPQ ) * BnprocsD;
639
640  for( k = 0; k < gcdPQ; k++ )
641  {
642  p = 0; q = k;
643
644  for( l = 0; l < lcmPQ; l++ )
645  {
646  Aroc = MModAdd( AprocD, p, AnprocsD );
647  Broc = MModAdd( BprocD, q, BnprocsD );
648
649  if( ( AmyprocD == Aroc ) || ( BmyprocD == Broc ) )
650  {
651  AnpD = PB_Cnumroc( AnD, 0, Ainb1D, AnbD, Aroc, AprocD,
652  AnprocsD );
653  BnpD = PB_Cnumroc( BnD, 0, Binb1D, BnbD, Broc, BprocD,
654  BnprocsD );
655  PB_CVMinit( &VM, 0, AnpD, BnpD, Ainb1D, Binb1D, AnbD, BnbD,
656  p, q, AnprocsD, BnprocsD, lcmb );
657  if( npq = PB_CVMnpq( &VM ) )
658  {
659  if( ( RRorCC && ( Aroc == Broc ) &&
660  ( AisR || ( AprocR == BprocR ) ) ) ||
661  ( !( RRorCC ) && ( Aroc == BprocR ) &&
662  ( AisR || ( AprocR == Broc ) ) ) )
663  {
664  if( ( BmyprocD == Broc ) && ( BmyprocR == BprocR ) )
665  {
666  PB_CVMloc( TYPE, &VM, ROW, &ascope, PACKING, &tran,
667  npq, AnR, ALPHA, Mptr( A, Aii, Ajj, Ald,
668  size ), Ald, BETA, Mptr( B, Bii, Bjj, Bld,
669  size ), Bld );
670  }
671  }
672  else
673  {
674  if( ( AmyprocR == AprocR ) && ( AmyprocD == Aroc ) )
675  {
676  if( AisRow ) { na = npq; } else { ma = npq; }
677  buf = PB_Cmalloc( ma * na * size );
678  PB_CVMpack( TYPE, &VM, ROW, &ascope, PACKING, NOTRAN,
679  npq, AnR, one, Mptr( A, Aii, Ajj, Ald,
680  size ), Ald, zero, buf, ma );
681  if( BisRow ) { rdst = BprocR; cdst = Broc; }
682  else { rdst = Broc; cdst = BprocR; }
683  TYPE->Cgesd2d( ctxt, ma, na, buf, ma, rdst, cdst );
684  if( buf ) free ( buf );
685  }
686  if( ( BmyprocR == BprocR ) && ( BmyprocD == Broc ) )
687  {
688  if( AisRow )
689  { na = npq; rsrc = AprocR; csrc = Aroc; }
690  else
691  { ma = npq; rsrc = Aroc; csrc = AprocR; }
692  buf = PB_Cmalloc( ma * na * size );
693  TYPE->Cgerv2d( ctxt, ma, na, buf, ma, rsrc, csrc );
694  PB_CVMpack( TYPE, &VM, COLUMN, &bscope, UNPACKING,
695  &tran, npq, AnR, BETA, Mptr( B, Bii, Bjj,
696  Bld, size ), Bld, ALPHA, buf, ma );
697  if( buf ) free ( buf );
698  }
699  }
700  }
701  }
702  p = MModAdd1( p, AnprocsD );
703  q = MModAdd1( q, BnprocsD );
704  }
705  if( AisR ) AprocR = MModAdd1( AprocR, AnprocsR );
706  }
707  }
708
709  if( BisR )
710  {
711 /*
712 * Replicate sub( B )
713 */
714  BnpD = PB_Cnumroc( BnD, BiD, BinbD, BnbD, BmyprocD, BsrcD, BnprocsD );
715  if( BnpD > 0 )
716  {
717  if( BisRow )
718  {
719  bscope = CCOLUMN; mb = BnR; nb = BnpD;
720  rsrc = BprocR; csrc = BmyprocD;
721  }
722  else
723  {
724  bscope = CROW; mb = BnpD; nb = BnR;
725  rsrc = BmyprocD; csrc = BprocR;
726  }
727  top = PB_Ctop( &ctxt, BCAST, &bscope, TOP_GET );
728  if( BmyprocR == BprocR )
729  {
730  TYPE->Cgebs2d( ctxt, &bscope, top, mb, nb, Mptr( B, Bii, Bjj,
731  Bld, size ), Bld );
732  }
733  else
734  {
735  TYPE->Cgebr2d( ctxt, &bscope, top, mb, nb, Mptr( B, Bii, Bjj,
736  Bld, size ), Bld, rsrc, csrc );
737  }
738  }
739  }
740  }
741  else if( !( AisD ) && BisD )
742  {
743 /*
744 * sub( A ) is not distributed and sub( B ) is distributed.
745 */
746  PB_CpaxpbyND( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, AROC, BETA, B,
747  IB, JB, DESCB, BROC );
748  }
749  else if( AisD && !( BisD ) )
750  {
751 /*
752 * sub( A ) is distributed and sub( B ) is not distributed.
753 */
754  PB_CpaxpbyDN( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, AROC, BETA, B,
755  IB, JB, DESCB, BROC );
756  }
757  else
758  {
759 /*
760 * Neither sub( A ) nor sub( B ) are distributed.
761 */
762  PB_CpaxpbyNN( TYPE, CONJUG, M, N, ALPHA, A, IA, JA, DESCA, AROC, BETA, B,
763  IB, JB, DESCB, BROC );
764  }
765 /*
766 * End of PB_Cpaxpby
767 */
768 }
CCONJG
#define CCONJG
Definition: PBblas.h:21
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
PB_CVMloc
int PB_CVMloc()
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
UNPACKING
#define UNPACKING
Definition: PBtools.h:54
CCOTRAN
#define CCOTRAN
Definition: PBblas.h:22
PB_CpaxpbyDN
void PB_CpaxpbyDN()
PB_Cfirstnb
int PB_Cfirstnb()
MPosMod
#define MPosMod(I, d)
Definition: PBtools.h:95
NOTRAN
#define NOTRAN
Definition: PBblas.h:44
LLD_
#define LLD_
Definition: PBtools.h:47
PB_CVMinit
void PB_CVMinit()
PB_CVMpack
int PB_CVMpack()
CNOCONJG
#define CNOCONJG
Definition: PBblas.h:19
Definition: PBtools.h:97
CROW
#define CROW
Definition: PBblacs.h:21
IMB_
#define IMB_
Definition: PBtools.h:41
MModSub
#define MModSub(I1, I2, d)
Definition: PBtools.h:102
Definition: PBtools.h:100
PB_Cgcd
int PB_Cgcd()
Definition: pblas.h:284
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
CNOTRAN
#define CNOTRAN
Definition: PBblas.h:18
BCAST
#define BCAST
Definition: PBblacs.h:48
PB_CpaxpbyNN
void PB_CpaxpbyNN()
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
PB_Cmalloc
char * PB_Cmalloc()
PB_VM_T
Definition: pblas.h:432
PB_CVMnpq
int PB_CVMnpq()
CCOLUMN
#define CCOLUMN
Definition: PBblacs.h:20
PACKING
#define PACKING
Definition: PBtools.h:53
INB_
#define INB_
Definition: PBtools.h:42
MModSub1
#define MModSub1(I, d)
Definition: PBtools.h:105
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
CTRAN
#define CTRAN
Definition: PBblas.h:20
PB_Cpaxpby
void PB_Cpaxpby(PBTYP_T *TYPE, char *CONJUG, int M, int N, char *ALPHA, char *A, int IA, int JA, int *DESCA, char *AROC, char *BETA, char *B, int IB, int JB, int *DESCB, char *BROC)
Definition: PB_Cpaxpby.c:26
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CTXT_
#define CTXT_
Definition: PBtools.h:38
PB_CpaxpbyND
void PB_CpaxpbyND()
PB_Clcm
int PB_Clcm()