ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CpgemmAB.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_CpgemmAB( PBTYP_T * TYPE, char * DIRECA, char * DIRECB,
21  char * TRANSA, char * TRANSB, int M, int N, int K,
22  char * ALPHA, char * A, int IA, int JA, int * DESCA,
23  char * B, int IB, int JB, int * DESCB, char * BETA,
24  char * C, int IC, int JC, int * DESCC )
25 #else
26 void PB_CpgemmAB( TYPE, DIRECA, DIRECB, TRANSA, TRANSB, M, N, K, ALPHA,
27  A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC,
28  DESCC )
29 /*
30 * .. Scalar Arguments ..
31 */
32  char * DIRECA, * DIRECB, * TRANSA, * TRANSB;
33  int IA, IB, IC, JA, JB, JC, K, M, N;
34  char * ALPHA, * BETA;
35  PBTYP_T * TYPE;
36 /*
37 * .. Array Arguments ..
38 */
39  int * DESCA, * DESCB, * DESCC;
40  char * A, * B, * C;
41 #endif
42 {
43 /*
44 * Purpose
45 * =======
46 *
47 * PB_CpgemmAB performs one of the matrix-matrix operations
48 *
49 * sub( C ) := alpha*op( sub( A ) )*op( sub( B ) ) + beta*sub( C ),
50 *
51 * where
52 *
53 * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of
54 * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ).
55 *
56 * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+K-1) if TRANSA = 'N',
57 * A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'T',
58 * conjg(A(IA:IA+K-1,JA:JA+M-1)') if TRANSA = 'C',
59 *
60 * and, op( sub( B ) ) denotes B(IB:IB+K-1,JB:JB+N-1) if TRANSB = 'N',
61 * B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'T',
62 * conjg(B(IB:IB+N-1,JB:JB+K-1)') if TRANSB = 'C'.
63 *
64 * Alpha and beta are scalars. A, B and C are matrices; op( sub( A ) )
65 * is an m by k submatrix, op( sub( B ) ) is an k by n submatrix and
66 * sub( C ) is an m by n submatrix.
67 *
68 * This is the outer-product algorithm using the logical LCM hybrid
69 * algorithmic blocking technique. The submatrix operand sub( C ) stays
70 * in place.
71 *
72 * Notes
73 * =====
74 *
75 * A description vector is associated with each 2D block-cyclicly dis-
76 * tributed matrix. This vector stores the information required to
77 * establish the mapping between a matrix entry and its corresponding
78 * process and memory location.
79 *
80 * In the following comments, the character _ should be read as
81 * "of the distributed matrix". Let A be a generic term for any 2D
82 * block cyclicly distributed matrix. Its description vector is DESC_A:
83 *
84 * NOTATION STORED IN EXPLANATION
85 * ---------------- --------------- ------------------------------------
86 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
87 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
88 * the NPROW x NPCOL BLACS process grid
89 * A is distributed over. The context
90 * itself is global, but the handle
91 * (the integer value) may vary.
92 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
93 * ted matrix A, M_A >= 0.
94 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
95 * buted matrix A, N_A >= 0.
96 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
97 * block of the matrix A, IMB_A > 0.
98 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
99 * left block of the matrix A,
100 * INB_A > 0.
101 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
102 * bute the last M_A-IMB_A rows of A,
103 * MB_A > 0.
104 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
105 * bute the last N_A-INB_A columns of
106 * A, NB_A > 0.
107 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
108 * row of the matrix A is distributed,
109 * NPROW > RSRC_A >= 0.
110 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
111 * first column of A is distributed.
112 * NPCOL > CSRC_A >= 0.
113 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
114 * array storing the local blocks of
115 * the distributed matrix A,
116 * IF( Lc( 1, N_A ) > 0 )
117 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
118 * ELSE
119 * LLD_A >= 1.
120 *
121 * Let K be the number of rows of a matrix A starting at the global in-
122 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
123 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
124 * receive if these K rows were distributed over NPROW processes. If K
125 * is the number of columns of a matrix A starting at the global index
126 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
127 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
128 * these K columns were distributed over NPCOL processes.
129 *
130 * The values of Lr() and Lc() may be determined via a call to the func-
131 * tion PB_Cnumroc:
132 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
133 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
134 *
135 * Arguments
136 * =========
137 *
138 * TYPE (local input) pointer to a PBTYP_T structure
139 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
140 * that contains type information (See pblas.h).
141 *
142 * DIRECA (global input) pointer to CHAR
143 * On entry, DIRECA specifies the direction in which the rows
144 * or columns of sub( A ) should be looped over as follows:
145 * DIRECA = 'F' or 'f' forward or increasing,
146 * DIRECA = 'B' or 'b' backward or decreasing.
147 *
148 * DIRECB (global input) pointer to CHAR
149 * On entry, DIRECB specifies the direction in which the rows
150 * or columns of sub( B ) should be looped over as follows:
151 * DIRECB = 'F' or 'f' forward or increasing,
152 * DIRECB = 'B' or 'b' backward or decreasing.
153 *
154 * TRANSA (global input) pointer to CHAR
155 * On entry, TRANSA specifies the form of op( sub( A ) ) to be
156 * used in the matrix multiplication as follows:
157 *
158 * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ),
159 * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )',
160 * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ).
161 *
162 * TRANSB (global input) pointer to CHAR
163 * On entry, TRANSB specifies the form of op( sub( B ) ) to be
164 * used in the matrix multiplication as follows:
165 *
166 * TRANSB = 'N' or 'n' op( sub( B ) ) = sub( B ),
167 * TRANSB = 'T' or 't' op( sub( B ) ) = sub( B )',
168 * TRANSB = 'C' or 'c' op( sub( B ) ) = conjg( sub( B )' ).
169 *
170 * M (global input) INTEGER
171 * On entry, M specifies the number of rows of the submatrix
172 * op( sub( A ) ) and of the submatrix sub( C ). M must be at
173 * least zero.
174 *
175 * N (global input) INTEGER
176 * On entry, N specifies the number of columns of the submatrix
177 * op( sub( B ) ) and the number of columns of the submatrix
178 * sub( C ). N must be at least zero.
179 *
180 * K (global input) INTEGER
181 * On entry, K specifies the number of columns of the submatrix
182 * op( sub( A ) ) and the number of rows of the submatrix
183 * op( sub( B ) ). K must be at least zero.
184 *
185 * ALPHA (global input) pointer to CHAR
186 * On entry, ALPHA specifies the scalar alpha. When ALPHA is
187 * supplied as zero then the local entries of the arrays A and
188 * B corresponding to the entries of the submatrices sub( A )
189 * and sub( B ) respectively need not be set on input.
190 *
191 * A (local input) pointer to CHAR
192 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
193 * at least Lc( 1, JA+K-1 ) when TRANSA = 'N' or 'n', and is at
194 * least Lc( 1, JA+M-1 ) otherwise. Before entry, this array
195 * contains the local entries of the matrix A.
196 *
197 * IA (global input) INTEGER
198 * On entry, IA specifies A's global row index, which points to
199 * the beginning of the submatrix sub( A ).
200 *
201 * JA (global input) INTEGER
202 * On entry, JA specifies A's global column index, which points
203 * to the beginning of the submatrix sub( A ).
204 *
205 * DESCA (global and local input) INTEGER array
206 * On entry, DESCA is an integer array of dimension DLEN_. This
207 * is the array descriptor for the matrix A.
208 *
209 * B (local input) pointer to CHAR
210 * On entry, B is an array of dimension (LLD_B, Kb), where Kb is
211 * at least Lc( 1, JB+N-1 ) when TRANSB = 'N' or 'n', and is at
212 * least Lc( 1, JB+K-1 ) otherwise. Before entry, this array
213 * contains the local entries of the matrix B.
214 *
215 * IB (global input) INTEGER
216 * On entry, IB specifies B's global row index, which points to
217 * the beginning of the submatrix sub( B ).
218 *
219 * JB (global input) INTEGER
220 * On entry, JB specifies B's global column index, which points
221 * to the beginning of the submatrix sub( B ).
222 *
223 * DESCB (global and local input) INTEGER array
224 * On entry, DESCB is an integer array of dimension DLEN_. This
225 * is the array descriptor for the matrix B.
226 *
227 * BETA (global input) pointer to CHAR
228 * On entry, BETA specifies the scalar beta. When BETA is
229 * supplied as zero then the local entries of the array C
230 * corresponding to the entries of the submatrix sub( C ) need
231 * not be set on input.
232 *
233 * C (local input/local output) pointer to CHAR
234 * On entry, C is an array of dimension (LLD_C, Kc), where Kc is
235 * at least Lc( 1, JC+N-1 ). Before entry, this array contains
236 * the local entries of the matrix C.
237 * On exit, the entries of this array corresponding to the local
238 * entries of the submatrix sub( C ) are overwritten by the
239 * local entries of the m by n updated submatrix.
240 *
241 * IC (global input) INTEGER
242 * On entry, IC specifies C's global row index, which points to
243 * the beginning of the submatrix sub( C ).
244 *
245 * JC (global input) INTEGER
246 * On entry, JC specifies C's global column index, which points
247 * to the beginning of the submatrix sub( C ).
248 *
249 * DESCC (global and local input) INTEGER array
250 * On entry, DESCC is an integer array of dimension DLEN_. This
251 * is the array descriptor for the matrix C.
252 *
253 * -- Written on April 1, 1998 by
254 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
255 *
256 * ---------------------------------------------------------------------
257 */
258 /*
259 * .. Local Scalars ..
260 */
261  char Aroc, Broc, TrA, TrB, * one, * tbeta, * zero;
262  int ABrocs, Abufld, AcurrocR, Afr, Afwd, AiD, AiR, AiiD, AiiR,
263  AinbD, AinbR, Ainb1D, Ainb1R, AisR, AkkR, Ald, AmyprocD,
264  AmyprocR, AnbD, AnbR, AnpD, AnpR, AnprocsD, AnprocsR, Aoff,
265  ArocD, ArocR, AsrcR, Bbufld, BcurrocR, Bfr, Bfwd, BiD, BiR,
266  BiiD, BiiR, BinbD, BinbR, Binb1D, Binb1R, BisR, BkkR, Bld,
267  BmyprocD, BmyprocR, BnbD, BnbR, BnpD, BnpR, BnprocsD,
268  BnprocsR, Boff, BrocD, BrocR, BsrcR, Ccol, Cii, Cimb1, Cinb1,
269  Cjj, Cld, Cmb, Cmp, Cnb, Cnq, Crow, WAfr, WAsum, WBfr, WBsum,
270  Wkbb=0, ctxt, k, kb, kbb, lcmb, maxp, maxpm1, maxq, mycol,
271  myrow, ncpq, nota, notb, npcol, npq=0, nprow, nrpq, p=0, q=0,
272  size, tmp;
273  GEMM_T gemm;
274 /*
275 * .. Local Arrays ..
276 */
277  PB_VM_T VM;
278  int Cd0[DLEN_], DBUFA[DLEN_], DBUFB[DLEN_], WAd0[DLEN_],
279  WBd0[DLEN_];
280  char * Abuf = NULL, * Bbuf = NULL, * Cptr = NULL, * WA = NULL,
281  * WB = NULL;
282 /* ..
283 * .. Executable Statements ..
284 *
285 */
286 /*
287 * Retrieve process grid information
288 */
289  Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
290 
291  nota = ( ( TrA = Mupcase( TRANSA[0] ) ) == CNOTRAN );
292  notb = ( ( TrB = Mupcase( TRANSB[0] ) ) == CNOTRAN );
293  TrA = ( ( TrA == CCOTRAN ) ? CCONJG : CNOCONJG );
294  TrB = ( ( TrB == CCOTRAN ) ? CCONJG : CNOCONJG );
295 
296  size = TYPE->size;
297 /*
298 * Retrieve local information for sub( A ), sub( B ) and sub( C )
299 */
300  if( nota )
301  {
302  AiR = JA; Aroc = CCOLUMN; AnprocsR = npcol;
303  AinbR = DESCA[INB_]; AnbR = DESCA[NB_ ]; AsrcR = DESCA[CSRC_];
304  }
305  else
306  {
307  AiR = IA; Aroc = CROW; AnprocsR = nprow;
308  AinbR = DESCA[IMB_]; AnbR = DESCA[MB_ ]; AsrcR = DESCA[RSRC_];
309  }
310 
311  if( notb )
312  {
313  BiR = IB; Broc = CROW; BnprocsR = nprow;
314  BinbR = DESCB[IMB_]; BnbR = DESCB[MB_ ]; BsrcR = DESCB[RSRC_];
315  }
316  else
317  {
318  BiR = JB; Broc = CCOLUMN; BnprocsR = npcol;
319  BinbR = DESCB[INB_]; BnbR = DESCB[NB_ ]; BsrcR = DESCB[CSRC_];
320  }
321 /*
322 * Retrieve sub( C )'s local information: Aii, Ajj, Arow, Acol ...
323 */
324  PB_Cdescribe( M, N, IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj,
325  &Cld, &Cimb1, &Cinb1, &Cmb, &Cnb, &Crow, &Ccol, Cd0 );
326 
327  Cmp = PB_Cnumroc( M, 0, Cimb1, Cmb, myrow, Crow, nprow );
328  Cnq = PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol );
329 /*
330 * When sub( A ) and sub( B ) do not span more than one process row or column,
331 * there is no need to pack the data.
332 */
333  if( !( PB_Cspan( K, AiR, AinbR, AnbR, AsrcR, AnprocsR ) ) &&
334  !( PB_Cspan( K, BiR, BinbR, BnbR, BsrcR, BnprocsR ) ) )
335  {
336  PB_CInV( TYPE, &TrA, COLUMN, M, N, Cd0, K, A, IA, JA, DESCA, &Aroc, &WA,
337  WAd0, &WAfr );
338  PB_CInV( TYPE, &TrB, ROW, M, N, Cd0, K, B, IB, JB, DESCB, &Broc, &WB,
339  WBd0, &WBfr );
340  if( ( Cmp > 0 ) && ( Cnq > 0 ) )
341  {
342 /*
343 * Perform the local update if I own some of sub( C )
344 */
345  TYPE->Fgemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp, &Cnq, &K,
346  ALPHA, WA, &WAd0[LLD_], WB, &WBd0[LLD_], BETA, Mptr( C,
347  Cii, Cjj, Cld, size ), &Cld );
348  }
349  if( WAfr ) free( WA );
350  if( WBfr ) free( WB );
351  return;
352  }
353 /*
354 * sub( A ) and sub( B ) span more than one process row or column.
355 */
356  Afwd = ( Mupcase( DIRECA[0] ) == CFORWARD );
357  Bfwd = ( Mupcase( DIRECB[0] ) == CFORWARD );
358 
359  one = TYPE->one; zero = TYPE->zero; tbeta = BETA; gemm = TYPE->Fgemm;
360  kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
361 /*
362 * Compute local information for sub( A ) and sub( B )
363 */
364  if( nota )
365  {
366  AiD = IA; AinbD = DESCA[IMB_]; AnbD = DESCA[MB_];
367  Ald = DESCA[LLD_]; AmyprocD = myrow; AmyprocR = mycol;
368  AnprocsD = nprow;
369  PB_Cinfog2l( IA, JA, DESCA, AnprocsD, AnprocsR, AmyprocD, AmyprocR,
370  &AiiD, &AiiR, &ArocD, &ArocR );
371  }
372  else
373  {
374  AiD = JA; AinbD = DESCA[INB_]; AnbD = DESCA[NB_];
375  Ald = DESCA[LLD_]; AmyprocD = mycol; AmyprocR = myrow;
376  AnprocsD = npcol;
377  PB_Cinfog2l( IA, JA, DESCA, AnprocsR, AnprocsD, AmyprocR, AmyprocD,
378  &AiiR, &AiiD, &ArocR, &ArocD );
379  }
380  Ainb1D = PB_Cfirstnb( M, AiD, AinbD, AnbD );
381  AnpD = PB_Cnumroc( M, 0, Ainb1D, AnbD, AmyprocD, ArocD, AnprocsD );
382  Ainb1R = PB_Cfirstnb( K, AiR, AinbR, AnbR );
383  AisR = ( ( AsrcR < 0 ) || ( AnprocsR == 1 ) );
384 
385  if( notb )
386  {
387  BiD = JB; BinbD = DESCB[INB_]; BnbD = DESCB[NB_];
388  Bld = DESCB[LLD_]; BmyprocD = mycol; BmyprocR = myrow;
389  BnprocsD = npcol;
390  PB_Cinfog2l( IB, JB, DESCB, BnprocsR, BnprocsD, BmyprocR, BmyprocD,
391  &BiiR, &BiiD, &BrocR, &BrocD );
392  }
393  else
394  {
395  BiD = IB; BinbD = DESCB[IMB_]; BnbD = DESCB[MB_];
396  Bld = DESCB[LLD_]; BmyprocD = myrow; BmyprocR = mycol;
397  BnprocsD = nprow;
398  PB_Cinfog2l( IB, JB, DESCB, BnprocsD, BnprocsR, BmyprocD, BmyprocR,
399  &BiiD, &BiiR, &BrocD, &BrocR );
400  }
401  Binb1D = PB_Cfirstnb( N, BiD, BinbD, BnbD );
402  BnpD = PB_Cnumroc( N, 0, Binb1D, BnbD, BmyprocD, BrocD, BnprocsD );
403  Binb1R = PB_Cfirstnb( K, BiR, BinbR, BnbR );
404  BisR = ( ( BsrcR < 0 ) || ( BnprocsR == 1 ) );
405 /*
406 * When sub( A ) is not replicated and backward pass on sub( A ), find the
407 * virtual process q owning the last row or column of sub( A ).
408 */
409  if( !( AisR ) && !( Afwd ) )
410  {
411  tmp = PB_Cindxg2p( K - 1, Ainb1R, AnbR, ArocR, ArocR, AnprocsR );
412  q = MModSub( tmp, ArocR, AnprocsR );
413  }
414 /*
415 * When sub( B ) is not replicated and backward pass on sub( B ), find the
416 * virtual process p owning the last row or column of sub( B ).
417 */
418  if( !( BisR ) && !( Bfwd ) )
419  {
420  tmp = PB_Cindxg2p( K - 1, Binb1R, BnbR, BrocR, BrocR, BnprocsR );
421  p = MModSub( tmp, BrocR, BnprocsR );
422  }
423 
424  if( Cmp > 0 && Cnq > 0 ) Cptr = Mptr( C, Cii, Cjj, Cld, size );
425 /*
426 * Allocate work space in process rows and columns spanned by sub( C )
427 */
428  PB_COutV( TYPE, COLUMN, NOINIT, M, N, Cd0, kb, &WA, WAd0, &WAfr, &WAsum );
429  PB_COutV( TYPE, ROW, NOINIT, M, N, Cd0, kb, &WB, WBd0, &WBfr, &WBsum );
430 /*
431 * Loop over the virtual process grid induced by the sub( A ) and sub( B )
432 */
433  lcmb = PB_Clcm( ( maxp = ( BisR ? 1 : BnprocsR ) ) * BnbR,
434  ( maxq = ( AisR ? 1 : AnprocsR ) ) * AnbR );
435  maxpm1 = maxp - 1;
436 /*
437 * Find out process coordinates corresponding to first virtual process (p,q)
438 */
439  AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, AnprocsR ) );
440  AkkR = PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR, AnprocsR );
441  AnpR = PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR, AnprocsR );
442 
443  BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, BnprocsR ) );
444  BkkR = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, BnprocsR );
445  BnpR = PB_Cnumroc( K, 0, Binb1R, BnbR, BcurrocR, BrocR, BnprocsR );
446 /*
447 * Find out how many diagonals this virtual process (p,q) has
448 */
449  PB_CVMinit( &VM, 0, BnpR, AnpR, Binb1R, Ainb1R, BnbR, AnbR, p, q,
450  maxp, maxq, lcmb );
451  npq = PB_CVMnpq( &VM );
452 
453  for( k = 0; k < K; k += kb )
454  {
455  kbb = K - k; kbb = MIN( kbb, kb );
456 
457  while( Wkbb != kbb )
458  {
459 /*
460 * Ensure that the current virtual process (p,q) has something to contribute
461 * to the replicated buffers WA and WB.
462 */
463  while( npq == 0 )
464  {
465  if( ( Bfwd && ( p == maxpm1 ) ) ||
466  ( !( Bfwd ) && ( p == 0 ) ) )
467  q = ( Afwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) );
468  p = ( Bfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) );
469 
470  AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, AnprocsR ) );
471  AkkR = PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR,
472  AnprocsR );
473  AnpR = PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR,
474  AnprocsR );
475 
476  BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, BnprocsR ) );
477  BkkR = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR,
478  BnprocsR );
479  BnpR = PB_Cnumroc( K, 0, Binb1R, BnbR, BcurrocR, BrocR,
480  BnprocsR );
481 
482  PB_CVMinit( &VM, 0, BnpR, AnpR, Binb1R, Ainb1R, BnbR, AnbR,
483  p, q, maxp, maxq, lcmb );
484  npq = PB_CVMnpq( &VM );
485  }
486 /*
487 * Current virtual process (p,q) has something, find out how many rows or
488 * columns could be used: ABrocs.
489 */
490  if( Wkbb == 0 ) { ABrocs = ( npq < kbb ? npq : kbb ); }
491  else { ABrocs = kbb - Wkbb; ABrocs = MIN( ABrocs, npq ); }
492 /*
493 * Find out how many rows or columns of sub( A ) and sub( B ) are contiguous
494 */
495  PB_CVMcontig( &VM, &nrpq, &ncpq, &Boff, &Aoff );
496 
497  if( nota )
498  {
499 /*
500 * Compute the descriptor DBUFA for the buffer that will contained the packed
501 * columns of sub( A ).
502 */
503  if( ( Afr = ( ncpq < ABrocs ) ) != 0 )
504  {
505 /*
506 * If columns of sub( A ) are not contiguous, then allocate the buffer and
507 * pack the ABrocs columns of sub( A ).
508 */
509  Abufld = MAX( 1, AnpD );
510  if( AisR || ( AmyprocR == AcurrocR ) )
511  {
512  Abuf = PB_Cmalloc( AnpD * ABrocs * size );
513  PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN,
514  ABrocs, AnpD, one, Mptr( A, AiiD, AkkR, Ald,
515  size ), Ald, zero, Abuf, Abufld );
516  }
517  }
518  else
519  {
520 /*
521 * Otherwise, re-use sub( A ) directly.
522 */
523  Abufld = Ald;
524  if( AisR || ( AmyprocR == AcurrocR ) )
525  Abuf = Mptr( A, AiiD, AkkR + Aoff, Ald, size );
526  }
527  PB_Cdescset( DBUFA, M, ABrocs, Ainb1D, ABrocs, AnbD, ABrocs,
528  ArocD, AcurrocR, ctxt, Abufld );
529  }
530  else
531  {
532 /*
533 * Compute the descriptor DBUFA for the buffer that will contained the packed
534 * rows of sub( A ).
535 */
536  if( ( Afr = ( ncpq < ABrocs ) ) != 0 )
537  {
538 /*
539 * If rows of sub( A ) are not contiguous, then allocate the buffer and
540 * pack the ABrocs rows of sub( A ).
541 */
542  Abufld = ABrocs;
543  if( AisR || ( AmyprocR == AcurrocR ) )
544  {
545  Abuf = PB_Cmalloc( AnpD * ABrocs * size );
546  PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN,
547  ABrocs, AnpD, one, Mptr( A, AkkR, AiiD, Ald,
548  size ), Ald, zero, Abuf, Abufld );
549  }
550  }
551  else
552  {
553 /*
554 * Otherwise, re-use sub( A ) directly.
555 */
556  Abufld = Ald;
557  if( AisR || ( AmyprocR == AcurrocR ) )
558  Abuf = Mptr( A, AkkR + Aoff, AiiD, Ald, size );
559  }
560  PB_Cdescset( DBUFA, ABrocs, M, ABrocs, Ainb1D, ABrocs, AnbD,
561  AcurrocR, ArocD, ctxt, Abufld );
562  }
563 
564  if( notb )
565  {
566 /*
567 * Compute the descriptor DBUFB for the buffer that will contained the packed
568 * rows of sub( B ).
569 */
570  if( ( Bfr = ( nrpq < ABrocs ) ) != 0 )
571  {
572 /*
573 * If rows of sub( B ) are not contiguous, then allocate the buffer and
574 * pack the ABrocs rows of sub( B ).
575 */
576  Bbufld = ABrocs;
577  if( BisR || ( BmyprocR == BcurrocR ) )
578  {
579  Bbuf = PB_Cmalloc( BnpD * ABrocs * size );
580  PB_CVMpack( TYPE, &VM, ROW, &Broc, PACKING, NOTRAN,
581  ABrocs, BnpD, one, Mptr( B, BkkR, BiiD, Bld,
582  size ), Bld, zero, Bbuf, Bbufld );
583  }
584  }
585  else
586  {
587 /*
588 * Otherwise, re-use sub( B ) directly.
589 */
590  Bbufld = Bld;
591  if( BisR || ( BmyprocR == BcurrocR ) )
592  Bbuf = Mptr( B, BkkR + Boff, BiiD, Bld, size );
593  }
594  PB_Cdescset( DBUFB, ABrocs, N, ABrocs, Binb1D, ABrocs, BnbD,
595  BcurrocR, BrocD, ctxt, Bbufld );
596  }
597  else
598  {
599 /*
600 * Compute the descriptor DBUFB for the buffer that will contained the packed
601 * columns of sub( B ).
602 */
603  if( ( Bfr = ( nrpq < ABrocs ) ) != 0 )
604  {
605 /*
606 * If columns of sub( B ) are not contiguous, then allocate the buffer and
607 * pack the ABrocs columns of sub( B ).
608 */
609  Bbufld = MAX( 1, BnpD );
610  if( BisR || ( BmyprocR == BcurrocR ) )
611  {
612  Bbuf = PB_Cmalloc( BnpD * ABrocs * size );
613  PB_CVMpack( TYPE, &VM, ROW, &Broc, PACKING, NOTRAN,
614  ABrocs, BnpD, one, Mptr( B, BiiD, BkkR, Bld,
615  size ), Bld, zero, Bbuf, Bbufld );
616  }
617  }
618  else
619  {
620 /*
621 * Otherwise, re-use sub( B ) directly.
622 */
623  Bbufld = Bld;
624  if( BisR || ( BmyprocR == BcurrocR ) )
625  Bbuf = Mptr( B, BiiD, BkkR + Boff, Bld, size );
626  }
627  PB_Cdescset( DBUFB, N, ABrocs, Binb1D, ABrocs, BnbD, ABrocs,
628  BrocD, BcurrocR, ctxt, Bbufld );
629  }
630 /*
631 * Update the local indexes of sub( A ) and sub( B )
632 */
633  PB_CVMupdate( &VM, ABrocs, &BkkR, &AkkR );
634 /*
635 * Replicate panels of rows or columns of sub( A ) and sub( B ) over sub( C )
636 * -> WA, WB
637 */
638  PB_CInV2( TYPE, &TrA, COLUMN, M, N, Cd0, ABrocs, Abuf, 0, 0,
639  DBUFA, &Aroc, WA, Wkbb, WAd0 );
640  PB_CInV2( TYPE, &TrB, ROW, M, N, Cd0, ABrocs, Bbuf, 0, 0,
641  DBUFB, &Broc, WB, Wkbb, WBd0 );
642 
643  if( Afr & ( AisR || ( AmyprocR == AcurrocR ) ) )
644  if( Abuf ) free( Abuf );
645  if( Bfr & ( BisR || ( BmyprocR == BcurrocR ) ) )
646  if( Bbuf ) free( Bbuf );
647 /*
648 * ABrocs rows or columns of sub( A ) and sub( B ) have been replicated,
649 * update the number of diagonals in this virtual process as well as the
650 * number of rows or columns of sub( A ) and sub( B ) that are in WA, WB.
651 */
652  npq -= ABrocs;
653  Wkbb += ABrocs;
654  }
655 /*
656 * Perform local update
657 */
658  if( Cmp > 0 && Cnq > 0 )
659  {
660  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp, &Cnq, &kbb,
661  ALPHA, WA, &WAd0[LLD_], WB, &WBd0[LLD_], tbeta, Cptr, &Cld );
662  tbeta = one;
663  }
664 
665  Wkbb = 0;
666  }
667 
668  if( WAfr ) free( WA );
669  if( WBfr ) free( WB );
670 /*
671 * End of PB_CpgemmAB
672 */
673 }
CCONJG
#define CCONJG
Definition: PBblas.h:21
NOINIT
#define NOINIT
Definition: PBblas.h:62
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
PB_CVMcontig
void PB_CVMcontig()
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
PB_Cg2lrem
int PB_Cg2lrem()
CCOTRAN
#define CCOTRAN
Definition: PBblas.h:22
PB_Cfirstnb
int PB_Cfirstnb()
DLEN_
#define DLEN_
Definition: PBtools.h:48
GEMM_T
F_VOID_FCT(* GEMM_T)()
Definition: pblas.h:313
NOTRAN
#define NOTRAN
Definition: PBblas.h:44
LLD_
#define LLD_
Definition: PBtools.h:47
PB_Cdescribe
void PB_Cdescribe()
PB_CVMinit
void PB_CVMinit()
PB_CVMpack
int PB_CVMpack()
CNOCONJG
#define CNOCONJG
Definition: PBblas.h:19
MModAdd
#define MModAdd(I1, I2, d)
Definition: PBtools.h:97
CROW
#define CROW
Definition: PBblacs.h:21
IMB_
#define IMB_
Definition: PBtools.h:41
MModSub
#define MModSub(I1, I2, d)
Definition: PBtools.h:102
pilaenv_
int pilaenv_()
PB_Cdescset
void PB_Cdescset()
PB_CVMupdate
void PB_CVMupdate()
MModAdd1
#define MModAdd1(I, d)
Definition: PBtools.h:100
PB_Cindxg2p
int PB_Cindxg2p()
RSRC_
#define RSRC_
Definition: PBtools.h:45
CNOTRAN
#define CNOTRAN
Definition: PBblas.h:18
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
PB_Cmalloc
char * PB_Cmalloc()
CFORWARD
#define CFORWARD
Definition: PBblas.h:38
PB_CInV
void PB_CInV()
PB_VM_T
Definition: pblas.h:432
PB_CVMnpq
int PB_CVMnpq()
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
CCOLUMN
#define CCOLUMN
Definition: PBblacs.h:20
PACKING
#define PACKING
Definition: PBtools.h:53
INB_
#define INB_
Definition: PBtools.h:42
PB_COutV
void PB_COutV()
C2F_CHAR
#define C2F_CHAR(a)
Definition: pblas.h:121
PB_Cspan
int PB_Cspan()
MModSub1
#define MModSub1(I, d)
Definition: PBtools.h:105
MAX
#define MAX(a_, b_)
Definition: PBtools.h:77
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
PB_CInV2
void PB_CInV2()
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
PB_CpgemmAB
void PB_CpgemmAB(PBTYP_T *TYPE, char *DIRECA, char *DIRECB, char *TRANSA, char *TRANSB, int M, int N, int K, char *ALPHA, char *A, int IA, int JA, int *DESCA, char *B, int IB, int JB, int *DESCB, char *BETA, char *C, int IC, int JC, int *DESCC)
Definition: PB_CpgemmAB.c:26
CTXT_
#define CTXT_
Definition: PBtools.h:38
PB_Clcm
int PB_Clcm()