SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
PB_CpgemmAB.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS auxiliary routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "../pblas.h"
14#include "../PBpblas.h"
15#include "../PBtools.h"
16#include "../PBblacs.h"
17#include "../PBblas.h"
18
19#ifdef __STDC__
20void PB_CpgemmAB( PBTYP_T * TYPE, char * DIRECA, char * DIRECB,
21 char * TRANSA, char * TRANSB, Int M, Int N, Int K,
22 char * ALPHA, char * A, Int IA, Int JA, Int * DESCA,
23 char * B, Int IB, Int JB, Int * DESCB, char * BETA,
24 char * C, Int IC, Int JC, Int * DESCC )
25#else
26void PB_CpgemmAB( TYPE, DIRECA, DIRECB, TRANSA, TRANSB, M, N, K, ALPHA,
27 A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC,
28 DESCC )
29/*
30* .. Scalar Arguments ..
31*/
32 char * DIRECA, * DIRECB, * TRANSA, * TRANSB;
33 Int IA, IB, IC, JA, JB, JC, K, M, N;
34 char * ALPHA, * BETA;
35 PBTYP_T * TYPE;
36/*
37* .. Array Arguments ..
38*/
39 Int * DESCA, * DESCB, * DESCC;
40 char * A, * B, * C;
41#endif
42{
43/*
44* Purpose
45* =======
46*
47* PB_CpgemmAB performs one of the matrix-matrix operations
48*
49* sub( C ) := alpha*op( sub( A ) )*op( sub( B ) ) + beta*sub( C ),
50*
51* where
52*
53* sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of
54* op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ).
55*
56* Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+K-1) if TRANSA = 'N',
57* A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'T',
58* conjg(A(IA:IA+K-1,JA:JA+M-1)') if TRANSA = 'C',
59*
60* and, op( sub( B ) ) denotes B(IB:IB+K-1,JB:JB+N-1) if TRANSB = 'N',
61* B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'T',
62* conjg(B(IB:IB+N-1,JB:JB+K-1)') if TRANSB = 'C'.
63*
64* Alpha and beta are scalars. A, B and C are matrices; op( sub( A ) )
65* is an m by k submatrix, op( sub( B ) ) is an k by n submatrix and
66* sub( C ) is an m by n submatrix.
67*
68* This is the outer-product algorithm using the logical LCM hybrid
69* algorithmic blocking technique. The submatrix operand sub( C ) stays
70* in place.
71*
72* Notes
73* =====
74*
75* A description vector is associated with each 2D block-cyclicly dis-
76* tributed matrix. This vector stores the information required to
77* establish the mapping between a matrix entry and its corresponding
78* process and memory location.
79*
80* In the following comments, the character _ should be read as
81* "of the distributed matrix". Let A be a generic term for any 2D
82* block cyclicly distributed matrix. Its description vector is DESC_A:
83*
84* NOTATION STORED IN EXPLANATION
85* ---------------- --------------- ------------------------------------
86* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
87* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
88* the NPROW x NPCOL BLACS process grid
89* A is distributed over. The context
90* itself is global, but the handle
91* (the integer value) may vary.
92* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
93* ted matrix A, M_A >= 0.
94* N_A (global) DESCA[ N_ ] The number of columns in the distri-
95* buted matrix A, N_A >= 0.
96* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
97* block of the matrix A, IMB_A > 0.
98* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
99* left block of the matrix A,
100* INB_A > 0.
101* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
102* bute the last M_A-IMB_A rows of A,
103* MB_A > 0.
104* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
105* bute the last N_A-INB_A columns of
106* A, NB_A > 0.
107* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
108* row of the matrix A is distributed,
109* NPROW > RSRC_A >= 0.
110* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
111* first column of A is distributed.
112* NPCOL > CSRC_A >= 0.
113* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
114* array storing the local blocks of
115* the distributed matrix A,
116* IF( Lc( 1, N_A ) > 0 )
117* LLD_A >= MAX( 1, Lr( 1, M_A ) )
118* ELSE
119* LLD_A >= 1.
120*
121* Let K be the number of rows of a matrix A starting at the global in-
122* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
123* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
124* receive if these K rows were distributed over NPROW processes. If K
125* is the number of columns of a matrix A starting at the global index
126* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
127* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
128* these K columns were distributed over NPCOL processes.
129*
130* The values of Lr() and Lc() may be determined via a call to the func-
131* tion PB_Cnumroc:
132* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
133* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
134*
135* Arguments
136* =========
137*
138* TYPE (local input) pointer to a PBTYP_T structure
139* On entry, TYPE is a pointer to a structure of type PBTYP_T,
140* that contains type information (See pblas.h).
141*
142* DIRECA (global input) pointer to CHAR
143* On entry, DIRECA specifies the direction in which the rows
144* or columns of sub( A ) should be looped over as follows:
145* DIRECA = 'F' or 'f' forward or increasing,
146* DIRECA = 'B' or 'b' backward or decreasing.
147*
148* DIRECB (global input) pointer to CHAR
149* On entry, DIRECB specifies the direction in which the rows
150* or columns of sub( B ) should be looped over as follows:
151* DIRECB = 'F' or 'f' forward or increasing,
152* DIRECB = 'B' or 'b' backward or decreasing.
153*
154* TRANSA (global input) pointer to CHAR
155* On entry, TRANSA specifies the form of op( sub( A ) ) to be
156* used in the matrix multiplication as follows:
157*
158* TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ),
159* TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )',
160* TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ).
161*
162* TRANSB (global input) pointer to CHAR
163* On entry, TRANSB specifies the form of op( sub( B ) ) to be
164* used in the matrix multiplication as follows:
165*
166* TRANSB = 'N' or 'n' op( sub( B ) ) = sub( B ),
167* TRANSB = 'T' or 't' op( sub( B ) ) = sub( B )',
168* TRANSB = 'C' or 'c' op( sub( B ) ) = conjg( sub( B )' ).
169*
170* M (global input) INTEGER
171* On entry, M specifies the number of rows of the submatrix
172* op( sub( A ) ) and of the submatrix sub( C ). M must be at
173* least zero.
174*
175* N (global input) INTEGER
176* On entry, N specifies the number of columns of the submatrix
177* op( sub( B ) ) and the number of columns of the submatrix
178* sub( C ). N must be at least zero.
179*
180* K (global input) INTEGER
181* On entry, K specifies the number of columns of the submatrix
182* op( sub( A ) ) and the number of rows of the submatrix
183* op( sub( B ) ). K must be at least zero.
184*
185* ALPHA (global input) pointer to CHAR
186* On entry, ALPHA specifies the scalar alpha. When ALPHA is
187* supplied as zero then the local entries of the arrays A and
188* B corresponding to the entries of the submatrices sub( A )
189* and sub( B ) respectively need not be set on input.
190*
191* A (local input) pointer to CHAR
192* On entry, A is an array of dimension (LLD_A, Ka), where Ka is
193* at least Lc( 1, JA+K-1 ) when TRANSA = 'N' or 'n', and is at
194* least Lc( 1, JA+M-1 ) otherwise. Before entry, this array
195* contains the local entries of the matrix A.
196*
197* IA (global input) INTEGER
198* On entry, IA specifies A's global row index, which points to
199* the beginning of the submatrix sub( A ).
200*
201* JA (global input) INTEGER
202* On entry, JA specifies A's global column index, which points
203* to the beginning of the submatrix sub( A ).
204*
205* DESCA (global and local input) INTEGER array
206* On entry, DESCA is an integer array of dimension DLEN_. This
207* is the array descriptor for the matrix A.
208*
209* B (local input) pointer to CHAR
210* On entry, B is an array of dimension (LLD_B, Kb), where Kb is
211* at least Lc( 1, JB+N-1 ) when TRANSB = 'N' or 'n', and is at
212* least Lc( 1, JB+K-1 ) otherwise. Before entry, this array
213* contains the local entries of the matrix B.
214*
215* IB (global input) INTEGER
216* On entry, IB specifies B's global row index, which points to
217* the beginning of the submatrix sub( B ).
218*
219* JB (global input) INTEGER
220* On entry, JB specifies B's global column index, which points
221* to the beginning of the submatrix sub( B ).
222*
223* DESCB (global and local input) INTEGER array
224* On entry, DESCB is an integer array of dimension DLEN_. This
225* is the array descriptor for the matrix B.
226*
227* BETA (global input) pointer to CHAR
228* On entry, BETA specifies the scalar beta. When BETA is
229* supplied as zero then the local entries of the array C
230* corresponding to the entries of the submatrix sub( C ) need
231* not be set on input.
232*
233* C (local input/local output) pointer to CHAR
234* On entry, C is an array of dimension (LLD_C, Kc), where Kc is
235* at least Lc( 1, JC+N-1 ). Before entry, this array contains
236* the local entries of the matrix C.
237* On exit, the entries of this array corresponding to the local
238* entries of the submatrix sub( C ) are overwritten by the
239* local entries of the m by n updated submatrix.
240*
241* IC (global input) INTEGER
242* On entry, IC specifies C's global row index, which points to
243* the beginning of the submatrix sub( C ).
244*
245* JC (global input) INTEGER
246* On entry, JC specifies C's global column index, which points
247* to the beginning of the submatrix sub( C ).
248*
249* DESCC (global and local input) INTEGER array
250* On entry, DESCC is an integer array of dimension DLEN_. This
251* is the array descriptor for the matrix C.
252*
253* -- Written on April 1, 1998 by
254* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
255*
256* ---------------------------------------------------------------------
257*/
258/*
259* .. Local Scalars ..
260*/
261 char Aroc, Broc, TrA, TrB, * one, * tbeta, * zero;
262 Int ABrocs, Abufld, AcurrocR, Afr, Afwd, AiD, AiR, AiiD, AiiR,
263 AinbD, AinbR, Ainb1D, Ainb1R, AisR, AkkR, Ald, AmyprocD,
264 AmyprocR, AnbD, AnbR, AnpD, AnpR, AnprocsD, AnprocsR, Aoff,
265 ArocD, ArocR, AsrcR, Bbufld, BcurrocR, Bfr, Bfwd, BiD, BiR,
266 BiiD, BiiR, BinbD, BinbR, Binb1D, Binb1R, BisR, BkkR, Bld,
267 BmyprocD, BmyprocR, BnbD, BnbR, BnpD, BnpR, BnprocsD,
268 BnprocsR, Boff, BrocD, BrocR, BsrcR, Ccol, Cii, Cimb1, Cinb1,
269 Cjj, Cld, Cmb, Cmp, Cnb, Cnq, Crow, WAfr, WAsum, WBfr, WBsum,
270 Wkbb=0, ctxt, k, kb, kbb, lcmb, maxp, maxpm1, maxq, mycol,
271 myrow, ncpq, nota, notb, npcol, npq=0, nprow, nrpq, p=0, q=0,
272 size, tmp;
273 GEMM_T gemm;
274/*
275* .. Local Arrays ..
276*/
277 PB_VM_T VM;
278 Int Cd0[DLEN_], DBUFA[DLEN_], DBUFB[DLEN_], WAd0[DLEN_],
279 WBd0[DLEN_];
280 char * Abuf = NULL, * Bbuf = NULL, * Cptr = NULL, * WA = NULL,
281 * WB = NULL;
282/* ..
283* .. Executable Statements ..
284*
285*/
286/*
287* Retrieve process grid information
288*/
289 Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
290
291 nota = ( ( TrA = Mupcase( TRANSA[0] ) ) == CNOTRAN );
292 notb = ( ( TrB = Mupcase( TRANSB[0] ) ) == CNOTRAN );
293 TrA = ( ( TrA == CCOTRAN ) ? CCONJG : CNOCONJG );
294 TrB = ( ( TrB == CCOTRAN ) ? CCONJG : CNOCONJG );
295
296 size = TYPE->size;
297/*
298* Retrieve local information for sub( A ), sub( B ) and sub( C )
299*/
300 if( nota )
301 {
302 AiR = JA; Aroc = CCOLUMN; AnprocsR = npcol;
303 AinbR = DESCA[INB_]; AnbR = DESCA[NB_ ]; AsrcR = DESCA[CSRC_];
304 }
305 else
306 {
307 AiR = IA; Aroc = CROW; AnprocsR = nprow;
308 AinbR = DESCA[IMB_]; AnbR = DESCA[MB_ ]; AsrcR = DESCA[RSRC_];
309 }
310
311 if( notb )
312 {
313 BiR = IB; Broc = CROW; BnprocsR = nprow;
314 BinbR = DESCB[IMB_]; BnbR = DESCB[MB_ ]; BsrcR = DESCB[RSRC_];
315 }
316 else
317 {
318 BiR = JB; Broc = CCOLUMN; BnprocsR = npcol;
319 BinbR = DESCB[INB_]; BnbR = DESCB[NB_ ]; BsrcR = DESCB[CSRC_];
320 }
321/*
322* Retrieve sub( C )'s local information: Aii, Ajj, Arow, Acol ...
323*/
324 PB_Cdescribe( M, N, IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj,
325 &Cld, &Cimb1, &Cinb1, &Cmb, &Cnb, &Crow, &Ccol, Cd0 );
326
327 Cmp = PB_Cnumroc( M, 0, Cimb1, Cmb, myrow, Crow, nprow );
328 Cnq = PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol );
329/*
330* When sub( A ) and sub( B ) do not span more than one process row or column,
331* there is no need to pack the data.
332*/
333 if( !( PB_Cspan( K, AiR, AinbR, AnbR, AsrcR, AnprocsR ) ) &&
334 !( PB_Cspan( K, BiR, BinbR, BnbR, BsrcR, BnprocsR ) ) )
335 {
336 PB_CInV( TYPE, &TrA, COLUMN, M, N, Cd0, K, A, IA, JA, DESCA, &Aroc, &WA,
337 WAd0, &WAfr );
338 PB_CInV( TYPE, &TrB, ROW, M, N, Cd0, K, B, IB, JB, DESCB, &Broc, &WB,
339 WBd0, &WBfr );
340 if( ( Cmp > 0 ) && ( Cnq > 0 ) )
341 {
342/*
343* Perform the local update if I own some of sub( C )
344*/
345 TYPE->Fgemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp, &Cnq, &K,
346 ALPHA, WA, &WAd0[LLD_], WB, &WBd0[LLD_], BETA, Mptr( C,
347 Cii, Cjj, Cld, size ), &Cld );
348 }
349 if( WAfr ) free( WA );
350 if( WBfr ) free( WB );
351 return;
352 }
353/*
354* sub( A ) and sub( B ) span more than one process row or column.
355*/
356 Afwd = ( Mupcase( DIRECA[0] ) == CFORWARD );
357 Bfwd = ( Mupcase( DIRECB[0] ) == CFORWARD );
358
359 one = TYPE->one; zero = TYPE->zero; tbeta = BETA; gemm = TYPE->Fgemm;
360 kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
361/*
362* Compute local information for sub( A ) and sub( B )
363*/
364 if( nota )
365 {
366 AiD = IA; AinbD = DESCA[IMB_]; AnbD = DESCA[MB_];
367 Ald = DESCA[LLD_]; AmyprocD = myrow; AmyprocR = mycol;
368 AnprocsD = nprow;
369 PB_Cinfog2l( IA, JA, DESCA, AnprocsD, AnprocsR, AmyprocD, AmyprocR,
370 &AiiD, &AiiR, &ArocD, &ArocR );
371 }
372 else
373 {
374 AiD = JA; AinbD = DESCA[INB_]; AnbD = DESCA[NB_];
375 Ald = DESCA[LLD_]; AmyprocD = mycol; AmyprocR = myrow;
376 AnprocsD = npcol;
377 PB_Cinfog2l( IA, JA, DESCA, AnprocsR, AnprocsD, AmyprocR, AmyprocD,
378 &AiiR, &AiiD, &ArocR, &ArocD );
379 }
380 Ainb1D = PB_Cfirstnb( M, AiD, AinbD, AnbD );
381 AnpD = PB_Cnumroc( M, 0, Ainb1D, AnbD, AmyprocD, ArocD, AnprocsD );
382 Ainb1R = PB_Cfirstnb( K, AiR, AinbR, AnbR );
383 AisR = ( ( AsrcR < 0 ) || ( AnprocsR == 1 ) );
384
385 if( notb )
386 {
387 BiD = JB; BinbD = DESCB[INB_]; BnbD = DESCB[NB_];
388 Bld = DESCB[LLD_]; BmyprocD = mycol; BmyprocR = myrow;
389 BnprocsD = npcol;
390 PB_Cinfog2l( IB, JB, DESCB, BnprocsR, BnprocsD, BmyprocR, BmyprocD,
391 &BiiR, &BiiD, &BrocR, &BrocD );
392 }
393 else
394 {
395 BiD = IB; BinbD = DESCB[IMB_]; BnbD = DESCB[MB_];
396 Bld = DESCB[LLD_]; BmyprocD = myrow; BmyprocR = mycol;
397 BnprocsD = nprow;
398 PB_Cinfog2l( IB, JB, DESCB, BnprocsD, BnprocsR, BmyprocD, BmyprocR,
399 &BiiD, &BiiR, &BrocD, &BrocR );
400 }
401 Binb1D = PB_Cfirstnb( N, BiD, BinbD, BnbD );
402 BnpD = PB_Cnumroc( N, 0, Binb1D, BnbD, BmyprocD, BrocD, BnprocsD );
403 Binb1R = PB_Cfirstnb( K, BiR, BinbR, BnbR );
404 BisR = ( ( BsrcR < 0 ) || ( BnprocsR == 1 ) );
405/*
406* When sub( A ) is not replicated and backward pass on sub( A ), find the
407* virtual process q owning the last row or column of sub( A ).
408*/
409 if( !( AisR ) && !( Afwd ) )
410 {
411 tmp = PB_Cindxg2p( K - 1, Ainb1R, AnbR, ArocR, ArocR, AnprocsR );
412 q = MModSub( tmp, ArocR, AnprocsR );
413 }
414/*
415* When sub( B ) is not replicated and backward pass on sub( B ), find the
416* virtual process p owning the last row or column of sub( B ).
417*/
418 if( !( BisR ) && !( Bfwd ) )
419 {
420 tmp = PB_Cindxg2p( K - 1, Binb1R, BnbR, BrocR, BrocR, BnprocsR );
421 p = MModSub( tmp, BrocR, BnprocsR );
422 }
423
424 if( Cmp > 0 && Cnq > 0 ) Cptr = Mptr( C, Cii, Cjj, Cld, size );
425/*
426* Allocate work space in process rows and columns spanned by sub( C )
427*/
428 PB_COutV( TYPE, COLUMN, NOINIT, M, N, Cd0, kb, &WA, WAd0, &WAfr, &WAsum );
429 PB_COutV( TYPE, ROW, NOINIT, M, N, Cd0, kb, &WB, WBd0, &WBfr, &WBsum );
430/*
431* Loop over the virtual process grid induced by the sub( A ) and sub( B )
432*/
433 lcmb = PB_Clcm( ( maxp = ( BisR ? 1 : BnprocsR ) ) * BnbR,
434 ( maxq = ( AisR ? 1 : AnprocsR ) ) * AnbR );
435 maxpm1 = maxp - 1;
436/*
437* Find out process coordinates corresponding to first virtual process (p,q)
438*/
439 AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, AnprocsR ) );
440 AkkR = PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR, AnprocsR );
441 AnpR = PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR, AnprocsR );
442
443 BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, BnprocsR ) );
444 BkkR = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, BnprocsR );
445 BnpR = PB_Cnumroc( K, 0, Binb1R, BnbR, BcurrocR, BrocR, BnprocsR );
446/*
447* Find out how many diagonals this virtual process (p,q) has
448*/
449 PB_CVMinit( &VM, 0, BnpR, AnpR, Binb1R, Ainb1R, BnbR, AnbR, p, q,
450 maxp, maxq, lcmb );
451 npq = PB_CVMnpq( &VM );
452
453 for( k = 0; k < K; k += kb )
454 {
455 kbb = K - k; kbb = MIN( kbb, kb );
456
457 while( Wkbb != kbb )
458 {
459/*
460* Ensure that the current virtual process (p,q) has something to contribute
461* to the replicated buffers WA and WB.
462*/
463 while( npq == 0 )
464 {
465 if( ( Bfwd && ( p == maxpm1 ) ) ||
466 ( !( Bfwd ) && ( p == 0 ) ) )
467 q = ( Afwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) );
468 p = ( Bfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) );
469
470 AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, AnprocsR ) );
471 AkkR = PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR,
472 AnprocsR );
473 AnpR = PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR,
474 AnprocsR );
475
476 BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, BnprocsR ) );
477 BkkR = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR,
478 BnprocsR );
479 BnpR = PB_Cnumroc( K, 0, Binb1R, BnbR, BcurrocR, BrocR,
480 BnprocsR );
481
482 PB_CVMinit( &VM, 0, BnpR, AnpR, Binb1R, Ainb1R, BnbR, AnbR,
483 p, q, maxp, maxq, lcmb );
484 npq = PB_CVMnpq( &VM );
485 }
486/*
487* Current virtual process (p,q) has something, find out how many rows or
488* columns could be used: ABrocs.
489*/
490 if( Wkbb == 0 ) { ABrocs = ( npq < kbb ? npq : kbb ); }
491 else { ABrocs = kbb - Wkbb; ABrocs = MIN( ABrocs, npq ); }
492/*
493* Find out how many rows or columns of sub( A ) and sub( B ) are contiguous
494*/
495 PB_CVMcontig( &VM, &nrpq, &ncpq, &Boff, &Aoff );
496
497 if( nota )
498 {
499/*
500* Compute the descriptor DBUFA for the buffer that will contained the packed
501* columns of sub( A ).
502*/
503 if( ( Afr = ( ncpq < ABrocs ) ) != 0 )
504 {
505/*
506* If columns of sub( A ) are not contiguous, then allocate the buffer and
507* pack the ABrocs columns of sub( A ).
508*/
509 Abufld = MAX( 1, AnpD );
510 if( AisR || ( AmyprocR == AcurrocR ) )
511 {
512 Abuf = PB_Cmalloc( AnpD * ABrocs * size );
513 PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN,
514 ABrocs, AnpD, one, Mptr( A, AiiD, AkkR, Ald,
515 size ), Ald, zero, Abuf, Abufld );
516 }
517 }
518 else
519 {
520/*
521* Otherwise, re-use sub( A ) directly.
522*/
523 Abufld = Ald;
524 if( AisR || ( AmyprocR == AcurrocR ) )
525 Abuf = Mptr( A, AiiD, AkkR + Aoff, Ald, size );
526 }
527 PB_Cdescset( DBUFA, M, ABrocs, Ainb1D, ABrocs, AnbD, ABrocs,
528 ArocD, AcurrocR, ctxt, Abufld );
529 }
530 else
531 {
532/*
533* Compute the descriptor DBUFA for the buffer that will contained the packed
534* rows of sub( A ).
535*/
536 if( ( Afr = ( ncpq < ABrocs ) ) != 0 )
537 {
538/*
539* If rows of sub( A ) are not contiguous, then allocate the buffer and
540* pack the ABrocs rows of sub( A ).
541*/
542 Abufld = ABrocs;
543 if( AisR || ( AmyprocR == AcurrocR ) )
544 {
545 Abuf = PB_Cmalloc( AnpD * ABrocs * size );
546 PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN,
547 ABrocs, AnpD, one, Mptr( A, AkkR, AiiD, Ald,
548 size ), Ald, zero, Abuf, Abufld );
549 }
550 }
551 else
552 {
553/*
554* Otherwise, re-use sub( A ) directly.
555*/
556 Abufld = Ald;
557 if( AisR || ( AmyprocR == AcurrocR ) )
558 Abuf = Mptr( A, AkkR + Aoff, AiiD, Ald, size );
559 }
560 PB_Cdescset( DBUFA, ABrocs, M, ABrocs, Ainb1D, ABrocs, AnbD,
561 AcurrocR, ArocD, ctxt, Abufld );
562 }
563
564 if( notb )
565 {
566/*
567* Compute the descriptor DBUFB for the buffer that will contained the packed
568* rows of sub( B ).
569*/
570 if( ( Bfr = ( nrpq < ABrocs ) ) != 0 )
571 {
572/*
573* If rows of sub( B ) are not contiguous, then allocate the buffer and
574* pack the ABrocs rows of sub( B ).
575*/
576 Bbufld = ABrocs;
577 if( BisR || ( BmyprocR == BcurrocR ) )
578 {
579 Bbuf = PB_Cmalloc( BnpD * ABrocs * size );
580 PB_CVMpack( TYPE, &VM, ROW, &Broc, PACKING, NOTRAN,
581 ABrocs, BnpD, one, Mptr( B, BkkR, BiiD, Bld,
582 size ), Bld, zero, Bbuf, Bbufld );
583 }
584 }
585 else
586 {
587/*
588* Otherwise, re-use sub( B ) directly.
589*/
590 Bbufld = Bld;
591 if( BisR || ( BmyprocR == BcurrocR ) )
592 Bbuf = Mptr( B, BkkR + Boff, BiiD, Bld, size );
593 }
594 PB_Cdescset( DBUFB, ABrocs, N, ABrocs, Binb1D, ABrocs, BnbD,
595 BcurrocR, BrocD, ctxt, Bbufld );
596 }
597 else
598 {
599/*
600* Compute the descriptor DBUFB for the buffer that will contained the packed
601* columns of sub( B ).
602*/
603 if( ( Bfr = ( nrpq < ABrocs ) ) != 0 )
604 {
605/*
606* If columns of sub( B ) are not contiguous, then allocate the buffer and
607* pack the ABrocs columns of sub( B ).
608*/
609 Bbufld = MAX( 1, BnpD );
610 if( BisR || ( BmyprocR == BcurrocR ) )
611 {
612 Bbuf = PB_Cmalloc( BnpD * ABrocs * size );
613 PB_CVMpack( TYPE, &VM, ROW, &Broc, PACKING, NOTRAN,
614 ABrocs, BnpD, one, Mptr( B, BiiD, BkkR, Bld,
615 size ), Bld, zero, Bbuf, Bbufld );
616 }
617 }
618 else
619 {
620/*
621* Otherwise, re-use sub( B ) directly.
622*/
623 Bbufld = Bld;
624 if( BisR || ( BmyprocR == BcurrocR ) )
625 Bbuf = Mptr( B, BiiD, BkkR + Boff, Bld, size );
626 }
627 PB_Cdescset( DBUFB, N, ABrocs, Binb1D, ABrocs, BnbD, ABrocs,
628 BrocD, BcurrocR, ctxt, Bbufld );
629 }
630/*
631* Update the local indexes of sub( A ) and sub( B )
632*/
633 PB_CVMupdate( &VM, ABrocs, &BkkR, &AkkR );
634/*
635* Replicate panels of rows or columns of sub( A ) and sub( B ) over sub( C )
636* -> WA, WB
637*/
638 PB_CInV2( TYPE, &TrA, COLUMN, M, N, Cd0, ABrocs, Abuf, 0, 0,
639 DBUFA, &Aroc, WA, Wkbb, WAd0 );
640 PB_CInV2( TYPE, &TrB, ROW, M, N, Cd0, ABrocs, Bbuf, 0, 0,
641 DBUFB, &Broc, WB, Wkbb, WBd0 );
642
643 if( Afr & ( AisR || ( AmyprocR == AcurrocR ) ) )
644 if( Abuf ) free( Abuf );
645 if( Bfr & ( BisR || ( BmyprocR == BcurrocR ) ) )
646 if( Bbuf ) free( Bbuf );
647/*
648* ABrocs rows or columns of sub( A ) and sub( B ) have been replicated,
649* update the number of diagonals in this virtual process as well as the
650* number of rows or columns of sub( A ) and sub( B ) that are in WA, WB.
651*/
652 npq -= ABrocs;
653 Wkbb += ABrocs;
654 }
655/*
656* Perform local update
657*/
658 if( Cmp > 0 && Cnq > 0 )
659 {
660 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp, &Cnq, &kbb,
661 ALPHA, WA, &WAd0[LLD_], WB, &WBd0[LLD_], tbeta, Cptr, &Cld );
662 tbeta = one;
663 }
664
665 Wkbb = 0;
666 }
667
668 if( WAfr ) free( WA );
669 if( WBfr ) free( WB );
670/*
671* End of PB_CpgemmAB
672*/
673}
#define Int
Definition Bconfig.h:22
F_VOID_FCT(* GEMM_T)()
Definition pblas.h:317
#define C2F_CHAR(a)
Definition pblas.h:125
#define CCOLUMN
Definition PBblacs.h:20
#define COLUMN
Definition PBblacs.h:45
#define CROW
Definition PBblacs.h:21
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define NOTRAN
Definition PBblas.h:44
#define CCONJG
Definition PBblas.h:21
#define NOINIT
Definition PBblas.h:62
#define CNOCONJG
Definition PBblas.h:19
#define CNOTRAN
Definition PBblas.h:18
#define CCOTRAN
Definition PBblas.h:22
#define CFORWARD
Definition PBblas.h:38
#define pilaenv_
Definition PBpblas.h:44
#define CTXT_
Definition PBtools.h:38
void PB_CVMinit()
Int PB_Cfirstnb()
#define MAX(a_, b_)
Definition PBtools.h:77
#define MB_
Definition PBtools.h:43
void PB_CInV2()
char * PB_Cmalloc()
void PB_Cinfog2l()
#define MModSub(I1, I2, d)
Definition PBtools.h:102
#define PACKING
Definition PBtools.h:53
#define MIN(a_, b_)
Definition PBtools.h:76
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
#define LLD_
Definition PBtools.h:47
Int PB_Cnumroc()
Int PB_CVMpack()
void PB_CInV()
void PB_CVMupdate()
#define RSRC_
Definition PBtools.h:45
void PB_Cdescset()
void PB_COutV()
#define MModAdd1(I, d)
Definition PBtools.h:100
#define MModAdd(I1, I2, d)
Definition PBtools.h:97
#define INB_
Definition PBtools.h:42
Int PB_CVMnpq()
#define MModSub1(I, d)
Definition PBtools.h:105
#define CSRC_
Definition PBtools.h:46
Int PB_Clcm()
#define IMB_
Definition PBtools.h:41
void PB_CpgemmAB()
Int PB_Cindxg2p()
Int PB_Cg2lrem()
#define Mupcase(C)
Definition PBtools.h:83
#define DLEN_
Definition PBtools.h:48
#define NB_
Definition PBtools.h:44
void PB_CVMcontig()
Int PB_Cspan()
void PB_Cdescribe()
#define TYPE
Definition clamov.c:7