SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
All Classes Files Functions Variables Typedefs Macros
PB_CpsymmBC.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS auxiliary routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "../pblas.h"
14#include "../PBpblas.h"
15#include "../PBtools.h"
16#include "../PBblacs.h"
17#include "../PBblas.h"
18
19#ifdef __STDC__
20void PB_CpsymmBC( PBTYP_T * TYPE, char * DIRECAB, char * CONJUG,
21 char * SIDE, char * UPLO, Int M, Int N, char * ALPHA,
22 char * A, Int IA, Int JA, Int * DESCA, char * B,
23 Int IB, Int JB, Int * DESCB, char * BETA, char * C,
24 Int IC, Int JC, Int * DESCC )
25#else
26void PB_CpsymmBC( TYPE, DIRECAB, CONJUG, SIDE, UPLO, M, N, ALPHA, A, IA,
27 JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC )
28/*
29* .. Scalar Arguments ..
30*/
31 char * CONJUG, * DIRECAB, * SIDE, * UPLO;
32 Int IA, IB, IC, JA, JB, JC, M, N;
33 char * ALPHA, * BETA;
34 PBTYP_T * TYPE;
35/*
36* .. Array Arguments ..
37*/
38 Int * DESCA, * DESCB, * DESCC;
39 char * A, * B, * C;
40#endif
41{
42/*
43* Purpose
44* =======
45*
46* PB_CpsymmBC performs one of the matrix-matrix operations
47*
48* sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ),
49*
50* or
51*
52* sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ),
53*
54* where
55*
56* sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1),
57*
58* sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L',
59* A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and,
60*
61* sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1).
62*
63* Alpha and beta are scalars, sub( A ) is a symmetric or Hermitian
64* submatrix and sub( B ) and sub( C ) are m by n submatrices.
65*
66* This is the inner-product algorithm using the logical LCM hybrid
67* and static blocking techniques. The submatrix operand sub( A ) stays
68* in place.
69*
70* Notes
71* =====
72*
73* A description vector is associated with each 2D block-cyclicly dis-
74* tributed matrix. This vector stores the information required to
75* establish the mapping between a matrix entry and its corresponding
76* process and memory location.
77*
78* In the following comments, the character _ should be read as
79* "of the distributed matrix". Let A be a generic term for any 2D
80* block cyclicly distributed matrix. Its description vector is DESC_A:
81*
82* NOTATION STORED IN EXPLANATION
83* ---------------- --------------- ------------------------------------
84* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
85* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
86* the NPROW x NPCOL BLACS process grid
87* A is distributed over. The context
88* itself is global, but the handle
89* (the integer value) may vary.
90* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
91* ted matrix A, M_A >= 0.
92* N_A (global) DESCA[ N_ ] The number of columns in the distri-
93* buted matrix A, N_A >= 0.
94* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
95* block of the matrix A, IMB_A > 0.
96* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
97* left block of the matrix A,
98* INB_A > 0.
99* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
100* bute the last M_A-IMB_A rows of A,
101* MB_A > 0.
102* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
103* bute the last N_A-INB_A columns of
104* A, NB_A > 0.
105* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
106* row of the matrix A is distributed,
107* NPROW > RSRC_A >= 0.
108* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
109* first column of A is distributed.
110* NPCOL > CSRC_A >= 0.
111* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
112* array storing the local blocks of
113* the distributed matrix A,
114* IF( Lc( 1, N_A ) > 0 )
115* LLD_A >= MAX( 1, Lr( 1, M_A ) )
116* ELSE
117* LLD_A >= 1.
118*
119* Let K be the number of rows of a matrix A starting at the global in-
120* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
121* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
122* receive if these K rows were distributed over NPROW processes. If K
123* is the number of columns of a matrix A starting at the global index
124* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
125* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
126* these K columns were distributed over NPCOL processes.
127*
128* The values of Lr() and Lc() may be determined via a call to the func-
129* tion PB_Cnumroc:
130* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
131* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
132*
133* Arguments
134* =========
135*
136* TYPE (local input) pointer to a PBTYP_T structure
137* On entry, TYPE is a pointer to a structure of type PBTYP_T,
138* that contains type information (See pblas.h).
139*
140* DIRECAB (global input) pointer to CHAR
141* On entry, DIRECAB specifies the direction in which the rows
142* or columns of sub( B ) should be looped over as follows:
143* DIRECAB = 'F' or 'f' forward or increasing,
144* DIRECAB = 'B' or 'b' backward or decreasing.
145*
146* CONJUG (global input) pointer to CHAR
147* On entry, CONJUG specifies whether sub( A ) is a symmetric or
148* Hermitian submatrix operand as follows:
149* CONJUG = 'N' or 'n' sub( A ) is symmetric,
150* CONJUG = 'Z' or 'z' sub( A ) is Hermitian.
151*
152* SIDE (global input) pointer to CHAR
153* On entry, SIDE specifies whether the symmetric or Hermitian
154* submatrix sub( A ) appears on the left or right in the opera-
155* tion as follows:
156*
157* SIDE = 'L' or 'l'
158* sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ),
159*
160* SIDE = 'R' or 'r'
161* sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ).
162*
163* UPLO (global input) pointer to CHAR
164* On entry, UPLO specifies whether the local pieces of
165* the array A containing the upper or lower triangular part
166* of the submatrix sub( A ) are to be referenced as follows:
167* UPLO = 'U' or 'u' Only the local pieces corresponding to
168* the upper triangular part of the
169* submatrix sub( A ) are referenced,
170* UPLO = 'L' or 'l' Only the local pieces corresponding to
171* the lower triangular part of the
172* submatrix sub( A ) are referenced.
173*
174* M (global input) INTEGER
175* On entry, M specifies the number of rows of the submatrix
176* sub( C ). M must be at least zero.
177*
178* N (global input) INTEGER
179* On entry, N specifies the number of columns of the submatrix
180* sub( C ). N must be at least zero.
181*
182* ALPHA (global input) pointer to CHAR
183* On entry, ALPHA specifies the scalar alpha. When ALPHA is
184* supplied as zero then the local entries of the arrays A and
185* B corresponding to the entries of the submatrices sub( A )
186* and sub( B ) respectively need not be set on input.
187*
188* A (local input) pointer to CHAR
189* On entry, A is an array of dimension (LLD_A, Ka), where Ka is
190* at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at
191* at least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
192* contains the local entries of the matrix A.
193* Before entry with SIDE = 'L' or 'l', this array contains
194* the local entries corresponding to the entries of the m by m
195* symmetric or Hermitian submatrix sub( A ), such that when
196* UPLO = 'U' or 'u', this array contains the local entries of
197* the upper triangular part of the submatrix sub( A ), and the
198* local entries of the strictly lower triangular of sub( A )
199* are not referenced, and when UPLO = 'L' or 'l', this array
200* contains the local entries of the lower triangular part of
201* the symmetric or Hermitian submatrix sub( A ), and the local
202* entries of the strictly upper triangular of sub( A ) are not
203* referenced.
204* Before entry with SIDE = 'R' or 'r', this array contains
205* the local entries corresponding to the entries of the n by n
206* symmetric or Hermitian submatrix sub( A ), such that when
207* UPLO = 'U' or 'u', this array contains the local entries of
208* the upper triangular part of the submatrix sub( A ), and the
209* local entries of the strictly lower triangular of sub( A )
210* are not referenced, and when UPLO = 'L' or 'l', this array
211* contains the local entries of the lower triangular part of
212* the symmetric or Hermitian submatrix sub( A ), and the local
213* entries of the strictly upper triangular of sub( A ) are not
214* referenced.
215* Note that the imaginary parts of the local entries corres-
216* ponding to the diagonal elements of sub( A ) need not be
217* set and assumed to be zero.
218*
219* IA (global input) INTEGER
220* On entry, IA specifies A's global row index, which points to
221* the beginning of the submatrix sub( A ).
222*
223* JA (global input) INTEGER
224* On entry, JA specifies A's global column index, which points
225* to the beginning of the submatrix sub( A ).
226*
227* DESCA (global and local input) INTEGER array
228* On entry, DESCA is an integer array of dimension DLEN_. This
229* is the array descriptor for the matrix A.
230*
231* B (local input) pointer to CHAR
232* On entry, B is an array of dimension (LLD_B, Kb), where Kb is
233* at least Lc( 1, JB+N-1 ). Before entry, this array contains
234* the local entries of the matrix B.
235*
236* IB (global input) INTEGER
237* On entry, IB specifies B's global row index, which points to
238* the beginning of the submatrix sub( B ).
239*
240* JB (global input) INTEGER
241* On entry, JB specifies B's global column index, which points
242* to the beginning of the submatrix sub( B ).
243*
244* DESCB (global and local input) INTEGER array
245* On entry, DESCB is an integer array of dimension DLEN_. This
246* is the array descriptor for the matrix B.
247*
248* BETA (global input) pointer to CHAR
249* On entry, BETA specifies the scalar beta. When BETA is
250* supplied as zero then the local entries of the array C
251* corresponding to the entries of the submatrix sub( C ) need
252* not be set on input.
253*
254* C (local input/local output) pointer to CHAR
255* On entry, C is an array of dimension (LLD_C, Kc), where Kc is
256* at least Lc( 1, JC+N-1 ). Before entry, this array contains
257* the local entries of the matrix C.
258* On exit, the entries of this array corresponding to the local
259* entries of the submatrix sub( C ) are overwritten by the
260* local entries of the m by n updated submatrix.
261*
262* IC (global input) INTEGER
263* On entry, IC specifies C's global row index, which points to
264* the beginning of the submatrix sub( C ).
265*
266* JC (global input) INTEGER
267* On entry, JC specifies C's global column index, which points
268* to the beginning of the submatrix sub( C ).
269*
270* DESCC (global and local input) INTEGER array
271* On entry, DESCC is an integer array of dimension DLEN_. This
272* is the array descriptor for the matrix C.
273*
274* -- Written on April 1, 1998 by
275* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
276*
277* ---------------------------------------------------------------------
278*/
279/*
280* .. Local Scalars ..
281*/
282 char GemmTa, GemmTb, cctop, * one, rctop, * talphaCR, * talphaRC,
283 * tbeta, * zero;
284 Int Acol, Aii, Aimb1, Ainb1, Ajj, Alcmb, Ald, Alp, Alp0, Alq,
285 Alq0, Amb, Amp, An, Anb, Anq, Arow, BCfwd, BCmyprocD,
286 BCmyprocR, BCnD, BCnR, BCnprocsD, BCnprocsR, Bbufld, BcurrocR,
287 Bfr, BiD, BiR, BiiD, BiiR, BinbD, BinbR, Binb1D, Binb1R, BisR,
288 Bkk, Bld, BnbD, BnbR, BnpD, BnpR, Boff, BrocD, BrocR, BsrcR,
289 Cfr, CiD, CiR, CiiD, CiiR, CinbD, CinbR, Cinb1D, Cinb1R, Ckk,
290 CnbD, CnbR, CnpD, CnpR, Coff, CrocD, CrocR, CsrcR, Cbufld,
291 CcurrocR, CisR, Cld, WBCfr, WBCld, WBRfr, WBRld, WCCfr, WCCld,
292 WCCsum, WCRfr, WCRld, WCRsum, conjg, ctxt, l, lb, lcmb, lside,
293 ltmp, maxp, maxpm1, maxq, mycol, myrow, n, nb, nbb, ncpq,
294 npcol, npq=0, nprow, nrpq, p=0, q=0, size, tmp, upper;
295 TZSYM_T tzsymm;
296 GEMM_T gemm;
297 GSUM2D_T gsum2d;
298/*
299* .. Local Arrays ..
300*/
301 PB_VM_T VM;
302 Int Ad0 [DLEN_], DBUFB[DLEN_], DBUFC[DLEN_], WBCd[DLEN_],
303 WBRd[DLEN_], WCCd [DLEN_], WCRd [DLEN_];
304 char * Aptr = NULL, * Bbuf = NULL, * Cbuf = NULL, * WBC = NULL,
305 * WBR = NULL, * WCC = NULL, * WCR = NULL;
306/* ..
307* .. Executable Statements ..
308*
309*/
310 Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
311
312 BCfwd = ( Mupcase( DIRECAB[0] ) == CFORWARD );
313 conjg = ( Mupcase( CONJUG [0] ) == CCONJG );
314 lside = ( Mupcase( SIDE [0] ) == CLEFT );
315 upper = ( Mupcase( UPLO [0] ) == CUPPER );
316
317 size = TYPE->size; one = TYPE->one; zero = TYPE->zero;
318 gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d;
319 nb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
320/*
321* Compute local information for sub( A ), sub( B ) and sub( C )
322*/
323 if( lside )
324 {
325 BCnD = An = M; BCnR = N;
326 BCmyprocD = myrow; BCnprocsD = nprow;
327 BCmyprocR = mycol; BCnprocsR = npcol;
328
329 BiD = IB; BiR = JB;
330 BinbD = DESCB[IMB_ ]; BinbR = DESCB[INB_];
331 BnbD = DESCB[MB_ ]; BnbR = DESCB[NB_ ];
332 BsrcR = DESCB[CSRC_]; Bld = DESCB[LLD_];
333 PB_Cinfog2l( IB, JB, DESCB, BCnprocsD, BCnprocsR, BCmyprocD, BCmyprocR,
334 &BiiD, &BiiR, &BrocD, &BrocR );
335
336 CiD = IC; CiR = JC;
337 CinbD = DESCC[IMB_ ]; CinbR = DESCC[INB_];
338 CnbD = DESCC[MB_ ]; CnbR = DESCC[NB_ ];
339 CsrcR = DESCC[CSRC_]; Cld = DESCC[LLD_];
340 PB_Cinfog2l( IC, JC, DESCC, BCnprocsD, BCnprocsR, BCmyprocD, BCmyprocR,
341 &CiiD, &CiiR, &CrocD, &CrocR );
342 }
343 else
344 {
345 BCnD = An = N; BCnR = M;
346 BCmyprocD = mycol; BCnprocsD = npcol;
347 BCmyprocR = myrow; BCnprocsR = nprow;
348
349 BiD = JB; BiR = IB;
350 BinbR = DESCB[IMB_ ]; BinbD = DESCB[INB_];
351 BnbR = DESCB[MB_ ]; BnbD = DESCB[NB_ ];
352 BsrcR = DESCB[RSRC_]; Bld = DESCB[LLD_];
353 PB_Cinfog2l( IB, JB, DESCB, BCnprocsR, BCnprocsD, BCmyprocR, BCmyprocD,
354 &BiiR, &BiiD, &BrocR, &BrocD );
355
356 CiD = JC; CiR = IC;
357 CinbR = DESCC[IMB_ ]; CinbD = DESCC[INB_];
358 CnbR = DESCC[MB_ ]; CnbD = DESCC[NB_ ];
359 CsrcR = DESCC[RSRC_]; Cld = DESCC[LLD_];
360 PB_Cinfog2l( IC, JC, DESCC, BCnprocsR, BCnprocsD, BCmyprocR, BCmyprocD,
361 &CiiR, &CiiD, &CrocR, &CrocD );
362 }
363
364 Binb1D = PB_Cfirstnb( BCnD, BiD, BinbD, BnbD );
365 BnpD = PB_Cnumroc( BCnD, 0, Binb1D, BnbD, BCmyprocD, BrocD, BCnprocsD );
366 Binb1R = PB_Cfirstnb( BCnR, BiR, BinbR, BnbR );
367 BisR = ( ( BsrcR < 0 ) || ( BCnprocsR == 1 ) );
368
369 Cinb1D = PB_Cfirstnb( BCnD, CiD, CinbD, CnbD );
370 CnpD = PB_Cnumroc( BCnD, 0, Cinb1D, CnbD, BCmyprocD, CrocD, BCnprocsD );
371 Cinb1R = PB_Cfirstnb( BCnR, CiR, CinbR, CnbR );
372 CisR = ( ( CsrcR < 0 ) || ( BCnprocsR == 1 ) );
373/*
374* Compute descriptor Ad0 for sub( A )
375*/
376 PB_Cdescribe( An, An, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
377 &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
378
379 Amp = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow );
380 Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol );
381 if( ( Amp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( A, Aii, Ajj, Ald, size );
382/*
383* Retrieve the BLACS combine topologies, compute conjugate of alpha for the
384* Hermitian case and set the transpose parameters to be passed to the BLAS
385* matrix multiply routine.
386*/
387 rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
388 cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
389
390 if( conjg )
391 {
392 tzsymm = PB_Ctzhemm;
393 if( lside )
394 {
395 talphaRC = ALPHA; GemmTa = CCOTRAN; GemmTb = CTRAN;
396 talphaCR = PB_Cmalloc( size );
397 PB_Cconjg( TYPE, ALPHA, talphaCR );
398 }
399 else
400 {
401 talphaCR = ALPHA; GemmTa = CTRAN; GemmTb = CCOTRAN;
402 talphaRC = PB_Cmalloc( size );
403 PB_Cconjg( TYPE, ALPHA, talphaRC );
404 }
405 }
406 else
407 {
408 tzsymm = PB_Ctzsymm; talphaCR = talphaRC = ALPHA;
409 GemmTa = CTRAN; GemmTb = CTRAN;
410 }
411/*
412* Computational partitioning size is computed as the product of the logical
413* value returned by pilaenv_ and 2 * lcm( nprow, npcol ).
414*/
415 Alcmb = 2 * nb * PB_Clcm( ( Arow >= 0 ? nprow : 1 ),
416 ( Acol >= 0 ? npcol : 1 ) );
417/*
418* When sub( B ) is not replicated and backward pass on sub( B ), find the
419* virtual process q owning the last row or column of sub( B ).
420*/
421 if( !( BisR ) && !( BCfwd ) )
422 {
423 tmp = PB_Cindxg2p( BCnR - 1, Binb1R, BnbR, BrocR, BrocR, BCnprocsR );
424 q = MModSub( tmp, BrocR, BCnprocsR );
425 }
426/*
427* When sub( C ) is not replicated and backward pass on sub( C ), find the
428* virtual process p owning the last row or column of sub( C ).
429*/
430 if( !( CisR ) && !( BCfwd ) )
431 {
432 tmp = PB_Cindxg2p( BCnR - 1, Cinb1R, CnbR, CrocR, CrocR, BCnprocsR );
433 p = MModSub( tmp, CrocR, BCnprocsR );
434 }
435/*
436* Loop over the virtual process grid induced by the rows or columns of
437* sub( B ) and sub( C ).
438*/
439 lcmb = PB_Clcm( ( maxp = ( CisR ? 1 : BCnprocsR ) ) * CnbR,
440 ( maxq = ( BisR ? 1 : BCnprocsR ) ) * BnbR );
441 n = BCnR;
442 maxpm1 = maxp - 1;
443
444 while( n > 0 )
445 {
446/*
447* Initialize local virtual matrix in process (p,q)
448*/
449 BcurrocR = ( BisR ? -1 : MModAdd( BrocR, q, BCnprocsR ) );
450 Bkk = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, BCnprocsR );
451 BnpR = PB_Cnumroc( BCnR, 0, Binb1R, BnbR, BcurrocR, BrocR, BCnprocsR );
452
453 CcurrocR = ( CisR ? -1 : MModAdd( CrocR, p, BCnprocsR ) );
454 Ckk = PB_Cg2lrem( CiR, CinbR, CnbR, CcurrocR, CsrcR, BCnprocsR );
455 CnpR = PB_Cnumroc( BCnR, 0, Cinb1R, CnbR, CcurrocR, CrocR, BCnprocsR );
456
457 PB_CVMinit( &VM, 0, CnpR, BnpR, Cinb1R, Binb1R, CnbR, BnbR, p, q,
458 maxp, maxq, lcmb );
459/*
460* Find how many diagonals in this virtual process
461*/
462 npq = PB_CVMnpq( &VM );
463
464 n -= npq;
465/*
466* Re-adjust the number of rows or columns to be (un)packed, in order to
467* average the message sizes.
468*/
469 if( npq ) nbb = npq / ( ( npq - 1 ) / nb + 1 );
470
471 while( npq )
472 {
473 nbb = MIN( nbb, npq );
474/*
475* Find out how many rows or columns of sub( B ) and sub( C ) are contiguous
476*/
477 PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Boff );
478
479 if( lside )
480 {
481/*
482* Compute the descriptor DBUFB for the buffer that will contained the packed
483* columns of sub( B ).
484*/
485 if( ( Bfr = ( ncpq < nbb ) ) != 0 )
486 {
487/*
488* If columns of sub( B ) are not contiguous, then allocate the buffer and
489* pack the kbb columns of sub( B ).
490*/
491 Bbufld = MAX( 1, BnpD );
492 if( BisR || ( BCmyprocR == BcurrocR ) )
493 {
494 Bbuf = PB_Cmalloc( BnpD * nbb * size );
495 PB_CVMpack( TYPE, &VM, COLUMN, COLUMN, PACKING, NOTRAN, nbb,
496 BnpD, one, Mptr( B, BiiD, Bkk, Bld, size ), Bld,
497 zero, Bbuf, Bbufld );
498 }
499 }
500 else
501 {
502/*
503* Otherwise, re-use sub( B ) directly.
504*/
505 Bbufld = Bld;
506 if( BisR || ( BCmyprocR == BcurrocR ) )
507 Bbuf = Mptr( B, BiiD, Bkk+Boff, Bld, size );
508 }
509 PB_Cdescset( DBUFB, BCnD, nbb, Binb1D, nbb, BnbD, nbb, BrocD,
510 BcurrocR, ctxt, Bbufld );
511/*
512* Replicate this panel of columns of sub( B ) as well as its transposed
513* over sub( A ) -> WBC, WBR
514*/
515 PB_CInV( TYPE, NOCONJG, COLUMN, An, An, Ad0, nbb, Bbuf, 0, 0,
516 DBUFB, COLUMN, &WBC, WBCd, &WBCfr );
517 PB_CInV( TYPE, NOCONJG, ROW, An, An, Ad0, nbb, WBC, 0, 0,
518 WBCd, COLUMN, &WBR, WBRd, &WBRfr );
519 }
520 else
521 {
522/*
523* Compute the descriptor DBUFB for the buffer that will contained the packed
524* rows of sub( B ).
525*/
526 if( ( Bfr = ( ncpq < nbb ) ) != 0 )
527 {
528/*
529* If rows of sub( B ) are not contiguous, then allocate the buffer and pack
530* the kbb rows of sub( B ).
531*/
532 Bbufld = nbb;
533 if( BisR || ( BCmyprocR == BcurrocR ) )
534 {
535 Bbuf = PB_Cmalloc( BnpD * nbb * size );
536 PB_CVMpack( TYPE, &VM, COLUMN, ROW, PACKING, NOTRAN, nbb,
537 BnpD, one, Mptr( B, Bkk, BiiD, Bld, size ), Bld,
538 zero, Bbuf, Bbufld );
539 }
540 }
541 else
542 {
543/*
544* Otherwise, re-use sub( B ) directly.
545*/
546 Bbufld = Bld;
547 if( BisR || ( BCmyprocR == BcurrocR ) )
548 Bbuf = Mptr( B, Bkk+Boff, BiiD, Bld, size );
549 }
550 PB_Cdescset( DBUFB, nbb, BCnD, nbb, Binb1D, nbb, BnbD, BcurrocR,
551 BrocD, ctxt, Bbufld );
552/*
553* Replicate this panel of rows of sub( B ) as well as its transposed
554* over sub( A ) -> WBR, WBC
555*/
556 PB_CInV( TYPE, NOCONJG, ROW, An, An, Ad0, nbb, Bbuf, 0, 0,
557 DBUFB, ROW, &WBR, WBRd, &WBRfr );
558 PB_CInV( TYPE, NOCONJG, COLUMN, An, An, Ad0, nbb, WBR, 0, 0,
559 WBRd, ROW, &WBC, WBCd, &WBCfr );
560 }
561/*
562* Allocate space for temporary results in scope of sub( A ) -> WCC, WCR
563*/
564 PB_COutV( TYPE, COLUMN, INIT, An, An, Ad0, nbb, &WCC, WCCd, &WCCfr,
565 &WCCsum );
566 PB_COutV( TYPE, ROW, INIT, An, An, Ad0, nbb, &WCR, WCRd, &WCRfr,
567 &WCRsum );
568/*
569* Local matrix-matrix multiply iff I own some data
570*/
571 WBCld = WBCd[LLD_]; WBRld = WBRd[LLD_];
572 WCCld = WCCd[LLD_]; WCRld = WCRd[LLD_];
573
574 if( ( Amp > 0 ) && ( Anq > 0 ) )
575 {
576 if( upper )
577 {
578/*
579* sub( A ) is upper triangular
580*/
581 for( l = 0; l < An; l += Alcmb )
582 {
583 lb = An - l; lb = MIN( lb, Alcmb );
584 Alp = PB_Cnumroc( l, 0, Aimb1, Amb, myrow, Arow, nprow );
585 Alq = PB_Cnumroc( l, 0, Ainb1, Anb, mycol, Acol, npcol );
586 Alq0 = PB_Cnumroc( lb, l, Ainb1, Anb, mycol, Acol, npcol );
587 if( Alp > 0 && Alq0 > 0 )
588 {
589 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &GemmTb ), &Alp, &nbb,
590 &Alq0, talphaRC, Mptr( Aptr, 0, Alq, Ald, size ),
591 &Ald, Mptr( WBR, 0, Alq, WBRld, size ), &WBRld, one,
592 WCC, &WCCld );
593 gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( NOTRAN ), &nbb, &Alq0,
594 &Alp, talphaCR, WBC, &WBCld, Mptr( Aptr, 0, Alq, Ald,
595 size ), &Ald, one, Mptr( WCR, 0, Alq, WCRld, size ),
596 &WCRld );
597 }
598 PB_Cpsym( TYPE, TYPE, SIDE, UPPER, lb, nbb, ALPHA, Aptr, l, l,
599 Ad0, Mptr( WBC, Alp, 0, WBCld, size ), WBCld,
600 Mptr( WBR, 0, Alq, WBRld, size ), WBRld, Mptr( WCC,
601 Alp, 0, WCCld, size ), WCCld, Mptr( WCR, 0, Alq,
602 WCRld, size ), WCRld, tzsymm );
603 }
604 }
605 else
606 {
607/*
608* sub( A ) is lower triangular
609*/
610 for( l = 0; l < An; l += Alcmb )
611 {
612 lb = An - l; ltmp = l + ( lb = MIN( lb, Alcmb ) );
613 Alp = PB_Cnumroc( l, 0, Aimb1, Amb, myrow, Arow, nprow );
614 Alq = PB_Cnumroc( l, 0, Ainb1, Anb, mycol, Acol, npcol );
615 PB_Cpsym( TYPE, TYPE, SIDE, LOWER, lb, nbb, ALPHA, Aptr, l, l,
616 Ad0, Mptr( WBC, Alp, 0, WBCld, size ), WBCld,
617 Mptr( WBR, 0, Alq, WBRld, size ), WBRld, Mptr( WCC,
618 Alp, 0, WCCld, size ), WCCld, Mptr( WCR, 0, Alq,
619 WCRld, size ), WCRld, tzsymm );
620 Alp = PB_Cnumroc( ltmp, 0, Aimb1, Amb, myrow, Arow, nprow );
621 Alp0 = Amp - Alp;
622 Alq0 = PB_Cnumroc( lb, l, Ainb1, Anb, mycol, Acol, npcol );
623 if( Alp0 > 0 && Alq0 > 0 )
624 {
625 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &GemmTb ), &Alp0, &nbb,
626 &Alq0, talphaRC, Mptr( Aptr, Alp, Alq, Ald, size ),
627 &Ald, Mptr( WBR, 0, Alq, WBRld, size ), &WBRld, one,
628 Mptr( WCC, Alp, 0, WCCld, size ), &WCCld );
629 gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( NOTRAN ), &nbb, &Alq0,
630 &Alp0, talphaCR, Mptr( WBC, Alp, 0, WBCld, size ),
631 &WBCld, Mptr( Aptr, Alp, Alq, Ald, size ), &Ald, one,
632 Mptr( WCR, 0, Alq, WCRld, size ), &WCRld );
633 }
634 }
635 }
636 }
637 if( WBCfr ) free( WBC );
638 if( WBRfr ) free( WBR );
639
640 if( Bfr && ( BisR || ( BCmyprocR == BcurrocR ) ) )
641 if( Bbuf ) free( Bbuf );
642
643 if( lside )
644 {
645/*
646* Accumulate the intermediate results in WCC and WCR
647*/
648 if( WCCsum )
649 {
650 WCCd[CSRC_] = CcurrocR;
651 if( Amp > 0 )
652 gsum2d( ctxt, ROW, &rctop, Amp, nbb, WCC, WCCld, myrow,
653 WCCd[CSRC_] );
654 }
655
656 if( WCRsum )
657 {
658 WCRd[RSRC_] = 0;
659 if( Anq > 0 )
660 gsum2d( ctxt, COLUMN, &cctop, nbb, Anq, WCR, WCRld,
661 WCRd[RSRC_], mycol );
662 }
663/*
664* WCC := WCC + WCR'
665*/
666 PB_Cpaxpby( TYPE, CONJUG, nbb, An, one, WCR, 0, 0, WCRd, ROW, one,
667 WCC, 0, 0, WCCd, COLUMN );
668 if( WCRfr ) free( WCR );
669/*
670* Compute the descriptor DBUFC for the buffer that will contained the packed
671* columns of sub( C ). Allocate it.
672*/
673 if( ( Cfr = ( nrpq < nbb ) ) != 0 )
674 {
675/*
676* If columns of sub( C ) are not contiguous, then allocate the buffer
677*/
678 Cbufld = MAX( 1, CnpD ); tbeta = zero;
679 if( CisR || ( BCmyprocR == CcurrocR ) )
680 Cbuf = PB_Cmalloc( CnpD * nbb * size );
681 }
682 else
683 {
684/*
685* Otherwise re-use sub( C )
686*/
687 Cbufld = Cld; tbeta = BETA;
688 if( CisR || ( BCmyprocR == CcurrocR ) )
689 Cbuf = Mptr( C, CiiD, Ckk+Coff, Cld, size );
690 }
691 PB_Cdescset( DBUFC, BCnD, nbb, Cinb1D, nbb, CnbD, nbb, CrocD,
692 CcurrocR, ctxt, Cbufld );
693/*
694* sub( C ) := beta * sub( C ) + WCC
695*/
696 PB_Cpaxpby( TYPE, NOCONJG, An, nbb, one, WCC, 0, 0, WCCd, COLUMN,
697 tbeta, Cbuf, 0, 0, DBUFC, COLUMN );
698 if( WCCfr ) free( WCC );
699/*
700* Unpack the kbb columns of sub( C ) and release the buffer containing them.
701*/
702 if( Cfr && ( CisR || ( BCmyprocR == CcurrocR ) ) )
703 {
704 PB_CVMpack( TYPE, &VM, ROW, COLUMN, UNPACKING, NOTRAN, nbb,
705 CnpD, BETA, Mptr( C, CiiD, Ckk, Cld, size ), Cld,
706 one, Cbuf, Cbufld );
707 if( Cbuf ) free( Cbuf );
708 }
709 }
710 else
711 {
712/*
713* Accumulate the intermediate results in WCC and WCR
714*/
715 if( WCCsum )
716 {
717 WCCd[CSRC_] = 0;
718 if( Amp > 0 )
719 gsum2d( ctxt, ROW, &rctop, Amp, nbb, WCC, WCCld, myrow,
720 WCCd[CSRC_] );
721 }
722
723 if( WCRsum )
724 {
725 WCRd[RSRC_] = CcurrocR;
726 if( Anq > 0 )
727 gsum2d( ctxt, COLUMN, &cctop, nbb, Anq, WCR, WCRld,
728 WCRd[RSRC_], mycol );
729 }
730/*
731* WCR := WCR + WCC'
732*/
733 PB_Cpaxpby( TYPE, CONJUG, An, nbb, one, WCC, 0, 0, WCCd, COLUMN,
734 one, WCR, 0, 0, WCRd, ROW );
735 if( WCCfr ) free( WCC );
736/*
737* Compute the descriptor DBUFC for the buffer that will contained the packed
738* rows of sub( C ). Allocate it.
739*/
740 if( ( Cfr = ( nrpq < nbb ) ) != 0 )
741 {
742/*
743* If rows of sub( C ) are not contiguous, then allocate receiving buffer.
744*/
745 Cbufld = nbb; tbeta = zero;
746 if( CisR || ( BCmyprocR == CcurrocR ) )
747 Cbuf = PB_Cmalloc( CnpD * nbb * size );
748 }
749 else
750 {
751/*
752* Otherwise re-use sub( C )
753*/
754 Cbufld = Cld; tbeta = BETA;
755 if( CisR || ( BCmyprocR == CcurrocR ) )
756 Cbuf = Mptr( C, Ckk+Coff, CiiD, Cld, size );
757 }
758 PB_Cdescset( DBUFC, nbb, BCnD, nbb, Cinb1D, nbb, CnbD, CcurrocR,
759 CrocD, ctxt, Cbufld );
760/*
761* sub( C ) := beta * sub( C ) + WCR
762*/
763 PB_Cpaxpby( TYPE, NOCONJG, nbb, An, one, WCR, 0, 0, WCRd, ROW,
764 tbeta, Cbuf, 0, 0, DBUFC, ROW );
765
766 if( WCRfr ) free( WCR );
767/*
768* Unpack the kbb rows of sub( C ) and release the buffer containing them.
769*/
770 if( Cfr && ( CisR || ( BCmyprocR == CcurrocR ) ) )
771 {
772 PB_CVMpack( TYPE, &VM, ROW, ROW, UNPACKING, NOTRAN, nbb,
773 CnpD, BETA, Mptr( C, Ckk, CiiD, Cld, size ), Cld,
774 one, Cbuf, Cbufld );
775 if( Cbuf ) free( Cbuf );
776 }
777 }
778/*
779* Update the local indexes of sub( B ) and sub( C )
780*/
781 PB_CVMupdate( &VM, nbb, &Ckk, &Bkk );
782
783 npq -= nbb;
784 }
785/*
786* Go to next or previous virtual process row or column
787*/
788 if( ( BCfwd && ( p == maxpm1 ) ) ||
789 ( !( BCfwd ) && ( p == 0 ) ) )
790 q = ( BCfwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) );
791 p = ( BCfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) );
792 }
793
794 if( conjg ) free( ( lside ? talphaCR : talphaRC ) );
795/*
796* End of PB_CpsymmBC
797*/
798}
#define Int
Definition Bconfig.h:22
F_VOID_FCT(* GEMM_T)()
Definition pblas.h:317
void(* GSUM2D_T)()
Definition pblas.h:286
#define C2F_CHAR(a)
Definition pblas.h:125
void(* TZSYM_T)()
Definition pblas.h:432
#define TOP_GET
Definition PBblacs.h:50
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define NOTRAN
Definition PBblas.h:44
#define CCONJG
Definition PBblas.h:21
#define CLEFT
Definition PBblas.h:29
#define NOCONJG
Definition PBblas.h:45
#define CUPPER
Definition PBblas.h:26
#define CTRAN
Definition PBblas.h:20
#define LOWER
Definition PBblas.h:51
#define CCOTRAN
Definition PBblas.h:22
#define INIT
Definition PBblas.h:61
#define UPPER
Definition PBblas.h:52
#define CFORWARD
Definition PBblas.h:38
#define pilaenv_
Definition PBpblas.h:44
#define CTXT_
Definition PBtools.h:38
#define UNPACKING
Definition PBtools.h:54
void PB_CVMinit()
Int PB_Cfirstnb()
#define MAX(a_, b_)
Definition PBtools.h:77
#define MB_
Definition PBtools.h:43
void PB_Cpsym()
void PB_Ctzsymm()
char * PB_Cmalloc()
void PB_Cinfog2l()
#define MModSub(I1, I2, d)
Definition PBtools.h:102
#define PACKING
Definition PBtools.h:53
void PB_Ctzhemm()
#define MIN(a_, b_)
Definition PBtools.h:76
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
#define LLD_
Definition PBtools.h:47
Int PB_Cnumroc()
Int PB_CVMpack()
char * PB_Ctop()
void PB_CInV()
void PB_CVMupdate()
#define RSRC_
Definition PBtools.h:45
void PB_Cdescset()
void PB_COutV()
#define MModAdd1(I, d)
Definition PBtools.h:100
void PB_CpsymmBC()
#define MModAdd(I1, I2, d)
Definition PBtools.h:97
#define INB_
Definition PBtools.h:42
Int PB_CVMnpq()
#define MModSub1(I, d)
Definition PBtools.h:105
#define CSRC_
Definition PBtools.h:46
Int PB_Clcm()
#define IMB_
Definition PBtools.h:41
Int PB_Cindxg2p()
Int PB_Cg2lrem()
#define Mupcase(C)
Definition PBtools.h:83
#define DLEN_
Definition PBtools.h:48
#define NB_
Definition PBtools.h:44
void PB_Cconjg()
void PB_CVMcontig()
void PB_Cpaxpby()
void PB_Cdescribe()
#define TYPE
Definition clamov.c:7