SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
PB_CpsymmAB.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS auxiliary routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "../pblas.h"
14#include "../PBpblas.h"
15#include "../PBtools.h"
16#include "../PBblacs.h"
17#include "../PBblas.h"
18
19#ifdef __STDC__
20void PB_CpsymmAB( PBTYP_T * TYPE, char * DIRECAB, char * CONJUG,
21 char * SIDE, char * UPLO, Int M, Int N, char * ALPHA,
22 char * A, Int IA, Int JA, Int * DESCA, char * B,
23 Int IB, Int JB, Int * DESCB, char * BETA, char * C,
24 Int IC, Int JC, Int * DESCC )
25#else
26void PB_CpsymmAB( TYPE, DIRECAB, CONJUG, SIDE, UPLO, M, N, ALPHA, A, IA,
27 JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC )
28/*
29* .. Scalar Arguments ..
30*/
31 char * CONJUG, * DIRECAB, * SIDE, * UPLO;
32 Int IA, IB, IC, JA, JB, JC, M, N;
33 char * ALPHA, * BETA;
34 PBTYP_T * TYPE;
35/*
36* .. Array Arguments ..
37*/
38 Int * DESCA, * DESCB, * DESCC;
39 char * A, * B, * C;
40#endif
41{
42/*
43* Purpose
44* =======
45*
46* PB_CpsymmAB performs one of the matrix-matrix operations
47*
48* sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ),
49*
50* or
51*
52* sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ),
53*
54* where
55*
56* sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1),
57*
58* sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L',
59* A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and,
60*
61* sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1).
62*
63* Alpha and beta are scalars, sub( A ) is a symmetric or Hermitian
64* submatrix and sub( B ) and sub( C ) are m by n submatrices.
65*
66* This is the outer-product algorithm using the logical aggregation
67* blocking technique. The submatrix operand sub( C ) stays in place.
68*
69* Notes
70* =====
71*
72* A description vector is associated with each 2D block-cyclicly dis-
73* tributed matrix. This vector stores the information required to
74* establish the mapping between a matrix entry and its corresponding
75* process and memory location.
76*
77* In the following comments, the character _ should be read as
78* "of the distributed matrix". Let A be a generic term for any 2D
79* block cyclicly distributed matrix. Its description vector is DESC_A:
80*
81* NOTATION STORED IN EXPLANATION
82* ---------------- --------------- ------------------------------------
83* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
84* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
85* the NPROW x NPCOL BLACS process grid
86* A is distributed over. The context
87* itself is global, but the handle
88* (the integer value) may vary.
89* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
90* ted matrix A, M_A >= 0.
91* N_A (global) DESCA[ N_ ] The number of columns in the distri-
92* buted matrix A, N_A >= 0.
93* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
94* block of the matrix A, IMB_A > 0.
95* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
96* left block of the matrix A,
97* INB_A > 0.
98* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
99* bute the last M_A-IMB_A rows of A,
100* MB_A > 0.
101* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
102* bute the last N_A-INB_A columns of
103* A, NB_A > 0.
104* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
105* row of the matrix A is distributed,
106* NPROW > RSRC_A >= 0.
107* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
108* first column of A is distributed.
109* NPCOL > CSRC_A >= 0.
110* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
111* array storing the local blocks of
112* the distributed matrix A,
113* IF( Lc( 1, N_A ) > 0 )
114* LLD_A >= MAX( 1, Lr( 1, M_A ) )
115* ELSE
116* LLD_A >= 1.
117*
118* Let K be the number of rows of a matrix A starting at the global in-
119* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
120* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
121* receive if these K rows were distributed over NPROW processes. If K
122* is the number of columns of a matrix A starting at the global index
123* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
124* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
125* these K columns were distributed over NPCOL processes.
126*
127* The values of Lr() and Lc() may be determined via a call to the func-
128* tion PB_Cnumroc:
129* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
130* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
131*
132* Arguments
133* =========
134*
135* TYPE (local input) pointer to a PBTYP_T structure
136* On entry, TYPE is a pointer to a structure of type PBTYP_T,
137* that contains type information (See pblas.h).
138*
139* DIRECAB (global input) pointer to CHAR
140* On entry, DIRECAB specifies the direction in which the rows
141* or columns of sub( A ) and sub( B ) should be looped over as
142* follows:
143* DIRECAB = 'F' or 'f' forward or increasing,
144* DIRECAB = 'B' or 'b' backward or decreasing.
145*
146* CONJUG (global input) pointer to CHAR
147* On entry, CONJUG specifies whether sub( A ) is a symmetric or
148* Hermitian submatrix operand as follows:
149* CONJUG = 'N' or 'n' sub( A ) is symmetric,
150* CONJUG = 'Z' or 'z' sub( A ) is Hermitian.
151*
152* SIDE (global input) pointer to CHAR
153* On entry, SIDE specifies whether the symmetric or Hermitian
154* submatrix sub( A ) appears on the left or right in the opera-
155* tion as follows:
156*
157* SIDE = 'L' or 'l'
158* sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ),
159*
160* SIDE = 'R' or 'r'
161* sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ).
162*
163* UPLO (global input) pointer to CHAR
164* On entry, UPLO specifies whether the local pieces of
165* the array A containing the upper or lower triangular part
166* of the submatrix sub( A ) are to be referenced as follows:
167* UPLO = 'U' or 'u' Only the local pieces corresponding to
168* the upper triangular part of the
169* submatrix sub( A ) are referenced,
170* UPLO = 'L' or 'l' Only the local pieces corresponding to
171* the lower triangular part of the
172* submatrix sub( A ) are referenced.
173*
174* M (global input) INTEGER
175* On entry, M specifies the number of rows of the submatrix
176* sub( C ). M must be at least zero.
177*
178* N (global input) INTEGER
179* On entry, N specifies the number of columns of the submatrix
180* sub( C ). N must be at least zero.
181*
182* ALPHA (global input) pointer to CHAR
183* On entry, ALPHA specifies the scalar alpha. When ALPHA is
184* supplied as zero then the local entries of the arrays A and
185* B corresponding to the entries of the submatrices sub( A )
186* and sub( B ) respectively need not be set on input.
187*
188* A (local input) pointer to CHAR
189* On entry, A is an array of dimension (LLD_A, Ka), where Ka is
190* at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at
191* at least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
192* contains the local entries of the matrix A.
193* Before entry with SIDE = 'L' or 'l', this array contains
194* the local entries corresponding to the entries of the m by m
195* symmetric or Hermitian submatrix sub( A ), such that when
196* UPLO = 'U' or 'u', this array contains the local entries of
197* the upper triangular part of the submatrix sub( A ), and the
198* local entries of the strictly lower triangular of sub( A )
199* are not referenced, and when UPLO = 'L' or 'l', this array
200* contains the local entries of the lower triangular part of
201* the symmetric or Hermitian submatrix sub( A ), and the local
202* entries of the strictly upper triangular of sub( A ) are not
203* referenced.
204* Before entry with SIDE = 'R' or 'r', this array contains
205* the local entries corresponding to the entries of the n by n
206* symmetric or Hermitian submatrix sub( A ), such that when
207* UPLO = 'U' or 'u', this array contains the local entries of
208* the upper triangular part of the submatrix sub( A ), and the
209* local entries of the strictly lower triangular of sub( A )
210* are not referenced, and when UPLO = 'L' or 'l', this array
211* contains the local entries of the lower triangular part of
212* the symmetric or Hermitian submatrix sub( A ), and the local
213* entries of the strictly upper triangular of sub( A ) are not
214* referenced.
215* Note that the imaginary parts of the local entries corres-
216* ponding to the diagonal elements of sub( A ) need not be
217* set and assumed to be zero.
218*
219* IA (global input) INTEGER
220* On entry, IA specifies A's global row index, which points to
221* the beginning of the submatrix sub( A ).
222*
223* JA (global input) INTEGER
224* On entry, JA specifies A's global column index, which points
225* to the beginning of the submatrix sub( A ).
226*
227* DESCA (global and local input) INTEGER array
228* On entry, DESCA is an integer array of dimension DLEN_. This
229* is the array descriptor for the matrix A.
230*
231* B (local input) pointer to CHAR
232* On entry, B is an array of dimension (LLD_B, Kb), where Kb is
233* at least Lc( 1, JB+N-1 ). Before entry, this array contains
234* the local entries of the matrix B.
235*
236* IB (global input) INTEGER
237* On entry, IB specifies B's global row index, which points to
238* the beginning of the submatrix sub( B ).
239*
240* JB (global input) INTEGER
241* On entry, JB specifies B's global column index, which points
242* to the beginning of the submatrix sub( B ).
243*
244* DESCB (global and local input) INTEGER array
245* On entry, DESCB is an integer array of dimension DLEN_. This
246* is the array descriptor for the matrix B.
247*
248* BETA (global input) pointer to CHAR
249* On entry, BETA specifies the scalar beta. When BETA is
250* supplied as zero then the local entries of the array C
251* corresponding to the entries of the submatrix sub( C ) need
252* not be set on input.
253*
254* C (local input/local output) pointer to CHAR
255* On entry, C is an array of dimension (LLD_C, Kc), where Kc is
256* at least Lc( 1, JC+N-1 ). Before entry, this array contains
257* the local entries of the matrix C.
258* On exit, the entries of this array corresponding to the local
259* entries of the submatrix sub( C ) are overwritten by the
260* local entries of the m by n updated submatrix.
261*
262* IC (global input) INTEGER
263* On entry, IC specifies C's global row index, which points to
264* the beginning of the submatrix sub( C ).
265*
266* JC (global input) INTEGER
267* On entry, JC specifies C's global column index, which points
268* to the beginning of the submatrix sub( C ).
269*
270* DESCC (global and local input) INTEGER array
271* On entry, DESCC is an integer array of dimension DLEN_. This
272* is the array descriptor for the matrix C.
273*
274* -- Written on April 1, 1998 by
275* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
276*
277* ---------------------------------------------------------------------
278*/
279/*
280* .. Local Scalars ..
281*/
282 char GatherDir, ScatterDir, * one, top, tran, * zero;
283 Int Afr, An, Bcol, Bcurcol, Bcurimb1, Bcurinb1, Bcurrow, Bfr, Bii,
284 Bimb, Bimb1, Binb, Binb1, Bjj, Bld, Bmb, Bmp, Bmp0, Bnb, Bnq,
285 Bnq0, Brow, Ccol, Ccurcol, Ccurimb1, Ccurinb1, Ccurrow, Cii,
286 Cimb, Cimb1, Cinb, Cinb1, Cjj, Cld, Cmb, Cmp, Cmp0, Cnb, Cnq,
287 Cnq0, Crow, WABfr, WACfr, WBCfr, WBCsum, conjg, ctxt, fwd, k,
288 kb, kbb, kend, kstart, kstep, ktmp, lside, mycol, myrow,
289 npcol, nprow, size, upper;
290 GEMM_T gemm;
291 GSUM2D_T gsum2d;
292/*
293* .. Local Arrays ..
294*/
295 Int Bd0 [DLEN_], Cd0 [DLEN_], DBUFA[DLEN_], DBUFB[DLEN_],
296 WABd[DLEN_], WACd[DLEN_], WBCd [DLEN_];
297 char * Aptr = NULL, * Bptr = NULL, * Bptr0 = NULL, * Cptr0 = NULL,
298 * WAB = NULL, * WAC = NULL, * WBC = NULL;
299/* ..
300* .. Executable Statements ..
301*
302*/
303/*
304* sub( C ) = beta * sub( C )
305*/
306 PB_Cplascal( TYPE, ALL, NOCONJG, M, N, BETA, C, IC, JC, DESCC );
307/*
308* Retrieve process grid information
309*/
310 Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
311
312 An = ( ( lside = ( Mupcase( SIDE[0] ) == CLEFT ) ) ? M : N );
313 upper = ( Mupcase( UPLO[0] ) == CUPPER );
314 tran = ( ( conjg = ( Mupcase( CONJUG[0] ) == CCONJG ) ) ? CCOTRAN : CTRAN );
315
316 size = TYPE->size; one = TYPE->one; zero = TYPE->zero;
317 gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d;
318/*
319* Figure out the loop bounds accordingly to DIRECAB
320*/
321 kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
322 if( ( fwd = ( Mupcase( DIRECAB[0] ) == CFORWARD ) ) != 0 )
323 {
324 kstart = 0; kend = ( ( An - 1 ) / kb + 1 ) * kb; kstep = kb;
325 GatherDir = ScatterDir = CFORWARD;
326 }
327 else
328 {
329 kstart = ( ( An - 1 ) / kb ) * kb; kend = kstep = -kb;
330 GatherDir = ScatterDir = CBACKWARD;
331 }
332/*
333* Compute local information for sub( B ) and sub( C )
334*/
335 PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj,
336 &Brow, &Bcol );
337 Bimb = DESCB[IMB_]; Binb = DESCB[INB_];
338 Bmb = DESCB[MB_ ]; Bnb = DESCB[NB_ ]; Bld = DESCB[LLD_];
339 Bimb1 = PB_Cfirstnb( M, IB, Bimb, Bmb );
340 Bmp0 = PB_Cnumroc( M, 0, Bimb1, Bmb, myrow, Brow, nprow );
341 Binb1 = PB_Cfirstnb( N, JB, Binb, Bnb );
342 Bnq0 = PB_Cnumroc( N, 0, Binb1, Bnb, mycol, Bcol, npcol );
343 if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size );
344
345 PB_Cinfog2l( IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj,
346 &Crow, &Ccol );
347 Cimb = DESCC[IMB_]; Cinb = DESCC[INB_];
348 Cmb = DESCC[MB_ ]; Cnb = DESCC[NB_ ]; Cld = DESCC[LLD_];
349 Cimb1 = PB_Cfirstnb( M, IC, Cimb, Cmb );
350 Cmp0 = PB_Cnumroc( M, 0, Cimb1, Cmb, myrow, Crow, nprow );
351 Cinb1 = PB_Cfirstnb( N, JC, Cinb, Cnb );
352 Cnq0 = PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol );
353 if( ( Cmp0 > 0 ) && ( Cnq0 > 0 ) ) Cptr0 = Mptr( C, Cii, Cjj, Cld, size );
354
355 if( lside )
356 {
357 top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
358
359 if( upper )
360 {
361 for( k = kstart; k != kend; k += kstep )
362 {
363 kbb = An - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
364/*
365* Accumulate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )
366*/
367 PB_CGatherV( TYPE, ALLOCATE, &GatherDir, ktmp, kbb, A, IA, JA+k,
368 DESCA, COLUMN, &Aptr, DBUFA, &Afr );
369/*
370* Replicate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) over
371* C( IC:IC+k+kbb-1, JC:JC+N-1 ) -> WAC
372*/
373 PB_Cdescset( Cd0, ktmp, N, Cimb1, Cinb1, Cmb, Cnb, Crow, Ccol,
374 ctxt, Cld );
375 PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Cd0, kbb, Aptr, 0, 0,
376 DBUFA, COLUMN, &WAC, WACd, &WACfr );
377/*
378* Zero lower triangle of WAC( k:k+kbb-1, 0:kbb-1 )
379*/
380 if( conjg )
381 PB_Cplapad( TYPE, LOWER, CONJUG, kbb, kbb, zero,
382 zero, WAC, k, 0, WACd );
383 else if( kbb > 1 )
384 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero,
385 zero, WAC, k+1, 0, WACd );
386/*
387* Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
388*/
389 PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, N, B, IB+k, JB, DESCB,
390 ROW, &Bptr, DBUFB, &Bfr );
391/*
392* Replicate B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over C( IC:IC+k+kbb-1, JC:JC+N-1 )
393*/
394 PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Cd0, kbb, Bptr, 0, 0, DBUFB,
395 ROW, &WBC, WBCd, &WBCfr );
396/*
397* C( IC:IC+k+kbb-1, JC:JC+N-1 ) += ALPHA * WAC * WBC
398*/
399 Cmp = PB_Cnumroc( ktmp, 0, Cimb1, Cmb, myrow, Crow, nprow );
400 if( ( Cmp > 0 ) && ( Cnq0 > 0 ) )
401 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp, &Cnq0, &kbb,
402 ALPHA, WAC, &WACd[LLD_], WBC, &WBCd[LLD_], one, Cptr0,
403 &Cld );
404 if( WBCfr ) free( WBC );
405 if( Bfr ) free( Bptr );
406/*
407* Replicate WAC = A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) over
408* B( IB:IB+k+kbb-1, JB:JB+N-1 ) -> WAB
409*/
410 PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
411 ctxt, Bld );
412 PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, WAC, 0, 0, WACd,
413 COLUMN, &WAB, WABd, &WABfr );
414/*
415* Zero lower triangle of WAB( k:k+kbb-1, 0:kbb-1 )
416*/
417 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, zero, WAB, k, 0,
418 WABd );
419/*
420* WBC := ALPHA*A(IA:IA+k+kbb-1, JA+k:JA+k+kbb-1)'*B( IB:IB+k+kbb-1, JB:JB+N-1 )
421*/
422 PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WBC, WBCd, &WBCfr,
423 &WBCsum );
424 Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow );
425 if( ( Bnq0 > 0 ) && ( Bmp > 0 ) )
426 gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0, &Bmp,
427 ALPHA, WAB, &WABd[LLD_], Bptr0, &Bld, zero, WBC,
428 &WBCd[LLD_] );
429 if( WABfr ) free( WAB );
430 if( WACfr ) free( WAC );
431 if( Afr ) free( Aptr );
432
433 if( WBCsum )
434 {
435 WBCd[RSRC_] = PB_Cindxg2p( ( fwd ? k : k + kbb - 1 ), Cimb1,
436 Cmb, Crow, Crow, nprow );
437 if( Bnq0 > 0 )
438 gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WBC, WBCd[LLD_],
439 WBCd[RSRC_], mycol );
440 }
441/*
442* C( IC+k:IC+k+kbb-1, JC:JC+N-1 ) := C( IC+k:IC+k+kbb-1, JC:JC+N-1 ) + WBC
443*/
444 PB_CScatterV( TYPE, &ScatterDir, kbb, N, WBC, 0, 0, WBCd, ROW, one,
445 C, IC+k, JC, DESCC, ROW );
446 if( WBCfr ) free( WBC );
447 }
448 }
449 else
450 {
451 for( k = kstart; k != kend; k += kstep )
452 {
453 ktmp = An - k; kbb = MIN( ktmp, kb );
454/*
455* Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )
456*/
457 PB_CGatherV( TYPE, ALLOCATE, &GatherDir, ktmp, kbb, A, IA+k, JA+k,
458 DESCA, COLUMN, &Aptr, DBUFA, &Afr );
459/*
460* Replicate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over C( IC+k:IC+M-1, JC:JC+N-1 )
461*/
462 Ccurimb1 = PB_Cfirstnb( ktmp, IC+k, Cimb, Cmb );
463 Ccurrow = PB_Cindxg2p( k, Cimb1, Cmb, Crow, Crow, nprow );
464 PB_Cdescset( Cd0, ktmp, N, Ccurimb1, Cinb1, Cmb, Cnb, Ccurrow,
465 Ccol, ctxt, Cld );
466 PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Cd0, kbb, Aptr, 0, 0,
467 DBUFA, COLUMN, &WAC, WACd, &WACfr );
468/*
469* Zero upper triangle of WAC( 0:kbb-1, 0:kbb-1 )
470*/
471 if( conjg )
472 PB_Cplapad( TYPE, UPPER, CONJUG, kbb, kbb, zero, zero, WAC,
473 0, 0, WACd );
474 else if( kbb > 1 )
475 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC,
476 0, 1, WACd );
477/*
478* Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
479*/
480 PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, N, B, IB+k, JB, DESCB,
481 ROW, &Bptr, DBUFB, &Bfr );
482/*
483* Replicate B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over C( IC+k:IC+M-1, JC:JC+N-1 )
484*/
485 PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Cd0, kbb, Bptr, 0, 0, DBUFB,
486 ROW, &WBC, WBCd, &WBCfr );
487/*
488* C( IC+k:IC+M-1, JC:JC+N-1 ) += ALPHA * WAC * WBC
489*/
490 Cmp = PB_Cnumroc( ktmp, k, Cimb1, Cmb, myrow, Crow, nprow );
491 if( ( Cmp > 0 ) && ( Cnq0 > 0 ) )
492 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp, &Cnq0, &kbb,
493 ALPHA, WAC, &WACd[LLD_], WBC, &WBCd[LLD_], one,
494 Mptr( Cptr0, Cmp0-Cmp, 0, Cld, size ), &Cld );
495 if( WBCfr ) free( WBC );
496 if( Bfr ) free( Bptr );
497/*
498* Replicate WAC = A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over
499* B( IB+k:IB+M-1, JB:JB+N-1 ) -> WAB
500*/
501 Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
502 Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
503 PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
504 Bcol, ctxt, Bld );
505 PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, WAC, 0, 0, WACd,
506 COLUMN, &WAB, WABd, &WABfr );
507/*
508* Zero upper triangle of WAB( 0:kbb-1, 0:kbb-1 )
509*/
510 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, zero, WAB, 0, 0,
511 WABd );
512/*
513* WBC := ALPHA*A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )'*B( IB+k:IB+M-1, JB:JB+N-1 )
514*/
515 PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WBC, WBCd, &WBCfr,
516 &WBCsum );
517 Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow );
518 if( ( Bnq0 > 0 ) && ( Bmp > 0 ) )
519 gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0, &Bmp,
520 ALPHA, WAB, &WABd[LLD_], Mptr( Bptr0, Bmp0-Bmp, 0, Bld,
521 size ), &Bld, zero, WBC, &WBCd[LLD_] );
522 if( WABfr ) free( WAB );
523 if( WACfr ) free( WAC );
524 if( Afr ) free( Aptr );
525
526 if( WBCsum )
527 {
528 WBCd[RSRC_] = PB_Cindxg2p( ( fwd ? k : k + kbb - 1 ), Cimb1,
529 Cmb, Crow, Crow, nprow );
530 if( Bnq0 > 0 )
531 gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WBC, WBCd[LLD_],
532 WBCd[RSRC_], mycol );
533 }
534/*
535* C( IC+k:IC+k+kbb-1, JC:JC+N-1 ) := C( IC+k:IC+k+kbb-1, JC:JC+N-1 ) + WBC
536*/
537 PB_CScatterV( TYPE, &ScatterDir, kbb, N, WBC, 0, 0, WBCd, ROW, one,
538 C, IC+k, JC, DESCC, ROW );
539 if( WBCfr ) free( WBC );
540 }
541 }
542 }
543 else
544 {
545 top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
546
547 if( upper )
548 {
549 for( k = kstart; k != kend; k += kstep )
550 {
551 ktmp = An - k; kbb = MIN( ktmp, kb );
552/*
553* Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )
554*/
555 PB_CGatherV( TYPE, ALLOCATE, &GatherDir, kbb, ktmp, A, IA+k, JA+k,
556 DESCA, ROW, &Aptr, DBUFA, &Afr );
557/*
558* Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over C( IC:IC+M-1, JC+k:JC+N-1 )
559*/
560 Ccurinb1 = PB_Cfirstnb( ktmp, JC+k, Cinb, Cnb );
561 Ccurcol = PB_Cindxg2p( k, Cinb1, Cnb, Ccol, Ccol, npcol );
562 PB_Cdescset( Cd0, M, ktmp, Cimb1, Ccurinb1, Cmb, Cnb, Crow, Ccurcol,
563 ctxt, Cld );
564 PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Cd0, kbb, Aptr, 0, 0, DBUFA,
565 ROW, &WAC, WACd, &WACfr );
566/*
567* Zero lower triangle of WAC( 0:kbb-1, 0:kbb-1 )
568*/
569 if( conjg )
570 PB_Cplapad( TYPE, LOWER, CONJUG, kbb, kbb, zero,
571 zero, WAC, 0, 0, WACd );
572 else if( kbb > 1 )
573 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero,
574 zero, WAC, 1, 0, WACd );
575/*
576* Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
577*/
578 PB_CGatherV( TYPE, REUSE, &GatherDir, M, kbb, B, IB, JB+k, DESCB,
579 COLUMN, &Bptr, DBUFB, &Bfr );
580/*
581* Replicate B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over C( IC:IC+M-1, JC+k:JC+N-1 )
582*/
583 PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Cd0, kbb, Bptr, 0, 0,
584 DBUFB, COLUMN, &WBC, WBCd, &WBCfr );
585/*
586* C( IC:IC+M-1, JC+k:JC+N-1 ) += ALPHA * WBC * WAC
587*/
588 Cnq = PB_Cnumroc( ktmp, k, Cinb1, Cnb, mycol, Ccol, npcol );
589 if( ( Cmp0 > 0 ) && ( Cnq > 0 ) )
590 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq, &kbb,
591 ALPHA, WBC, &WBCd[LLD_], WAC, &WACd[LLD_], one,
592 Mptr( Cptr0, 0, Cnq0-Cnq, Cld, size ), &Cld );
593 if( WBCfr ) free( WBC );
594 if( Bfr ) free( Bptr );
595/*
596* Replicate WAC = A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over
597* B( IB:IB+M-1, JB+k:JB+N-1 ) -> WAB
598*/
599 Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
600 Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
601 PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow, Bcurcol,
602 ctxt, Bld );
603 PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, WAC, 0, 0, WACd,
604 ROW, &WAB, WABd, &WABfr );
605/*
606* Zero lower triangle of WAB( 0:kbb-1, 0:kbb-1 )
607*/
608 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, zero, WAB, 0, 0,
609 WABd );
610/*
611* WBC := ALPHA*B( IB:IB+M-1, JB+k:JB+N-1 )*A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )'
612*/
613 PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WBC, WBCd,
614 &WBCfr, &WBCsum );
615 Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol );
616 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
617 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Bmp0, &kbb, &Bnq,
618 ALPHA, Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld, WAB,
619 &WABd[LLD_], zero, WBC, &WBCd[LLD_] );
620 if( WABfr ) free( WAB );
621 if( WACfr ) free( WAC );
622 if( Afr ) free( Aptr );
623
624 if( WBCsum )
625 {
626 WBCd[CSRC_] = PB_Cindxg2p( ( fwd ? k : k + kbb - 1 ), Cinb1,
627 Cnb, Ccol, Ccol, npcol );
628 if( Bmp0 > 0 )
629 gsum2d( ctxt, ROW, &top, Bmp0, kbb, WBC, WBCd[LLD_], myrow,
630 WBCd[CSRC_] );
631 }
632/*
633* C( IC:IC+M-1, JC+k:JC+k+kbb-1 ) := C( IC:IC+M-1, JC+k:JC+k+kbb-1 ) + WBC
634*/
635 PB_CScatterV( TYPE, &ScatterDir, M, kbb, WBC, 0, 0, WBCd, COLUMN,
636 one, C, IC, JC+k, DESCC, COLUMN );
637 if( WBCfr ) free( WBC );
638 }
639 }
640 else
641 {
642 for( k = kstart; k != kend; k += kstep )
643 {
644 kbb = An - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
645/*
646* Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )
647*/
648 PB_CGatherV( TYPE, ALLOCATE, &GatherDir, kbb, ktmp, A, IA+k, JA,
649 DESCA, ROW, &Aptr, DBUFA, &Afr );
650/*
651* Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over
652* C( IC:IC+M-1, JC:JC+k+kbb-1 ) -> WAC
653*/
654 PB_Cdescset( Cd0, M, ktmp, Cimb1, Cinb1, Cmb, Cnb, Crow, Ccol, ctxt,
655 Cld );
656 PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Cd0, kbb, Aptr, 0, 0, DBUFA,
657 ROW, &WAC, WACd, &WACfr );
658/*
659* Zero upper triangle of WAC( 0:kbb-1, k:k+kbb-1 )
660*/
661 if( conjg )
662 PB_Cplapad( TYPE, UPPER, CONJUG, kbb, kbb, zero, zero, WAC,
663 0, k, WACd );
664 else if( kbb > 1 )
665 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC,
666 0, k+1, WACd );
667/*
668* Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
669*/
670 PB_CGatherV( TYPE, REUSE, &GatherDir, M, kbb, B, IB, JB+k, DESCB,
671 COLUMN, &Bptr, DBUFB, &Bfr );
672/*
673* Replicate B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over C( IC:IC+M-1, JC:JC+k+kbb-1 )
674*/
675 PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Cd0, kbb, Bptr, 0, 0,
676 DBUFB, COLUMN, &WBC, WBCd, &WBCfr );
677/*
678* C( IC:IC+M-1, JC:JC+k+kbb-1 ) += ALPHA * WBC * WAC
679*/
680 Cnq = PB_Cnumroc( ktmp, 0, Cinb1, Cnb, mycol, Ccol, npcol );
681 if( ( Cmp0 > 0 ) && ( Cnq > 0 ) )
682 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq, &kbb,
683 ALPHA, WBC, &WBCd[LLD_], WAC, &WACd[LLD_], one, Cptr0,
684 &Cld );
685 if( WBCfr ) free( WBC );
686 if( Bfr ) free( Bptr );
687/*
688* Replicate WAC = A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over
689* B( IB:IB+M-1, JB:JB+k+kbb-1 ) -> WAB
690*/
691 PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt,
692 Bld );
693 PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, WAC, 0, 0, WACd,
694 ROW, &WAB, WABd, &WABfr );
695/*
696* Zero upper triangle of WAB( 0:kbb-1, k:k+kbb-1 )
697*/
698 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, zero, WAB, 0, k,
699 WABd );
700/*
701* WBC := ALPHA*B( IB:IB+M-1, JB:JB+k+kbb-1 )*A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )'
702*/
703 PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WBC, WBCd, &WBCfr,
704 &WBCsum );
705 Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol );
706 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
707 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Bmp0, &kbb, &Bnq,
708 ALPHA, Bptr0, &Bld, WAB, &WABd[LLD_], zero, WBC,
709 &WBCd[LLD_] );
710 if( WABfr ) free( WAB );
711 if( WACfr ) free( WAC );
712 if( Afr ) free( Aptr );
713
714 if( WBCsum )
715 {
716 WBCd[CSRC_] = PB_Cindxg2p( ( fwd ? k : k + kbb - 1 ), Cinb1,
717 Cnb, Ccol, Ccol, npcol );
718 if( Bmp0 > 0 )
719 gsum2d( ctxt, ROW, &top, Bmp0, kbb, WBC, WBCd[LLD_], myrow,
720 WBCd[CSRC_] );
721 }
722/*
723* C( IC:IC+M-1, JC+k:JC+k+kbb-1 ) := C( IC:IC+M-1, JC+k:JC+k+kbb-1 ) + WBC
724*/
725 PB_CScatterV( TYPE, &ScatterDir, M, kbb, WBC, 0, 0, WBCd, COLUMN,
726 one, C, IC, JC+k, DESCC, COLUMN );
727 if( WBCfr ) free( WBC );
728 }
729 }
730 }
731/*
732* End of PB_CpsymmAB
733*/
734}
#define Int
Definition Bconfig.h:22
F_VOID_FCT(* GEMM_T)()
Definition pblas.h:317
void(* GSUM2D_T)()
Definition pblas.h:286
#define C2F_CHAR(a)
Definition pblas.h:125
#define TOP_GET
Definition PBblacs.h:50
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define NOTRAN
Definition PBblas.h:44
#define CCONJG
Definition PBblas.h:21
#define ALL
Definition PBblas.h:50
#define CLEFT
Definition PBblas.h:29
#define CBACKWARD
Definition PBblas.h:39
#define NOCONJG
Definition PBblas.h:45
#define CUPPER
Definition PBblas.h:26
#define CTRAN
Definition PBblas.h:20
#define LOWER
Definition PBblas.h:51
#define REUSE
Definition PBblas.h:67
#define ALLOCATE
Definition PBblas.h:68
#define CCOTRAN
Definition PBblas.h:22
#define INIT
Definition PBblas.h:61
#define UPPER
Definition PBblas.h:52
#define CFORWARD
Definition PBblas.h:38
#define pilaenv_
Definition PBpblas.h:44
#define CTXT_
Definition PBtools.h:38
Int PB_Cfirstnb()
#define MB_
Definition PBtools.h:43
void PB_Cinfog2l()
#define MIN(a_, b_)
Definition PBtools.h:76
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
#define LLD_
Definition PBtools.h:47
void PB_CGatherV()
Int PB_Cnumroc()
char * PB_Ctop()
void PB_CInV()
void PB_CScatterV()
void PB_Cplapad()
void PB_Cplascal()
#define RSRC_
Definition PBtools.h:45
void PB_Cdescset()
void PB_COutV()
#define INB_
Definition PBtools.h:42
void PB_CpsymmAB()
#define CSRC_
Definition PBtools.h:46
#define IMB_
Definition PBtools.h:41
Int PB_Cindxg2p()
#define Mupcase(C)
Definition PBtools.h:83
#define DLEN_
Definition PBtools.h:48
#define NB_
Definition PBtools.h:44
#define TYPE
Definition clamov.c:7