ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CpsymmAB.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_CpsymmAB( PBTYP_T * TYPE, char * DIRECAB, char * CONJUG,
21  char * SIDE, char * UPLO, int M, int N, char * ALPHA,
22  char * A, int IA, int JA, int * DESCA, char * B,
23  int IB, int JB, int * DESCB, char * BETA, char * C,
24  int IC, int JC, int * DESCC )
25 #else
26 void PB_CpsymmAB( TYPE, DIRECAB, CONJUG, SIDE, UPLO, M, N, ALPHA, A, IA,
27  JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC )
28 /*
29 * .. Scalar Arguments ..
30 */
31  char * CONJUG, * DIRECAB, * SIDE, * UPLO;
32  int IA, IB, IC, JA, JB, JC, M, N;
33  char * ALPHA, * BETA;
34  PBTYP_T * TYPE;
35 /*
36 * .. Array Arguments ..
37 */
38  int * DESCA, * DESCB, * DESCC;
39  char * A, * B, * C;
40 #endif
41 {
42 /*
43 * Purpose
44 * =======
45 *
46 * PB_CpsymmAB performs one of the matrix-matrix operations
47 *
48 * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ),
49 *
50 * or
51 *
52 * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ),
53 *
54 * where
55 *
56 * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1),
57 *
58 * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L',
59 * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and,
60 *
61 * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1).
62 *
63 * Alpha and beta are scalars, sub( A ) is a symmetric or Hermitian
64 * submatrix and sub( B ) and sub( C ) are m by n submatrices.
65 *
66 * This is the outer-product algorithm using the logical aggregation
67 * blocking technique. The submatrix operand sub( C ) stays in place.
68 *
69 * Notes
70 * =====
71 *
72 * A description vector is associated with each 2D block-cyclicly dis-
73 * tributed matrix. This vector stores the information required to
74 * establish the mapping between a matrix entry and its corresponding
75 * process and memory location.
76 *
77 * In the following comments, the character _ should be read as
78 * "of the distributed matrix". Let A be a generic term for any 2D
79 * block cyclicly distributed matrix. Its description vector is DESC_A:
80 *
81 * NOTATION STORED IN EXPLANATION
82 * ---------------- --------------- ------------------------------------
83 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
84 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
85 * the NPROW x NPCOL BLACS process grid
86 * A is distributed over. The context
87 * itself is global, but the handle
88 * (the integer value) may vary.
89 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
90 * ted matrix A, M_A >= 0.
91 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
92 * buted matrix A, N_A >= 0.
93 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
94 * block of the matrix A, IMB_A > 0.
95 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
96 * left block of the matrix A,
97 * INB_A > 0.
98 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
99 * bute the last M_A-IMB_A rows of A,
100 * MB_A > 0.
101 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
102 * bute the last N_A-INB_A columns of
103 * A, NB_A > 0.
104 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
105 * row of the matrix A is distributed,
106 * NPROW > RSRC_A >= 0.
107 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
108 * first column of A is distributed.
109 * NPCOL > CSRC_A >= 0.
110 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
111 * array storing the local blocks of
112 * the distributed matrix A,
113 * IF( Lc( 1, N_A ) > 0 )
114 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
115 * ELSE
116 * LLD_A >= 1.
117 *
118 * Let K be the number of rows of a matrix A starting at the global in-
119 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
120 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
121 * receive if these K rows were distributed over NPROW processes. If K
122 * is the number of columns of a matrix A starting at the global index
123 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
124 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
125 * these K columns were distributed over NPCOL processes.
126 *
127 * The values of Lr() and Lc() may be determined via a call to the func-
128 * tion PB_Cnumroc:
129 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
130 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
131 *
132 * Arguments
133 * =========
134 *
135 * TYPE (local input) pointer to a PBTYP_T structure
136 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
137 * that contains type information (See pblas.h).
138 *
139 * DIRECAB (global input) pointer to CHAR
140 * On entry, DIRECAB specifies the direction in which the rows
141 * or columns of sub( A ) and sub( B ) should be looped over as
142 * follows:
143 * DIRECAB = 'F' or 'f' forward or increasing,
144 * DIRECAB = 'B' or 'b' backward or decreasing.
145 *
146 * CONJUG (global input) pointer to CHAR
147 * On entry, CONJUG specifies whether sub( A ) is a symmetric or
148 * Hermitian submatrix operand as follows:
149 * CONJUG = 'N' or 'n' sub( A ) is symmetric,
150 * CONJUG = 'Z' or 'z' sub( A ) is Hermitian.
151 *
152 * SIDE (global input) pointer to CHAR
153 * On entry, SIDE specifies whether the symmetric or Hermitian
154 * submatrix sub( A ) appears on the left or right in the opera-
155 * tion as follows:
156 *
157 * SIDE = 'L' or 'l'
158 * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ),
159 *
160 * SIDE = 'R' or 'r'
161 * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ).
162 *
163 * UPLO (global input) pointer to CHAR
164 * On entry, UPLO specifies whether the local pieces of
165 * the array A containing the upper or lower triangular part
166 * of the submatrix sub( A ) are to be referenced as follows:
167 * UPLO = 'U' or 'u' Only the local pieces corresponding to
168 * the upper triangular part of the
169 * submatrix sub( A ) are referenced,
170 * UPLO = 'L' or 'l' Only the local pieces corresponding to
171 * the lower triangular part of the
172 * submatrix sub( A ) are referenced.
173 *
174 * M (global input) INTEGER
175 * On entry, M specifies the number of rows of the submatrix
176 * sub( C ). M must be at least zero.
177 *
178 * N (global input) INTEGER
179 * On entry, N specifies the number of columns of the submatrix
180 * sub( C ). N must be at least zero.
181 *
182 * ALPHA (global input) pointer to CHAR
183 * On entry, ALPHA specifies the scalar alpha. When ALPHA is
184 * supplied as zero then the local entries of the arrays A and
185 * B corresponding to the entries of the submatrices sub( A )
186 * and sub( B ) respectively need not be set on input.
187 *
188 * A (local input) pointer to CHAR
189 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
190 * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at
191 * at least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
192 * contains the local entries of the matrix A.
193 * Before entry with SIDE = 'L' or 'l', this array contains
194 * the local entries corresponding to the entries of the m by m
195 * symmetric or Hermitian submatrix sub( A ), such that when
196 * UPLO = 'U' or 'u', this array contains the local entries of
197 * the upper triangular part of the submatrix sub( A ), and the
198 * local entries of the strictly lower triangular of sub( A )
199 * are not referenced, and when UPLO = 'L' or 'l', this array
200 * contains the local entries of the lower triangular part of
201 * the symmetric or Hermitian submatrix sub( A ), and the local
202 * entries of the strictly upper triangular of sub( A ) are not
203 * referenced.
204 * Before entry with SIDE = 'R' or 'r', this array contains
205 * the local entries corresponding to the entries of the n by n
206 * symmetric or Hermitian submatrix sub( A ), such that when
207 * UPLO = 'U' or 'u', this array contains the local entries of
208 * the upper triangular part of the submatrix sub( A ), and the
209 * local entries of the strictly lower triangular of sub( A )
210 * are not referenced, and when UPLO = 'L' or 'l', this array
211 * contains the local entries of the lower triangular part of
212 * the symmetric or Hermitian submatrix sub( A ), and the local
213 * entries of the strictly upper triangular of sub( A ) are not
214 * referenced.
215 * Note that the imaginary parts of the local entries corres-
216 * ponding to the diagonal elements of sub( A ) need not be
217 * set and assumed to be zero.
218 *
219 * IA (global input) INTEGER
220 * On entry, IA specifies A's global row index, which points to
221 * the beginning of the submatrix sub( A ).
222 *
223 * JA (global input) INTEGER
224 * On entry, JA specifies A's global column index, which points
225 * to the beginning of the submatrix sub( A ).
226 *
227 * DESCA (global and local input) INTEGER array
228 * On entry, DESCA is an integer array of dimension DLEN_. This
229 * is the array descriptor for the matrix A.
230 *
231 * B (local input) pointer to CHAR
232 * On entry, B is an array of dimension (LLD_B, Kb), where Kb is
233 * at least Lc( 1, JB+N-1 ). Before entry, this array contains
234 * the local entries of the matrix B.
235 *
236 * IB (global input) INTEGER
237 * On entry, IB specifies B's global row index, which points to
238 * the beginning of the submatrix sub( B ).
239 *
240 * JB (global input) INTEGER
241 * On entry, JB specifies B's global column index, which points
242 * to the beginning of the submatrix sub( B ).
243 *
244 * DESCB (global and local input) INTEGER array
245 * On entry, DESCB is an integer array of dimension DLEN_. This
246 * is the array descriptor for the matrix B.
247 *
248 * BETA (global input) pointer to CHAR
249 * On entry, BETA specifies the scalar beta. When BETA is
250 * supplied as zero then the local entries of the array C
251 * corresponding to the entries of the submatrix sub( C ) need
252 * not be set on input.
253 *
254 * C (local input/local output) pointer to CHAR
255 * On entry, C is an array of dimension (LLD_C, Kc), where Kc is
256 * at least Lc( 1, JC+N-1 ). Before entry, this array contains
257 * the local entries of the matrix C.
258 * On exit, the entries of this array corresponding to the local
259 * entries of the submatrix sub( C ) are overwritten by the
260 * local entries of the m by n updated submatrix.
261 *
262 * IC (global input) INTEGER
263 * On entry, IC specifies C's global row index, which points to
264 * the beginning of the submatrix sub( C ).
265 *
266 * JC (global input) INTEGER
267 * On entry, JC specifies C's global column index, which points
268 * to the beginning of the submatrix sub( C ).
269 *
270 * DESCC (global and local input) INTEGER array
271 * On entry, DESCC is an integer array of dimension DLEN_. This
272 * is the array descriptor for the matrix C.
273 *
274 * -- Written on April 1, 1998 by
275 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
276 *
277 * ---------------------------------------------------------------------
278 */
279 /*
280 * .. Local Scalars ..
281 */
282  char GatherDir, ScatterDir, * one, top, tran, * zero;
283  int Afr, An, Bcol, Bcurcol, Bcurimb1, Bcurinb1, Bcurrow, Bfr, Bii,
284  Bimb, Bimb1, Binb, Binb1, Bjj, Bld, Bmb, Bmp, Bmp0, Bnb, Bnq,
285  Bnq0, Brow, Ccol, Ccurcol, Ccurimb1, Ccurinb1, Ccurrow, Cii,
286  Cimb, Cimb1, Cinb, Cinb1, Cjj, Cld, Cmb, Cmp, Cmp0, Cnb, Cnq,
287  Cnq0, Crow, WABfr, WACfr, WBCfr, WBCsum, conjg, ctxt, fwd, k,
288  kb, kbb, kend, kstart, kstep, ktmp, lside, mycol, myrow,
289  npcol, nprow, size, upper;
290  GEMM_T gemm;
291  GSUM2D_T gsum2d;
292 /*
293 * .. Local Arrays ..
294 */
295  int Bd0 [DLEN_], Cd0 [DLEN_], DBUFA[DLEN_], DBUFB[DLEN_],
296  WABd[DLEN_], WACd[DLEN_], WBCd [DLEN_];
297  char * Aptr = NULL, * Bptr = NULL, * Bptr0 = NULL, * Cptr0 = NULL,
298  * WAB = NULL, * WAC = NULL, * WBC = NULL;
299 /* ..
300 * .. Executable Statements ..
301 *
302 */
303 /*
304 * sub( C ) = beta * sub( C )
305 */
306  PB_Cplascal( TYPE, ALL, NOCONJG, M, N, BETA, C, IC, JC, DESCC );
307 /*
308 * Retrieve process grid information
309 */
310  Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
311 
312  An = ( ( lside = ( Mupcase( SIDE[0] ) == CLEFT ) ) ? M : N );
313  upper = ( Mupcase( UPLO[0] ) == CUPPER );
314  tran = ( ( conjg = ( Mupcase( CONJUG[0] ) == CCONJG ) ) ? CCOTRAN : CTRAN );
315 
316  size = TYPE->size; one = TYPE->one; zero = TYPE->zero;
317  gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d;
318 /*
319 * Figure out the loop bounds accordingly to DIRECAB
320 */
321  kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
322  if( ( fwd = ( Mupcase( DIRECAB[0] ) == CFORWARD ) ) != 0 )
323  {
324  kstart = 0; kend = ( ( An - 1 ) / kb + 1 ) * kb; kstep = kb;
325  GatherDir = ScatterDir = CFORWARD;
326  }
327  else
328  {
329  kstart = ( ( An - 1 ) / kb ) * kb; kend = kstep = -kb;
330  GatherDir = ScatterDir = CBACKWARD;
331  }
332 /*
333 * Compute local information for sub( B ) and sub( C )
334 */
335  PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj,
336  &Brow, &Bcol );
337  Bimb = DESCB[IMB_]; Binb = DESCB[INB_];
338  Bmb = DESCB[MB_ ]; Bnb = DESCB[NB_ ]; Bld = DESCB[LLD_];
339  Bimb1 = PB_Cfirstnb( M, IB, Bimb, Bmb );
340  Bmp0 = PB_Cnumroc( M, 0, Bimb1, Bmb, myrow, Brow, nprow );
341  Binb1 = PB_Cfirstnb( N, JB, Binb, Bnb );
342  Bnq0 = PB_Cnumroc( N, 0, Binb1, Bnb, mycol, Bcol, npcol );
343  if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size );
344 
345  PB_Cinfog2l( IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj,
346  &Crow, &Ccol );
347  Cimb = DESCC[IMB_]; Cinb = DESCC[INB_];
348  Cmb = DESCC[MB_ ]; Cnb = DESCC[NB_ ]; Cld = DESCC[LLD_];
349  Cimb1 = PB_Cfirstnb( M, IC, Cimb, Cmb );
350  Cmp0 = PB_Cnumroc( M, 0, Cimb1, Cmb, myrow, Crow, nprow );
351  Cinb1 = PB_Cfirstnb( N, JC, Cinb, Cnb );
352  Cnq0 = PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol );
353  if( ( Cmp0 > 0 ) && ( Cnq0 > 0 ) ) Cptr0 = Mptr( C, Cii, Cjj, Cld, size );
354 
355  if( lside )
356  {
357  top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
358 
359  if( upper )
360  {
361  for( k = kstart; k != kend; k += kstep )
362  {
363  kbb = An - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
364 /*
365 * Accumulate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )
366 */
367  PB_CGatherV( TYPE, ALLOCATE, &GatherDir, ktmp, kbb, A, IA, JA+k,
368  DESCA, COLUMN, &Aptr, DBUFA, &Afr );
369 /*
370 * Replicate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) over
371 * C( IC:IC+k+kbb-1, JC:JC+N-1 ) -> WAC
372 */
373  PB_Cdescset( Cd0, ktmp, N, Cimb1, Cinb1, Cmb, Cnb, Crow, Ccol,
374  ctxt, Cld );
375  PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Cd0, kbb, Aptr, 0, 0,
376  DBUFA, COLUMN, &WAC, WACd, &WACfr );
377 /*
378 * Zero lower triangle of WAC( k:k+kbb-1, 0:kbb-1 )
379 */
380  if( conjg )
381  PB_Cplapad( TYPE, LOWER, CONJUG, kbb, kbb, zero,
382  zero, WAC, k, 0, WACd );
383  else if( kbb > 1 )
384  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero,
385  zero, WAC, k+1, 0, WACd );
386 /*
387 * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
388 */
389  PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, N, B, IB+k, JB, DESCB,
390  ROW, &Bptr, DBUFB, &Bfr );
391 /*
392 * Replicate B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over C( IC:IC+k+kbb-1, JC:JC+N-1 )
393 */
394  PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Cd0, kbb, Bptr, 0, 0, DBUFB,
395  ROW, &WBC, WBCd, &WBCfr );
396 /*
397 * C( IC:IC+k+kbb-1, JC:JC+N-1 ) += ALPHA * WAC * WBC
398 */
399  Cmp = PB_Cnumroc( ktmp, 0, Cimb1, Cmb, myrow, Crow, nprow );
400  if( ( Cmp > 0 ) && ( Cnq0 > 0 ) )
401  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp, &Cnq0, &kbb,
402  ALPHA, WAC, &WACd[LLD_], WBC, &WBCd[LLD_], one, Cptr0,
403  &Cld );
404  if( WBCfr ) free( WBC );
405  if( Bfr ) free( Bptr );
406 /*
407 * Replicate WAC = A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) over
408 * B( IB:IB+k+kbb-1, JB:JB+N-1 ) -> WAB
409 */
410  PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
411  ctxt, Bld );
412  PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, WAC, 0, 0, WACd,
413  COLUMN, &WAB, WABd, &WABfr );
414 /*
415 * Zero lower triangle of WAB( k:k+kbb-1, 0:kbb-1 )
416 */
417  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, zero, WAB, k, 0,
418  WABd );
419 /*
420 * WBC := ALPHA*A(IA:IA+k+kbb-1, JA+k:JA+k+kbb-1)'*B( IB:IB+k+kbb-1, JB:JB+N-1 )
421 */
422  PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WBC, WBCd, &WBCfr,
423  &WBCsum );
424  Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow );
425  if( ( Bnq0 > 0 ) && ( Bmp > 0 ) )
426  gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0, &Bmp,
427  ALPHA, WAB, &WABd[LLD_], Bptr0, &Bld, zero, WBC,
428  &WBCd[LLD_] );
429  if( WABfr ) free( WAB );
430  if( WACfr ) free( WAC );
431  if( Afr ) free( Aptr );
432 
433  if( WBCsum )
434  {
435  WBCd[RSRC_] = PB_Cindxg2p( ( fwd ? k : k + kbb - 1 ), Cimb1,
436  Cmb, Crow, Crow, nprow );
437  if( Bnq0 > 0 )
438  gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WBC, WBCd[LLD_],
439  WBCd[RSRC_], mycol );
440  }
441 /*
442 * C( IC+k:IC+k+kbb-1, JC:JC+N-1 ) := C( IC+k:IC+k+kbb-1, JC:JC+N-1 ) + WBC
443 */
444  PB_CScatterV( TYPE, &ScatterDir, kbb, N, WBC, 0, 0, WBCd, ROW, one,
445  C, IC+k, JC, DESCC, ROW );
446  if( WBCfr ) free( WBC );
447  }
448  }
449  else
450  {
451  for( k = kstart; k != kend; k += kstep )
452  {
453  ktmp = An - k; kbb = MIN( ktmp, kb );
454 /*
455 * Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )
456 */
457  PB_CGatherV( TYPE, ALLOCATE, &GatherDir, ktmp, kbb, A, IA+k, JA+k,
458  DESCA, COLUMN, &Aptr, DBUFA, &Afr );
459 /*
460 * Replicate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over C( IC+k:IC+M-1, JC:JC+N-1 )
461 */
462  Ccurimb1 = PB_Cfirstnb( ktmp, IC+k, Cimb, Cmb );
463  Ccurrow = PB_Cindxg2p( k, Cimb1, Cmb, Crow, Crow, nprow );
464  PB_Cdescset( Cd0, ktmp, N, Ccurimb1, Cinb1, Cmb, Cnb, Ccurrow,
465  Ccol, ctxt, Cld );
466  PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Cd0, kbb, Aptr, 0, 0,
467  DBUFA, COLUMN, &WAC, WACd, &WACfr );
468 /*
469 * Zero upper triangle of WAC( 0:kbb-1, 0:kbb-1 )
470 */
471  if( conjg )
472  PB_Cplapad( TYPE, UPPER, CONJUG, kbb, kbb, zero, zero, WAC,
473  0, 0, WACd );
474  else if( kbb > 1 )
475  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC,
476  0, 1, WACd );
477 /*
478 * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
479 */
480  PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, N, B, IB+k, JB, DESCB,
481  ROW, &Bptr, DBUFB, &Bfr );
482 /*
483 * Replicate B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over C( IC+k:IC+M-1, JC:JC+N-1 )
484 */
485  PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Cd0, kbb, Bptr, 0, 0, DBUFB,
486  ROW, &WBC, WBCd, &WBCfr );
487 /*
488 * C( IC+k:IC+M-1, JC:JC+N-1 ) += ALPHA * WAC * WBC
489 */
490  Cmp = PB_Cnumroc( ktmp, k, Cimb1, Cmb, myrow, Crow, nprow );
491  if( ( Cmp > 0 ) && ( Cnq0 > 0 ) )
492  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp, &Cnq0, &kbb,
493  ALPHA, WAC, &WACd[LLD_], WBC, &WBCd[LLD_], one,
494  Mptr( Cptr0, Cmp0-Cmp, 0, Cld, size ), &Cld );
495  if( WBCfr ) free( WBC );
496  if( Bfr ) free( Bptr );
497 /*
498 * Replicate WAC = A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over
499 * B( IB+k:IB+M-1, JB:JB+N-1 ) -> WAB
500 */
501  Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
502  Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
503  PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
504  Bcol, ctxt, Bld );
505  PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, WAC, 0, 0, WACd,
506  COLUMN, &WAB, WABd, &WABfr );
507 /*
508 * Zero upper triangle of WAB( 0:kbb-1, 0:kbb-1 )
509 */
510  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, zero, WAB, 0, 0,
511  WABd );
512 /*
513 * WBC := ALPHA*A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )'*B( IB+k:IB+M-1, JB:JB+N-1 )
514 */
515  PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WBC, WBCd, &WBCfr,
516  &WBCsum );
517  Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow );
518  if( ( Bnq0 > 0 ) && ( Bmp > 0 ) )
519  gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0, &Bmp,
520  ALPHA, WAB, &WABd[LLD_], Mptr( Bptr0, Bmp0-Bmp, 0, Bld,
521  size ), &Bld, zero, WBC, &WBCd[LLD_] );
522  if( WABfr ) free( WAB );
523  if( WACfr ) free( WAC );
524  if( Afr ) free( Aptr );
525 
526  if( WBCsum )
527  {
528  WBCd[RSRC_] = PB_Cindxg2p( ( fwd ? k : k + kbb - 1 ), Cimb1,
529  Cmb, Crow, Crow, nprow );
530  if( Bnq0 > 0 )
531  gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WBC, WBCd[LLD_],
532  WBCd[RSRC_], mycol );
533  }
534 /*
535 * C( IC+k:IC+k+kbb-1, JC:JC+N-1 ) := C( IC+k:IC+k+kbb-1, JC:JC+N-1 ) + WBC
536 */
537  PB_CScatterV( TYPE, &ScatterDir, kbb, N, WBC, 0, 0, WBCd, ROW, one,
538  C, IC+k, JC, DESCC, ROW );
539  if( WBCfr ) free( WBC );
540  }
541  }
542  }
543  else
544  {
545  top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
546 
547  if( upper )
548  {
549  for( k = kstart; k != kend; k += kstep )
550  {
551  ktmp = An - k; kbb = MIN( ktmp, kb );
552 /*
553 * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )
554 */
555  PB_CGatherV( TYPE, ALLOCATE, &GatherDir, kbb, ktmp, A, IA+k, JA+k,
556  DESCA, ROW, &Aptr, DBUFA, &Afr );
557 /*
558 * Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over C( IC:IC+M-1, JC+k:JC+N-1 )
559 */
560  Ccurinb1 = PB_Cfirstnb( ktmp, JC+k, Cinb, Cnb );
561  Ccurcol = PB_Cindxg2p( k, Cinb1, Cnb, Ccol, Ccol, npcol );
562  PB_Cdescset( Cd0, M, ktmp, Cimb1, Ccurinb1, Cmb, Cnb, Crow, Ccurcol,
563  ctxt, Cld );
564  PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Cd0, kbb, Aptr, 0, 0, DBUFA,
565  ROW, &WAC, WACd, &WACfr );
566 /*
567 * Zero lower triangle of WAC( 0:kbb-1, 0:kbb-1 )
568 */
569  if( conjg )
570  PB_Cplapad( TYPE, LOWER, CONJUG, kbb, kbb, zero,
571  zero, WAC, 0, 0, WACd );
572  else if( kbb > 1 )
573  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero,
574  zero, WAC, 1, 0, WACd );
575 /*
576 * Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
577 */
578  PB_CGatherV( TYPE, REUSE, &GatherDir, M, kbb, B, IB, JB+k, DESCB,
579  COLUMN, &Bptr, DBUFB, &Bfr );
580 /*
581 * Replicate B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over C( IC:IC+M-1, JC+k:JC+N-1 )
582 */
583  PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Cd0, kbb, Bptr, 0, 0,
584  DBUFB, COLUMN, &WBC, WBCd, &WBCfr );
585 /*
586 * C( IC:IC+M-1, JC+k:JC+N-1 ) += ALPHA * WBC * WAC
587 */
588  Cnq = PB_Cnumroc( ktmp, k, Cinb1, Cnb, mycol, Ccol, npcol );
589  if( ( Cmp0 > 0 ) && ( Cnq > 0 ) )
590  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq, &kbb,
591  ALPHA, WBC, &WBCd[LLD_], WAC, &WACd[LLD_], one,
592  Mptr( Cptr0, 0, Cnq0-Cnq, Cld, size ), &Cld );
593  if( WBCfr ) free( WBC );
594  if( Bfr ) free( Bptr );
595 /*
596 * Replicate WAC = A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over
597 * B( IB:IB+M-1, JB+k:JB+N-1 ) -> WAB
598 */
599  Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
600  Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
601  PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow, Bcurcol,
602  ctxt, Bld );
603  PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, WAC, 0, 0, WACd,
604  ROW, &WAB, WABd, &WABfr );
605 /*
606 * Zero lower triangle of WAB( 0:kbb-1, 0:kbb-1 )
607 */
608  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, zero, WAB, 0, 0,
609  WABd );
610 /*
611 * WBC := ALPHA*B( IB:IB+M-1, JB+k:JB+N-1 )*A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )'
612 */
613  PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WBC, WBCd,
614  &WBCfr, &WBCsum );
615  Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol );
616  if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
617  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Bmp0, &kbb, &Bnq,
618  ALPHA, Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld, WAB,
619  &WABd[LLD_], zero, WBC, &WBCd[LLD_] );
620  if( WABfr ) free( WAB );
621  if( WACfr ) free( WAC );
622  if( Afr ) free( Aptr );
623 
624  if( WBCsum )
625  {
626  WBCd[CSRC_] = PB_Cindxg2p( ( fwd ? k : k + kbb - 1 ), Cinb1,
627  Cnb, Ccol, Ccol, npcol );
628  if( Bmp0 > 0 )
629  gsum2d( ctxt, ROW, &top, Bmp0, kbb, WBC, WBCd[LLD_], myrow,
630  WBCd[CSRC_] );
631  }
632 /*
633 * C( IC:IC+M-1, JC+k:JC+k+kbb-1 ) := C( IC:IC+M-1, JC+k:JC+k+kbb-1 ) + WBC
634 */
635  PB_CScatterV( TYPE, &ScatterDir, M, kbb, WBC, 0, 0, WBCd, COLUMN,
636  one, C, IC, JC+k, DESCC, COLUMN );
637  if( WBCfr ) free( WBC );
638  }
639  }
640  else
641  {
642  for( k = kstart; k != kend; k += kstep )
643  {
644  kbb = An - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
645 /*
646 * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )
647 */
648  PB_CGatherV( TYPE, ALLOCATE, &GatherDir, kbb, ktmp, A, IA+k, JA,
649  DESCA, ROW, &Aptr, DBUFA, &Afr );
650 /*
651 * Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over
652 * C( IC:IC+M-1, JC:JC+k+kbb-1 ) -> WAC
653 */
654  PB_Cdescset( Cd0, M, ktmp, Cimb1, Cinb1, Cmb, Cnb, Crow, Ccol, ctxt,
655  Cld );
656  PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Cd0, kbb, Aptr, 0, 0, DBUFA,
657  ROW, &WAC, WACd, &WACfr );
658 /*
659 * Zero upper triangle of WAC( 0:kbb-1, k:k+kbb-1 )
660 */
661  if( conjg )
662  PB_Cplapad( TYPE, UPPER, CONJUG, kbb, kbb, zero, zero, WAC,
663  0, k, WACd );
664  else if( kbb > 1 )
665  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC,
666  0, k+1, WACd );
667 /*
668 * Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
669 */
670  PB_CGatherV( TYPE, REUSE, &GatherDir, M, kbb, B, IB, JB+k, DESCB,
671  COLUMN, &Bptr, DBUFB, &Bfr );
672 /*
673 * Replicate B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over C( IC:IC+M-1, JC:JC+k+kbb-1 )
674 */
675  PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Cd0, kbb, Bptr, 0, 0,
676  DBUFB, COLUMN, &WBC, WBCd, &WBCfr );
677 /*
678 * C( IC:IC+M-1, JC:JC+k+kbb-1 ) += ALPHA * WBC * WAC
679 */
680  Cnq = PB_Cnumroc( ktmp, 0, Cinb1, Cnb, mycol, Ccol, npcol );
681  if( ( Cmp0 > 0 ) && ( Cnq > 0 ) )
682  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq, &kbb,
683  ALPHA, WBC, &WBCd[LLD_], WAC, &WACd[LLD_], one, Cptr0,
684  &Cld );
685  if( WBCfr ) free( WBC );
686  if( Bfr ) free( Bptr );
687 /*
688 * Replicate WAC = A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over
689 * B( IB:IB+M-1, JB:JB+k+kbb-1 ) -> WAB
690 */
691  PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt,
692  Bld );
693  PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, WAC, 0, 0, WACd,
694  ROW, &WAB, WABd, &WABfr );
695 /*
696 * Zero upper triangle of WAB( 0:kbb-1, k:k+kbb-1 )
697 */
698  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, zero, WAB, 0, k,
699  WABd );
700 /*
701 * WBC := ALPHA*B( IB:IB+M-1, JB:JB+k+kbb-1 )*A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )'
702 */
703  PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WBC, WBCd, &WBCfr,
704  &WBCsum );
705  Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol );
706  if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
707  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Bmp0, &kbb, &Bnq,
708  ALPHA, Bptr0, &Bld, WAB, &WABd[LLD_], zero, WBC,
709  &WBCd[LLD_] );
710  if( WABfr ) free( WAB );
711  if( WACfr ) free( WAC );
712  if( Afr ) free( Aptr );
713 
714  if( WBCsum )
715  {
716  WBCd[CSRC_] = PB_Cindxg2p( ( fwd ? k : k + kbb - 1 ), Cinb1,
717  Cnb, Ccol, Ccol, npcol );
718  if( Bmp0 > 0 )
719  gsum2d( ctxt, ROW, &top, Bmp0, kbb, WBC, WBCd[LLD_], myrow,
720  WBCd[CSRC_] );
721  }
722 /*
723 * C( IC:IC+M-1, JC+k:JC+k+kbb-1 ) := C( IC:IC+M-1, JC+k:JC+k+kbb-1 ) + WBC
724 */
725  PB_CScatterV( TYPE, &ScatterDir, M, kbb, WBC, 0, 0, WBCd, COLUMN,
726  one, C, IC, JC+k, DESCC, COLUMN );
727  if( WBCfr ) free( WBC );
728  }
729  }
730  }
731 /*
732 * End of PB_CpsymmAB
733 */
734 }
CCONJG
#define CCONJG
Definition: PBblas.h:21
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
ALLOCATE
#define ALLOCATE
Definition: PBblas.h:68
PB_CScatterV
void PB_CScatterV()
CCOTRAN
#define CCOTRAN
Definition: PBblas.h:22
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
PB_Cfirstnb
int PB_Cfirstnb()
DLEN_
#define DLEN_
Definition: PBtools.h:48
GEMM_T
F_VOID_FCT(* GEMM_T)()
Definition: pblas.h:313
NOTRAN
#define NOTRAN
Definition: PBblas.h:44
LLD_
#define LLD_
Definition: PBtools.h:47
UPPER
#define UPPER
Definition: PBblas.h:52
IMB_
#define IMB_
Definition: PBtools.h:41
pilaenv_
int pilaenv_()
PB_Cplascal
void PB_Cplascal()
PB_CGatherV
void PB_CGatherV()
INIT
#define INIT
Definition: PBblas.h:61
PB_Cdescset
void PB_Cdescset()
PB_Cplapad
void PB_Cplapad()
PB_Cindxg2p
int PB_Cindxg2p()
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
GSUM2D_T
void(* GSUM2D_T)()
Definition: pblas.h:282
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
COMBINE
#define COMBINE
Definition: PBblacs.h:49
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
CFORWARD
#define CFORWARD
Definition: PBblas.h:38
PB_CInV
void PB_CInV()
ALL
#define ALL
Definition: PBblas.h:50
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
PB_CpsymmAB
void PB_CpsymmAB(PBTYP_T *TYPE, char *DIRECAB, char *CONJUG, char *SIDE, char *UPLO, int M, int N, char *ALPHA, char *A, int IA, int JA, int *DESCA, char *B, int IB, int JB, int *DESCB, char *BETA, char *C, int IC, int JC, int *DESCC)
Definition: PB_CpsymmAB.c:26
INB_
#define INB_
Definition: PBtools.h:42
LOWER
#define LOWER
Definition: PBblas.h:51
PB_COutV
void PB_COutV()
REUSE
#define REUSE
Definition: PBblas.h:67
C2F_CHAR
#define C2F_CHAR(a)
Definition: pblas.h:121
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
CUPPER
#define CUPPER
Definition: PBblas.h:26
CTRAN
#define CTRAN
Definition: PBblas.h:20
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CLEFT
#define CLEFT
Definition: PBblas.h:29
CBACKWARD
#define CBACKWARD
Definition: PBblas.h:39
CTXT_
#define CTXT_
Definition: PBtools.h:38