ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CpsymmBC.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_CpsymmBC( PBTYP_T * TYPE, char * DIRECAB, char * CONJUG,
21  char * SIDE, char * UPLO, int M, int N, char * ALPHA,
22  char * A, int IA, int JA, int * DESCA, char * B,
23  int IB, int JB, int * DESCB, char * BETA, char * C,
24  int IC, int JC, int * DESCC )
25 #else
26 void PB_CpsymmBC( TYPE, DIRECAB, CONJUG, SIDE, UPLO, M, N, ALPHA, A, IA,
27  JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC )
28 /*
29 * .. Scalar Arguments ..
30 */
31  char * CONJUG, * DIRECAB, * SIDE, * UPLO;
32  int IA, IB, IC, JA, JB, JC, M, N;
33  char * ALPHA, * BETA;
34  PBTYP_T * TYPE;
35 /*
36 * .. Array Arguments ..
37 */
38  int * DESCA, * DESCB, * DESCC;
39  char * A, * B, * C;
40 #endif
41 {
42 /*
43 * Purpose
44 * =======
45 *
46 * PB_CpsymmBC performs one of the matrix-matrix operations
47 *
48 * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ),
49 *
50 * or
51 *
52 * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ),
53 *
54 * where
55 *
56 * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1),
57 *
58 * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L',
59 * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and,
60 *
61 * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1).
62 *
63 * Alpha and beta are scalars, sub( A ) is a symmetric or Hermitian
64 * submatrix and sub( B ) and sub( C ) are m by n submatrices.
65 *
66 * This is the inner-product algorithm using the logical LCM hybrid
67 * and static blocking techniques. The submatrix operand sub( A ) stays
68 * in place.
69 *
70 * Notes
71 * =====
72 *
73 * A description vector is associated with each 2D block-cyclicly dis-
74 * tributed matrix. This vector stores the information required to
75 * establish the mapping between a matrix entry and its corresponding
76 * process and memory location.
77 *
78 * In the following comments, the character _ should be read as
79 * "of the distributed matrix". Let A be a generic term for any 2D
80 * block cyclicly distributed matrix. Its description vector is DESC_A:
81 *
82 * NOTATION STORED IN EXPLANATION
83 * ---------------- --------------- ------------------------------------
84 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
85 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
86 * the NPROW x NPCOL BLACS process grid
87 * A is distributed over. The context
88 * itself is global, but the handle
89 * (the integer value) may vary.
90 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
91 * ted matrix A, M_A >= 0.
92 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
93 * buted matrix A, N_A >= 0.
94 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
95 * block of the matrix A, IMB_A > 0.
96 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
97 * left block of the matrix A,
98 * INB_A > 0.
99 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
100 * bute the last M_A-IMB_A rows of A,
101 * MB_A > 0.
102 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
103 * bute the last N_A-INB_A columns of
104 * A, NB_A > 0.
105 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
106 * row of the matrix A is distributed,
107 * NPROW > RSRC_A >= 0.
108 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
109 * first column of A is distributed.
110 * NPCOL > CSRC_A >= 0.
111 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
112 * array storing the local blocks of
113 * the distributed matrix A,
114 * IF( Lc( 1, N_A ) > 0 )
115 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
116 * ELSE
117 * LLD_A >= 1.
118 *
119 * Let K be the number of rows of a matrix A starting at the global in-
120 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
121 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
122 * receive if these K rows were distributed over NPROW processes. If K
123 * is the number of columns of a matrix A starting at the global index
124 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
125 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
126 * these K columns were distributed over NPCOL processes.
127 *
128 * The values of Lr() and Lc() may be determined via a call to the func-
129 * tion PB_Cnumroc:
130 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
131 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
132 *
133 * Arguments
134 * =========
135 *
136 * TYPE (local input) pointer to a PBTYP_T structure
137 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
138 * that contains type information (See pblas.h).
139 *
140 * DIRECAB (global input) pointer to CHAR
141 * On entry, DIRECAB specifies the direction in which the rows
142 * or columns of sub( B ) should be looped over as follows:
143 * DIRECAB = 'F' or 'f' forward or increasing,
144 * DIRECAB = 'B' or 'b' backward or decreasing.
145 *
146 * CONJUG (global input) pointer to CHAR
147 * On entry, CONJUG specifies whether sub( A ) is a symmetric or
148 * Hermitian submatrix operand as follows:
149 * CONJUG = 'N' or 'n' sub( A ) is symmetric,
150 * CONJUG = 'Z' or 'z' sub( A ) is Hermitian.
151 *
152 * SIDE (global input) pointer to CHAR
153 * On entry, SIDE specifies whether the symmetric or Hermitian
154 * submatrix sub( A ) appears on the left or right in the opera-
155 * tion as follows:
156 *
157 * SIDE = 'L' or 'l'
158 * sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ),
159 *
160 * SIDE = 'R' or 'r'
161 * sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ).
162 *
163 * UPLO (global input) pointer to CHAR
164 * On entry, UPLO specifies whether the local pieces of
165 * the array A containing the upper or lower triangular part
166 * of the submatrix sub( A ) are to be referenced as follows:
167 * UPLO = 'U' or 'u' Only the local pieces corresponding to
168 * the upper triangular part of the
169 * submatrix sub( A ) are referenced,
170 * UPLO = 'L' or 'l' Only the local pieces corresponding to
171 * the lower triangular part of the
172 * submatrix sub( A ) are referenced.
173 *
174 * M (global input) INTEGER
175 * On entry, M specifies the number of rows of the submatrix
176 * sub( C ). M must be at least zero.
177 *
178 * N (global input) INTEGER
179 * On entry, N specifies the number of columns of the submatrix
180 * sub( C ). N must be at least zero.
181 *
182 * ALPHA (global input) pointer to CHAR
183 * On entry, ALPHA specifies the scalar alpha. When ALPHA is
184 * supplied as zero then the local entries of the arrays A and
185 * B corresponding to the entries of the submatrices sub( A )
186 * and sub( B ) respectively need not be set on input.
187 *
188 * A (local input) pointer to CHAR
189 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
190 * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at
191 * at least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
192 * contains the local entries of the matrix A.
193 * Before entry with SIDE = 'L' or 'l', this array contains
194 * the local entries corresponding to the entries of the m by m
195 * symmetric or Hermitian submatrix sub( A ), such that when
196 * UPLO = 'U' or 'u', this array contains the local entries of
197 * the upper triangular part of the submatrix sub( A ), and the
198 * local entries of the strictly lower triangular of sub( A )
199 * are not referenced, and when UPLO = 'L' or 'l', this array
200 * contains the local entries of the lower triangular part of
201 * the symmetric or Hermitian submatrix sub( A ), and the local
202 * entries of the strictly upper triangular of sub( A ) are not
203 * referenced.
204 * Before entry with SIDE = 'R' or 'r', this array contains
205 * the local entries corresponding to the entries of the n by n
206 * symmetric or Hermitian submatrix sub( A ), such that when
207 * UPLO = 'U' or 'u', this array contains the local entries of
208 * the upper triangular part of the submatrix sub( A ), and the
209 * local entries of the strictly lower triangular of sub( A )
210 * are not referenced, and when UPLO = 'L' or 'l', this array
211 * contains the local entries of the lower triangular part of
212 * the symmetric or Hermitian submatrix sub( A ), and the local
213 * entries of the strictly upper triangular of sub( A ) are not
214 * referenced.
215 * Note that the imaginary parts of the local entries corres-
216 * ponding to the diagonal elements of sub( A ) need not be
217 * set and assumed to be zero.
218 *
219 * IA (global input) INTEGER
220 * On entry, IA specifies A's global row index, which points to
221 * the beginning of the submatrix sub( A ).
222 *
223 * JA (global input) INTEGER
224 * On entry, JA specifies A's global column index, which points
225 * to the beginning of the submatrix sub( A ).
226 *
227 * DESCA (global and local input) INTEGER array
228 * On entry, DESCA is an integer array of dimension DLEN_. This
229 * is the array descriptor for the matrix A.
230 *
231 * B (local input) pointer to CHAR
232 * On entry, B is an array of dimension (LLD_B, Kb), where Kb is
233 * at least Lc( 1, JB+N-1 ). Before entry, this array contains
234 * the local entries of the matrix B.
235 *
236 * IB (global input) INTEGER
237 * On entry, IB specifies B's global row index, which points to
238 * the beginning of the submatrix sub( B ).
239 *
240 * JB (global input) INTEGER
241 * On entry, JB specifies B's global column index, which points
242 * to the beginning of the submatrix sub( B ).
243 *
244 * DESCB (global and local input) INTEGER array
245 * On entry, DESCB is an integer array of dimension DLEN_. This
246 * is the array descriptor for the matrix B.
247 *
248 * BETA (global input) pointer to CHAR
249 * On entry, BETA specifies the scalar beta. When BETA is
250 * supplied as zero then the local entries of the array C
251 * corresponding to the entries of the submatrix sub( C ) need
252 * not be set on input.
253 *
254 * C (local input/local output) pointer to CHAR
255 * On entry, C is an array of dimension (LLD_C, Kc), where Kc is
256 * at least Lc( 1, JC+N-1 ). Before entry, this array contains
257 * the local entries of the matrix C.
258 * On exit, the entries of this array corresponding to the local
259 * entries of the submatrix sub( C ) are overwritten by the
260 * local entries of the m by n updated submatrix.
261 *
262 * IC (global input) INTEGER
263 * On entry, IC specifies C's global row index, which points to
264 * the beginning of the submatrix sub( C ).
265 *
266 * JC (global input) INTEGER
267 * On entry, JC specifies C's global column index, which points
268 * to the beginning of the submatrix sub( C ).
269 *
270 * DESCC (global and local input) INTEGER array
271 * On entry, DESCC is an integer array of dimension DLEN_. This
272 * is the array descriptor for the matrix C.
273 *
274 * -- Written on April 1, 1998 by
275 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
276 *
277 * ---------------------------------------------------------------------
278 */
279 /*
280 * .. Local Scalars ..
281 */
282  char GemmTa, GemmTb, cctop, * one, rctop, * talphaCR, * talphaRC,
283  * tbeta, * zero;
284  int Acol, Aii, Aimb1, Ainb1, Ajj, Alcmb, Ald, Alp, Alp0, Alq,
285  Alq0, Amb, Amp, An, Anb, Anq, Arow, BCfwd, BCmyprocD,
286  BCmyprocR, BCnD, BCnR, BCnprocsD, BCnprocsR, Bbufld, BcurrocR,
287  Bfr, BiD, BiR, BiiD, BiiR, BinbD, BinbR, Binb1D, Binb1R, BisR,
288  Bkk, Bld, BnbD, BnbR, BnpD, BnpR, Boff, BrocD, BrocR, BsrcR,
289  Cfr, CiD, CiR, CiiD, CiiR, CinbD, CinbR, Cinb1D, Cinb1R, Ckk,
290  CnbD, CnbR, CnpD, CnpR, Coff, CrocD, CrocR, CsrcR, Cbufld,
291  CcurrocR, CisR, Cld, WBCfr, WBCld, WBRfr, WBRld, WCCfr, WCCld,
292  WCCsum, WCRfr, WCRld, WCRsum, conjg, ctxt, l, lb, lcmb, lside,
293  ltmp, maxp, maxpm1, maxq, mycol, myrow, n, nb, nbb, ncpq,
294  npcol, npq=0, nprow, nrpq, p=0, q=0, size, tmp, upper;
295  TZSYM_T tzsymm;
296  GEMM_T gemm;
297  GSUM2D_T gsum2d;
298 /*
299 * .. Local Arrays ..
300 */
301  PB_VM_T VM;
302  int Ad0 [DLEN_], DBUFB[DLEN_], DBUFC[DLEN_], WBCd[DLEN_],
303  WBRd[DLEN_], WCCd [DLEN_], WCRd [DLEN_];
304  char * Aptr = NULL, * Bbuf = NULL, * Cbuf = NULL, * WBC = NULL,
305  * WBR = NULL, * WCC = NULL, * WCR = NULL;
306 /* ..
307 * .. Executable Statements ..
308 *
309 */
310  Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
311 
312  BCfwd = ( Mupcase( DIRECAB[0] ) == CFORWARD );
313  conjg = ( Mupcase( CONJUG [0] ) == CCONJG );
314  lside = ( Mupcase( SIDE [0] ) == CLEFT );
315  upper = ( Mupcase( UPLO [0] ) == CUPPER );
316 
317  size = TYPE->size; one = TYPE->one; zero = TYPE->zero;
318  gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d;
319  nb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
320 /*
321 * Compute local information for sub( A ), sub( B ) and sub( C )
322 */
323  if( lside )
324  {
325  BCnD = An = M; BCnR = N;
326  BCmyprocD = myrow; BCnprocsD = nprow;
327  BCmyprocR = mycol; BCnprocsR = npcol;
328 
329  BiD = IB; BiR = JB;
330  BinbD = DESCB[IMB_ ]; BinbR = DESCB[INB_];
331  BnbD = DESCB[MB_ ]; BnbR = DESCB[NB_ ];
332  BsrcR = DESCB[CSRC_]; Bld = DESCB[LLD_];
333  PB_Cinfog2l( IB, JB, DESCB, BCnprocsD, BCnprocsR, BCmyprocD, BCmyprocR,
334  &BiiD, &BiiR, &BrocD, &BrocR );
335 
336  CiD = IC; CiR = JC;
337  CinbD = DESCC[IMB_ ]; CinbR = DESCC[INB_];
338  CnbD = DESCC[MB_ ]; CnbR = DESCC[NB_ ];
339  CsrcR = DESCC[CSRC_]; Cld = DESCC[LLD_];
340  PB_Cinfog2l( IC, JC, DESCC, BCnprocsD, BCnprocsR, BCmyprocD, BCmyprocR,
341  &CiiD, &CiiR, &CrocD, &CrocR );
342  }
343  else
344  {
345  BCnD = An = N; BCnR = M;
346  BCmyprocD = mycol; BCnprocsD = npcol;
347  BCmyprocR = myrow; BCnprocsR = nprow;
348 
349  BiD = JB; BiR = IB;
350  BinbR = DESCB[IMB_ ]; BinbD = DESCB[INB_];
351  BnbR = DESCB[MB_ ]; BnbD = DESCB[NB_ ];
352  BsrcR = DESCB[RSRC_]; Bld = DESCB[LLD_];
353  PB_Cinfog2l( IB, JB, DESCB, BCnprocsR, BCnprocsD, BCmyprocR, BCmyprocD,
354  &BiiR, &BiiD, &BrocR, &BrocD );
355 
356  CiD = JC; CiR = IC;
357  CinbR = DESCC[IMB_ ]; CinbD = DESCC[INB_];
358  CnbR = DESCC[MB_ ]; CnbD = DESCC[NB_ ];
359  CsrcR = DESCC[RSRC_]; Cld = DESCC[LLD_];
360  PB_Cinfog2l( IC, JC, DESCC, BCnprocsR, BCnprocsD, BCmyprocR, BCmyprocD,
361  &CiiR, &CiiD, &CrocR, &CrocD );
362  }
363 
364  Binb1D = PB_Cfirstnb( BCnD, BiD, BinbD, BnbD );
365  BnpD = PB_Cnumroc( BCnD, 0, Binb1D, BnbD, BCmyprocD, BrocD, BCnprocsD );
366  Binb1R = PB_Cfirstnb( BCnR, BiR, BinbR, BnbR );
367  BisR = ( ( BsrcR < 0 ) || ( BCnprocsR == 1 ) );
368 
369  Cinb1D = PB_Cfirstnb( BCnD, CiD, CinbD, CnbD );
370  CnpD = PB_Cnumroc( BCnD, 0, Cinb1D, CnbD, BCmyprocD, CrocD, BCnprocsD );
371  Cinb1R = PB_Cfirstnb( BCnR, CiR, CinbR, CnbR );
372  CisR = ( ( CsrcR < 0 ) || ( BCnprocsR == 1 ) );
373 /*
374 * Compute descriptor Ad0 for sub( A )
375 */
376  PB_Cdescribe( An, An, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
377  &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
378 
379  Amp = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow );
380  Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol );
381  if( ( Amp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( A, Aii, Ajj, Ald, size );
382 /*
383 * Retrieve the BLACS combine topologies, compute conjugate of alpha for the
384 * Hermitian case and set the transpose parameters to be passed to the BLAS
385 * matrix multiply routine.
386 */
387  rctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
388  cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
389 
390  if( conjg )
391  {
392  tzsymm = PB_Ctzhemm;
393  if( lside )
394  {
395  talphaRC = ALPHA; GemmTa = CCOTRAN; GemmTb = CTRAN;
396  talphaCR = PB_Cmalloc( size );
397  PB_Cconjg( TYPE, ALPHA, talphaCR );
398  }
399  else
400  {
401  talphaCR = ALPHA; GemmTa = CTRAN; GemmTb = CCOTRAN;
402  talphaRC = PB_Cmalloc( size );
403  PB_Cconjg( TYPE, ALPHA, talphaRC );
404  }
405  }
406  else
407  {
408  tzsymm = PB_Ctzsymm; talphaCR = talphaRC = ALPHA;
409  GemmTa = CTRAN; GemmTb = CTRAN;
410  }
411 /*
412 * Computational partitioning size is computed as the product of the logical
413 * value returned by pilaenv_ and 2 * lcm( nprow, npcol ).
414 */
415  Alcmb = 2 * nb * PB_Clcm( ( Arow >= 0 ? nprow : 1 ),
416  ( Acol >= 0 ? npcol : 1 ) );
417 /*
418 * When sub( B ) is not replicated and backward pass on sub( B ), find the
419 * virtual process q owning the last row or column of sub( B ).
420 */
421  if( !( BisR ) && !( BCfwd ) )
422  {
423  tmp = PB_Cindxg2p( BCnR - 1, Binb1R, BnbR, BrocR, BrocR, BCnprocsR );
424  q = MModSub( tmp, BrocR, BCnprocsR );
425  }
426 /*
427 * When sub( C ) is not replicated and backward pass on sub( C ), find the
428 * virtual process p owning the last row or column of sub( C ).
429 */
430  if( !( CisR ) && !( BCfwd ) )
431  {
432  tmp = PB_Cindxg2p( BCnR - 1, Cinb1R, CnbR, CrocR, CrocR, BCnprocsR );
433  p = MModSub( tmp, CrocR, BCnprocsR );
434  }
435 /*
436 * Loop over the virtual process grid induced by the rows or columns of
437 * sub( B ) and sub( C ).
438 */
439  lcmb = PB_Clcm( ( maxp = ( CisR ? 1 : BCnprocsR ) ) * CnbR,
440  ( maxq = ( BisR ? 1 : BCnprocsR ) ) * BnbR );
441  n = BCnR;
442  maxpm1 = maxp - 1;
443 
444  while( n > 0 )
445  {
446 /*
447 * Initialize local virtual matrix in process (p,q)
448 */
449  BcurrocR = ( BisR ? -1 : MModAdd( BrocR, q, BCnprocsR ) );
450  Bkk = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, BCnprocsR );
451  BnpR = PB_Cnumroc( BCnR, 0, Binb1R, BnbR, BcurrocR, BrocR, BCnprocsR );
452 
453  CcurrocR = ( CisR ? -1 : MModAdd( CrocR, p, BCnprocsR ) );
454  Ckk = PB_Cg2lrem( CiR, CinbR, CnbR, CcurrocR, CsrcR, BCnprocsR );
455  CnpR = PB_Cnumroc( BCnR, 0, Cinb1R, CnbR, CcurrocR, CrocR, BCnprocsR );
456 
457  PB_CVMinit( &VM, 0, CnpR, BnpR, Cinb1R, Binb1R, CnbR, BnbR, p, q,
458  maxp, maxq, lcmb );
459 /*
460 * Find how many diagonals in this virtual process
461 */
462  npq = PB_CVMnpq( &VM );
463 
464  n -= npq;
465 /*
466 * Re-adjust the number of rows or columns to be (un)packed, in order to
467 * average the message sizes.
468 */
469  if( npq ) nbb = npq / ( ( npq - 1 ) / nb + 1 );
470 
471  while( npq )
472  {
473  nbb = MIN( nbb, npq );
474 /*
475 * Find out how many rows or columns of sub( B ) and sub( C ) are contiguous
476 */
477  PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Boff );
478 
479  if( lside )
480  {
481 /*
482 * Compute the descriptor DBUFB for the buffer that will contained the packed
483 * columns of sub( B ).
484 */
485  if( ( Bfr = ( ncpq < nbb ) ) != 0 )
486  {
487 /*
488 * If columns of sub( B ) are not contiguous, then allocate the buffer and
489 * pack the kbb columns of sub( B ).
490 */
491  Bbufld = MAX( 1, BnpD );
492  if( BisR || ( BCmyprocR == BcurrocR ) )
493  {
494  Bbuf = PB_Cmalloc( BnpD * nbb * size );
495  PB_CVMpack( TYPE, &VM, COLUMN, COLUMN, PACKING, NOTRAN, nbb,
496  BnpD, one, Mptr( B, BiiD, Bkk, Bld, size ), Bld,
497  zero, Bbuf, Bbufld );
498  }
499  }
500  else
501  {
502 /*
503 * Otherwise, re-use sub( B ) directly.
504 */
505  Bbufld = Bld;
506  if( BisR || ( BCmyprocR == BcurrocR ) )
507  Bbuf = Mptr( B, BiiD, Bkk+Boff, Bld, size );
508  }
509  PB_Cdescset( DBUFB, BCnD, nbb, Binb1D, nbb, BnbD, nbb, BrocD,
510  BcurrocR, ctxt, Bbufld );
511 /*
512 * Replicate this panel of columns of sub( B ) as well as its transposed
513 * over sub( A ) -> WBC, WBR
514 */
515  PB_CInV( TYPE, NOCONJG, COLUMN, An, An, Ad0, nbb, Bbuf, 0, 0,
516  DBUFB, COLUMN, &WBC, WBCd, &WBCfr );
517  PB_CInV( TYPE, NOCONJG, ROW, An, An, Ad0, nbb, WBC, 0, 0,
518  WBCd, COLUMN, &WBR, WBRd, &WBRfr );
519  }
520  else
521  {
522 /*
523 * Compute the descriptor DBUFB for the buffer that will contained the packed
524 * rows of sub( B ).
525 */
526  if( ( Bfr = ( ncpq < nbb ) ) != 0 )
527  {
528 /*
529 * If rows of sub( B ) are not contiguous, then allocate the buffer and pack
530 * the kbb rows of sub( B ).
531 */
532  Bbufld = nbb;
533  if( BisR || ( BCmyprocR == BcurrocR ) )
534  {
535  Bbuf = PB_Cmalloc( BnpD * nbb * size );
536  PB_CVMpack( TYPE, &VM, COLUMN, ROW, PACKING, NOTRAN, nbb,
537  BnpD, one, Mptr( B, Bkk, BiiD, Bld, size ), Bld,
538  zero, Bbuf, Bbufld );
539  }
540  }
541  else
542  {
543 /*
544 * Otherwise, re-use sub( B ) directly.
545 */
546  Bbufld = Bld;
547  if( BisR || ( BCmyprocR == BcurrocR ) )
548  Bbuf = Mptr( B, Bkk+Boff, BiiD, Bld, size );
549  }
550  PB_Cdescset( DBUFB, nbb, BCnD, nbb, Binb1D, nbb, BnbD, BcurrocR,
551  BrocD, ctxt, Bbufld );
552 /*
553 * Replicate this panel of rows of sub( B ) as well as its transposed
554 * over sub( A ) -> WBR, WBC
555 */
556  PB_CInV( TYPE, NOCONJG, ROW, An, An, Ad0, nbb, Bbuf, 0, 0,
557  DBUFB, ROW, &WBR, WBRd, &WBRfr );
558  PB_CInV( TYPE, NOCONJG, COLUMN, An, An, Ad0, nbb, WBR, 0, 0,
559  WBRd, ROW, &WBC, WBCd, &WBCfr );
560  }
561 /*
562 * Allocate space for temporary results in scope of sub( A ) -> WCC, WCR
563 */
564  PB_COutV( TYPE, COLUMN, INIT, An, An, Ad0, nbb, &WCC, WCCd, &WCCfr,
565  &WCCsum );
566  PB_COutV( TYPE, ROW, INIT, An, An, Ad0, nbb, &WCR, WCRd, &WCRfr,
567  &WCRsum );
568 /*
569 * Local matrix-matrix multiply iff I own some data
570 */
571  WBCld = WBCd[LLD_]; WBRld = WBRd[LLD_];
572  WCCld = WCCd[LLD_]; WCRld = WCRd[LLD_];
573 
574  if( ( Amp > 0 ) && ( Anq > 0 ) )
575  {
576  if( upper )
577  {
578 /*
579 * sub( A ) is upper triangular
580 */
581  for( l = 0; l < An; l += Alcmb )
582  {
583  lb = An - l; lb = MIN( lb, Alcmb );
584  Alp = PB_Cnumroc( l, 0, Aimb1, Amb, myrow, Arow, nprow );
585  Alq = PB_Cnumroc( l, 0, Ainb1, Anb, mycol, Acol, npcol );
586  Alq0 = PB_Cnumroc( lb, l, Ainb1, Anb, mycol, Acol, npcol );
587  if( Alp > 0 && Alq0 > 0 )
588  {
589  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &GemmTb ), &Alp, &nbb,
590  &Alq0, talphaRC, Mptr( Aptr, 0, Alq, Ald, size ),
591  &Ald, Mptr( WBR, 0, Alq, WBRld, size ), &WBRld, one,
592  WCC, &WCCld );
593  gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( NOTRAN ), &nbb, &Alq0,
594  &Alp, talphaCR, WBC, &WBCld, Mptr( Aptr, 0, Alq, Ald,
595  size ), &Ald, one, Mptr( WCR, 0, Alq, WCRld, size ),
596  &WCRld );
597  }
598  PB_Cpsym( TYPE, TYPE, SIDE, UPPER, lb, nbb, ALPHA, Aptr, l, l,
599  Ad0, Mptr( WBC, Alp, 0, WBCld, size ), WBCld,
600  Mptr( WBR, 0, Alq, WBRld, size ), WBRld, Mptr( WCC,
601  Alp, 0, WCCld, size ), WCCld, Mptr( WCR, 0, Alq,
602  WCRld, size ), WCRld, tzsymm );
603  }
604  }
605  else
606  {
607 /*
608 * sub( A ) is lower triangular
609 */
610  for( l = 0; l < An; l += Alcmb )
611  {
612  lb = An - l; ltmp = l + ( lb = MIN( lb, Alcmb ) );
613  Alp = PB_Cnumroc( l, 0, Aimb1, Amb, myrow, Arow, nprow );
614  Alq = PB_Cnumroc( l, 0, Ainb1, Anb, mycol, Acol, npcol );
615  PB_Cpsym( TYPE, TYPE, SIDE, LOWER, lb, nbb, ALPHA, Aptr, l, l,
616  Ad0, Mptr( WBC, Alp, 0, WBCld, size ), WBCld,
617  Mptr( WBR, 0, Alq, WBRld, size ), WBRld, Mptr( WCC,
618  Alp, 0, WCCld, size ), WCCld, Mptr( WCR, 0, Alq,
619  WCRld, size ), WCRld, tzsymm );
620  Alp = PB_Cnumroc( ltmp, 0, Aimb1, Amb, myrow, Arow, nprow );
621  Alp0 = Amp - Alp;
622  Alq0 = PB_Cnumroc( lb, l, Ainb1, Anb, mycol, Acol, npcol );
623  if( Alp0 > 0 && Alq0 > 0 )
624  {
625  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &GemmTb ), &Alp0, &nbb,
626  &Alq0, talphaRC, Mptr( Aptr, Alp, Alq, Ald, size ),
627  &Ald, Mptr( WBR, 0, Alq, WBRld, size ), &WBRld, one,
628  Mptr( WCC, Alp, 0, WCCld, size ), &WCCld );
629  gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( NOTRAN ), &nbb, &Alq0,
630  &Alp0, talphaCR, Mptr( WBC, Alp, 0, WBCld, size ),
631  &WBCld, Mptr( Aptr, Alp, Alq, Ald, size ), &Ald, one,
632  Mptr( WCR, 0, Alq, WCRld, size ), &WCRld );
633  }
634  }
635  }
636  }
637  if( WBCfr ) free( WBC );
638  if( WBRfr ) free( WBR );
639 
640  if( Bfr && ( BisR || ( BCmyprocR == BcurrocR ) ) )
641  if( Bbuf ) free( Bbuf );
642 
643  if( lside )
644  {
645 /*
646 * Accumulate the intermediate results in WCC and WCR
647 */
648  if( WCCsum )
649  {
650  WCCd[CSRC_] = CcurrocR;
651  if( Amp > 0 )
652  gsum2d( ctxt, ROW, &rctop, Amp, nbb, WCC, WCCld, myrow,
653  WCCd[CSRC_] );
654  }
655 
656  if( WCRsum )
657  {
658  WCRd[RSRC_] = 0;
659  if( Anq > 0 )
660  gsum2d( ctxt, COLUMN, &cctop, nbb, Anq, WCR, WCRld,
661  WCRd[RSRC_], mycol );
662  }
663 /*
664 * WCC := WCC + WCR'
665 */
666  PB_Cpaxpby( TYPE, CONJUG, nbb, An, one, WCR, 0, 0, WCRd, ROW, one,
667  WCC, 0, 0, WCCd, COLUMN );
668  if( WCRfr ) free( WCR );
669 /*
670 * Compute the descriptor DBUFC for the buffer that will contained the packed
671 * columns of sub( C ). Allocate it.
672 */
673  if( ( Cfr = ( nrpq < nbb ) ) != 0 )
674  {
675 /*
676 * If columns of sub( C ) are not contiguous, then allocate the buffer
677 */
678  Cbufld = MAX( 1, CnpD ); tbeta = zero;
679  if( CisR || ( BCmyprocR == CcurrocR ) )
680  Cbuf = PB_Cmalloc( CnpD * nbb * size );
681  }
682  else
683  {
684 /*
685 * Otherwise re-use sub( C )
686 */
687  Cbufld = Cld; tbeta = BETA;
688  if( CisR || ( BCmyprocR == CcurrocR ) )
689  Cbuf = Mptr( C, CiiD, Ckk+Coff, Cld, size );
690  }
691  PB_Cdescset( DBUFC, BCnD, nbb, Cinb1D, nbb, CnbD, nbb, CrocD,
692  CcurrocR, ctxt, Cbufld );
693 /*
694 * sub( C ) := beta * sub( C ) + WCC
695 */
696  PB_Cpaxpby( TYPE, NOCONJG, An, nbb, one, WCC, 0, 0, WCCd, COLUMN,
697  tbeta, Cbuf, 0, 0, DBUFC, COLUMN );
698  if( WCCfr ) free( WCC );
699 /*
700 * Unpack the kbb columns of sub( C ) and release the buffer containing them.
701 */
702  if( Cfr && ( CisR || ( BCmyprocR == CcurrocR ) ) )
703  {
704  PB_CVMpack( TYPE, &VM, ROW, COLUMN, UNPACKING, NOTRAN, nbb,
705  CnpD, BETA, Mptr( C, CiiD, Ckk, Cld, size ), Cld,
706  one, Cbuf, Cbufld );
707  if( Cbuf ) free( Cbuf );
708  }
709  }
710  else
711  {
712 /*
713 * Accumulate the intermediate results in WCC and WCR
714 */
715  if( WCCsum )
716  {
717  WCCd[CSRC_] = 0;
718  if( Amp > 0 )
719  gsum2d( ctxt, ROW, &rctop, Amp, nbb, WCC, WCCld, myrow,
720  WCCd[CSRC_] );
721  }
722 
723  if( WCRsum )
724  {
725  WCRd[RSRC_] = CcurrocR;
726  if( Anq > 0 )
727  gsum2d( ctxt, COLUMN, &cctop, nbb, Anq, WCR, WCRld,
728  WCRd[RSRC_], mycol );
729  }
730 /*
731 * WCR := WCR + WCC'
732 */
733  PB_Cpaxpby( TYPE, CONJUG, An, nbb, one, WCC, 0, 0, WCCd, COLUMN,
734  one, WCR, 0, 0, WCRd, ROW );
735  if( WCCfr ) free( WCC );
736 /*
737 * Compute the descriptor DBUFC for the buffer that will contained the packed
738 * rows of sub( C ). Allocate it.
739 */
740  if( ( Cfr = ( nrpq < nbb ) ) != 0 )
741  {
742 /*
743 * If rows of sub( C ) are not contiguous, then allocate receiving buffer.
744 */
745  Cbufld = nbb; tbeta = zero;
746  if( CisR || ( BCmyprocR == CcurrocR ) )
747  Cbuf = PB_Cmalloc( CnpD * nbb * size );
748  }
749  else
750  {
751 /*
752 * Otherwise re-use sub( C )
753 */
754  Cbufld = Cld; tbeta = BETA;
755  if( CisR || ( BCmyprocR == CcurrocR ) )
756  Cbuf = Mptr( C, Ckk+Coff, CiiD, Cld, size );
757  }
758  PB_Cdescset( DBUFC, nbb, BCnD, nbb, Cinb1D, nbb, CnbD, CcurrocR,
759  CrocD, ctxt, Cbufld );
760 /*
761 * sub( C ) := beta * sub( C ) + WCR
762 */
763  PB_Cpaxpby( TYPE, NOCONJG, nbb, An, one, WCR, 0, 0, WCRd, ROW,
764  tbeta, Cbuf, 0, 0, DBUFC, ROW );
765 
766  if( WCRfr ) free( WCR );
767 /*
768 * Unpack the kbb rows of sub( C ) and release the buffer containing them.
769 */
770  if( Cfr && ( CisR || ( BCmyprocR == CcurrocR ) ) )
771  {
772  PB_CVMpack( TYPE, &VM, ROW, ROW, UNPACKING, NOTRAN, nbb,
773  CnpD, BETA, Mptr( C, Ckk, CiiD, Cld, size ), Cld,
774  one, Cbuf, Cbufld );
775  if( Cbuf ) free( Cbuf );
776  }
777  }
778 /*
779 * Update the local indexes of sub( B ) and sub( C )
780 */
781  PB_CVMupdate( &VM, nbb, &Ckk, &Bkk );
782 
783  npq -= nbb;
784  }
785 /*
786 * Go to next or previous virtual process row or column
787 */
788  if( ( BCfwd && ( p == maxpm1 ) ) ||
789  ( !( BCfwd ) && ( p == 0 ) ) )
790  q = ( BCfwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) );
791  p = ( BCfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) );
792  }
793 
794  if( conjg ) free( ( lside ? talphaCR : talphaRC ) );
795 /*
796 * End of PB_CpsymmBC
797 */
798 }
CCONJG
#define CCONJG
Definition: PBblas.h:21
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
PB_Cpaxpby
void PB_Cpaxpby()
PB_CVMcontig
void PB_CVMcontig()
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
PB_Ctzhemm
void PB_Ctzhemm()
UNPACKING
#define UNPACKING
Definition: PBtools.h:54
PB_Cpsym
void PB_Cpsym()
PB_Cg2lrem
int PB_Cg2lrem()
PB_Cconjg
void PB_Cconjg()
CCOTRAN
#define CCOTRAN
Definition: PBblas.h:22
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
PB_Cfirstnb
int PB_Cfirstnb()
DLEN_
#define DLEN_
Definition: PBtools.h:48
GEMM_T
F_VOID_FCT(* GEMM_T)()
Definition: pblas.h:313
TZSYM_T
void(* TZSYM_T)()
Definition: pblas.h:428
NOTRAN
#define NOTRAN
Definition: PBblas.h:44
LLD_
#define LLD_
Definition: PBtools.h:47
PB_Cdescribe
void PB_Cdescribe()
PB_CVMinit
void PB_CVMinit()
PB_CpsymmBC
void PB_CpsymmBC(PBTYP_T *TYPE, char *DIRECAB, char *CONJUG, char *SIDE, char *UPLO, int M, int N, char *ALPHA, char *A, int IA, int JA, int *DESCA, char *B, int IB, int JB, int *DESCB, char *BETA, char *C, int IC, int JC, int *DESCC)
Definition: PB_CpsymmBC.c:26
PB_CVMpack
int PB_CVMpack()
MModAdd
#define MModAdd(I1, I2, d)
Definition: PBtools.h:97
UPPER
#define UPPER
Definition: PBblas.h:52
IMB_
#define IMB_
Definition: PBtools.h:41
MModSub
#define MModSub(I1, I2, d)
Definition: PBtools.h:102
pilaenv_
int pilaenv_()
INIT
#define INIT
Definition: PBblas.h:61
PB_Cdescset
void PB_Cdescset()
PB_CVMupdate
void PB_CVMupdate()
MModAdd1
#define MModAdd1(I, d)
Definition: PBtools.h:100
PB_Cindxg2p
int PB_Cindxg2p()
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
GSUM2D_T
void(* GSUM2D_T)()
Definition: pblas.h:282
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
COMBINE
#define COMBINE
Definition: PBblacs.h:49
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
PB_Cmalloc
char * PB_Cmalloc()
CFORWARD
#define CFORWARD
Definition: PBblas.h:38
PB_CInV
void PB_CInV()
PB_VM_T
Definition: pblas.h:432
PB_CVMnpq
int PB_CVMnpq()
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
PACKING
#define PACKING
Definition: PBtools.h:53
INB_
#define INB_
Definition: PBtools.h:42
LOWER
#define LOWER
Definition: PBblas.h:51
PB_COutV
void PB_COutV()
PB_Ctzsymm
void PB_Ctzsymm()
C2F_CHAR
#define C2F_CHAR(a)
Definition: pblas.h:121
MModSub1
#define MModSub1(I, d)
Definition: PBtools.h:105
MAX
#define MAX(a_, b_)
Definition: PBtools.h:77
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
CUPPER
#define CUPPER
Definition: PBblas.h:26
CTRAN
#define CTRAN
Definition: PBblas.h:20
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CLEFT
#define CLEFT
Definition: PBblas.h:29
CTXT_
#define CTXT_
Definition: PBtools.h:38
PB_Clcm
int PB_Clcm()