ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CptrmmAB.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_CptrmmAB( PBTYP_T * TYPE, char * VARIANT, char * SIDE, char * UPLO,
21  char * TRANSA, char * DIAG, int M, int N, char * ALPHA,
22  char * A, int IA, int JA, int * DESCA, char * B, int IB,
23  int JB, int * DESCB )
24 #else
25 void PB_CptrmmAB( TYPE, VARIANT, SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, IA,
26  JA, DESCA, B, IB, JB, DESCB )
27 /*
28 * .. Scalar Arguments ..
29 */
30  char * DIAG, * SIDE, * TRANSA, * UPLO, * VARIANT;
31  int IA, IB, JA, JB, M, N;
32  char * ALPHA;
33  PBTYP_T * TYPE;
34 /*
35 * .. Array Arguments ..
36 */
37  int * DESCA, * DESCB;
38  char * A, * B;
39 #endif
40 {
41 /*
42 * Purpose
43 * =======
44 *
45 * PB_CptrmmAB performs one of the matrix-matrix operations
46 *
47 * sub( B ) := alpha * op( sub( A ) ) * sub( B ),
48 *
49 * or
50 *
51 * sub( B ) := alpha * sub( B ) * op( sub( A ) ),
52 *
53 * where
54 *
55 * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L',
56 * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and,
57 *
58 * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1).
59 *
60 * Alpha is a scalar, sub( B ) is an m by n submatrix, sub( A ) is a
61 * unit, or non-unit, upper or lower triangular submatrix and op( X ) is
62 * one of
63 *
64 * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ).
65 *
66 * This is the outer-product algorithm using the logical aggregation
67 * blocking technique.
68 *
69 * Notes
70 * =====
71 *
72 * A description vector is associated with each 2D block-cyclicly dis-
73 * tributed matrix. This vector stores the information required to
74 * establish the mapping between a matrix entry and its corresponding
75 * process and memory location.
76 *
77 * In the following comments, the character _ should be read as
78 * "of the distributed matrix". Let A be a generic term for any 2D
79 * block cyclicly distributed matrix. Its description vector is DESC_A:
80 *
81 * NOTATION STORED IN EXPLANATION
82 * ---------------- --------------- ------------------------------------
83 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
84 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
85 * the NPROW x NPCOL BLACS process grid
86 * A is distributed over. The context
87 * itself is global, but the handle
88 * (the integer value) may vary.
89 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
90 * ted matrix A, M_A >= 0.
91 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
92 * buted matrix A, N_A >= 0.
93 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
94 * block of the matrix A, IMB_A > 0.
95 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
96 * left block of the matrix A,
97 * INB_A > 0.
98 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
99 * bute the last M_A-IMB_A rows of A,
100 * MB_A > 0.
101 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
102 * bute the last N_A-INB_A columns of
103 * A, NB_A > 0.
104 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
105 * row of the matrix A is distributed,
106 * NPROW > RSRC_A >= 0.
107 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
108 * first column of A is distributed.
109 * NPCOL > CSRC_A >= 0.
110 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
111 * array storing the local blocks of
112 * the distributed matrix A,
113 * IF( Lc( 1, N_A ) > 0 )
114 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
115 * ELSE
116 * LLD_A >= 1.
117 *
118 * Let K be the number of rows of a matrix A starting at the global in-
119 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
120 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
121 * receive if these K rows were distributed over NPROW processes. If K
122 * is the number of columns of a matrix A starting at the global index
123 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
124 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
125 * these K columns were distributed over NPCOL processes.
126 *
127 * The values of Lr() and Lc() may be determined via a call to the func-
128 * tion PB_Cnumroc:
129 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
130 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
131 *
132 * Arguments
133 * =========
134 *
135 * TYPE (local input) pointer to a PBTYP_T structure
136 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
137 * that contains type information (See pblas.h).
138 *
139 * VARIANT (global input) pointer to CHAR
140 * On entry, VARIANT specifies whether the left- or right-loo-
141 * king variant of the algorithm should be used for the transpo-
142 * se cases only, that is TRANSA is not 'N' or 'n'. When VARIANT
143 * is 'L' or 'l', the left-looking variant is used, otherwise
144 * the right-looking algorithm is selected.
145 *
146 * SIDE (global input) pointer to CHAR
147 * On entry, SIDE specifies whether op( sub( A ) ) multiplies
148 * sub( B ) from the left or right as follows:
149 *
150 * SIDE = 'L' or 'l' sub( B ) := alpha*op( sub( A ) )*sub( B ),
151 *
152 * SIDE = 'R' or 'r' sub( B ) := alpha*sub( B )*op( sub( A ) ).
153 *
154 * UPLO (global input) pointer to CHAR
155 * On entry, UPLO specifies whether the submatrix sub( A ) is
156 * an upper or lower triangular submatrix as follows:
157 *
158 * UPLO = 'U' or 'u' sub( A ) is an upper triangular
159 * submatrix,
160 *
161 * UPLO = 'L' or 'l' sub( A ) is a lower triangular
162 * submatrix.
163 *
164 * TRANSA (global input) pointer to CHAR
165 * On entry, TRANSA specifies the form of op( sub( A ) ) to be
166 * used in the matrix multiplication as follows:
167 *
168 * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ),
169 *
170 * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )',
171 *
172 * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ).
173 *
174 * DIAG (global input) pointer to CHAR
175 * On entry, DIAG specifies whether or not sub( A ) is unit
176 * triangular as follows:
177 *
178 * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian-
179 * gular,
180 *
181 * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri-
182 * angular.
183 *
184 * M (global input) INTEGER
185 * On entry, M specifies the number of rows of the submatrix
186 * sub( B ). M must be at least zero.
187 *
188 * N (global input) INTEGER
189 * On entry, N specifies the number of columns of the submatrix
190 * sub( B ). N must be at least zero.
191 *
192 * ALPHA (global input) pointer to CHAR
193 * On entry, ALPHA specifies the scalar alpha. When ALPHA is
194 * supplied as zero then the local entries of the array B
195 * corresponding to the entries of the submatrix sub( B ) need
196 * not be set on input.
197 *
198 * A (local input) pointer to CHAR
199 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
200 * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at
201 * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
202 * contains the local entries of the matrix A.
203 * Before entry with UPLO = 'U' or 'u', this array contains the
204 * local entries corresponding to the entries of the upper tri-
205 * angular submatrix sub( A ), and the local entries correspon-
206 * ding to the entries of the strictly lower triangular part of
207 * the submatrix sub( A ) are not referenced.
208 * Before entry with UPLO = 'L' or 'l', this array contains the
209 * local entries corresponding to the entries of the lower tri-
210 * angular submatrix sub( A ), and the local entries correspon-
211 * ding to the entries of the strictly upper triangular part of
212 * the submatrix sub( A ) are not referenced.
213 * Note that when DIAG = 'U' or 'u', the local entries corres-
214 * ponding to the diagonal elements of the submatrix sub( A )
215 * are not referenced either, but are assumed to be unity.
216 *
217 * IA (global input) INTEGER
218 * On entry, IA specifies A's global row index, which points to
219 * the beginning of the submatrix sub( A ).
220 *
221 * JA (global input) INTEGER
222 * On entry, JA specifies A's global column index, which points
223 * to the beginning of the submatrix sub( A ).
224 *
225 * DESCA (global and local input) INTEGER array
226 * On entry, DESCA is an integer array of dimension DLEN_. This
227 * is the array descriptor for the matrix A.
228 *
229 * B (local input/local output) pointer to CHAR
230 * On entry, B is an array of dimension (LLD_B, Kb), where Kb is
231 * at least Lc( 1, JB+N-1 ). Before entry, this array contains
232 * the local entries of the matrix B.
233 * On exit, the local entries of this array corresponding to the
234 * to the entries of the submatrix sub( B ) are overwritten by
235 * the local entries of the m by n transformed submatrix.
236 *
237 * IB (global input) INTEGER
238 * On entry, IB specifies B's global row index, which points to
239 * the beginning of the submatrix sub( B ).
240 *
241 * JB (global input) INTEGER
242 * On entry, JB specifies B's global column index, which points
243 * to the beginning of the submatrix sub( B ).
244 *
245 * DESCB (global and local input) INTEGER array
246 * On entry, DESCB is an integer array of dimension DLEN_. This
247 * is the array descriptor for the matrix B.
248 *
249 * -- Written on April 1, 1998 by
250 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
251 *
252 * ---------------------------------------------------------------------
253 */
254 /*
255 * .. Local Scalars ..
256 */
257  char conjg, * one, top, * zero;
258  int Afr, Bcol, Bcurcol, Bcurimb1, Bcurinb1, Bcurrow, Bfr, Bii,
259  Bimb, Bimb1, Binb, Binb1, Bjj, Bld, Bmb, Bmp, Bmp0, Bnb, Bnq,
260  Bnq0, Brow, WAfr, WBfr, WBsum, ctxt, k, kb, kbb, ktmp, lside,
261  mycol, myrow, notran, npcol, nprow, size, unit, upper;
262 /*
263 * .. Local Arrays ..
264 */
265  int Bd0[DLEN_], DBUFA[DLEN_], DBUFB[DLEN_], WAd[DLEN_],
266  WBd[DLEN_];
267  char * Aptr = NULL, * Bptr = NULL, * Bptr0 = NULL, * WA = NULL,
268  * WB = NULL;
269  GEMM_T gemm;
270  GSUM2D_T gsum2d;
271 /* ..
272 * .. Executable Statements ..
273 *
274 */
275  lside = ( Mupcase( SIDE [0] ) == CLEFT );
276  upper = ( Mupcase( UPLO [0] ) == CUPPER );
277  notran = ( Mupcase( TRANSA[0] ) == CNOTRAN );
278  unit = ( Mupcase( DIAG [0] ) == CUNIT );
279  size = TYPE->size; one = TYPE->one; zero = TYPE->zero;
280  gsum2d = TYPE->Cgsum2d; gemm = TYPE->Fgemm;
281  kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
282 /*
283 * Retrieve process grid information
284 */
285  Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
286 /*
287 * Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol, Bld ...
288 */
289  Bimb = DESCB[IMB_]; Binb = DESCB[INB_];
290  Bmb = DESCB[MB_ ]; Bnb = DESCB[NB_ ]; Bld = DESCB[LLD_];
291  PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow,
292  &Bcol );
293  Bimb1 = PB_Cfirstnb( M, IB, Bimb, Bmb );
294  Bmp0 = PB_Cnumroc( M, 0, Bimb1, Bmb, myrow, Brow, nprow );
295  Binb1 = PB_Cfirstnb( N, JB, Binb, Bnb );
296  Bnq0 = PB_Cnumroc( N, 0, Binb1, Bnb, mycol, Bcol, npcol );
297  if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size );
298 
299  if( notran )
300  {
301  if( lside )
302  {
303  if( upper )
304  {
305  for( k = 0; k < M; k += kb )
306  {
307  kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
308 /*
309 * Accumulate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )
310 */
311  PB_CGatherV( TYPE, ALLOCATE, FORWARD, ktmp, kbb, A, IA, JA+k,
312  DESCA, COLUMN, &Aptr, DBUFA, &Afr );
313 /*
314 * Replicate WA := A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) over
315 * B( IB:IB+k+kbb-1, JB:JB+N-1 )
316 */
317  PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
318  ctxt, Bld );
319  PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
320  DBUFA, COLUMN, &WA, WAd, &WAfr );
321 /*
322 * Zero lower triangle of WA( k:k+kbb-1, 0:kbb-1 )
323 */
324  if( unit )
325  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one,
326  WA, k, 0, WAd );
327  else if( kbb > 1 )
328  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
329  WA, k+1, 0, WAd );
330 /*
331 * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
332 */
333  PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, N, B, IB+k, JB, DESCB,
334  ROW, &Bptr, DBUFB, &Bfr );
335 /*
336 * Replicate WB := B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over
337 * B( IB:IB+k+kbb-1, JB:JB+N-1)
338 */
339  PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0,
340  DBUFB, ROW, &WB, WBd, &WBfr );
341 /*
342 * Zero B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
343 */
344  PB_Cplapad( TYPE, ALL, NOCONJG, kbb, N, zero, zero, B, IB+k, JB,
345  DESCB );
346 /*
347 * B( IB:IB+k+kbb-1, JB:JB+N-1 ) := ALPHA * WA * WB
348 */
349  Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow );
350  if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
351  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0,
352  &kbb, ALPHA, WA, &WAd[LLD_], WB, &WBd[LLD_], one, Bptr0,
353  &Bld );
354 
355  if( WBfr ) free( WB );
356  if( WAfr ) free( WA );
357  if( Bfr ) free( Bptr );
358  if( Afr ) free( Aptr );
359  }
360  }
361  else
362  {
363  for( k = ( ( M - 1 ) / kb ) * kb; k >= 0; k -= kb )
364  {
365  ktmp = M - k; kbb = MIN( ktmp, kb );
366 /*
367 * Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )
368 */
369  PB_CGatherV( TYPE, ALLOCATE, BACKWARD, ktmp, kbb, A, IA+k, JA+k,
370  DESCA, COLUMN, &Aptr, DBUFA, &Afr );
371 /*
372 * Replicate WA := A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over
373 * B( IB+k:IB+M-1, JB:JB+N-1 )
374 */
375  Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
376  Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
377  PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
378  Bcol, ctxt, Bld );
379  PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
380  DBUFA, COLUMN, &WA, WAd, &WAfr );
381 /*
382 * Zero upper triangle of WA( 0:kbb-1, 0:kbb-1 )
383 */
384  if( unit )
385  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one,
386  WA, 0, 0, WAd );
387  else if( kbb > 1 )
388  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
389  WA, 0, 1, WAd );
390 /*
391 * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
392 */
393  PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, N, B, IB+k, JB,
394  DESCB, ROW, &Bptr, DBUFB, &Bfr );
395 /*
396 * Replicate WB := B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over
397 * B( IB+k:IB+M-1, JB:JB+N-1 )
398 */
399  PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0,
400  DBUFB, ROW, &WB, WBd, &WBfr );
401 /*
402 * Zero B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
403 */
404  PB_Cplapad( TYPE, ALL, NOCONJG, kbb, N, zero, zero, B, IB+k, JB,
405  DESCB );
406 /*
407 * B( IB+k:IB+M-1, JB:JB+N-1 ) := ALPHA * WA * WB
408 */
409  Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow );
410  if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
411  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0,
412  &kbb, ALPHA, WA, &WAd[LLD_], WB, &WBd[LLD_], one,
413  Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld );
414 
415  if( WBfr ) free( WB );
416  if( WAfr ) free( WA );
417  if( Bfr ) free( Bptr );
418  if( Afr ) free( Aptr );
419  }
420  }
421  }
422  else
423  {
424  if( upper )
425  {
426  for( k = ( ( N - 1 ) / kb ) * kb; k >= 0; k -= kb )
427  {
428  ktmp = N - k; kbb = MIN( ktmp, kb );
429 /*
430 * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )
431 */
432  PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, ktmp, A, IA+k, JA+k,
433  DESCA, ROW, &Aptr, DBUFA, &Afr );
434 /*
435 * Replicate WA := A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over
436 * B( IB:IB+M-1, JB+k:JB+N-1 )
437 */
438  Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
439  Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
440  PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
441  Bcurcol, ctxt, Bld );
442  PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
443  DBUFA, ROW, &WA, WAd, &WAfr );
444 /*
445 * Zero lower triangle of WA( 0:kbb-1, 0:kbb-1 )
446 */
447  if( unit )
448  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one,
449  WA, 0, 0, WAd );
450  else if( kbb > 1 )
451  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
452  WA, 1, 0, WAd );
453 /*
454 * Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
455 */
456  PB_CGatherV( TYPE, ALLOCATE, BACKWARD, M, kbb, B, IB, JB+k,
457  DESCB, COLUMN, &Bptr, DBUFB, &Bfr );
458 /*
459 * Replicate WB := B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over
460 * B( IB:IB+M-1, JB+k:JB+N-1 )
461 */
462  PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0,
463  DBUFB, COLUMN, &WB, WBd, &WBfr );
464 /*
465 * Zero B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
466 */
467  PB_Cplapad( TYPE, ALL, NOCONJG, M, kbb, zero, zero, B, IB, JB+k,
468  DESCB );
469 /*
470 * B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := ALPHA * WB * WA
471 */
472  Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol );
473  if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
474  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq,
475  &kbb, ALPHA, WB, &WBd[LLD_], WA, &WAd[LLD_], one,
476  Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld );
477 
478  if( WBfr ) free( WB );
479  if( WAfr ) free( WA );
480  if( Bfr ) free( Bptr );
481  if( Afr ) free( Aptr );
482  }
483  }
484  else
485  {
486  for( k = 0; k < N; k += kb )
487  {
488  kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
489 /*
490 * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 )
491 */
492  PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, ktmp, A, IA+k, JA,
493  DESCA, ROW, &Aptr, DBUFA, &Afr );
494 /*
495 * Replicate WA := A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 ) over
496 * B( IB:IB+M-1, JB:JB+k+kbb-1 )
497 */
498  PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
499  ctxt, Bld );
500  PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
501  DBUFA, ROW, &WA, WAd, &WAfr );
502 /*
503 * Zero upper triangle of WA( 0:kbb-1, k:k+kbb-1 )
504 */
505  if( unit )
506  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one,
507  WA, 0, k, WAd );
508  else if( kbb > 1 )
509  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
510  WA, 0, k+1, WAd );
511 /*
512 * Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
513 */
514  PB_CGatherV( TYPE, ALLOCATE, FORWARD, M, kbb, B, IB, JB+k, DESCB,
515  COLUMN, &Bptr, DBUFB, &Bfr );
516 /*
517 * Replicate WB := B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over
518 * B( IB:IB+M-1, JB:JB+k+kbb-1 )
519 */
520  PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0,
521  DBUFB, COLUMN, &WB, WBd, &WBfr );
522 /*
523 * Zero B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
524 */
525  PB_Cplapad( TYPE, ALL, NOCONJG, M, kbb, zero, zero, B, IB, JB+k,
526  DESCB );
527 /*
528 * B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := ALPHA * WB * WA
529 */
530  Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol );
531  if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
532  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq,
533  &kbb, ALPHA, WB, &WBd[LLD_], WA, &WAd[LLD_], one, Bptr0,
534  &Bld );
535 
536  if( WBfr ) free( WB );
537  if( WAfr ) free( WA );
538  if( Bfr ) free( Bptr );
539  if( Afr ) free( Aptr );
540  }
541  }
542  }
543  }
544  else
545  {
546  if( Mupcase( VARIANT[0] ) == CRIGHT )
547  {
548 /*
549 * Right looking variant for the transpose cases
550 */
551  conjg = ( ( Mupcase( TRANSA[0] ) == CCOTRAN ) ? CCONJG : CNOCONJG );
552 
553  if( lside )
554  {
555  if( !upper )
556  {
557  for( k = 0; k < M; k += kb )
558  {
559  kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
560 /*
561 * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 )
562 */
563  PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, ktmp, A, IA+k, JA,
564  DESCA, ROW, &Aptr, DBUFA, &Afr );
565 /*
566 * Replicate WA := A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 )' over
567 * B( IB:IB+k+kbb-1, JB:JB+N-1 )
568 */
569  PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
570  ctxt, Bld );
571  PB_CInV( TYPE, &conjg, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
572  DBUFA, ROW, &WA, WAd, &WAfr );
573 /*
574 * Zero lower triangle of WA( k:k+kbb-1, 0:kbb-1 )
575 */
576  if( unit )
577  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one,
578  WA, k, 0, WAd );
579  else if( kbb > 1 )
580  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
581  WA, k+1, 0, WAd );
582 /*
583 * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
584 */
585  PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, N, B, IB+k, JB,
586  DESCB, ROW, &Bptr, DBUFB, &Bfr );
587 /*
588 * Replicate WB := B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over
589 * B( IB:IB+k+kbb-1, JB:JB+N-1)
590 */
591  PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0,
592  DBUFB, ROW, &WB, WBd, &WBfr );
593 /*
594 * Zero B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
595 */
596  PB_Cplapad( TYPE, ALL, NOCONJG, kbb, N, zero, zero, B, IB+k,
597  JB, DESCB );
598 /*
599 * B( IB:IB+k+kbb-1, JB:JB+N-1 ) := ALPHA * WA * WB
600 */
601  Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow );
602  if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
603  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0,
604  &kbb, ALPHA, WA, &WAd[LLD_], WB, &WBd[LLD_], one,
605  Bptr0, &Bld );
606 
607  if( WBfr ) free( WB );
608  if( WAfr ) free( WA );
609  if( Bfr ) free( Bptr );
610  if( Afr ) free( Aptr );
611  }
612  }
613  else
614  {
615  for( k = ( ( M - 1 ) / kb ) * kb; k >= 0; k -= kb )
616  {
617  ktmp = M - k; kbb = MIN( ktmp, kb );
618 /*
619 * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 )
620 */
621  PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, ktmp, A, IA+k,
622  JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr );
623 /*
624 * Replicate WA := A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 )' over
625 * B( IB+k:IB+M-1, JB:JB+N-1 )
626 */
627  Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
628  Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
629  PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
630  Bcol, ctxt, Bld );
631  PB_CInV( TYPE, &conjg, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
632  DBUFA, ROW, &WA, WAd, &WAfr );
633 /*
634 * Zero upper triangle of WA( 0:kbb-1, 0:kbb-1 )
635 */
636  if( unit )
637  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one,
638  WA, 0, 0, WAd );
639  else if( kbb > 1 )
640  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
641  WA, 0, 1, WAd );
642 /*
643 * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
644 */
645  PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, N, B, IB+k, JB,
646  DESCB, ROW, &Bptr, DBUFB, &Bfr );
647 /*
648 * Replicate WB := B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over
649 * B( IB+k:IB+M-1, JB:JB+N-1 )
650 */
651  PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0,
652  DBUFB, ROW, &WB, WBd, &WBfr );
653 /*
654 * Zero B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
655 */
656  PB_Cplapad( TYPE, ALL, NOCONJG, kbb, N, zero, zero, B, IB+k,
657  JB, DESCB );
658 /*
659 * B( IB+k:IB+M-1, JB:JB+N-1 ) := ALPHA * WA * WB
660 */
661  Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow );
662  if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
663  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0,
664  &kbb, ALPHA, WA, &WAd[LLD_], WB, &WBd[LLD_], one,
665  Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld );
666 
667  if( WBfr ) free( WB );
668  if( WAfr ) free( WA );
669  if( Bfr ) free( Bptr );
670  if( Afr ) free( Aptr );
671  }
672  }
673  }
674  else
675  {
676  if( !upper )
677  {
678  for( k = ( ( N - 1 ) / kb ) * kb; k >= 0; k -= kb )
679  {
680  ktmp = N - k; kbb = MIN( ktmp, kb );
681 /*
682 * Accumulate A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 )
683 */
684  PB_CGatherV( TYPE, ALLOCATE, BACKWARD, ktmp, kbb, A, IA+k,
685  JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr );
686 /*
687 * Replicate WA := A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 )' over
688 * B( IB:IB+M-1, JB+k:JB+N-1 )
689 */
690  Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
691  Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
692  PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
693  Bcurcol, ctxt, Bld );
694  PB_CInV( TYPE, &conjg, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
695  DBUFA, COLUMN, &WA, WAd, &WAfr );
696 /*
697 * Zero lower triangle of WA( 0:kbb-1, 0:kbb-1 )
698 */
699  if( unit )
700  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one,
701  WA, 0, 0, WAd );
702  else if( kbb > 1 )
703  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
704  WA, 1, 0, WAd );
705 /*
706 * Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
707 */
708  PB_CGatherV( TYPE, ALLOCATE, BACKWARD, M, kbb, B, IB, JB+k,
709  DESCB, COLUMN, &Bptr, DBUFB, &Bfr );
710 /*
711 * Replicate WB := B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over
712 * B( IB:IB+M-1, JB+k:JB+N-1 )
713 */
714  PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0,
715  DBUFB, COLUMN, &WB, WBd, &WBfr );
716 /*
717 * Zero B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
718 */
719  PB_Cplapad( TYPE, ALL, NOCONJG, M, kbb, zero, zero, B, IB,
720  JB+k, DESCB );
721 /*
722 * B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := ALPHA * WB * WA
723 */
724  Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol );
725  if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
726  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq,
727  &kbb, ALPHA, WB, &WBd[LLD_], WA, &WAd[LLD_], one,
728  Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld );
729 
730  if( WBfr ) free( WB );
731  if( WAfr ) free( WA );
732  if( Bfr ) free( Bptr );
733  if( Afr ) free( Aptr );
734  }
735  }
736  else
737  {
738  for( k = 0; k < N; k += kb )
739  {
740  kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
741 /*
742 * Accumulate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )
743 */
744  PB_CGatherV( TYPE, ALLOCATE, FORWARD, ktmp, kbb, A, IA, JA+k,
745  DESCA, COLUMN, &Aptr, DBUFA, &Afr );
746 /*
747 * Replicate WA := A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )' over
748 * B( IB:IB+M-1, JB:JB+k+kbb-1 )
749 */
750  PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
751  ctxt, Bld );
752  PB_CInV( TYPE, &conjg, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
753  DBUFA, COLUMN, &WA, WAd, &WAfr );
754 /*
755 * Zero upper triangle of WA( 0:kbb-1, k:k+kbb-1 )
756 */
757  if( unit )
758  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one,
759  WA, 0, k, WAd );
760  else if( kbb > 1 )
761  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
762  WA, 0, k+1, WAd );
763 /*
764 * Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
765 */
766  PB_CGatherV( TYPE, ALLOCATE, FORWARD, M, kbb, B, IB, JB+k,
767  DESCB, COLUMN, &Bptr, DBUFB, &Bfr );
768 /*
769 * Replicate WB := B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over
770 * B( IB:IB+M-1, JB:JB+k+kbb-1 )
771 */
772  PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0,
773  DBUFB, COLUMN, &WB, WBd, &WBfr );
774 /*
775 * Zero B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
776 */
777  PB_Cplapad( TYPE, ALL, NOCONJG, M, kbb, zero, zero, B, IB,
778  JB+k, DESCB );
779 /*
780 * B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := ALPHA * WB * WA
781 */
782  Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol );
783  if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
784  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq,
785  &kbb, ALPHA, WB, &WBd[LLD_], WA, &WAd[LLD_], one,
786  Bptr0, &Bld );
787 
788  if( WBfr ) free( WB );
789  if( WAfr ) free( WA );
790  if( Bfr ) free( Bptr );
791  if( Afr ) free( Aptr );
792  }
793  }
794  }
795  }
796  else
797  {
798 /*
799 * Left looking variant for the transpose cases
800 */
801  if( lside )
802  {
803  top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
804 
805  if( upper )
806  {
807  for( k = ( ( M - 1 ) / kb ) * kb; k >= 0; k -= kb )
808  {
809  kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
810 /*
811 * Accumulate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )
812 */
813  PB_CGatherV( TYPE, ALLOCATE, BACKWARD, ktmp, kbb, A, IA, JA+k,
814  DESCA, COLUMN, &Aptr, DBUFA, &Afr );
815 /*
816 * Replicate WA := A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) over
817 * B( IB:IB+k+kbb-1, JB:JB+N-1 )
818 */
819  PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
820  ctxt, Bld );
821  PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
822  DBUFA, COLUMN, &WA, WAd, &WAfr );
823 /*
824 * Zero lower triangle of WA( k:k+kbb-1, 0:kbb-1 )
825 */
826  if( unit )
827  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one,
828  WA, k, 0, WAd );
829  else if( kbb > 1 )
830  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
831  WA, k+1, 0, WAd );
832 /*
833 * WB := A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )' * B( IB:IB+k+kbb-1, JB:JB+N-1 )
834 */
835  PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WB, WBd, &WBfr,
836  &WBsum );
837  Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow );
838  if( ( Bnq0 > 0 ) && ( Bmp > 0 ) )
839  gemm( C2F_CHAR( TRANSA ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0,
840  &Bmp, ALPHA, WA, &WAd[LLD_], Bptr0, &Bld, zero, WB,
841  &WBd[LLD_] );
842  if( WBsum )
843  {
844  WBd[RSRC_] = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow,
845  nprow );
846  if( Bnq0 > 0 )
847  gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WB, WBd[LLD_],
848  WBd[RSRC_], mycol );
849  }
850  if( WAfr ) free( WA );
851  if( Afr ) free( Aptr );
852 /*
853 * B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) := WB
854 */
855  PB_CScatterV( TYPE, FORWARD, kbb, N, WB, 0, 0, WBd, ROW, zero,
856  B, IB+k, JB, DESCB, ROW );
857  if( WBfr ) free( WB );
858  }
859  }
860  else
861  {
862  for( k = 0; k < M; k += kb )
863  {
864  ktmp = M - k; kbb = MIN( ktmp, kb );
865 /*
866 * Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )
867 */
868  PB_CGatherV( TYPE, ALLOCATE, FORWARD, ktmp, kbb, A, IA+k,
869  JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr );
870 /*
871 * Replicate WA := A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over
872 * B( IB+k:IB+M-1, JB:JB+N-1 )
873 */
874  Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
875  Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
876  PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
877  Bcol, ctxt, Bld );
878  PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
879  DBUFA, COLUMN, &WA, WAd, &WAfr );
880 /*
881 * Zero upper triangle of WA( 0:kbb-1, 0:kbb-1 )
882 */
883  if( unit )
884  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one,
885  WA, 0, 0, WAd );
886  else if( kbb > 1 )
887  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
888  WA, 0, 1, WAd );
889 /*
890 * WB := A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )' * B( IB+k:IB+M-1, JB:JB+N-1 )
891 */
892  PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WB, WBd, &WBfr,
893  &WBsum );
894  Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow );
895  if( ( Bnq0 > 0 ) && ( Bmp > 0 ) )
896  gemm( C2F_CHAR( TRANSA ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0,
897  &Bmp, ALPHA, WA, &WAd[LLD_], Mptr( Bptr0, Bmp0-Bmp,
898  0, Bld, size ), &Bld, zero, WB, &WBd[LLD_] );
899  if( WBsum )
900  {
901  WBd[RSRC_] = PB_Cindxg2p( k + kbb - 1, Bimb1, Bmb, Brow,
902  Brow, nprow );
903  if( Bnq0 > 0 )
904  gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WB, WBd[LLD_],
905  WBd[RSRC_], mycol );
906 
907  }
908  if( WAfr ) free( WA );
909  if( Afr ) free( Aptr );
910 /*
911 * B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) := WB
912 */
913  PB_CScatterV( TYPE, BACKWARD, kbb, N, WB, 0, 0, WBd, ROW,
914  zero, B, IB+k, JB, DESCB, ROW );
915  if( WBfr ) free( WB );
916  }
917  }
918  }
919  else
920  {
921  top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
922 
923  if( upper )
924  {
925  for( k = 0; k < N; k += kb )
926  {
927  ktmp = N - k; kbb = MIN( ktmp, kb );
928 /*
929 * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )
930 */
931  PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, ktmp, A, IA+k,
932  JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr );
933 /*
934 * Replicate WA := A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over
935 * B( IB:IB+M-1, JB+k:JB+N-1 )
936 */
937  Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
938  Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
939  PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
940  Bcurcol, ctxt, Bld );
941  PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
942  DBUFA, ROW, &WA, WAd, &WAfr );
943 /*
944 * Zero lower triangle of WA( 0:kbb-1, 0:kbb-1 )
945 */
946  if( unit )
947  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one,
948  WA, 0, 0, WAd );
949  else if( kbb > 1 )
950  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
951  WA, 1, 0, WAd );
952 /*
953 * WB := B( IB:IB+M-1, JB+k:JB+N-1 ) * A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )'
954 */
955  PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WB, WBd,
956  &WBfr, &WBsum );
957  Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol );
958  if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
959  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANSA ), &Bmp0, &kbb,
960  &Bnq, ALPHA, Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ),
961  &Bld, WA, &WAd[LLD_], zero, WB, &WBd[LLD_] );
962  if( WBsum )
963  {
964  WBd[CSRC_] = PB_Cindxg2p( k + kbb - 1, Binb1, Bnb, Bcol,
965  Bcol, npcol );
966  if( Bmp0 > 0 )
967  gsum2d( ctxt, ROW, &top, Bmp0, kbb, WB, WBd[LLD_],
968  myrow, WBd[CSRC_] );
969  }
970  if( WAfr ) free( WA );
971  if( Afr ) free( Aptr );
972 /*
973 * B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := WB
974 */
975  PB_CScatterV( TYPE, BACKWARD, M, kbb, WB, 0, 0, WBd, COLUMN,
976  zero, B, IB, JB+k, DESCB, COLUMN );
977  if( WBfr ) free( WB );
978  }
979  }
980  else
981  {
982  for( k = ( ( N - 1 ) / kb ) * kb; k >= 0; k -= kb )
983  {
984  kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
985 /*
986 * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 )
987 */
988  PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, ktmp, A, IA+k, JA,
989  DESCA, ROW, &Aptr, DBUFA, &Afr );
990 /*
991 * Replicate WA := A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 ) over
992 * B( IB:IB+M-1, JB:JB+k+kbb-1 )
993 */
994  PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
995  ctxt, Bld );
996  PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
997  DBUFA, ROW, &WA, WAd, &WAfr );
998 /*
999 * Zero upper triangle of WA( 0:kbb-1, k:k+kbb-1 )
1000 */
1001  if( unit )
1002  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one,
1003  WA, 0, k, WAd );
1004  else if( kbb > 1 )
1005  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
1006  WA, 0, k+1, WAd );
1007 /*
1008 * WB := B( IB:IB+M-1, JB:JB+k+kbb-1 ) * A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 )'
1009 */
1010  PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WB, WBd,
1011  &WBfr, &WBsum );
1012  Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol );
1013  if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
1014  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANSA ), &Bmp0, &kbb,
1015  &Bnq, ALPHA, Bptr0, &Bld, WA, &WAd[LLD_], zero, WB,
1016  &WBd[LLD_] );
1017  if( WBsum )
1018  {
1019  WBd[CSRC_] = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol,
1020  npcol );
1021  if( Bmp0 > 0 )
1022  gsum2d( ctxt, ROW, &top, Bmp0, kbb, WB, WBd[LLD_],
1023  myrow, WBd[CSRC_] );
1024  }
1025  if( WAfr ) free( WA );
1026  if( Afr ) free( Aptr );
1027 /*
1028 * B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := WB
1029 */
1030  PB_CScatterV( TYPE, FORWARD, M, kbb, WB, 0, 0, WBd, COLUMN,
1031  zero, B, IB, JB+k, DESCB, COLUMN );
1032  if( WBfr ) free( WB );
1033  }
1034  }
1035  }
1036  }
1037  }
1038 /*
1039 * End of PB_CptrmmAB
1040 */
1041 }
CCONJG
#define CCONJG
Definition: PBblas.h:21
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
ALLOCATE
#define ALLOCATE
Definition: PBblas.h:68
PB_CScatterV
void PB_CScatterV()
CRIGHT
#define CRIGHT
Definition: PBblas.h:30
BACKWARD
#define BACKWARD
Definition: PBblas.h:65
CCOTRAN
#define CCOTRAN
Definition: PBblas.h:22
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
PB_Cfirstnb
int PB_Cfirstnb()
DLEN_
#define DLEN_
Definition: PBtools.h:48
CUNIT
#define CUNIT
Definition: PBblas.h:32
GEMM_T
F_VOID_FCT(* GEMM_T)()
Definition: pblas.h:313
PB_CptrmmAB
void PB_CptrmmAB(PBTYP_T *TYPE, char *VARIANT, char *SIDE, char *UPLO, char *TRANSA, char *DIAG, int M, int N, char *ALPHA, char *A, int IA, int JA, int *DESCA, char *B, int IB, int JB, int *DESCB)
Definition: PB_CptrmmAB.c:25
NOTRAN
#define NOTRAN
Definition: PBblas.h:44
LLD_
#define LLD_
Definition: PBtools.h:47
CNOCONJG
#define CNOCONJG
Definition: PBblas.h:19
UPPER
#define UPPER
Definition: PBblas.h:52
IMB_
#define IMB_
Definition: PBtools.h:41
pilaenv_
int pilaenv_()
PB_CGatherV
void PB_CGatherV()
INIT
#define INIT
Definition: PBblas.h:61
PB_Cdescset
void PB_Cdescset()
PB_Cplapad
void PB_Cplapad()
PB_Cindxg2p
int PB_Cindxg2p()
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
GSUM2D_T
void(* GSUM2D_T)()
Definition: pblas.h:282
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
CNOTRAN
#define CNOTRAN
Definition: PBblas.h:18
FORWARD
#define FORWARD
Definition: PBblas.h:64
COMBINE
#define COMBINE
Definition: PBblacs.h:49
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
PB_CInV
void PB_CInV()
ALL
#define ALL
Definition: PBblas.h:50
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
INB_
#define INB_
Definition: PBtools.h:42
LOWER
#define LOWER
Definition: PBblas.h:51
PB_COutV
void PB_COutV()
C2F_CHAR
#define C2F_CHAR(a)
Definition: pblas.h:121
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
CUPPER
#define CUPPER
Definition: PBblas.h:26
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CLEFT
#define CLEFT
Definition: PBblas.h:29
CTXT_
#define CTXT_
Definition: PBtools.h:38