ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CptrsmAB.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_CptrsmAB( PBTYP_T * TYPE, char * VARIANT, char * SIDE, char * UPLO,
21  char * TRANSA, char * DIAG, int M, int N, char * ALPHA,
22  char * A, int IA, int JA, int * DESCA, char * B, int IB,
23  int JB, int * DESCB )
24 #else
25 void PB_CptrsmAB( TYPE, VARIANT, SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A,
26  IA, JA, DESCA, B, IB, JB, DESCB )
27 /*
28 * .. Scalar Arguments ..
29 */
30  char * DIAG, * SIDE, * TRANSA, * UPLO, * VARIANT;
31  int IA, IB, JA, JB, M, N;
32  char * ALPHA;
33  PBTYP_T * TYPE;
34 /*
35 * .. Array Arguments ..
36 */
37  int * DESCA, * DESCB;
38  char * A, * B;
39 #endif
40 {
41 /*
42 * Purpose
43 * =======
44 *
45 * PB_CptrsmAB solves one of the matrix equations
46 *
47 * op( sub( A ) )*X = alpha*sub( B ), or
48 *
49 * X*op( sub( A ) ) = alpha*sub( B ),
50 *
51 * where
52 *
53 * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L',
54 * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and,
55 *
56 * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1).
57 *
58 * Alpha is a scalar, X and sub( B ) are m by n submatrices, sub( A ) is
59 * a unit, or non-unit, upper or lower triangular submatrix and op( Y )
60 * is one of
61 *
62 * op( Y ) = Y or op( Y ) = Y' or op( Y ) = conjg( Y' ).
63 *
64 * The submatrix X is overwritten on sub( B ).
65 *
66 * This is the outer-product algorithm using the logical aggregation
67 * blocking technique.
68 *
69 * Notes
70 * =====
71 *
72 * A description vector is associated with each 2D block-cyclicly dis-
73 * tributed matrix. This vector stores the information required to
74 * establish the mapping between a matrix entry and its corresponding
75 * process and memory location.
76 *
77 * In the following comments, the character _ should be read as
78 * "of the distributed matrix". Let A be a generic term for any 2D
79 * block cyclicly distributed matrix. Its description vector is DESC_A:
80 *
81 * NOTATION STORED IN EXPLANATION
82 * ---------------- --------------- ------------------------------------
83 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
84 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
85 * the NPROW x NPCOL BLACS process grid
86 * A is distributed over. The context
87 * itself is global, but the handle
88 * (the integer value) may vary.
89 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
90 * ted matrix A, M_A >= 0.
91 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
92 * buted matrix A, N_A >= 0.
93 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
94 * block of the matrix A, IMB_A > 0.
95 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
96 * left block of the matrix A,
97 * INB_A > 0.
98 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
99 * bute the last M_A-IMB_A rows of A,
100 * MB_A > 0.
101 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
102 * bute the last N_A-INB_A columns of
103 * A, NB_A > 0.
104 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
105 * row of the matrix A is distributed,
106 * NPROW > RSRC_A >= 0.
107 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
108 * first column of A is distributed.
109 * NPCOL > CSRC_A >= 0.
110 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
111 * array storing the local blocks of
112 * the distributed matrix A,
113 * IF( Lc( 1, N_A ) > 0 )
114 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
115 * ELSE
116 * LLD_A >= 1.
117 *
118 * Let K be the number of rows of a matrix A starting at the global in-
119 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
120 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
121 * receive if these K rows were distributed over NPROW processes. If K
122 * is the number of columns of a matrix A starting at the global index
123 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
124 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
125 * these K columns were distributed over NPCOL processes.
126 *
127 * The values of Lr() and Lc() may be determined via a call to the func-
128 * tion PB_Cnumroc:
129 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
130 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
131 *
132 * Arguments
133 * =========
134 *
135 * TYPE (local input) pointer to a PBTYP_T structure
136 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
137 * that contains type information (See pblas.h).
138 *
139 * VARIANT (global input) pointer to CHAR
140 * On entry, VARIANT specifies whether the left- or right-loo-
141 * king variant of the algorithm should be used for the transpo-
142 * se cases only, that is TRANSA is not 'N' or 'n'. When VARIANT
143 * is 'L' or 'l', the left-looking variant is used, otherwise
144 * the right-looking algorithm is selected.
145 *
146 * SIDE (global input) pointer to CHAR
147 * On entry, SIDE specifies whether op( sub( A ) ) appears on
148 * the left or right of X as follows:
149 *
150 * SIDE = 'L' or 'l' op( sub( A ) )*X = alpha*sub( B ),
151 *
152 * SIDE = 'R' or 'r' X*op( sub( A ) ) = alpha*sub( B ).
153 *
154 * UPLO (global input) pointer to CHAR
155 * On entry, UPLO specifies whether the submatrix sub( A ) is
156 * an upper or lower triangular submatrix as follows:
157 *
158 * UPLO = 'U' or 'u' sub( A ) is an upper triangular
159 * submatrix,
160 *
161 * UPLO = 'L' or 'l' sub( A ) is a lower triangular
162 * submatrix.
163 *
164 * TRANSA (global input) pointer to CHAR
165 * On entry, TRANSA specifies the form of op( sub( A ) ) to be
166 * used in the matrix multiplication as follows:
167 *
168 * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ),
169 *
170 * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )',
171 *
172 * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ).
173 *
174 * DIAG (global input) pointer to CHAR
175 * On entry, DIAG specifies whether or not sub( A ) is unit
176 * triangular as follows:
177 *
178 * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian-
179 * gular,
180 *
181 * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri-
182 * angular.
183 *
184 * M (global input) INTEGER
185 * On entry, M specifies the number of rows of the submatrix
186 * sub( B ). M must be at least zero.
187 *
188 * N (global input) INTEGER
189 * On entry, N specifies the number of columns of the submatrix
190 * sub( B ). N must be at least zero.
191 *
192 * ALPHA (global input) pointer to CHAR
193 * On entry, ALPHA specifies the scalar alpha. When ALPHA is
194 * supplied as zero then the local entries of the array B
195 * corresponding to the entries of the submatrix sub( B ) need
196 * not be set on input.
197 *
198 * A (local input) pointer to CHAR
199 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
200 * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at
201 * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
202 * contains the local entries of the matrix A.
203 * Before entry with UPLO = 'U' or 'u', this array contains the
204 * local entries corresponding to the entries of the upper tri-
205 * angular submatrix sub( A ), and the local entries correspon-
206 * ding to the entries of the strictly lower triangular part of
207 * the submatrix sub( A ) are not referenced.
208 * Before entry with UPLO = 'L' or 'l', this array contains the
209 * local entries corresponding to the entries of the lower tri-
210 * angular submatrix sub( A ), and the local entries correspon-
211 * ding to the entries of the strictly upper triangular part of
212 * the submatrix sub( A ) are not referenced.
213 * Note that when DIAG = 'U' or 'u', the local entries corres-
214 * ponding to the diagonal elements of the submatrix sub( A )
215 * are not referenced either, but are assumed to be unity.
216 *
217 * IA (global input) INTEGER
218 * On entry, IA specifies A's global row index, which points to
219 * the beginning of the submatrix sub( A ).
220 *
221 * JA (global input) INTEGER
222 * On entry, JA specifies A's global column index, which points
223 * to the beginning of the submatrix sub( A ).
224 *
225 * DESCA (global and local input) INTEGER array
226 * On entry, DESCA is an integer array of dimension DLEN_. This
227 * is the array descriptor for the matrix A.
228 *
229 * B (local input/local output) pointer to CHAR
230 * On entry, B is an array of dimension (LLD_B, Kb), where Kb is
231 * at least Lc( 1, JB+N-1 ). Before entry, this array contains
232 * the local entries of the matrix B.
233 * On exit, the local entries of this array corresponding to the
234 * to the entries of the submatrix sub( B ) are overwritten by
235 * the local entries of the m by n solution submatrix.
236 *
237 * IB (global input) INTEGER
238 * On entry, IB specifies B's global row index, which points to
239 * the beginning of the submatrix sub( B ).
240 *
241 * JB (global input) INTEGER
242 * On entry, JB specifies B's global column index, which points
243 * to the beginning of the submatrix sub( B ).
244 *
245 * DESCB (global and local input) INTEGER array
246 * On entry, DESCB is an integer array of dimension DLEN_. This
247 * is the array descriptor for the matrix B.
248 *
249 * -- Written on April 1, 1998 by
250 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
251 *
252 * ---------------------------------------------------------------------
253 */
254 /*
255 * .. Local Scalars ..
256 */
257  char conjg, * negone, * one, * talph, top, * zero;
258  int Afr, Bcol, Bcurcol, Bcurimb1, Bcurinb1, Bcurrow, Bfr, Bii,
259  Bimb, Bimb1, Binb, Binb1, Bjj, Bld, Bmb, Bmp, Bmp0, Bnb, Bnq,
260  Bnq0, Brow, WAfr, WAoff, WBfr, WBsum, ctxt, k, kb, kbb, kmax,
261  ktmp, lside, mn, mycol, myrow, notran, npcol, nprow, size,
262  upper;
263  GEMM_T gemm;
264  GSUM2D_T gsum2d;
265 /*
266 * .. Local Arrays ..
267 */
268  int Bd0[DLEN_], DBUFA[DLEN_], DBUFB[DLEN_], WAd[DLEN_],
269  WBd[DLEN_];
270  char * Aptr = NULL, * Bptr = NULL, * Bptr0 = NULL, * WA = NULL,
271  * WB = NULL;
272 /* ..
273 * .. Executable Statements ..
274 *
275 */
276  lside = ( Mupcase( SIDE [0] ) == CLEFT );
277  upper = ( Mupcase( UPLO [0] ) == CUPPER );
278  notran = ( Mupcase( TRANSA[0] ) == CNOTRAN );
279  size = TYPE->size; negone = TYPE->negone; one = TYPE->one;
280  zero = TYPE->zero; gsum2d = TYPE->Cgsum2d; gemm = TYPE->Fgemm;
281  talph = ALPHA;
282  kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
283 /*
284 * Retrieve process grid information
285 */
286  Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
287 /*
288 * Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol, Bld ...
289 */
290  Bimb = DESCB[IMB_]; Binb = DESCB[INB_];
291  Bmb = DESCB[MB_ ]; Bnb = DESCB[NB_ ]; Bld = DESCB[LLD_];
292  PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow,
293  &Bcol );
294 
295  Bimb1 = PB_Cfirstnb( M, IB, Bimb, Bmb );
296  Bmp0 = PB_Cnumroc( M, 0, Bimb1, Bmb, myrow, Brow, nprow );
297  Binb1 = PB_Cfirstnb( N, JB, Binb, Bnb );
298  Bnq0 = PB_Cnumroc( N, 0, Binb1, Bnb, mycol, Bcol, npcol );
299  if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size );
300 
301  if( notran )
302  {
303  if( lside )
304  {
305  if( upper )
306  {
307  kmax = ( ( M - 1 ) / kb ) * kb;
308 
309  for( k = kmax; k >= 0; k -= kb )
310  {
311  kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
312 /*
313 * Accumulate A( IA:IA+ktmp-1, JA+k:JA+k+kbb-1 )
314 */
315  PB_CGatherV( TYPE, REUSE, BACKWARD, ktmp, kbb, A, IA, JA+k,
316  DESCA, COLUMN, &Aptr, DBUFA, &Afr );
317 /*
318 * Replicate A(IA:IA+ktmp-1, JA+k:JA+k+kbb-1) over B(IB:IB+ktmp-1, JB:JB+N-1)
319 */
320  PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
321  ctxt, Bld );
322  PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
323  DBUFA, COLUMN, &WA, WAd, &WAfr );
324 /*
325 * Solve B( IB+k:IB+ktmp-1, JB:JB+N-1 ) with talph
326 */
327  PB_CptrsmAB0( TYPE, SIDE, UPLO, DIAG, kbb, N, talph, WA, k, 0,
328  WAd, B, IB+k, JB, DESCB, &Bptr, DBUFB, &Bfr );
329 /*
330 * Update B( IB:IB+k-1, JB:JB+N-1 )
331 */
332  if( k > 0 )
333  {
334 /*
335 * Replicate B( IB+k:IB+ktmp-1, JB:JB+N-1 ) over B( IB:IB+k-1, JB:JB+N-1 )
336 */
337  PB_Cdescset( Bd0, k, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
338  ctxt, Bld );
339  PB_CInV( TYPE, NOCONJG, ROW, k, N, Bd0, kbb, Bptr, 0, 0,
340  DBUFB, ROW, &WB, WBd, &WBfr );
341 /*
342 * Local update
343 */
344  Bmp = PB_Cnumroc( k, 0, Bimb1, Bmb, myrow, Brow, nprow );
345  if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
346  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0,
347  &kbb, negone, WA, &WAd[LLD_], WB, &WBd[LLD_], talph,
348  Bptr0, &Bld );
349  if( WBfr ) free( WB );
350  talph = one;
351  }
352  if( WAfr ) free( WA );
353  if( Bfr ) free( Bptr );
354  if( Afr ) free( Aptr );
355  }
356  }
357  else
358  {
359  for( k = 0; k < M; k += kb )
360  {
361  ktmp = M - k; kbb = MIN( ktmp, kb );
362 /*
363 * Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )
364 */
365  PB_CGatherV( TYPE, REUSE, FORWARD, ktmp, kbb, A, IA+k, JA+k,
366  DESCA, COLUMN, &Aptr, DBUFA, &Afr );
367 /*
368 * Replicate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over B( IB+k:IB+M-1, JB:JB+N-1 )
369 */
370  Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
371  Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
372  PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
373  Bcol, ctxt, Bld );
374  PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
375  DBUFA, COLUMN, &WA, WAd, &WAfr );
376 /*
377 * Solve B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) with talph
378 */
379  PB_CptrsmAB0( TYPE, SIDE, UPLO, DIAG, kbb, N, talph, WA, 0, 0,
380  WAd, B, IB+k, JB, DESCB, &Bptr, DBUFB, &Bfr );
381 /*
382 * Update B( IB+k+kbb:IB+M-1, JB:JB+N-1 )
383 */
384  if( ( ktmp = ktmp - kbb ) > 0 )
385  {
386 /*
387 * Replicate B(IB+k:IB+k+kbb-1, JB:JB+N-1) over B(IB+k+kbb:IB+M-1, JB:JB+N-1)
388 */
389  Bcurimb1 = PB_Cfirstnb( ktmp, IB+k+kbb, Bimb, Bmb );
390  Bcurrow = PB_Cindxg2p( k+kbb, Bimb1, Bmb, Brow, Brow,
391  nprow );
392  PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
393  Bcol, ctxt, Bld );
394  PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0,
395  DBUFB, ROW, &WB, WBd, &WBfr );
396 /*
397 * Local update
398 */
399  Bmp = PB_Cnumroc( ktmp, k+kbb, Bimb1, Bmb, myrow, Brow,
400  nprow );
401  if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
402  {
403  WAoff = PB_Cnumroc( kbb, 0, WAd[IMB_], WAd[MB_], myrow,
404  WAd[RSRC_], nprow );
405  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0,
406  &kbb, negone, Mptr( WA, WAoff, 0, WAd[LLD_], size ),
407  &WAd[LLD_], WB, &WBd[LLD_], talph, Mptr( Bptr0,
408  Bmp0-Bmp, 0, Bld, size ), &Bld );
409  }
410  if( WBfr ) free( WB );
411  talph = one;
412  }
413  if( WAfr ) free( WA );
414  if( Bfr ) free( Bptr );
415  if( Afr ) free( Aptr );
416  }
417  }
418  }
419  else
420  {
421  if( upper )
422  {
423  for( k = 0; k < N; k += kb )
424  {
425  ktmp = N - k; kbb = MIN( ktmp, kb );
426 /*
427 * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )
428 */
429  PB_CGatherV( TYPE, REUSE, FORWARD, kbb, ktmp, A, IA+k, JA+k,
430  DESCA, ROW, &Aptr, DBUFA, &Afr );
431 /*
432 * Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over B( IB:IB+M-1, JB+k:JB+N-1 )
433 */
434  Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
435  Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
436  PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
437  Bcurcol, ctxt, Bld );
438  PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
439  DBUFA, ROW, &WA, WAd, &WAfr );
440 /*
441 * Solve B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) with talph
442 */
443  PB_CptrsmAB0( TYPE, SIDE, UPLO, DIAG, M, kbb, talph, WA, 0, 0,
444  WAd, B, IB, JB+k, DESCB, &Bptr, DBUFB, &Bfr );
445 /*
446 * Update B( IB:IB+M-1, JB+k+kbb:JB+N-1 )
447 */
448  if( ( ktmp = ktmp - kbb ) > 0 )
449  {
450 /*
451 * Replicate B(IB:IB+M-1, JB+k:JB+k+kbb-1) over B(IB:IB+M-1, JB+k+kbb:JB+N-1)
452 */
453  Bcurinb1 = PB_Cfirstnb( ktmp, JB+k+kbb, Binb, Bnb );
454  Bcurcol = PB_Cindxg2p( k+kbb, Binb1, Bnb, Bcol, Bcol,
455  npcol );
456  PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
457  Bcurcol, ctxt, Bld );
458  PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0,
459  DBUFB, COLUMN, &WB, WBd, &WBfr );
460 /*
461 * Local update
462 */
463  Bnq = PB_Cnumroc( ktmp, k+kbb, Binb1, Bnb, mycol, Bcol,
464  npcol );
465  if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
466  {
467  WAoff = PB_Cnumroc( kbb, 0, WAd[INB_], WAd[NB_], mycol,
468  WAd[CSRC_], npcol );
469  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq,
470  &kbb, negone, WB, &WBd[LLD_], Mptr( WA, 0, WAoff,
471  WAd[LLD_], size ), &WAd[LLD_], talph, Mptr( Bptr0,
472  0, Bnq0-Bnq, Bld, size ), &Bld );
473  }
474  if( WBfr ) free( WB );
475  talph = one;
476  }
477  if( WAfr ) free( WA );
478  if( Bfr ) free( Bptr );
479  if( Afr ) free( Aptr );
480  }
481  }
482  else
483  {
484  kmax = ( ( N - 1 ) / kb ) * kb;
485 
486  for( k = kmax; k >= 0; k -= kb )
487  {
488  kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
489 /*
490 * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+ktmp-1 )
491 */
492  PB_CGatherV( TYPE, REUSE, BACKWARD, kbb, ktmp, A, IA+k, JA,
493  DESCA, ROW, &Aptr, DBUFA, &Afr );
494 /*
495 * Replicate A( IA+k:IA+k+kbb-1, JA:JA+ktmp-1 ) over B(IB:IB+M-1, JB:JB+ktmp-1)
496 */
497  PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
498  ctxt, Bld );
499  PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
500  DBUFA, ROW, &WA, WAd, &WAfr );
501 /*
502 * Solve B( IB:IB+M-1, JB+k:JB+ktmp-1 ) with talph
503 */
504  PB_CptrsmAB0( TYPE, SIDE, UPLO, DIAG, M, kbb, talph, WA, 0, k,
505  WAd, B, IB, JB+k, DESCB, &Bptr, DBUFB, &Bfr );
506 /*
507 * Update B( IB:IB+M-1, JB:JB+k-1 )
508 */
509  if( k > 0 )
510  {
511 /*
512 * Replicate B( IB:IB+M-1, JB+k:JB+ktmp-1 ) over B( IB:IB+M-1, JB:JB+k-1 )
513 */
514  PB_Cdescset( Bd0, M, k, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
515  ctxt, Bld );
516  PB_CInV( TYPE, NOCONJG, COLUMN, M, k, Bd0, kbb, Bptr, 0, 0,
517  DBUFB, COLUMN, &WB, WBd, &WBfr );
518 /*
519 * Local update
520 */
521  Bnq = PB_Cnumroc( k, 0, Binb1, Bnb, mycol, Bcol, npcol );
522  if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
523  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq,
524  &kbb, negone, WB, &WBd[LLD_], WA, &WAd[LLD_], talph,
525  Bptr0, &Bld );
526  if( WBfr ) free( WB );
527  talph = one;
528  }
529  if( WAfr ) free( WA );
530  if( Bfr ) free( Bptr );
531  if( Afr ) free( Aptr );
532  }
533  }
534  }
535  }
536  else
537  {
538  if( Mupcase( VARIANT[0] ) == CRIGHT )
539  {
540 /*
541 * Right looking variant for the transpose cases
542 */
543  conjg = ( ( Mupcase( TRANSA[0] ) == CCOTRAN ) ? CCONJG : CNOCONJG );
544 
545  if( lside )
546  {
547  if( !upper )
548  {
549 /*
550 * Left Lower (Conjugate) Transpose
551 */
552  kmax = ( ( M - 1 ) / kb ) * kb;
553 
554  for( k = kmax; k >= 0; k -= kb )
555  {
556  kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
557 /*
558 * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+ktmp-1 )
559 */
560  PB_CGatherV( TYPE, REUSE, BACKWARD, kbb, ktmp, A, IA+k, JA,
561  DESCA, ROW, &Aptr, DBUFA, &Afr );
562 /*
563 * Replicate A( IA+k:IA+k+kbb-1, JA:JA+ktmp-1 )' over B(IB:IB+ktmp-1, JB:JB+N-1)
564 */
565  PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
566  ctxt, Bld );
567  PB_CInV( TYPE, &conjg, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
568  DBUFA, ROW, &WA, WAd, &WAfr );
569 /*
570 * Solve B( IB+k:IB+ktmp-1, JB:JB+N-1 ) with talph
571 */
572  PB_CptrsmAB0( TYPE, SIDE, UPPER, DIAG, kbb, N, talph, WA, k,
573  0, WAd, B, IB+k, JB, DESCB, &Bptr, DBUFB,
574  &Bfr );
575 /*
576 * Update B( IB:IB+k-1, JB:JB+N-1 )
577 */
578  if( k > 0 )
579  {
580 /*
581 * Replicate B( IB+k:IB+ktmp-1, JB:JB+N-1 ) over B( IB:IB+k-1, JB:JB+N-1 )
582 */
583  PB_Cdescset( Bd0, k, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
584  ctxt, Bld );
585  PB_CInV( TYPE, NOCONJG, ROW, k, N, Bd0, kbb, Bptr, 0, 0,
586  DBUFB, ROW, &WB, WBd, &WBfr );
587 /*
588 * Local update
589 */
590  Bmp = PB_Cnumroc( k, 0, Bimb1, Bmb, myrow, Brow, nprow );
591  if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
592  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp,
593  &Bnq0, &kbb, negone, WA, &WAd[LLD_], WB,
594  &WBd[LLD_], talph, Bptr0, &Bld );
595  if( WBfr ) free( WB );
596  talph = one;
597  }
598  if( WAfr ) free( WA );
599  if( Bfr ) free( Bptr );
600  if( Afr ) free( Aptr );
601  }
602  }
603  else
604  {
605 /*
606 * Left Upper (Conjugate) Transpose
607 */
608  for( k = 0; k < M; k += kb )
609  {
610  ktmp = M - k; kbb = MIN( ktmp, kb );
611 /*
612 * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 )
613 */
614  PB_CGatherV( TYPE, REUSE, FORWARD, kbb, ktmp, A, IA+k, JA+k,
615  DESCA, ROW, &Aptr, DBUFA, &Afr );
616 /*
617 * Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 )' over B( IB+k:IB+M-1, JB:JB+N-1 )
618 */
619  Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
620  Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
621  PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
622  Bcol, ctxt, Bld );
623  PB_CInV( TYPE, &conjg, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
624  DBUFA, ROW, &WA, WAd, &WAfr );
625 /*
626 * Solve B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) with talph
627 */
628  PB_CptrsmAB0( TYPE, SIDE, LOWER, DIAG, kbb, N, talph, WA, 0,
629  0, WAd, B, IB+k, JB, DESCB, &Bptr, DBUFB,
630  &Bfr );
631 /*
632 * Update B( IB+k+kbb:IB+M-1, JB:JB+N-1 )
633 */
634  if( ( ktmp = ktmp - kbb ) > 0 )
635  {
636 /*
637 * Replicate B(IB+k:IB+k+kbb-1, JB:JB+N-1) over B(IB+k+kbb:IB+M-1, JB:JB+N-1)
638 */
639  Bcurimb1 = PB_Cfirstnb( ktmp, IB+k+kbb, Bimb, Bmb );
640  Bcurrow = PB_Cindxg2p( k+kbb, Bimb1, Bmb, Brow, Brow,
641  nprow );
642  PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb,
643  Bcurrow, Bcol, ctxt, Bld );
644  PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0,
645  DBUFB, ROW, &WB, WBd, &WBfr );
646 /*
647 * Local update
648 */
649  Bmp = PB_Cnumroc( ktmp, k+kbb, Bimb1, Bmb, myrow, Brow,
650  nprow );
651  if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
652  {
653  WAoff = PB_Cnumroc( kbb, 0, WAd[IMB_], WAd[MB_], myrow,
654  WAd[RSRC_], nprow );
655  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp,
656  &Bnq0, &kbb, negone, Mptr( WA, WAoff, 0,
657  WAd[LLD_], size ), &WAd[LLD_], WB, &WBd[LLD_],
658  talph, Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ),
659  &Bld );
660  }
661  if( WBfr ) free( WB );
662  talph = one;
663  }
664  if( WAfr ) free( WA );
665  if( Bfr ) free( Bptr );
666  if( Afr ) free( Aptr );
667  }
668  }
669  }
670  else
671  {
672  if( !upper )
673  {
674 /*
675 * Right Lower (Conjugate) Transpose
676 */
677  for( k = 0; k < N; k += kb )
678  {
679  ktmp = N - k; kbb = MIN( ktmp, kb );
680 /*
681 * Accumulate A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 )
682 */
683  PB_CGatherV( TYPE, REUSE, FORWARD, ktmp, kbb, A, IA+k, JA+k,
684  DESCA, COLUMN, &Aptr, DBUFA, &Afr );
685 /*
686 * Replicate A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 )' over B( IB:IB+M-1, JB+k:JB+N-1 )
687 */
688  Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
689  Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
690  PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
691  Bcurcol, ctxt, Bld );
692  PB_CInV( TYPE, &conjg, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
693  DBUFA, COLUMN, &WA, WAd, &WAfr );
694 /*
695 * Solve B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) with talph
696 */
697  PB_CptrsmAB0( TYPE, SIDE, UPPER, DIAG, M, kbb, talph, WA, 0,
698  0, WAd, B, IB, JB+k, DESCB, &Bptr, DBUFB,
699  &Bfr );
700 /*
701 * Update B( IB:IB+M-1, JB+k+kbb:JB+N-1 )
702 */
703  if( ( ktmp = ktmp - kbb ) > 0 )
704  {
705 /*
706 * Replicate B(IB:IB+M-1, JB+k:JB+k+kbb-1) over B(IB:IB+M-1, JB+k+kbb:JB+N-1)
707 */
708  Bcurinb1 = PB_Cfirstnb( ktmp, JB+k+kbb, Binb, Bnb );
709  Bcurcol = PB_Cindxg2p( k+kbb, Binb1, Bnb, Bcol, Bcol,
710  npcol );
711  PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
712  Bcurcol, ctxt, Bld );
713  PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr,
714  0, 0, DBUFB, COLUMN, &WB, WBd, &WBfr );
715 /*
716 * Local update
717 */
718  Bnq = PB_Cnumroc( ktmp, k+kbb, Binb1, Bnb, mycol, Bcol,
719  npcol );
720  if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
721  {
722  WAoff = PB_Cnumroc( kbb, 0, WAd[INB_], WAd[NB_], mycol,
723  WAd[CSRC_], npcol );
724  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0,
725  &Bnq, &kbb, negone, WB, &WBd[LLD_], Mptr( WA, 0,
726  WAoff, WAd[LLD_], size ), &WAd[LLD_], talph,
727  Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld );
728  }
729  if( WBfr ) free( WB );
730  talph = one;
731  }
732  if( WAfr ) free( WA );
733  if( Bfr ) free( Bptr );
734  if( Afr ) free( Aptr );
735  }
736  }
737  else
738  {
739 /*
740 * Right Upper (Conjugate) Transpose
741 */
742  kmax = ( ( N - 1 ) / kb ) * kb;
743 
744  for( k = kmax; k >= 0; k -= kb )
745  {
746  kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
747 /*
748 * Accumulate A( IA:IA+ktmp-1, JA+k:JA+k+kbb-1 )
749 */
750  PB_CGatherV( TYPE, REUSE, BACKWARD, ktmp, kbb, A, IA, JA+k,
751  DESCA, COLUMN, &Aptr, DBUFA, &Afr );
752 /*
753 * Replicate A( IA:IA+ktmp-1, JA+k:JA+k+kbb-1 )' over B(IB:IB+M-1, JB:JB+ktmp-1)
754 */
755  PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
756  ctxt, Bld );
757  PB_CInV( TYPE, &conjg, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
758  DBUFA, COLUMN, &WA, WAd, &WAfr );
759 /*
760 * Solve B( IB:IB+M-1, JB+k:JB+ktmp-1 ) with talph
761 */
762  PB_CptrsmAB0( TYPE, SIDE, LOWER, DIAG, M, kbb, talph, WA, 0,
763  k, WAd, B, IB, JB+k, DESCB, &Bptr, DBUFB,
764  &Bfr );
765 /*
766 * Update B( IB:IB+M-1, JB:JB+k-1 )
767 */
768  if( k > 0 )
769  {
770 /*
771 * Replicate B( IB:IB+M-1, JB+k:JB+ktmp-1 ) over B( IB:IB+M-1, JB:JB+k-1 )
772 */
773  PB_Cdescset( Bd0, M, k, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
774  ctxt, Bld );
775  PB_CInV( TYPE, NOCONJG, COLUMN, M, k, Bd0, kbb, Bptr, 0, 0,
776  DBUFB, COLUMN, &WB, WBd, &WBfr );
777 /*
778 * Local update
779 */
780  Bnq = PB_Cnumroc( k, 0, Binb1, Bnb, mycol, Bcol, npcol );
781  if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
782  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0,
783  &Bnq, &kbb, negone, WB, &WBd[LLD_], WA,
784  &WAd[LLD_], talph, Bptr0, &Bld );
785  if( WBfr ) free( WB );
786  talph = one;
787  }
788  if( WAfr ) free( WA );
789  if( Bfr ) free( Bptr );
790  if( Afr ) free( Aptr );
791  }
792  }
793  }
794  }
795  else
796  {
797 /*
798 * Left looking variant for the transpose cases
799 */
800  if( lside )
801  {
802  top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
803 
804  if( upper )
805  {
806 /*
807 * Accumulate A( IA:IA+Bimb1-1, JA:JA+Bimb1-1 )
808 */
809  PB_CGatherV( TYPE, REUSE, FORWARD, Bimb1, Bimb1, A, IA, JA,
810  DESCA, COLUMN, &Aptr, DBUFA, &Afr );
811 /*
812 * Replicate A( IA:IA+Bimb1-1, JA:JA+Bimb1-1 ) over B(IB:IB+Bimb1-1, JB:JB+N-1)
813 */
814  PB_Cdescset( Bd0, Bimb1, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
815  ctxt, Bld );
816  PB_CInV( TYPE, NOCONJG, COLUMN, Bimb1, N, Bd0, Bimb1, Aptr, 0, 0,
817  DBUFA, COLUMN, &WA, WAd, &WAfr );
818 /*
819 * Solve B( IB:IB+Bimb1-1, JB:JB+N-1 )
820 */
821  if( ( ( Brow < 0 ) || ( myrow == Brow ) ) && ( Bnq0 > 0 ) )
822  TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ),
823  C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ),
824  &Bimb1, &Bnq0, ALPHA, WA, &WAd[LLD_],
825  Bptr0, &Bld );
826  if( WAfr ) free( WA );
827  if( Afr ) free( Aptr );
828 /*
829 * Update and solve remaining rows of sub( B )
830 */
831  for( k = Bimb1; k < M; k += kb )
832  {
833  kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
834 /*
835 * Accumulate A( IA:IA+ktmp-1, JA+k:JA+ktmp-1 )
836 */
837  PB_CGatherV( TYPE, REUSE, FORWARD, ktmp, kbb, A, IA, JA+k,
838  DESCA, COLUMN, &Aptr, DBUFA, &Afr );
839 /*
840 * Replicate A( IA:IA+ktmp-1, JA+k:JA+ktmp-1 ) over B(IB:IB+ktmp-1, JB:JB+N-1)
841 */
842  PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
843  ctxt, Bld );
844  PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
845  DBUFA, COLUMN, &WA, WAd, &WAfr );
846 /*
847 * WB := A( IA:IA+k-1, JA+k:JA+ktmp-1 )' * B( IB:IB+k-1, JB:JB+N-1 )
848 */
849  PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WB, WBd, &WBfr,
850  &WBsum );
851  Bmp = PB_Cnumroc( k, 0, Bimb1, Bmb, myrow, Brow, nprow );
852  if( ( Bnq0 > 0 ) && ( Bmp > 0 ) )
853  gemm( C2F_CHAR( TRANSA ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0,
854  &Bmp, one, WA, &WAd[LLD_], Bptr0, &Bld, zero, WB,
855  &WBd[LLD_] );
856  if( WBsum )
857  {
858  WBd[RSRC_] = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow,
859  nprow );
860  if( Bnq0 > 0 )
861  gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WB, WBd[LLD_],
862  WBd[RSRC_], mycol );
863  }
864 /*
865 * Add WB to B( IB+k:IB+ktmp-1, JB:JB+N-1 ) and solve it with
866 * A( IA+k:IA+ktmp-1, JA+k:JA+ktmp-1 )
867 */
868  PB_CptrsmAB1( TYPE, SIDE, UPLO, TRANSA, DIAG, kbb, N, ALPHA,
869  WA, k, 0, WAd, B, IB+k, JB, DESCB, WB, WBd );
870  if( WBfr ) free( WB );
871  if( WAfr ) free( WA );
872  if( Afr ) free( Aptr );
873  }
874  }
875  else
876  {
877 /*
878 * Solve last block of rows of sub( B )
879 */
880  Bcurimb1 = PB_Clastnb( M, IB, Bimb, Bmb );
881  k = M - Bcurimb1;
882 /*
883 * Accumulate A( IA+k:IA+M-1, JA+k:JA+M-1 )
884 */
885  PB_CGatherV( TYPE, REUSE, BACKWARD, Bcurimb1, Bcurimb1, A, IA+k,
886  JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr );
887 /*
888 * Replicate A( IA+k:IA+M-1, JA+k:JA+M-1 ) over B( IB+k:IB+M-1, JB:JB+N-1 )
889 */
890  Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
891  PB_Cdescset( Bd0, Bcurimb1, N, Bcurimb1, Binb1, Bmb, Bnb,
892  Bcurrow, Bcol, ctxt, Bld );
893  PB_CInV( TYPE, NOCONJG, COLUMN, Bcurimb1, N, Bd0, Bcurimb1, Aptr,
894  0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr );
895 /*
896 * Solve B( IB+k:IB+M-1, JB:JB+N-1 )
897 */
898  if( ( ( Brow < 0 ) || ( myrow == Bcurrow ) ) && ( Bnq0 > 0 ) )
899  TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ),
900  C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ),
901  &Bcurimb1, &Bnq0, ALPHA, WA, &WAd[LLD_],
902  Mptr( Bptr0, Bmp0-Bcurimb1, 0, Bld, size ),
903  &Bld );
904  if( WAfr ) free( WA );
905  if( Afr ) free( Aptr );
906  if( ( mn = M - Bcurimb1 ) <= 0 ) return;
907 /*
908 * Update and solve remaining rows of sub( B )
909 */
910  kmax = ( ( mn - 1 ) / kb ) * kb;
911 
912  for( k = kmax; k >= 0; k -= kb )
913  {
914  ktmp = M - k; kbb = mn - k; kbb = MIN( kbb, kb );
915 /*
916 * Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )
917 */
918  PB_CGatherV( TYPE, REUSE, BACKWARD, ktmp, kbb, A, IA+k, JA+k,
919  DESCA, COLUMN, &Aptr, DBUFA, &Afr );
920 /*
921 * Replicate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over B( IB+k:IB+M-1, JB:JB+N-1 )
922 */
923  Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
924  Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
925  PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
926  Bcol, ctxt, Bld );
927  PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
928  DBUFA, COLUMN, &WA, WAd, &WAfr );
929 /*
930 * WB := A( IA+k+kbb:IA+M-1, JA+k:JA+k+kbb-1 )'* B( IB+k+kbb:IB+M-1, JB:JB+N-1 )
931 */
932  PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WB, WBd, &WBfr,
933  &WBsum );
934  Bmp = PB_Cnumroc( ktmp-kbb, k+kbb, Bimb1, Bmb, myrow, Brow,
935  nprow );
936  if( ( Bnq0 > 0 ) && ( Bmp > 0 ) )
937  {
938  WAoff = PB_Cnumroc( kbb, 0, WAd[IMB_], WAd[MB_], myrow,
939  WAd[RSRC_], nprow );
940  gemm( C2F_CHAR( TRANSA ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0,
941  &Bmp, one, Mptr( WA, WAoff, 0, WAd[LLD_], size ),
942  &WAd[LLD_], Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ),
943  &Bld, zero, WB, &WBd[LLD_] );
944  }
945  if( WBsum )
946  {
947  WBd[RSRC_] = PB_Cindxg2p( k + kbb - 1, Bimb1, Bmb, Brow,
948  Brow, nprow );
949  if( Bnq0 > 0 )
950  gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WB, WBd[LLD_],
951  WBd[RSRC_], mycol );
952  }
953 /*
954 * Add WB to B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) and solve it with
955 * A( IA+k:IA+k+kbb-1, JA+k:JA+k+kbb-1 )
956 */
957  PB_CptrsmAB1( TYPE, SIDE, UPLO, TRANSA, DIAG, kbb, N, ALPHA,
958  WA, 0, 0, WAd, B, IB+k, JB, DESCB, WB, WBd );
959  if( WBfr ) free( WB );
960  if( WAfr ) free( WA );
961  if( Afr ) free( Aptr );
962  }
963  }
964  }
965  else
966  {
967  top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
968 
969  if( upper )
970  {
971 /*
972 * Solve last block of columns of sub( B )
973 */
974  Bcurinb1 = PB_Clastnb( N, JB, Binb, Bnb );
975  k = N - Bcurinb1;
976 /*
977 * Accumulate A( IA+k:IA+N-1, JA+k:JA+N-1 )
978 */
979  PB_CGatherV( TYPE, REUSE, BACKWARD, Bcurinb1, Bcurinb1, A, IA+k,
980  JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr );
981 /*
982 * Replicate A( IA+k:IA+N-1, JA+k:JA+N-1 ) over B( IB:IB+M-1, JB+k:JB+N-1 )
983 */
984  Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
985  PB_Cdescset( Bd0, M, Bcurinb1, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
986  Bcurcol, ctxt, Bld );
987  PB_CInV( TYPE, NOCONJG, ROW, M, Bcurinb1, Bd0, Bcurinb1, Aptr,
988  0, 0, DBUFA, ROW, &WA, WAd, &WAfr );
989 /*
990 * Solve B( IB:IB+M-1, JB+k:JB+N-1 )
991 */
992  if( ( ( Bcol < 0 ) || ( mycol == Bcurcol ) ) && ( Bmp0 > 0 ) )
993  TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ),
994  C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ),
995  &Bmp0, &Bcurinb1, ALPHA, WA, &WAd[LLD_],
996  Mptr( Bptr0, 0, Bnq0-Bcurinb1, Bld, size ),
997  &Bld );
998  if( WAfr ) free( WA );
999  if( Afr ) free( Aptr );
1000  if( ( mn = N - Bcurinb1 ) <= 0 ) return;
1001 /*
1002 * Update and solve remaining columns of sub( B )
1003 */
1004  kmax = ( ( mn - 1 ) / kb ) * kb;
1005 
1006  for( k = kmax; k >= 0; k -= kb )
1007  {
1008  ktmp = N - k; kbb = mn - k; kbb = MIN( kbb, kb );
1009 /*
1010 * Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )
1011 */
1012  PB_CGatherV( TYPE, REUSE, BACKWARD, kbb, ktmp, A, IA+k, JA+k,
1013  DESCA, ROW, &Aptr, DBUFA, &Afr );
1014 /*
1015 * Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over B( IB:IB+M-1, JB+k:JB+N-1 )
1016 */
1017  Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
1018  Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
1019  PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
1020  Bcurcol, ctxt, Bld );
1021  PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
1022  DBUFA, ROW, &WA, WAd, &WAfr );
1023 /*
1024 * WB := B( IB:IB+M-1, JB+k+kbb:JB+N-1 ) * A(IA+k:IA+k+kbb-1, JA+k+kbb:JA+N-1)'
1025 */
1026  PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WB, WBd,
1027  &WBfr, &WBsum );
1028  Bnq = PB_Cnumroc( ktmp-kbb, k+kbb, Binb1, Bnb, mycol, Bcol,
1029  npcol );
1030  if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
1031  {
1032  WAoff = PB_Cnumroc( kbb, 0, WAd[INB_], WAd[NB_], mycol,
1033  WAd[CSRC_], npcol );
1034  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANSA ), &Bmp0, &kbb,
1035  &Bnq, one, Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ),
1036  &Bld, Mptr( WA, 0, WAoff, WAd[LLD_], size ),
1037  &WAd[LLD_], zero, WB, &WBd[LLD_] );
1038  }
1039  if( WBsum )
1040  {
1041  WBd[CSRC_] = PB_Cindxg2p( k + kbb - 1, Binb1, Bnb, Bcol,
1042  Bcol, npcol );
1043  if( Bmp0 > 0 )
1044  gsum2d( ctxt, ROW, &top, Bmp0, kbb, WB, WBd[LLD_],
1045  myrow, WBd[CSRC_] );
1046  }
1047 /*
1048 * Add WB to B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) and solve it with
1049 * A( IA+k:IA+k+kbb-1, JA+k:JA+k+kbb-1 )
1050 */
1051  PB_CptrsmAB1( TYPE, SIDE, UPLO, TRANSA, DIAG, M, kbb, ALPHA,
1052  WA, 0, 0, WAd, B, IB, JB+k, DESCB, WB, WBd );
1053  if( WBfr ) free( WB );
1054  if( WAfr ) free( WA );
1055  if( Afr ) free( Aptr );
1056  }
1057  }
1058  else
1059  {
1060 /*
1061 * Accumulate A( IA:IA+Binb1-1, JA:JA+Binb1-1 )
1062 */
1063  PB_CGatherV( TYPE, REUSE, FORWARD, Binb1, Binb1, A, IA, JA,
1064  DESCA, ROW, &Aptr, DBUFA, &Afr );
1065 /*
1066 * Replicate A( IA:IA+Binb1-1, JA:JA+Binb1-1 ) over B(IB:IB+M-1, JB:JB+Binb1-1)
1067 */
1068  PB_Cdescset( Bd0, M, Binb1, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
1069  ctxt, Bld );
1070  PB_CInV( TYPE, NOCONJG, ROW, M, Binb1, Bd0, Binb1, Aptr, 0, 0,
1071  DBUFA, ROW, &WA, WAd, &WAfr );
1072 /*
1073 * Solve B( IB:IB+M-1, JB:JB+Binb1-1 )
1074 */
1075  if( ( ( Bcol < 0 ) || ( mycol == Bcol ) ) && ( Bmp0 > 0 ) )
1076  TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ),
1077  C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ),
1078  &Bmp0, &Binb1, ALPHA, WA, &WAd[LLD_],
1079  Bptr0, &Bld );
1080  if( WAfr ) free( WA );
1081  if( Afr ) free( Aptr );
1082 /*
1083 * Update and solve remaining columns of sub( B )
1084 */
1085  for( k = Binb1; k < N; k += kb )
1086  {
1087  kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
1088 /*
1089 * Accumulate A( IA+k:IA+ktmp-1, JA:JA+ktmp-1 )
1090 */
1091  PB_CGatherV( TYPE, REUSE, FORWARD, kbb, ktmp, A, IA+k, JA,
1092  DESCA, ROW, &Aptr, DBUFA, &Afr );
1093 /*
1094 * Replicate A( IA+k:IA+ktmp-1, JA:JA+ktmp-1 ) over B( IB:IB+M-1, JB:JB+ktmp-1 )
1095 */
1096  PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
1097  ctxt, Bld );
1098  PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
1099  DBUFA, ROW, &WA, WAd, &WAfr );
1100 /*
1101 * WB := B( IB:IB+M-1, JB:JB+k-1 ) * A( IA+k:IA+ktmp-1, JA:JA+k-1 )'
1102 */
1103  PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WB, WBd,
1104  &WBfr, &WBsum );
1105  Bnq = PB_Cnumroc( k, 0, Binb1, Bnb, mycol, Bcol, npcol );
1106  if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
1107  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANSA ), &Bmp0, &kbb,
1108  &Bnq, one, Bptr0, &Bld, WA, &WAd[LLD_], zero, WB,
1109  &WBd[LLD_] );
1110  if( WBsum )
1111  {
1112  WBd[CSRC_] = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol,
1113  npcol );
1114  if( Bmp0 > 0 )
1115  gsum2d( ctxt, ROW, &top, Bmp0, kbb, WB, WBd[LLD_],
1116  myrow, WBd[CSRC_] );
1117  }
1118 /*
1119 * Add WB to B( IB:IB+M-1, JB+k:JB+ktmp-1 ) and solve it with
1120 * A( IA+k:IA+ktmp-1, JA+k:JA+ktmp-1 )
1121 */
1122  PB_CptrsmAB1( TYPE, SIDE, UPLO, TRANSA, DIAG, M, kbb, ALPHA,
1123  WA, 0, k, WAd, B, IB, JB+k, DESCB, WB, WBd );
1124  if( WAfr ) free( WA );
1125  if( Afr ) free( Aptr );
1126  if( WBfr ) free( WB );
1127  }
1128  }
1129  }
1130  }
1131  }
1132 /*
1133 * End of PB_CptrsmAB
1134 */
1135 }
PB_CptrsmAB
void PB_CptrsmAB(PBTYP_T *TYPE, char *VARIANT, char *SIDE, char *UPLO, char *TRANSA, char *DIAG, int M, int N, char *ALPHA, char *A, int IA, int JA, int *DESCA, char *B, int IB, int JB, int *DESCB)
Definition: PB_CptrsmAB.c:25
CCONJG
#define CCONJG
Definition: PBblas.h:21
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
CRIGHT
#define CRIGHT
Definition: PBblas.h:30
BACKWARD
#define BACKWARD
Definition: PBblas.h:65
CCOTRAN
#define CCOTRAN
Definition: PBblas.h:22
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
PB_Cfirstnb
int PB_Cfirstnb()
DLEN_
#define DLEN_
Definition: PBtools.h:48
PB_Clastnb
int PB_Clastnb()
GEMM_T
F_VOID_FCT(* GEMM_T)()
Definition: pblas.h:313
PB_CptrsmAB1
void PB_CptrsmAB1()
NOTRAN
#define NOTRAN
Definition: PBblas.h:44
LLD_
#define LLD_
Definition: PBtools.h:47
CNOCONJG
#define CNOCONJG
Definition: PBblas.h:19
PB_CptrsmAB0
void PB_CptrsmAB0()
UPPER
#define UPPER
Definition: PBblas.h:52
IMB_
#define IMB_
Definition: PBtools.h:41
pilaenv_
int pilaenv_()
PB_CGatherV
void PB_CGatherV()
INIT
#define INIT
Definition: PBblas.h:61
PB_Cdescset
void PB_Cdescset()
PB_Cindxg2p
int PB_Cindxg2p()
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
GSUM2D_T
void(* GSUM2D_T)()
Definition: pblas.h:282
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
CNOTRAN
#define CNOTRAN
Definition: PBblas.h:18
FORWARD
#define FORWARD
Definition: PBblas.h:64
COMBINE
#define COMBINE
Definition: PBblacs.h:49
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
PB_CInV
void PB_CInV()
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
INB_
#define INB_
Definition: PBtools.h:42
LOWER
#define LOWER
Definition: PBblas.h:51
PB_COutV
void PB_COutV()
REUSE
#define REUSE
Definition: PBblas.h:67
C2F_CHAR
#define C2F_CHAR(a)
Definition: pblas.h:121
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
CUPPER
#define CUPPER
Definition: PBblas.h:26
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CLEFT
#define CLEFT
Definition: PBblas.h:29
CTXT_
#define CTXT_
Definition: PBtools.h:38