ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CptrsmB.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_CptrsmB( PBTYP_T * TYPE, char * DIRECB, char * SIDE,
21  char * UPLO, char * TRANSA, char * DIAG, int M, int N,
22  char * ALPHA, char * A, int IA, int JA, int * DESCA,
23  char * B, int IB, int JB, int * DESCB )
24 #else
25 void PB_CptrsmB( TYPE, DIRECB, SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A,
26  IA, JA, DESCA, B, IB, JB, DESCB )
27 /*
28 * .. Scalar Arguments ..
29 */
30  char * DIAG, * DIRECB, * SIDE, * TRANSA, * UPLO;
31  int IA, IB, JA, JB, M, N;
32  char * ALPHA;
33  PBTYP_T * TYPE;
34 /*
35 * .. Array Arguments ..
36 */
37  int * DESCA, * DESCB;
38  char * A, * B;
39 #endif
40 {
41 /*
42 * Purpose
43 * =======
44 *
45 * PB_CptrsmB solves one of the matrix equations
46 *
47 * op( sub( A ) )*X = alpha*sub( B ), or
48 *
49 * X*op( sub( A ) ) = alpha*sub( B ),
50 *
51 * where
52 *
53 * sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L',
54 * A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and,
55 *
56 * sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1).
57 *
58 * Alpha is a scalar, X and sub( B ) are m by n submatrices, sub( A ) is
59 * a unit, or non-unit, upper or lower triangular submatrix and op( Y )
60 * is one of
61 *
62 * op( Y ) = Y or op( Y ) = Y' or op( Y ) = conjg( Y' ).
63 *
64 * The submatrix X is overwritten on sub( B ).
65 *
66 * This is the inner-product algorithm using the logical LCM hybrid
67 * and static blocking techniques. The submatrix operand sub( A ) stays
68 * in place.
69 *
70 * Notes
71 * =====
72 *
73 * A description vector is associated with each 2D block-cyclicly dis-
74 * tributed matrix. This vector stores the information required to
75 * establish the mapping between a matrix entry and its corresponding
76 * process and memory location.
77 *
78 * In the following comments, the character _ should be read as
79 * "of the distributed matrix". Let A be a generic term for any 2D
80 * block cyclicly distributed matrix. Its description vector is DESC_A:
81 *
82 * NOTATION STORED IN EXPLANATION
83 * ---------------- --------------- ------------------------------------
84 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
85 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
86 * the NPROW x NPCOL BLACS process grid
87 * A is distributed over. The context
88 * itself is global, but the handle
89 * (the integer value) may vary.
90 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
91 * ted matrix A, M_A >= 0.
92 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
93 * buted matrix A, N_A >= 0.
94 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
95 * block of the matrix A, IMB_A > 0.
96 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
97 * left block of the matrix A,
98 * INB_A > 0.
99 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
100 * bute the last M_A-IMB_A rows of A,
101 * MB_A > 0.
102 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
103 * bute the last N_A-INB_A columns of
104 * A, NB_A > 0.
105 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
106 * row of the matrix A is distributed,
107 * NPROW > RSRC_A >= 0.
108 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
109 * first column of A is distributed.
110 * NPCOL > CSRC_A >= 0.
111 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
112 * array storing the local blocks of
113 * the distributed matrix A,
114 * IF( Lc( 1, N_A ) > 0 )
115 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
116 * ELSE
117 * LLD_A >= 1.
118 *
119 * Let K be the number of rows of a matrix A starting at the global in-
120 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
121 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
122 * receive if these K rows were distributed over NPROW processes. If K
123 * is the number of columns of a matrix A starting at the global index
124 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
125 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
126 * these K columns were distributed over NPCOL processes.
127 *
128 * The values of Lr() and Lc() may be determined via a call to the func-
129 * tion PB_Cnumroc:
130 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
131 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
132 *
133 * Arguments
134 * =========
135 *
136 * TYPE (local input) pointer to a PBTYP_T structure
137 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
138 * that contains type information (See pblas.h).
139 *
140 * DIRECB (global input) pointer to CHAR
141 * On entry, DIRECB specifies the direction in which the rows
142 * or columns of sub( B ) should be looped over as follows:
143 * DIRECB = 'F' or 'f' forward or increasing,
144 * DIRECB = 'B' or 'b' backward or decreasing.
145 *
146 * SIDE (global input) pointer to CHAR
147 * On entry, SIDE specifies whether op( sub( A ) ) appears on
148 * the left or right of X as follows:
149 *
150 * SIDE = 'L' or 'l' op( sub( A ) )*X = alpha*sub( B ),
151 *
152 * SIDE = 'R' or 'r' X*op( sub( A ) ) = alpha*sub( B ).
153 *
154 * UPLO (global input) pointer to CHAR
155 * On entry, UPLO specifies whether the submatrix sub( A ) is
156 * an upper or lower triangular submatrix as follows:
157 *
158 * UPLO = 'U' or 'u' sub( A ) is an upper triangular
159 * submatrix,
160 *
161 * UPLO = 'L' or 'l' sub( A ) is a lower triangular
162 * submatrix.
163 *
164 * TRANSA (global input) pointer to CHAR
165 * On entry, TRANSA specifies the form of op( sub( A ) ) to be
166 * used in the matrix multiplication as follows:
167 *
168 * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ),
169 *
170 * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )',
171 *
172 * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ).
173 *
174 * DIAG (global input) pointer to CHAR
175 * On entry, DIAG specifies whether or not sub( A ) is unit
176 * triangular as follows:
177 *
178 * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian-
179 * gular,
180 *
181 * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri-
182 * angular.
183 *
184 * M (global input) INTEGER
185 * On entry, M specifies the number of rows of the submatrix
186 * sub( B ). M must be at least zero.
187 *
188 * N (global input) INTEGER
189 * On entry, N specifies the number of columns of the submatrix
190 * sub( B ). N must be at least zero.
191 *
192 * ALPHA (global input) pointer to CHAR
193 * On entry, ALPHA specifies the scalar alpha. When ALPHA is
194 * supplied as zero then the local entries of the array B
195 * corresponding to the entries of the submatrix sub( B ) need
196 * not be set on input.
197 *
198 * A (local input) pointer to CHAR
199 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
200 * at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at
201 * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
202 * contains the local entries of the matrix A.
203 * Before entry with UPLO = 'U' or 'u', this array contains the
204 * local entries corresponding to the entries of the upper tri-
205 * angular submatrix sub( A ), and the local entries correspon-
206 * ding to the entries of the strictly lower triangular part of
207 * the submatrix sub( A ) are not referenced.
208 * Before entry with UPLO = 'L' or 'l', this array contains the
209 * local entries corresponding to the entries of the lower tri-
210 * angular submatrix sub( A ), and the local entries correspon-
211 * ding to the entries of the strictly upper triangular part of
212 * the submatrix sub( A ) are not referenced.
213 * Note that when DIAG = 'U' or 'u', the local entries corres-
214 * ponding to the diagonal elements of the submatrix sub( A )
215 * are not referenced either, but are assumed to be unity.
216 *
217 * IA (global input) INTEGER
218 * On entry, IA specifies A's global row index, which points to
219 * the beginning of the submatrix sub( A ).
220 *
221 * JA (global input) INTEGER
222 * On entry, JA specifies A's global column index, which points
223 * to the beginning of the submatrix sub( A ).
224 *
225 * DESCA (global and local input) INTEGER array
226 * On entry, DESCA is an integer array of dimension DLEN_. This
227 * is the array descriptor for the matrix A.
228 *
229 * B (local input/local output) pointer to CHAR
230 * On entry, B is an array of dimension (LLD_B, Kb), where Kb is
231 * at least Lc( 1, JB+N-1 ). Before entry, this array contains
232 * the local entries of the matrix B.
233 * On exit, the local entries of this array corresponding to the
234 * to the entries of the submatrix sub( B ) are overwritten by
235 * the local entries of the m by n solution submatrix.
236 *
237 * IB (global input) INTEGER
238 * On entry, IB specifies B's global row index, which points to
239 * the beginning of the submatrix sub( B ).
240 *
241 * JB (global input) INTEGER
242 * On entry, JB specifies B's global column index, which points
243 * to the beginning of the submatrix sub( B ).
244 *
245 * DESCB (global and local input) INTEGER array
246 * On entry, DESCB is an integer array of dimension DLEN_. This
247 * is the array descriptor for the matrix B.
248 *
249 * -- Written on April 1, 1998 by
250 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
251 *
252 * ---------------------------------------------------------------------
253 */
254 /*
255 * .. Local Scalars ..
256 */
257  char Broc, TranOp, conjg, * negone, * one, * talpha, * talph0, top,
258  * zero;
259  int Acol, Aii, Aimb1, Ainb1, Ajj, Akp, Akq, Alcmb, Ald, Amb, An,
260  Anb, Anp, Anp0, Anq, Anq0, Arow, Asrc, Astart, BcurrocR, Bfwd,
261  BiiD, BiiR, Binb1D, Binb1R, BisR, Bld, BmyprocD, BmyprocR,
262  BnD, BnR, BnbD, BnbR, BnpR, BnprocsD, BnprocsR, BrocD, BrocR,
263  BsrcR, LNorRT, WBCfr, WBCld, WBCapbX, WBCsum, WBRfr, WBRld,
264  WBRapbX, WBRsum, ctxt, izero=0, k, kb, kbnext, kbprev, ktmp,
265  lside, mycol, myrow, n, nb, nbb, notran, npcol, nprow, p=0,
266  size, tmp, upper;
267  TZPAD_T pad;
268  GEMM_T gemm;
269  GSUM2D_T gsum2d;
270 /*
271 * .. Local Arrays ..
272 */
273  int Ad0[DLEN_], DBUFB[DLEN_], WBCd[DLEN_], WBRd[DLEN_];
274  char * Aptr = NULL, * Bptr = NULL, * WBC = NULL, * WBR = NULL;
275 /* ..
276 * .. Executable Statements ..
277 *
278 */
279 /*
280 * Retrieve process grid information
281 */
282  Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
283 
284  Bfwd = ( Mupcase( DIRECB[0] ) == CFORWARD );
285  lside = ( Mupcase( SIDE [0] ) == CLEFT );
286  upper = ( Mupcase( UPLO [0] ) == CUPPER );
287  notran = ( ( TranOp = Mupcase( TRANSA[0] ) ) == CNOTRAN );
288  LNorRT = ( lside && notran ) || ( !( lside ) && !( notran ) );
289 
290  size = TYPE->size;
291  one = TYPE->one; zero = TYPE->zero; negone = TYPE->negone;
292  pad = TYPE->Ftzpad; gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d;
293  nb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
294 /*
295 * Compute local information for sub( A ) and sub( B )
296 */
297  if( lside )
298  {
299  BnD = An = M; BnR = N; Broc = CCOLUMN;
300  BmyprocD = myrow; BnprocsD = nprow;
301  BmyprocR = mycol; BnprocsR = npcol;
302  BnbD = DESCB[MB_ ]; BnbR = DESCB[NB_ ];
303  BsrcR = DESCB[CSRC_]; Bld = DESCB[LLD_];
304  PB_Cinfog2l( IB, JB, DESCB, BnprocsD, BnprocsR, BmyprocD, BmyprocR,
305  &BiiD, &BiiR, &BrocD, &BrocR );
306  Binb1D = PB_Cfirstnb( BnD, IB, DESCB[IMB_], BnbD );
307  Binb1R = PB_Cfirstnb( BnR, JB, DESCB[INB_], BnbR );
308  }
309  else
310  {
311  BnD = An = N; BnR = M; Broc = CROW;
312  BmyprocD = mycol; BnprocsD = npcol;
313  BmyprocR = myrow; BnprocsR = nprow;
314  BnbR = DESCB[MB_ ]; BnbD = DESCB[NB_ ];
315  BsrcR = DESCB[RSRC_]; Bld = DESCB[LLD_];
316  PB_Cinfog2l( IB, JB, DESCB, BnprocsR, BnprocsD, BmyprocR, BmyprocD,
317  &BiiR, &BiiD, &BrocR, &BrocD );
318  Binb1D = PB_Cfirstnb( BnD, JB, DESCB[INB_], BnbD );
319  Binb1R = PB_Cfirstnb( BnR, IB, DESCB[IMB_], BnbR );
320  }
321 /*
322 * Compute descriptor Ad0 for sub( A )
323 */
324  PB_Cdescribe( An, An, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
325  &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
326 /*
327 * Compute conjugate of alpha for the conjugate transpose cases
328 */
329  if( TranOp == CCOTRAN )
330  {
331  conjg = CCONJG; talpha = PB_Cmalloc( size );
332  PB_Cconjg( TYPE, ALPHA, talpha );
333  }
334  else { conjg = CNOCONJG; talpha = ALPHA; }
335 /*
336 * Retrieve BLACS combine topology, select backward ot forward substitution.
337 */
338  if( LNorRT )
339  {
340  top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
341  Astart = ( upper ? An - 1 : 0 );
342  }
343  else
344  {
345  top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
346  Astart = ( upper ? 0 : An - 1 );
347  }
348 /*
349 * Computational partitioning size is computed as the product of the logical
350 * value returned by pilaenv_ and 2 * lcm( nprow, npcol ).
351 */
352  Alcmb = 2 * nb * PB_Clcm( ( Arow >= 0 ? nprow : 1 ),
353  ( Acol >= 0 ? npcol : 1 ) );
354 /*
355 * When sub( B ) is not replicated and backward pass on sub( B ), find the
356 * virtual process p owning the last row or column of sub( B ).
357 */
358  if( !( BisR = ( ( BsrcR < 0 ) || ( BnprocsR == 1 ) ) ) && !( Bfwd ) )
359  {
360  tmp = PB_Cindxg2p( BnR - 1, Binb1R, BnbR, BrocR, BrocR, BnprocsR );
361  p = MModSub( tmp, BrocR, BnprocsR );
362  }
363 /*
364 * Loop over the processes rows or columns owning the BnR rows or columns of
365 * sub( B ) to be processed.
366 */
367  n = BnR;
368 
369  while( n > 0 )
370  {
371 /*
372 * Find out who is the active process row or column as well as the number of
373 * rows or columns of sub( B ) it owns.
374 */
375  BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, BnprocsR ) );
376  BnpR = PB_Cnumroc( BnR, 0, Binb1R, BnbR, BcurrocR, BrocR, BnprocsR );
377 
378  n -= BnpR;
379 /*
380 * Re-adjust the number of rows or columns to be handled at each step, in order
381 * to average the message sizes and the computational granularity.
382 */
383  if( BnpR ) nbb = BnpR / ( ( BnpR - 1 ) / nb + 1 );
384 
385  while( BnpR )
386  {
387  nbb = MIN( nbb, BnpR );
388 /*
389 * Describe the local contiguous panel of sub( B )
390 */
391  if( lside )
392  {
393  PB_Cdescset( DBUFB, BnD, nbb, Binb1D, nbb, BnbD, BnbR, BrocD,
394  BcurrocR, ctxt, Bld );
395  if( BisR || ( BmyprocR == BcurrocR ) )
396  Bptr = Mptr( B, BiiD, BiiR, Bld, size );
397  }
398  else
399  {
400  PB_Cdescset( DBUFB, nbb, BnD, nbb, Binb1D, BnbR, BnbD, BcurrocR,
401  BrocD, ctxt, Bld );
402  if( BisR || ( BmyprocR == BcurrocR ) )
403  Bptr = Mptr( B, BiiR, BiiD, Bld, size );
404  }
405 
406  talph0 = talpha;
407 
408  if( LNorRT )
409  {
410 /*
411 * Reuse sub( B ) and/or create vector WBC in process column owning the first
412 * or last column of sub( A )
413 */
414  PB_CInOutV2( TYPE, &conjg, COLUMN, An, An, Astart, Ad0, nbb, Bptr,
415  0, 0, DBUFB, &Broc, &WBC, WBCd, &WBCfr, &WBCsum,
416  &WBCapbX );
417 /*
418 * Create WBR in process rows spanned by sub( A )
419 */
420  PB_COutV( TYPE, ROW, INIT, An, An, Ad0, nbb, &WBR, WBRd, &WBRfr,
421  &WBRsum );
422 /*
423 * Retrieve local quantities related to sub( A ) -> Ad0
424 */
425  Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
426  Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
427  Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
428 
429  Anp = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow );
430  Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol );
431  if( ( Anp > 0 ) && ( Anq > 0 ) )
432  Aptr = Mptr( A, Aii, Ajj, Ald, size );
433 
434  WBCld = WBCd[LLD_]; WBRld = WBRd[LLD_];
435 
436  if( upper )
437  {
438 /*
439 * sub( A ) is upper triangular
440 */
441  for( k = ( Astart / Alcmb ) * Alcmb; k >= 0; k -= Alcmb )
442  {
443  ktmp = An - k; kb = MIN( ktmp, Alcmb );
444 /*
445 * Solve logical diagonal block, WBC contains the solution scattered in multiple
446 * process columns and WBR contains the solution replicated in the process rows.
447 */
448  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
449  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
450  PB_Cptrsm( TYPE, WBRsum, SIDE, UPLO, TRANSA, DIAG, kb, nbb,
451  talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld,
452  size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ),
453  WBRld );
454 /*
455 * Update: only the part of sub( B ) to be solved at the next step is locally
456 * updated and combined, the remaining part of the matrix to be solved later
457 * is only locally updated.
458 */
459  if( Akp > 0 )
460  {
461  Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
462  if( WBCsum )
463  {
464  kbprev = MIN( k, Alcmb );
465  ktmp = PB_Cnumroc( kbprev, k-kbprev, Aimb1, Amb,
466  myrow, Arow, nprow );
467  Akp -= ktmp;
468 
469  if( ktmp > 0 )
470  {
471  if( Anq0 > 0 )
472  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &ktmp,
473  &nbb, &Anq0, negone, Mptr( Aptr, Akp, Akq,
474  Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld,
475  size ), &WBRld, talph0, Mptr( WBC, Akp, 0,
476  WBCld, size ), &WBCld );
477  Asrc = PB_Cindxg2p( k-1, Ainb1, Anb, Acol, Acol,
478  npcol );
479  gsum2d( ctxt, ROW, &top, ktmp, nbb, Mptr( WBC, Akp,
480  0, WBCld, size ), WBCld, myrow, Asrc );
481  if( mycol != Asrc )
482  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &ktmp,
483  &nbb, &izero, zero, zero, Mptr( WBC, Akp, 0,
484  WBCld, size ), &WBCld );
485  }
486  if( ( Akp > 0 ) && ( Anq0 > 0 ) )
487  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Akp,
488  &nbb, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald,
489  size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ),
490  &WBRld, talph0, WBC, &WBCld );
491  }
492  else
493  {
494  if( Anq0 > 0 )
495  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Akp,
496  &nbb, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald,
497  size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ),
498  &WBRld, talph0, WBC, &WBCld );
499  }
500  }
501  talph0 = one;
502  }
503  }
504  else
505  {
506 /*
507 * sub( A ) is lower triangular
508 */
509  for( k = 0; k < An; k += Alcmb )
510  {
511  ktmp = An - k; kb = MIN( ktmp, Alcmb );
512 /*
513 * Solve logical diagonal block, WBC contains the solution scattered in multiple
514 * process columns and WBR contains the solution replicated in the process rows.
515 */
516  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
517  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
518  PB_Cptrsm( TYPE, WBRsum, SIDE, UPLO, TRANSA, DIAG, kb, nbb,
519  talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld,
520  size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ),
521  WBRld );
522 /*
523 * Update: only the part of sub( B ) to be solved at the next step is locally
524 * updated and combined, the remaining part of the matrix to be solved later is
525 * only locally updated.
526 */
527  Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow );
528  if( ( Anp0 = Anp - Akp ) > 0 )
529  {
530  Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
531  if( WBCsum )
532  {
533  kbnext = ktmp - kb;
534  kbnext = MIN( kbnext, Alcmb );
535  ktmp = PB_Cnumroc( kbnext, k+kb, Aimb1, Amb, myrow,
536  Arow, nprow );
537  Anp0 -= ktmp;
538 
539  if( ktmp > 0 )
540  {
541  if( Anq0 > 0 )
542  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &ktmp,
543  &nbb, &Anq0, negone, Mptr( Aptr, Akp, Akq,
544  Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld,
545  size ), &WBRld, talph0, Mptr( WBC, Akp, 0,
546  WBCld, size ), &WBCld );
547  Asrc = PB_Cindxg2p( k+kb, Ainb1, Anb, Acol, Acol,
548  npcol );
549  gsum2d( ctxt, ROW, &top, ktmp, nbb, Mptr( WBC, Akp,
550  0, WBCld, size ), WBCld, myrow, Asrc );
551  if( mycol != Asrc )
552  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &ktmp,
553  &nbb, &izero, zero, zero, Mptr( WBC, Akp, 0,
554  WBCld, size ), &WBCld );
555  }
556  if( ( Anp0 > 0 ) && ( Anq0 > 0 ) )
557  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Anp0,
558  &nbb, &Anq0, negone, Mptr( Aptr, Akp+ktmp, Akq,
559  Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld,
560  size ), &WBRld, talph0, Mptr( WBC, Akp+ktmp, 0,
561  WBCld, size ), &WBCld );
562  }
563  else
564  {
565  if( Anq0 > 0 )
566  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Anp0,
567  &nbb, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald,
568  size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ),
569  &WBRld, talph0, Mptr( WBC, Akp, 0, WBCld,
570  size ), &WBCld );
571  }
572  }
573  talph0 = one;
574  }
575  }
576 /*
577 * Combine the scattered resulting matrix WBC
578 */
579  if( WBCsum && ( Anp > 0 ) )
580  gsum2d( ctxt, ROW, &top, Anp, nbb, WBC, WBCld, myrow,
581  WBCd[CSRC_] );
582 /*
583 * sub( B ) := WBC (if necessary)
584 */
585  if( WBCapbX )
586  PB_Cpaxpby( TYPE, &conjg, An, nbb, one, WBC, 0, 0, WBCd, COLUMN,
587  zero, Bptr, 0, 0, DBUFB, &Broc );
588  }
589  else
590  {
591 /*
592 * Reuse sub( B ) and/or create vector WBR in process row owning the first or
593 * last row of sub( A )
594 */
595  PB_CInOutV2( TYPE, &conjg, ROW, An, An, Astart, Ad0, nbb, Bptr,
596  0, 0, DBUFB, &Broc, &WBR, WBRd, &WBRfr, &WBRsum,
597  &WBRapbX );
598 /*
599 * Create WBC in process columns spanned by sub( A )
600 */
601  PB_COutV( TYPE, COLUMN, INIT, An, An, Ad0, nbb, &WBC, WBCd, &WBCfr,
602  &WBCsum );
603 /*
604 * Retrieve local quantities related to Ad0 -> sub( A )
605 */
606  Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
607  Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
608  Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
609 
610  Anp = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow );
611  Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol );
612  if( ( Anp > 0 ) && ( Anq > 0 ) )
613  Aptr = Mptr( A, Aii, Ajj, Ald, size );
614 
615  WBCld = WBCd[LLD_]; WBRld = WBRd[LLD_];
616 
617  if( upper )
618  {
619 /*
620 * sub( A ) is upper triangular
621 */
622  for( k = 0; k < An; k += Alcmb )
623  {
624  ktmp = An - k; kb = MIN( ktmp, Alcmb );
625 /*
626 * Solve logical diagonal block, WBR contains the solution scattered in multiple
627 * process rows and WBC contains the solution replicated in the process columns.
628 */
629  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
630  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
631  PB_Cptrsm( TYPE, WBCsum, SIDE, UPLO, TRANSA, DIAG, nbb, kb,
632  talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld,
633  size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ),
634  WBRld );
635 /*
636 * Update: only the part of sub( B ) to be solved at the next step is locally
637 * updated and combined, the remaining part of the matrix to be solved later is
638 * only locally updated.
639 */
640  Akq = PB_Cnumroc( k+kb, 0, Ainb1, Anb, mycol, Acol, npcol );
641  if( ( Anq0 = Anq - Akq ) > 0 )
642  {
643  Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
644  if( WBRsum )
645  {
646  kbnext = ktmp - kb;
647  kbnext = MIN( kbnext, Alcmb );
648  ktmp = PB_Cnumroc( kbnext, k+kb, Ainb1, Anb, mycol,
649  Acol, npcol );
650  Anq0 -= ktmp;
651 
652  if( ktmp > 0 )
653  {
654  if( Anp0 > 0 )
655  gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
656  &ktmp, &Anp0, negone, Mptr( WBC, Akp, 0,
657  WBCld, size ), &WBCld, Mptr( Aptr, Akp, Akq,
658  Ald, size ), &Ald, talph0, Mptr( WBR, 0,
659  Akq, WBRld, size ), &WBRld );
660  Asrc = PB_Cindxg2p( k+kb, Aimb1, Amb, Arow, Arow,
661  nprow );
662  gsum2d( ctxt, COLUMN, &top, nbb, ktmp, Mptr( WBR, 0,
663  Akq, WBRld, size ), WBRld, Asrc, mycol );
664  if( myrow != Asrc )
665  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &nbb,
666  &ktmp, &izero, zero, zero, Mptr( WBR, 0, Akq,
667  WBRld, size ), &WBRld );
668  }
669  if( ( Anp0 > 0 ) && ( Anq0 > 0 ) )
670  gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
671  &Anq0, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld,
672  size ), &WBCld, Mptr( Aptr, Akp, Akq+ktmp, Ald,
673  size ), &Ald, talph0, Mptr( WBR, 0, Akq+ktmp,
674  WBRld, size ), &WBRld );
675  }
676  else
677  {
678  if( Anp0 > 0 )
679  gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
680  &Anq0, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld,
681  size ), &WBCld, Mptr( Aptr, Akp, Akq, Ald,
682  size ), &Ald, talph0, Mptr( WBR, 0, Akq, WBRld,
683  size ), &WBRld );
684  }
685  }
686  talph0 = one;
687  }
688  }
689  else
690  {
691 /*
692 * sub( A ) is lower triangular
693 */
694  for( k = ( Astart / Alcmb ) * Alcmb; k >= 0; k -= Alcmb )
695  {
696  ktmp = An - k; kb = MIN( ktmp, Alcmb );
697 /*
698 * Solve logical diagonal block, WBR contains the solution scattered in multiple
699 * process rows and WBC contains the solution replicated in the process columns.
700 */
701  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
702  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
703  PB_Cptrsm( TYPE, WBCsum, SIDE, UPLO, TRANSA, DIAG, nbb, kb,
704  talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld,
705  size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ),
706  WBRld );
707 /*
708 * Update: only the part of sub( B ) to be solved at the next step is locally
709 * updated and combined, the remaining part of the matrix to be solved later
710 * is only locally updated.
711 */
712  if( Akq > 0 )
713  {
714  Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
715  if( WBRsum )
716  {
717  kbprev = MIN( k, Alcmb );
718  ktmp = PB_Cnumroc( kbprev, k-kbprev, Ainb1, Anb,
719  mycol, Acol, npcol );
720  Akq -= ktmp;
721 
722  if( ktmp > 0 )
723  {
724  if( Anp0 > 0 )
725  gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
726  &ktmp, &Anp0, negone, Mptr( WBC, Akp, 0,
727  WBCld, size ), &WBCld, Mptr( Aptr, Akp, Akq,
728  Ald, size ), &Ald, talph0, Mptr( WBR, 0,
729  Akq, WBRld, size ), &WBRld );
730  Asrc = PB_Cindxg2p( k-1, Aimb1, Amb, Arow, Arow,
731  nprow );
732  gsum2d( ctxt, COLUMN, &top, nbb, ktmp, Mptr( WBR, 0,
733  Akq, WBRld, size ), WBRld, Asrc, mycol );
734  if( myrow != Asrc )
735  pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &nbb,
736  &ktmp, &izero, zero, zero, Mptr( WBR, 0, Akq,
737  WBRld, size ), &WBRld );
738  }
739  if( ( Anp0 > 0 ) && ( Akq > 0 ) )
740  gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
741  &Akq, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld,
742  size ), &WBCld, Mptr( Aptr, Akp, 0, Ald,
743  size ), &Ald, talph0, WBR, &WBRld );
744  }
745  else
746  {
747  if( Anp0 > 0 )
748  gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
749  &Akq, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld,
750  size ), &WBCld, Mptr( Aptr, Akp, 0, Ald,
751  size ), &Ald, talph0, WBR, &WBRld );
752  }
753  }
754  talph0 = one;
755  }
756  }
757 /*
758 * Combine the scattered resulting matrix WBR
759 */
760  if( WBRsum && ( Anq > 0 ) )
761  gsum2d( ctxt, COLUMN, &top, nbb, Anq, WBR, WBRld, WBRd[RSRC_],
762  mycol );
763 /*
764 * sub( B ) := WBR (if necessary)
765 */
766  if( WBRapbX )
767  PB_Cpaxpby( TYPE, &conjg, nbb, An, one, WBR, 0, 0, WBRd, ROW,
768  zero, Bptr, 0, 0, DBUFB, &Broc );
769  }
770 
771  if( WBCfr ) free( WBC );
772  if( WBRfr ) free( WBR );
773 /*
774 * Go to the next contiguous panel if any residing in this process row or column
775 */
776  BnpR -= nbb;
777 
778  if( BisR || ( BmyprocR == BcurrocR ) ) BiiR += nbb;
779  }
780 /*
781 * Go to next or previous process row or column owning some of sub( B )
782 */
783  if( !( BisR ) )
784  p = ( Bfwd ? MModAdd1( p, BnprocsR ) : MModSub1( p, BnprocsR ) );
785  }
786 
787  if( TranOp == CCOTRAN ) free( talpha );
788 /*
789 * End of PB_CptrsmB
790 */
791 }
CCONJG
#define CCONJG
Definition: PBblas.h:21
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
PB_Cpaxpby
void PB_Cpaxpby()
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
PB_Cconjg
void PB_Cconjg()
CCOTRAN
#define CCOTRAN
Definition: PBblas.h:22
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
PB_Cfirstnb
int PB_Cfirstnb()
DLEN_
#define DLEN_
Definition: PBtools.h:48
GEMM_T
F_VOID_FCT(* GEMM_T)()
Definition: pblas.h:313
TRAN
#define TRAN
Definition: PBblas.h:46
PB_CptrsmB
void PB_CptrsmB(PBTYP_T *TYPE, char *DIRECB, char *SIDE, char *UPLO, char *TRANSA, char *DIAG, int M, int N, char *ALPHA, char *A, int IA, int JA, int *DESCA, char *B, int IB, int JB, int *DESCB)
Definition: PB_CptrsmB.c:25
NOTRAN
#define NOTRAN
Definition: PBblas.h:44
LLD_
#define LLD_
Definition: PBtools.h:47
PB_Cdescribe
void PB_Cdescribe()
CNOCONJG
#define CNOCONJG
Definition: PBblas.h:19
MModAdd
#define MModAdd(I1, I2, d)
Definition: PBtools.h:97
CROW
#define CROW
Definition: PBblacs.h:21
TZPAD_T
F_VOID_FCT(* TZPAD_T)()
Definition: pblas.h:288
IMB_
#define IMB_
Definition: PBtools.h:41
MModSub
#define MModSub(I1, I2, d)
Definition: PBtools.h:102
pilaenv_
int pilaenv_()
INIT
#define INIT
Definition: PBblas.h:61
PB_Cdescset
void PB_Cdescset()
MModAdd1
#define MModAdd1(I, d)
Definition: PBtools.h:100
PB_Cptrsm
void PB_Cptrsm()
PB_Cindxg2p
int PB_Cindxg2p()
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
GSUM2D_T
void(* GSUM2D_T)()
Definition: pblas.h:282
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
CNOTRAN
#define CNOTRAN
Definition: PBblas.h:18
COMBINE
#define COMBINE
Definition: PBblacs.h:49
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
PB_Cmalloc
char * PB_Cmalloc()
CFORWARD
#define CFORWARD
Definition: PBblas.h:38
ALL
#define ALL
Definition: PBblas.h:50
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
CCOLUMN
#define CCOLUMN
Definition: PBblacs.h:20
INB_
#define INB_
Definition: PBtools.h:42
PB_COutV
void PB_COutV()
C2F_CHAR
#define C2F_CHAR(a)
Definition: pblas.h:121
PB_CInOutV2
void PB_CInOutV2()
MModSub1
#define MModSub1(I, d)
Definition: PBtools.h:105
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
CUPPER
#define CUPPER
Definition: PBblas.h:26
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CLEFT
#define CLEFT
Definition: PBblas.h:29
CTXT_
#define CTXT_
Definition: PBtools.h:38
PB_Clcm
int PB_Clcm()