ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pctrsv_.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "pblas.h"
14 #include "PBpblas.h"
15 #include "PBtools.h"
16 #include "PBblacs.h"
17 #include "PBblas.h"
18 
19 #ifdef __STDC__
20 void pctrsv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N,
21  float * A, int * IA, int * JA, int * DESCA,
22  float * X, int * IX, int * JX, int * DESCX,
23  int * INCX )
24 #else
25 void pctrsv_( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX,
26  DESCX, INCX )
27 /*
28 * .. Scalar Arguments ..
29 */
30  F_CHAR_T DIAG, TRANS, UPLO;
31  int * IA, * INCX, * IX, * JA, * JX, * N;
32 /*
33 * .. Array Arguments ..
34 */
35  int * DESCA, * DESCX;
36  float * A, * X;
37 #endif
38 {
39 /*
40 * Purpose
41 * =======
42 *
43 * PCTRSV solves one of the systems of equations
44 *
45 * sub( A )*sub( X ) = b, or sub( A )'*sub( X ) = b, or
46 *
47 * conjg( sub( A )' )*sub( X ) = b,
48 *
49 * where
50 *
51 * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and,
52 *
53 * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
54 * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X.
55 *
56 * b and sub( X ) are n element subvectors and sub( A ) is an n by n
57 * unit, or non-unit, upper or lower triangular submatrix.
58 *
59 * No test for singularity or near-singularity is included in this
60 * routine. Such tests must be performed before calling this routine.
61 *
62 * Notes
63 * =====
64 *
65 * A description vector is associated with each 2D block-cyclicly dis-
66 * tributed matrix. This vector stores the information required to
67 * establish the mapping between a matrix entry and its corresponding
68 * process and memory location.
69 *
70 * In the following comments, the character _ should be read as
71 * "of the distributed matrix". Let A be a generic term for any 2D
72 * block cyclicly distributed matrix. Its description vector is DESC_A:
73 *
74 * NOTATION STORED IN EXPLANATION
75 * ---------------- --------------- ------------------------------------
76 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
77 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
78 * the NPROW x NPCOL BLACS process grid
79 * A is distributed over. The context
80 * itself is global, but the handle
81 * (the integer value) may vary.
82 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
83 * ted matrix A, M_A >= 0.
84 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
85 * buted matrix A, N_A >= 0.
86 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
87 * block of the matrix A, IMB_A > 0.
88 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
89 * left block of the matrix A,
90 * INB_A > 0.
91 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
92 * bute the last M_A-IMB_A rows of A,
93 * MB_A > 0.
94 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
95 * bute the last N_A-INB_A columns of
96 * A, NB_A > 0.
97 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
98 * row of the matrix A is distributed,
99 * NPROW > RSRC_A >= 0.
100 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
101 * first column of A is distributed.
102 * NPCOL > CSRC_A >= 0.
103 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
104 * array storing the local blocks of
105 * the distributed matrix A,
106 * IF( Lc( 1, N_A ) > 0 )
107 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
108 * ELSE
109 * LLD_A >= 1.
110 *
111 * Let K be the number of rows of a matrix A starting at the global in-
112 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
113 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
114 * receive if these K rows were distributed over NPROW processes. If K
115 * is the number of columns of a matrix A starting at the global index
116 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
117 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
118 * these K columns were distributed over NPCOL processes.
119 *
120 * The values of Lr() and Lc() may be determined via a call to the func-
121 * tion PB_Cnumroc:
122 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
123 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
124 *
125 * Arguments
126 * =========
127 *
128 * UPLO (global input) CHARACTER*1
129 * On entry, UPLO specifies whether the submatrix sub( A ) is
130 * an upper or lower triangular submatrix as follows:
131 *
132 * UPLO = 'U' or 'u' sub( A ) is an upper triangular
133 * submatrix,
134 *
135 * UPLO = 'L' or 'l' sub( A ) is a lower triangular
136 * submatrix.
137 *
138 * TRANS (global input) CHARACTER*1
139 * On entry, TRANS specifies the operation to be performed as
140 * follows:
141 *
142 * TRANS = 'N' or 'n' sub( A ) * sub( X ) = b.
143 *
144 * TRANS = 'T' or 't' sub( A )' * sub( X ) = b.
145 *
146 * TRANS = 'C' or 'c' conjg( sub( A )' ) * sub( X ) = b.
147 *
148 * DIAG (global input) CHARACTER*1
149 * On entry, DIAG specifies whether or not sub( A ) is unit
150 * triangular as follows:
151 *
152 * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian-
153 * gular,
154 *
155 * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri-
156 * angular.
157 *
158 * N (global input) INTEGER
159 * On entry, N specifies the order of the submatrix sub( A ).
160 * N must be at least zero.
161 *
162 * A (local input) COMPLEX array
163 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
164 * at least Lc( 1, JA+N-1 ). Before entry, this array contains
165 * the local entries of the matrix A.
166 * Before entry with UPLO = 'U' or 'u', this array contains the
167 * local entries corresponding to the entries of the upper tri-
168 * angular submatrix sub( A ), and the local entries correspon-
169 * ding to the entries of the strictly lower triangular part of
170 * the submatrix sub( A ) are not referenced.
171 * Before entry with UPLO = 'L' or 'l', this array contains the
172 * local entries corresponding to the entries of the lower tri-
173 * angular submatrix sub( A ), and the local entries correspon-
174 * ding to the entries of the strictly upper triangular part of
175 * the submatrix sub( A ) are not referenced.
176 * Note that when DIAG = 'U' or 'u', the local entries corres-
177 * ponding to the diagonal elements of the submatrix sub( A )
178 * are not referenced either, but are assumed to be unity.
179 *
180 * IA (global input) INTEGER
181 * On entry, IA specifies A's global row index, which points to
182 * the beginning of the submatrix sub( A ).
183 *
184 * JA (global input) INTEGER
185 * On entry, JA specifies A's global column index, which points
186 * to the beginning of the submatrix sub( A ).
187 *
188 * DESCA (global and local input) INTEGER array
189 * On entry, DESCA is an integer array of dimension DLEN_. This
190 * is the array descriptor for the matrix A.
191 *
192 * X (local input/local output) COMPLEX array
193 * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
194 * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
195 * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
196 * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
197 * Before entry, this array contains the local entries of the
198 * matrix X. On entry, sub( X ) is the n element right-hand side
199 * b. On exit, sub( X ) is overwritten with the solution subvec-
200 * tor.
201 *
202 * IX (global input) INTEGER
203 * On entry, IX specifies X's global row index, which points to
204 * the beginning of the submatrix sub( X ).
205 *
206 * JX (global input) INTEGER
207 * On entry, JX specifies X's global column index, which points
208 * to the beginning of the submatrix sub( X ).
209 *
210 * DESCX (global and local input) INTEGER array
211 * On entry, DESCX is an integer array of dimension DLEN_. This
212 * is the array descriptor for the matrix X.
213 *
214 * INCX (global input) INTEGER
215 * On entry, INCX specifies the global increment for the
216 * elements of X. Only two values of INCX are supported in
217 * this version, namely 1 and M_X. INCX must not be zero.
218 *
219 * -- Written on April 1, 1998 by
220 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
221 *
222 * ---------------------------------------------------------------------
223 */
224 /*
225 * .. Local Scalars ..
226 */
227  char DiagA, TranOp, UploA, Xroc, btop, ctop, * negone, * one,
228  * zero;
229  int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Anb,
230  Anp, Anp0, Anq, Anq0, Arow, Asrc, XACapbX, XACfr, XACld,
231  XACsum, XARapbX, XARfr, XARld, XARsum, Xi, Xj, ctxt, info,
232  ione=1, k, kb, kbnext, kbprev, ktmp, mycol, myrow, nb, notran,
233  nounit, npcol, nprow, size, upper;
234  PBTYP_T * type;
235 /*
236 * .. Local Arrays ..
237 */
238  int Ad[DLEN_], Ad0[DLEN_], XACd[DLEN_], XARd[DLEN_], Xd[DLEN_];
239  char * Aptr = NULL, * XAC = NULL, * XAR = NULL;
240 /* ..
241 * .. Executable Statements ..
242 *
243 */
244  upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER );
245  notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN );
246  nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT );
247  PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad );
248  PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd );
249 #ifndef NO_ARGCHK
250 /*
251 * Test the input parameters
252 */
253  Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
254  if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) )
255  {
256  if( ( !upper ) && ( UploA != CLOWER ) )
257  {
258  PB_Cwarn( ctxt, __LINE__, "PCTRSV", "Illegal UPLO = %c\n", UploA );
259  info = -1;
260  }
261  else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) )
262  {
263  PB_Cwarn( ctxt, __LINE__, "PCTRSV", "Illegal TRANS = %c\n", TranOp );
264  info = -2;
265  }
266  else if( ( !nounit ) && ( DiagA != CUNIT ) )
267  {
268  PB_Cwarn( ctxt, __LINE__, "PCTRSV", "Illegal DIAG = %c\n", DiagA );
269  info = -3;
270  }
271  PB_Cchkmat( ctxt, "PCTRSV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 8, &info );
272  PB_Cchkvec( ctxt, "PCTRSV", "X", *N, 4, Xi, Xj, Xd, *INCX, 12, &info );
273  }
274  if( info ) { PB_Cabort( ctxt, "PCTRSV", info ); return; }
275 #endif
276 /*
277 * Quick return if possible
278 */
279  if( *N == 0 ) return;
280 /*
281 * Retrieve process grid information
282 */
283 #ifdef NO_ARGCHK
284  Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
285 #endif
286 /*
287 * Get type structure
288 */
289  type = PB_Cctypeset();
290  size = type->size; one = type->one; zero = type->zero; negone = type->negone;
291 /*
292 * Compute descriptor Ad0 for sub( A )
293 */
294  PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj,
295  &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
296 /*
297 * Computational partitioning size is computed as the product of the logical
298 * value returned by pilaenv_ and 2 * lcm( nprow, npcol )
299 */
300  nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) *
301  PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) );
302 
303  Xroc = ( *INCX == Xd[M_] ? CROW : CCOLUMN );
304 
305  if( notran )
306  {
307  if( upper )
308  {
309 /*
310 * Save current and enforce ring BLACS topologies
311 */
312  btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
313  ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
314  (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DRING );
315  (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DRING );
316 /*
317 * Remove next line when BLACS combine operations support ring topologies.
318 */
319  (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT );
320 /*
321 * Reuse sub( X ) and/or create vector XAC in process column owning the last
322 * column of sub( A )
323 */
324  PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, *N-1, Ad0, 1,
325  ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd,
326  &XACfr, &XACsum, &XACapbX );
327 /*
328 * Create vector XAR in process rows spanned by sub( A )
329 */
330  PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr,
331  &XARsum );
332 /*
333 * Retrieve local quantities related to Ad0 -> sub( A )
334 */
335  Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
336  Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
337  Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
338  Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
339  Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
340  if( ( Anp > 0 ) && ( Anq > 0 ) )
341  Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
342 
343  XACld = XACd[LLD_]; XARld = XARd[LLD_];
344 
345  for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb )
346  {
347  ktmp = *N - k; kb = MIN( ktmp, nb );
348 /*
349 * Solve logical diagonal block, XAC contains the solution scattered in multiple
350 * process columns and XAR contains the solution replicated in the process rows.
351 */
352  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
353  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
354  PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
355  Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
356  Akq, XARld, size ), XARld );
357 /*
358 * Update: only the part of sub( X ) to be solved at the next step is locally
359 * updated and combined, the remaining part of the vector to be solved later is
360 * only locally updated.
361 */
362  if( Akp > 0 )
363  {
364  Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
365  if( XACsum )
366  {
367  kbprev = MIN( k, nb );
368  ktmp = PB_Cnumroc( kbprev, k-kbprev, Aimb1, Amb, myrow,
369  Arow, nprow );
370  Akp -= ktmp;
371 
372  if( ktmp > 0 )
373  {
374  if( Anq0 > 0 )
375  cgemv_( TRANS, &ktmp, &Anq0, negone,
376  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
377  Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
378  Mptr( XAC, Akp, 0, XACld, size ), &ione );
379  Asrc = PB_Cindxg2p( k-1, Ainb1, Anb, Acol, Acol, npcol );
380  Ccgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp,
381  0, XACld, size ), XACld, myrow, Asrc );
382  if( mycol != Asrc )
383  cset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld,
384  size ), &ione );
385  }
386  if( Akp > 0 && Anq0 > 0 )
387  cgemv_( TRANS, &Akp, &Anq0, negone,
388  Mptr( Aptr, 0, Akq, Ald, size ), &Ald,
389  Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
390  XAC, &ione );
391  }
392  else
393  {
394  if( Anq0 > 0 )
395  cgemv_( TRANS, &Akp, &Anq0, negone,
396  Mptr( Aptr, 0, Akq, Ald, size ), &Ald,
397  Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
398  XAC, &ione );
399  }
400  }
401  }
402 /*
403 * Combine the scattered resulting vector XAC
404 */
405  if( XACsum && ( Anp > 0 ) )
406  {
407  Ccgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow,
408  XACd[CSRC_] );
409  }
410 /*
411 * sub( X ) := XAC (if necessary)
412 */
413  if( XACapbX )
414  {
415  PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, COLUMN,
416  zero, ((char *) X), Xi, Xj, Xd, &Xroc );
417  }
418 /*
419 * Restore BLACS topologies
420 */
421  (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop );
422  (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop );
423  }
424  else
425  {
426 /*
427 * Save current and enforce ring BLACS topologies
428 */
429  btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
430  ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
431  (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING );
432  (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING );
433 /*
434 * Remove next line when BLACS combine operations support ring topologies.
435 */
436  (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT );
437 /*
438 * Reuse sub( X ) and/or create vector XAC in process column owning the first
439 * column of sub( A )
440 */
441  PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, 0, Ad0, 1,
442  ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd,
443  &XACfr, &XACsum, &XACapbX );
444 /*
445 * Create vector XAR in process rows spanned by sub( A )
446 */
447  PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr,
448  &XARsum );
449 /*
450 * Retrieve local quantities related to Ad0 -> sub( A )
451 */
452  Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
453  Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
454  Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
455  Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
456  Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
457  if( ( Anp > 0 ) && ( Anq > 0 ) )
458  Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
459 
460  XACld = XACd[LLD_]; XARld = XARd[LLD_];
461 
462  for( k = 0; k < *N; k += nb )
463  {
464  ktmp = *N - k; kb = MIN( ktmp, nb );
465 /*
466 * Solve logical diagonal block, XAC contains the solution scattered in multiple
467 * process columns and XAR contains the solution replicated in the process rows.
468 */
469  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
470  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
471  PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
472  Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
473  Akq, XARld, size ), XARld );
474 /*
475 * Update: only the part of sub( X ) to be solved at the next step is locally
476 * updated and combined, the remaining part of the vector to be solved later is
477 * only locally updated.
478 */
479  Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow );
480  if( ( Anp0 = Anp - Akp ) > 0 )
481  {
482  Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
483  if( XACsum )
484  {
485  kbnext = ktmp - kb; kbnext = MIN( kbnext, nb );
486  ktmp = PB_Cnumroc( kbnext, k+kb, Aimb1, Amb, myrow, Arow,
487  nprow );
488  Anp0 -= ktmp;
489 
490  if( ktmp > 0 )
491  {
492  if( Anq0 > 0 )
493  cgemv_( TRANS, &ktmp, &Anq0, negone,
494  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
495  Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
496  Mptr( XAC, Akp, 0, XACld, size ), &ione );
497  Asrc = PB_Cindxg2p( k+kb, Ainb1, Anb, Acol, Acol, npcol );
498  Ccgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp,
499  0, XACld, size ), XACld, myrow, Asrc );
500  if( mycol != Asrc )
501  cset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld,
502  size ), &ione );
503  }
504  if( Anp0 > 0 && Anq0 > 0 )
505  cgemv_( TRANS, &Anp0, &Anq0, negone,
506  Mptr( Aptr, Akp+ktmp, Akq, Ald, size ), &Ald,
507  Mptr( XAR, 0, Akq, XARld, size ), &XARld,
508  one,
509  Mptr( XAC, Akp+ktmp, 0, XACld, size ), &ione );
510  }
511  else
512  {
513  if( Anq0 > 0 )
514  cgemv_( TRANS, &Anp0, &Anq0, negone,
515  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
516  Mptr( XAR, 0, Akq, XARld, size ), &XARld,
517  one,
518  Mptr( XAC, Akp, 0, XACld, size ), &ione );
519  }
520  }
521  }
522 /*
523 * Combine the scattered resulting vector XAC
524 */
525  if( XACsum && ( Anp > 0 ) )
526  {
527  Ccgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow,
528  XACd[CSRC_] );
529  }
530 /*
531 * sub( X ) := XAC (if necessary)
532 */
533  if( XACapbX )
534  {
535  PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd,
536  COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc );
537  }
538 /*
539 * Restore BLACS topologies
540 */
541  (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop );
542  (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop );
543  }
544  }
545  else
546  {
547  if( upper )
548  {
549 /*
550 * Save current and enforce ring BLACS topologies
551 */
552  btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
553  ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
554  (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING );
555  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING );
556 /*
557 * Remove next line when BLACS combine operations support ring topologies.
558 */
559  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT );
560 /*
561 * Reuse sub( X ) and/or create vector XAR in process row owning the first row
562 * of sub( A )
563 */
564  PB_CInOutV2( type, NOCONJG, ROW, *N, *N, 0, Ad0, 1,
565  ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd,
566  &XARfr, &XARsum, &XARapbX );
567 /*
568 * Create vector XAC in process columns spanned by sub( A )
569 */
570  PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr,
571  &XACsum );
572 /*
573 * Retrieve local quantities related to Ad0 -> sub( A )
574 */
575  Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
576  Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
577  Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
578  Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
579  Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
580  if( ( Anp > 0 ) && ( Anq > 0 ) )
581  Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
582 
583  XACld = XACd[LLD_]; XARld = XARd[LLD_];
584 
585  for( k = 0; k < *N; k += nb )
586  {
587  ktmp = *N - k; kb = MIN( ktmp, nb );
588 /*
589 * Solve logical diagonal block, XAR contains the solution scattered in multiple
590 * process rows and XAC contains the solution replicated in the process columns.
591 */
592  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
593  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
594  PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
595  Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
596  Akq, XARld, size ), XARld );
597 /*
598 * Update: only the part of sub( X ) to be solved at the next step is locally
599 * updated and combined, the remaining part of the vector to be solved later is
600 * only locally updated.
601 */
602  Akq = PB_Cnumroc( k+kb, 0, Ainb1, Anb, mycol, Acol, npcol );
603  if( ( Anq0 = Anq - Akq ) > 0 )
604  {
605  Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
606  if( XARsum )
607  {
608  kbnext = ktmp - kb; kbnext = MIN( kbnext, nb );
609  ktmp = PB_Cnumroc( kbnext, k+kb, Ainb1, Anb, mycol, Acol,
610  npcol );
611  Anq0 -= ktmp;
612 
613  if( ktmp > 0 )
614  {
615  if( Anp0 > 0 )
616  cgemv_( TRANS, &Anp0, &ktmp, negone,
617  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
618  Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
619  Mptr( XAR, 0, Akq, XARld, size ), &XARld );
620  Asrc = PB_Cindxg2p( k+kb, Aimb1, Amb, Arow, Arow, nprow );
621  Ccgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0,
622  Akq, XARld, size ), XARld, Asrc, mycol );
623  if( myrow != Asrc )
624  cset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld,
625  size ), &XARld );
626  }
627  if( Anp0 > 0 && Anq0 > 0 )
628  cgemv_( TRANS, &Anp0, &Anq0, negone,
629  Mptr( Aptr, Akp, Akq+ktmp, Ald, size ), &Ald,
630  Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
631  Mptr( XAR, 0, Akq+ktmp, XARld, size ), &XARld );
632  }
633  else
634  {
635  if( Anp0 > 0 )
636  cgemv_( TRANS, &Anp0, &Anq0, negone,
637  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
638  Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
639  Mptr( XAR, 0, Akq, XARld, size ), &XARld );
640  }
641  }
642  }
643 /*
644 * Combine the scattered resulting vector XAR
645 */
646  if( XARsum && ( Anq > 0 ) )
647  {
648  Ccgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_],
649  mycol );
650  }
651 /*
652 * sub( X ) := XAR (if necessary)
653 */
654  if( XARapbX )
655  {
656  PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero,
657  ((char *) X), Xi, Xj, Xd, &Xroc );
658  }
659 /*
660 * Restore BLACS topologies
661 */
662  (void) PB_Ctop( &ctxt, BCAST, ROW, &btop );
663  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop );
664  }
665  else
666  {
667 /*
668 * Save current and enforce ring BLACS topologies
669 */
670  btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
671  ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
672  (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_DRING );
673  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DRING );
674 /*
675 * Remove next line when BLACS combine operations support ring topologies.
676 */
677  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT );
678 /*
679 * Reuse sub( X ) and/or create vector XAC in process row owning the last row
680 * of sub( A )
681 */
682  PB_CInOutV2( type, NOCONJG, ROW, *N, *N, *N-1, Ad0, 1,
683  ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd,
684  &XARfr, &XARsum, &XARapbX );
685 /*
686 * Create vector XAC in process columns spanned by sub( A )
687 */
688  PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr,
689  &XACsum );
690 /*
691 * Retrieve local quantities related to Ad0 -> sub( A )
692 */
693  Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
694  Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
695  Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
696  Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
697  Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
698  if( ( Anp > 0 ) && ( Anq > 0 ) )
699  Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
700 
701  XACld = XACd[LLD_]; XARld = XARd[LLD_];
702 
703  for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb )
704  {
705  ktmp = *N - k; kb = MIN( ktmp, nb );
706 /*
707 * Solve logical diagonal block, XAR contains the solution scattered in multiple
708 * process rows and XAC contains the solution replicated in the process columns.
709 */
710  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
711  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
712  PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
713  Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
714  Akq, XARld, size ), XARld );
715 /*
716 * Update: only the part of sub( X ) to be solved at the next step is locally
717 * updated and combined, the remaining part of the vector to be solved later
718 * is only locally updated.
719 */
720  if( Akq > 0 )
721  {
722  Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
723  if( XARsum )
724  {
725  kbprev = MIN( k, nb );
726  ktmp = PB_Cnumroc( kbprev, k-kbprev, Ainb1, Anb, mycol,
727  Acol, npcol );
728  Akq -= ktmp;
729 
730  if( ktmp > 0 )
731  {
732  if( Anp0 > 0 )
733  cgemv_( TRANS, &Anp0, &ktmp, negone,
734  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
735  Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
736  Mptr( XAR, 0, Akq, XARld, size ), &XARld );
737  Asrc = PB_Cindxg2p( k-1, Aimb1, Amb, Arow, Arow, nprow );
738  Ccgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0,
739  Akq, XARld, size ), XARld, Asrc, mycol );
740  if( myrow != Asrc )
741  cset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld,
742  size ), &XARld );
743  }
744  if( Anp0 > 0 && Akq > 0 )
745  cgemv_( TRANS, &Anp0, &Akq, negone,
746  Mptr( Aptr, Akp, 0, Ald, size ), &Ald,
747  Mptr( XAC, Akp, 0, XACld, size ), &ione,
748  one, XAR, &XARld );
749  }
750  else
751  {
752  if( Anp0 > 0 )
753  cgemv_( TRANS, &Anp0, &Akq, negone,
754  Mptr( Aptr, Akp, 0, Ald, size ), &Ald,
755  Mptr( XAC, Akp, 0, XACld, size ), &ione,
756  one, XAR, &XARld );
757  }
758  }
759  }
760 /*
761 * Combine the scattered resulting vector XAR
762 */
763  if( XARsum && ( Anq > 0 ) )
764  {
765  Ccgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_],
766  mycol );
767  }
768 /*
769 * sub( X ) := XAR (if necessary)
770 */
771  if( XARapbX )
772  {
773  PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero,
774  ((char *) X), Xi, Xj, Xd, &Xroc );
775  }
776 /*
777 * Restore BLACS topologies
778 */
779  (void) PB_Ctop( &ctxt, BCAST, ROW, &btop );
780  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop );
781  }
782  }
783  if( XACfr ) free( XAC );
784  if( XARfr ) free( XAR );
785 /*
786 * End of PCTRSV
787 */
788 }
M_
#define M_
Definition: PBtools.h:39
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
TOP_DEFAULT
#define TOP_DEFAULT
Definition: PBblacs.h:51
PB_Cpaxpby
void PB_Cpaxpby()
Ccgsum2d
void Ccgsum2d()
PB_Cwarn
void PB_Cwarn()
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
PBblacs.h
TOP_DRING
#define TOP_DRING
Definition: PBblacs.h:53
PBtools.h
PBblas.h
CCOTRAN
#define CCOTRAN
Definition: PBblas.h:22
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
PBTYP_T::type
char type
Definition: pblas.h:327
PBpblas.h
DLEN_
#define DLEN_
Definition: PBtools.h:48
CUNIT
#define CUNIT
Definition: PBblas.h:32
LLD_
#define LLD_
Definition: PBtools.h:47
PB_Cdescribe
void PB_Cdescribe()
PB_Cptrsv
void PB_Cptrsv()
F_CHAR_T
char * F_CHAR_T
Definition: pblas.h:118
CROW
#define CROW
Definition: PBblacs.h:21
PB_Cchkvec
void PB_Cchkvec()
IMB_
#define IMB_
Definition: PBtools.h:41
TOP_IRING
#define TOP_IRING
Definition: PBblacs.h:52
pilaenv_
int pilaenv_()
INIT
#define INIT
Definition: PBblas.h:61
PB_Cabort
void PB_Cabort()
CLOWER
#define CLOWER
Definition: PBblas.h:25
pctrsv_
void pctrsv_(F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int *N, float *A, int *IA, int *JA, int *DESCA, float *X, int *IX, int *JX, int *DESCX, int *INCX)
Definition: pctrsv_.c:25
PB_Cindxg2p
int PB_Cindxg2p()
F2C_CHAR
#define F2C_CHAR(a)
Definition: pblas.h:120
PBTYP_T::negone
char * negone
Definition: pblas.h:331
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
CNOTRAN
#define CNOTRAN
Definition: PBblas.h:18
PBTYP_T::one
char * one
Definition: pblas.h:331
PB_CargFtoC
void PB_CargFtoC()
BCAST
#define BCAST
Definition: PBblacs.h:48
cgemv_
F_VOID_FCT cgemv_()
COMBINE
#define COMBINE
Definition: PBblacs.h:49
PBTYP_T::size
int size
Definition: pblas.h:329
PB_Cchkmat
void PB_Cchkmat()
PB_Cnumroc
int PB_Cnumroc()
cset_
F_VOID_FCT cset_()
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
CCOLUMN
#define CCOLUMN
Definition: PBblacs.h:20
INB_
#define INB_
Definition: PBtools.h:42
PB_COutV
void PB_COutV()
C2F_CHAR
#define C2F_CHAR(a)
Definition: pblas.h:121
PB_CInOutV2
void PB_CInOutV2()
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
pblas.h
CUPPER
#define CUPPER
Definition: PBblas.h:26
CTRAN
#define CTRAN
Definition: PBblas.h:20
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CTXT_
#define CTXT_
Definition: PBtools.h:38
PB_Cctypeset
PBTYP_T * PB_Cctypeset()
Definition: PB_Cctypeset.c:19
PBTYP_T::zero
char * zero
Definition: pblas.h:331
CNOUNIT
#define CNOUNIT
Definition: PBblas.h:33
PB_Clcm
int PB_Clcm()