ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pstrsv_.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "pblas.h"
14 #include "PBpblas.h"
15 #include "PBtools.h"
16 #include "PBblacs.h"
17 #include "PBblas.h"
18 
19 #ifdef __STDC__
20 void pstrsv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N,
21  float * A, int * IA, int * JA, int * DESCA,
22  float * X, int * IX, int * JX, int * DESCX,
23  int * INCX )
24 #else
25 void pstrsv_( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX,
26  DESCX, INCX )
27 /*
28 * .. Scalar Arguments ..
29 */
30  F_CHAR_T DIAG, TRANS, UPLO;
31  int * IA, * INCX, * IX, * JA, * JX, * N;
32 /*
33 * .. Array Arguments ..
34 */
35  int * DESCA, * DESCX;
36  float * A, * X;
37 #endif
38 {
39 /*
40 * Purpose
41 * =======
42 *
43 * PSTRSV solves one of the systems of equations
44 *
45 * sub( A )*sub( X ) = b, or sub( A )'*sub( X ) = b,
46 *
47 * where
48 *
49 * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and,
50 *
51 * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
52 * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X.
53 *
54 * b and sub( X ) are n element subvectors and sub( A ) is an n by n
55 * unit, or non-unit, upper or lower triangular submatrix.
56 *
57 * No test for singularity or near-singularity is included in this
58 * routine. Such tests must be performed before calling this routine.
59 *
60 * Notes
61 * =====
62 *
63 * A description vector is associated with each 2D block-cyclicly dis-
64 * tributed matrix. This vector stores the information required to
65 * establish the mapping between a matrix entry and its corresponding
66 * process and memory location.
67 *
68 * In the following comments, the character _ should be read as
69 * "of the distributed matrix". Let A be a generic term for any 2D
70 * block cyclicly distributed matrix. Its description vector is DESC_A:
71 *
72 * NOTATION STORED IN EXPLANATION
73 * ---------------- --------------- ------------------------------------
74 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
75 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
76 * the NPROW x NPCOL BLACS process grid
77 * A is distributed over. The context
78 * itself is global, but the handle
79 * (the integer value) may vary.
80 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
81 * ted matrix A, M_A >= 0.
82 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
83 * buted matrix A, N_A >= 0.
84 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
85 * block of the matrix A, IMB_A > 0.
86 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
87 * left block of the matrix A,
88 * INB_A > 0.
89 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
90 * bute the last M_A-IMB_A rows of A,
91 * MB_A > 0.
92 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
93 * bute the last N_A-INB_A columns of
94 * A, NB_A > 0.
95 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
96 * row of the matrix A is distributed,
97 * NPROW > RSRC_A >= 0.
98 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
99 * first column of A is distributed.
100 * NPCOL > CSRC_A >= 0.
101 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
102 * array storing the local blocks of
103 * the distributed matrix A,
104 * IF( Lc( 1, N_A ) > 0 )
105 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
106 * ELSE
107 * LLD_A >= 1.
108 *
109 * Let K be the number of rows of a matrix A starting at the global in-
110 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
111 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
112 * receive if these K rows were distributed over NPROW processes. If K
113 * is the number of columns of a matrix A starting at the global index
114 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
115 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
116 * these K columns were distributed over NPCOL processes.
117 *
118 * The values of Lr() and Lc() may be determined via a call to the func-
119 * tion PB_Cnumroc:
120 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
121 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
122 *
123 * Arguments
124 * =========
125 *
126 * UPLO (global input) CHARACTER*1
127 * On entry, UPLO specifies whether the submatrix sub( A ) is
128 * an upper or lower triangular submatrix as follows:
129 *
130 * UPLO = 'U' or 'u' sub( A ) is an upper triangular
131 * submatrix,
132 *
133 * UPLO = 'L' or 'l' sub( A ) is a lower triangular
134 * submatrix.
135 *
136 * TRANS (global input) CHARACTER*1
137 * On entry, TRANS specifies the operation to be performed as
138 * follows:
139 *
140 * TRANS = 'N' or 'n' sub( A ) * sub( X ) = b.
141 *
142 * TRANS = 'T' or 't' sub( A )' * sub( X ) = b.
143 *
144 * TRANS = 'C' or 'c' sub( A )' * sub( X ) = b.
145 *
146 * DIAG (global input) CHARACTER*1
147 * On entry, DIAG specifies whether or not sub( A ) is unit
148 * triangular as follows:
149 *
150 * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian-
151 * gular,
152 *
153 * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri-
154 * angular.
155 *
156 * N (global input) INTEGER
157 * On entry, N specifies the order of the submatrix sub( A ).
158 * N must be at least zero.
159 *
160 * A (local input) REAL array
161 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
162 * at least Lc( 1, JA+N-1 ). Before entry, this array contains
163 * the local entries of the matrix A.
164 * Before entry with UPLO = 'U' or 'u', this array contains the
165 * local entries corresponding to the entries of the upper tri-
166 * angular submatrix sub( A ), and the local entries correspon-
167 * ding to the entries of the strictly lower triangular part of
168 * the submatrix sub( A ) are not referenced.
169 * Before entry with UPLO = 'L' or 'l', this array contains the
170 * local entries corresponding to the entries of the lower tri-
171 * angular submatrix sub( A ), and the local entries correspon-
172 * ding to the entries of the strictly upper triangular part of
173 * the submatrix sub( A ) are not referenced.
174 * Note that when DIAG = 'U' or 'u', the local entries corres-
175 * ponding to the diagonal elements of the submatrix sub( A )
176 * are not referenced either, but are assumed to be unity.
177 *
178 * IA (global input) INTEGER
179 * On entry, IA specifies A's global row index, which points to
180 * the beginning of the submatrix sub( A ).
181 *
182 * JA (global input) INTEGER
183 * On entry, JA specifies A's global column index, which points
184 * to the beginning of the submatrix sub( A ).
185 *
186 * DESCA (global and local input) INTEGER array
187 * On entry, DESCA is an integer array of dimension DLEN_. This
188 * is the array descriptor for the matrix A.
189 *
190 * X (local input/local output) REAL array
191 * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
192 * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
193 * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
194 * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
195 * Before entry, this array contains the local entries of the
196 * matrix X. On entry, sub( X ) is the n element right-hand side
197 * b. On exit, sub( X ) is overwritten with the solution subvec-
198 * tor.
199 *
200 * IX (global input) INTEGER
201 * On entry, IX specifies X's global row index, which points to
202 * the beginning of the submatrix sub( X ).
203 *
204 * JX (global input) INTEGER
205 * On entry, JX specifies X's global column index, which points
206 * to the beginning of the submatrix sub( X ).
207 *
208 * DESCX (global and local input) INTEGER array
209 * On entry, DESCX is an integer array of dimension DLEN_. This
210 * is the array descriptor for the matrix X.
211 *
212 * INCX (global input) INTEGER
213 * On entry, INCX specifies the global increment for the
214 * elements of X. Only two values of INCX are supported in
215 * this version, namely 1 and M_X. INCX must not be zero.
216 *
217 * -- Written on April 1, 1998 by
218 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
219 *
220 * ---------------------------------------------------------------------
221 */
222 /*
223 * .. Local Scalars ..
224 */
225  char DiagA, TranOp, UploA, Xroc, btop, ctop, * negone, * one,
226  * zero;
227  int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Anb,
228  Anp, Anp0, Anq, Anq0, Arow, Asrc, XACapbX, XACfr, XACld,
229  XACsum, XARapbX, XARfr, XARld, XARsum, Xi, Xj, ctxt, info,
230  ione=1, k, kb, kbnext, kbprev, ktmp, mycol, myrow, nb, notran,
231  nounit, npcol, nprow, size, upper;
232  PBTYP_T * type;
233 /*
234 * .. Local Arrays ..
235 */
236  int Ad[DLEN_], Ad0[DLEN_], XACd[DLEN_], XARd[DLEN_], Xd[DLEN_];
237  char * Aptr = NULL, * XAC = NULL, * XAR = NULL;
238 /* ..
239 * .. Executable Statements ..
240 *
241 */
242  upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER );
243  notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN );
244  nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT );
245  PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad );
246  PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd );
247 #ifndef NO_ARGCHK
248 /*
249 * Test the input parameters
250 */
251  Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
252  if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) )
253  {
254  if( ( !upper ) && ( UploA != CLOWER ) )
255  {
256  PB_Cwarn( ctxt, __LINE__, "PSTRSV", "Illegal UPLO = %c\n", UploA );
257  info = -1;
258  }
259  else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) )
260  {
261  PB_Cwarn( ctxt, __LINE__, "PSTRSV", "Illegal TRANS = %c\n", TranOp );
262  info = -2;
263  }
264  else if( ( !nounit ) && ( DiagA != CUNIT ) )
265  {
266  PB_Cwarn( ctxt, __LINE__, "PSTRSV", "Illegal DIAG = %c\n", DiagA );
267  info = -3;
268  }
269  PB_Cchkmat( ctxt, "PSTRSV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 8, &info );
270  PB_Cchkvec( ctxt, "PSTRSV", "X", *N, 4, Xi, Xj, Xd, *INCX, 12, &info );
271  }
272  if( info ) { PB_Cabort( ctxt, "PSTRSV", info ); return; }
273 #endif
274 /*
275 * Quick return if possible
276 */
277  if( *N == 0 ) return;
278 /*
279 * Retrieve process grid information
280 */
281 #ifdef NO_ARGCHK
282  Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
283 #endif
284 /*
285 * Get type structure
286 */
287  type = PB_Cstypeset();
288  size = type->size; one = type->one; zero = type->zero; negone = type->negone;
289 /*
290 * Compute descriptor Ad0 for sub( A )
291 */
292  PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj,
293  &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
294 /*
295 * Computational partitioning size is computed as the product of the logical
296 * value returned by pilaenv_ and 2 * lcm( nprow, npcol )
297 */
298  nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) *
299  PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) );
300 
301  Xroc = ( *INCX == Xd[M_] ? CROW : CCOLUMN );
302 
303  if( notran )
304  {
305  if( upper )
306  {
307 /*
308 * Save current and enforce ring BLACS topologies
309 */
310  btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
311  ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
312  (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DRING );
313  (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DRING );
314 /*
315 * Remove next line when BLACS combine operations support ring topologies.
316 */
317  (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT );
318 /*
319 * Reuse sub( X ) and/or create vector XAC in process column owning the last
320 * column of sub( A )
321 */
322  PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, *N-1, Ad0, 1,
323  ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd,
324  &XACfr, &XACsum, &XACapbX );
325 /*
326 * Create vector XAR in process rows spanned by sub( A )
327 */
328  PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr,
329  &XARsum );
330 /*
331 * Retrieve local quantities related to Ad0 -> sub( A )
332 */
333  Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
334  Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
335  Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
336  Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
337  Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
338  if( ( Anp > 0 ) && ( Anq > 0 ) )
339  Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
340 
341  XACld = XACd[LLD_]; XARld = XARd[LLD_];
342 
343  for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb )
344  {
345  ktmp = *N - k; kb = MIN( ktmp, nb );
346 /*
347 * Solve logical diagonal block, XAC contains the solution scattered in multiple
348 * process columns and XAR contains the solution replicated in the process rows.
349 */
350  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
351  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
352  PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
353  Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
354  Akq, XARld, size ), XARld );
355 /*
356 * Update: only the part of sub( X ) to be solved at the next step is locally
357 * updated and combined, the remaining part of the vector to be solved later is
358 * only locally updated.
359 */
360  if( Akp > 0 )
361  {
362  Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
363  if( XACsum )
364  {
365  kbprev = MIN( k, nb );
366  ktmp = PB_Cnumroc( kbprev, k-kbprev, Aimb1, Amb, myrow,
367  Arow, nprow );
368  Akp -= ktmp;
369 
370  if( ktmp > 0 )
371  {
372  if( Anq0 > 0 )
373  sgemv_( TRANS, &ktmp, &Anq0, negone,
374  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
375  Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
376  Mptr( XAC, Akp, 0, XACld, size ), &ione );
377  Asrc = PB_Cindxg2p( k-1, Ainb1, Anb, Acol, Acol, npcol );
378  Csgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp,
379  0, XACld, size ), XACld, myrow, Asrc );
380  if( mycol != Asrc )
381  sset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld,
382  size ), &ione );
383  }
384  if( Akp > 0 && Anq0 > 0 )
385  sgemv_( TRANS, &Akp, &Anq0, negone,
386  Mptr( Aptr, 0, Akq, Ald, size ), &Ald,
387  Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
388  XAC, &ione );
389  }
390  else
391  {
392  if( Anq0 > 0 )
393  sgemv_( TRANS, &Akp, &Anq0, negone,
394  Mptr( Aptr, 0, Akq, Ald, size ), &Ald,
395  Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
396  XAC, &ione );
397  }
398  }
399  }
400 /*
401 * Combine the scattered resulting vector XAC
402 */
403  if( XACsum && ( Anp > 0 ) )
404  {
405  Csgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow,
406  XACd[CSRC_] );
407  }
408 /*
409 * sub( X ) := XAC (if necessary)
410 */
411  if( XACapbX )
412  {
413  PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, COLUMN,
414  zero, ((char *) X), Xi, Xj, Xd, &Xroc );
415  }
416 /*
417 * Restore BLACS topologies
418 */
419  (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop );
420  (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop );
421  }
422  else
423  {
424 /*
425 * Save current and enforce ring BLACS topologies
426 */
427  btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
428  ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
429  (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING );
430  (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING );
431 /*
432 * Remove next line when BLACS combine operations support ring topologies.
433 */
434  (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT );
435 /*
436 * Reuse sub( X ) and/or create vector XAC in process column owning the first
437 * column of sub( A )
438 */
439  PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, 0, Ad0, 1,
440  ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd,
441  &XACfr, &XACsum, &XACapbX );
442 /*
443 * Create vector XAR in process rows spanned by sub( A )
444 */
445  PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr,
446  &XARsum );
447 /*
448 * Retrieve local quantities related to Ad0 -> sub( A )
449 */
450  Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
451  Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
452  Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
453  Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
454  Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
455  if( ( Anp > 0 ) && ( Anq > 0 ) )
456  Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
457 
458  XACld = XACd[LLD_]; XARld = XARd[LLD_];
459 
460  for( k = 0; k < *N; k += nb )
461  {
462  ktmp = *N - k; kb = MIN( ktmp, nb );
463 /*
464 * Solve logical diagonal block, XAC contains the solution scattered in multiple
465 * process columns and XAR contains the solution replicated in the process rows.
466 */
467  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
468  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
469  PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
470  Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
471  Akq, XARld, size ), XARld );
472 /*
473 * Update: only the part of sub( X ) to be solved at the next step is locally
474 * updated and combined, the remaining part of the vector to be solved later is
475 * only locally updated.
476 */
477  Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow );
478  if( ( Anp0 = Anp - Akp ) > 0 )
479  {
480  Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
481  if( XACsum )
482  {
483  kbnext = ktmp - kb; kbnext = MIN( kbnext, nb );
484  ktmp = PB_Cnumroc( kbnext, k+kb, Aimb1, Amb, myrow, Arow,
485  nprow );
486  Anp0 -= ktmp;
487 
488  if( ktmp > 0 )
489  {
490  if( Anq0 > 0 )
491  sgemv_( TRANS, &ktmp, &Anq0, negone,
492  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
493  Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
494  Mptr( XAC, Akp, 0, XACld, size ), &ione );
495  Asrc = PB_Cindxg2p( k+kb, Ainb1, Anb, Acol, Acol, npcol );
496  Csgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp,
497  0, XACld, size ), XACld, myrow, Asrc );
498  if( mycol != Asrc )
499  sset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld,
500  size ), &ione );
501  }
502  if( Anp0 > 0 && Anq0 > 0 )
503  sgemv_( TRANS, &Anp0, &Anq0, negone,
504  Mptr( Aptr, Akp+ktmp, Akq, Ald, size ), &Ald,
505  Mptr( XAR, 0, Akq, XARld, size ), &XARld,
506  one,
507  Mptr( XAC, Akp+ktmp, 0, XACld, size ), &ione );
508  }
509  else
510  {
511  if( Anq0 > 0 )
512  sgemv_( TRANS, &Anp0, &Anq0, negone,
513  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
514  Mptr( XAR, 0, Akq, XARld, size ), &XARld,
515  one,
516  Mptr( XAC, Akp, 0, XACld, size ), &ione );
517  }
518  }
519  }
520 /*
521 * Combine the scattered resulting vector XAC
522 */
523  if( XACsum && ( Anp > 0 ) )
524  {
525  Csgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow,
526  XACd[CSRC_] );
527  }
528 /*
529 * sub( X ) := XAC (if necessary)
530 */
531  if( XACapbX )
532  {
533  PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd,
534  COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc );
535  }
536 /*
537 * Restore BLACS topologies
538 */
539  (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop );
540  (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop );
541  }
542  }
543  else
544  {
545  if( upper )
546  {
547 /*
548 * Save current and enforce ring BLACS topologies
549 */
550  btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
551  ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
552  (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING );
553  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING );
554 /*
555 * Remove next line when BLACS combine operations support ring topologies.
556 */
557  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT );
558 /*
559 * Reuse sub( X ) and/or create vector XAR in process row owning the first row
560 * of sub( A )
561 */
562  PB_CInOutV2( type, NOCONJG, ROW, *N, *N, 0, Ad0, 1,
563  ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd,
564  &XARfr, &XARsum, &XARapbX );
565 /*
566 * Create vector XAC in process columns spanned by sub( A )
567 */
568  PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr,
569  &XACsum );
570 /*
571 * Retrieve local quantities related to Ad0 -> sub( A )
572 */
573  Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
574  Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
575  Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
576  Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
577  Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
578  if( ( Anp > 0 ) && ( Anq > 0 ) )
579  Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
580 
581  XACld = XACd[LLD_]; XARld = XARd[LLD_];
582 
583  for( k = 0; k < *N; k += nb )
584  {
585  ktmp = *N - k; kb = MIN( ktmp, nb );
586 /*
587 * Solve logical diagonal block, XAR contains the solution scattered in multiple
588 * process rows and XAC contains the solution replicated in the process columns.
589 */
590  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
591  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
592  PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
593  Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
594  Akq, XARld, size ), XARld );
595 /*
596 * Update: only the part of sub( X ) to be solved at the next step is locally
597 * updated and combined, the remaining part of the vector to be solved later is
598 * only locally updated.
599 */
600  Akq = PB_Cnumroc( k+kb, 0, Ainb1, Anb, mycol, Acol, npcol );
601  if( ( Anq0 = Anq - Akq ) > 0 )
602  {
603  Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
604  if( XARsum )
605  {
606  kbnext = ktmp - kb; kbnext = MIN( kbnext, nb );
607  ktmp = PB_Cnumroc( kbnext, k+kb, Ainb1, Anb, mycol, Acol,
608  npcol );
609  Anq0 -= ktmp;
610 
611  if( ktmp > 0 )
612  {
613  if( Anp0 > 0 )
614  sgemv_( TRANS, &Anp0, &ktmp, negone,
615  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
616  Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
617  Mptr( XAR, 0, Akq, XARld, size ), &XARld );
618  Asrc = PB_Cindxg2p( k+kb, Aimb1, Amb, Arow, Arow, nprow );
619  Csgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0,
620  Akq, XARld, size ), XARld, Asrc, mycol );
621  if( myrow != Asrc )
622  sset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld,
623  size ), &XARld );
624  }
625  if( Anp0 > 0 && Anq0 > 0 )
626  sgemv_( TRANS, &Anp0, &Anq0, negone,
627  Mptr( Aptr, Akp, Akq+ktmp, Ald, size ), &Ald,
628  Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
629  Mptr( XAR, 0, Akq+ktmp, XARld, size ), &XARld );
630  }
631  else
632  {
633  if( Anp0 > 0 )
634  sgemv_( TRANS, &Anp0, &Anq0, negone,
635  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
636  Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
637  Mptr( XAR, 0, Akq, XARld, size ), &XARld );
638  }
639  }
640  }
641 /*
642 * Combine the scattered resulting vector XAR
643 */
644  if( XARsum && ( Anq > 0 ) )
645  {
646  Csgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_],
647  mycol );
648  }
649 /*
650 * sub( X ) := XAR (if necessary)
651 */
652  if( XARapbX )
653  {
654  PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero,
655  ((char *) X), Xi, Xj, Xd, &Xroc );
656  }
657 /*
658 * Restore BLACS topologies
659 */
660  (void) PB_Ctop( &ctxt, BCAST, ROW, &btop );
661  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop );
662  }
663  else
664  {
665 /*
666 * Save current and enforce ring BLACS topologies
667 */
668  btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
669  ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
670  (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_DRING );
671  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DRING );
672 /*
673 * Remove next line when BLACS combine operations support ring topologies.
674 */
675  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT );
676 /*
677 * Reuse sub( X ) and/or create vector XAC in process row owning the last row
678 * of sub( A )
679 */
680  PB_CInOutV2( type, NOCONJG, ROW, *N, *N, *N-1, Ad0, 1,
681  ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd,
682  &XARfr, &XARsum, &XARapbX );
683 /*
684 * Create vector XAC in process columns spanned by sub( A )
685 */
686  PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr,
687  &XACsum );
688 /*
689 * Retrieve local quantities related to Ad0 -> sub( A )
690 */
691  Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
692  Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
693  Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
694  Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
695  Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
696  if( ( Anp > 0 ) && ( Anq > 0 ) )
697  Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
698 
699  XACld = XACd[LLD_]; XARld = XARd[LLD_];
700 
701  for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb )
702  {
703  ktmp = *N - k; kb = MIN( ktmp, nb );
704 /*
705 * Solve logical diagonal block, XAR contains the solution scattered in multiple
706 * process rows and XAC contains the solution replicated in the process columns.
707 */
708  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
709  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
710  PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
711  Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
712  Akq, XARld, size ), XARld );
713 /*
714 * Update: only the part of sub( X ) to be solved at the next step is locally
715 * updated and combined, the remaining part of the vector to be solved later
716 * is only locally updated.
717 */
718  if( Akq > 0 )
719  {
720  Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
721  if( XARsum )
722  {
723  kbprev = MIN( k, nb );
724  ktmp = PB_Cnumroc( kbprev, k-kbprev, Ainb1, Anb, mycol,
725  Acol, npcol );
726  Akq -= ktmp;
727 
728  if( ktmp > 0 )
729  {
730  if( Anp0 > 0 )
731  sgemv_( TRANS, &Anp0, &ktmp, negone,
732  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
733  Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
734  Mptr( XAR, 0, Akq, XARld, size ), &XARld );
735  Asrc = PB_Cindxg2p( k-1, Aimb1, Amb, Arow, Arow, nprow );
736  Csgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0,
737  Akq, XARld, size ), XARld, Asrc, mycol );
738  if( myrow != Asrc )
739  sset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld,
740  size ), &XARld );
741  }
742  if( Anp0 > 0 && Akq > 0 )
743  sgemv_( TRANS, &Anp0, &Akq, negone,
744  Mptr( Aptr, Akp, 0, Ald, size ), &Ald,
745  Mptr( XAC, Akp, 0, XACld, size ), &ione,
746  one, XAR, &XARld );
747  }
748  else
749  {
750  if( Anp0 > 0 )
751  sgemv_( TRANS, &Anp0, &Akq, negone,
752  Mptr( Aptr, Akp, 0, Ald, size ), &Ald,
753  Mptr( XAC, Akp, 0, XACld, size ), &ione,
754  one, XAR, &XARld );
755  }
756  }
757  }
758 /*
759 * Combine the scattered resulting vector XAR
760 */
761  if( XARsum && ( Anq > 0 ) )
762  {
763  Csgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_],
764  mycol );
765  }
766 /*
767 * sub( X ) := XAR (if necessary)
768 */
769  if( XARapbX )
770  {
771  PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero,
772  ((char *) X), Xi, Xj, Xd, &Xroc );
773  }
774 /*
775 * Restore BLACS topologies
776 */
777  (void) PB_Ctop( &ctxt, BCAST, ROW, &btop );
778  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop );
779  }
780  }
781  if( XACfr ) free( XAC );
782  if( XARfr ) free( XAR );
783 /*
784 * End of PSTRSV
785 */
786 }
M_
#define M_
Definition: PBtools.h:39
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
TOP_DEFAULT
#define TOP_DEFAULT
Definition: PBblacs.h:51
PB_Cpaxpby
void PB_Cpaxpby()
PB_Cwarn
void PB_Cwarn()
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
PBblacs.h
TOP_DRING
#define TOP_DRING
Definition: PBblacs.h:53
PBtools.h
sset_
F_VOID_FCT sset_()
PBblas.h
CCOTRAN
#define CCOTRAN
Definition: PBblas.h:22
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
PBTYP_T::type
char type
Definition: pblas.h:327
PBpblas.h
DLEN_
#define DLEN_
Definition: PBtools.h:48
CUNIT
#define CUNIT
Definition: PBblas.h:32
sgemv_
F_VOID_FCT sgemv_()
LLD_
#define LLD_
Definition: PBtools.h:47
PB_Cdescribe
void PB_Cdescribe()
PB_Cptrsv
void PB_Cptrsv()
F_CHAR_T
char * F_CHAR_T
Definition: pblas.h:118
CROW
#define CROW
Definition: PBblacs.h:21
PB_Cchkvec
void PB_Cchkvec()
IMB_
#define IMB_
Definition: PBtools.h:41
TOP_IRING
#define TOP_IRING
Definition: PBblacs.h:52
pilaenv_
int pilaenv_()
INIT
#define INIT
Definition: PBblas.h:61
PB_Cabort
void PB_Cabort()
CLOWER
#define CLOWER
Definition: PBblas.h:25
PB_Cindxg2p
int PB_Cindxg2p()
F2C_CHAR
#define F2C_CHAR(a)
Definition: pblas.h:120
PBTYP_T::negone
char * negone
Definition: pblas.h:331
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
CNOTRAN
#define CNOTRAN
Definition: PBblas.h:18
PBTYP_T::one
char * one
Definition: pblas.h:331
PB_CargFtoC
void PB_CargFtoC()
BCAST
#define BCAST
Definition: PBblacs.h:48
COMBINE
#define COMBINE
Definition: PBblacs.h:49
PBTYP_T::size
int size
Definition: pblas.h:329
PB_Cchkmat
void PB_Cchkmat()
PB_Cnumroc
int PB_Cnumroc()
PB_Cstypeset
PBTYP_T * PB_Cstypeset()
Definition: PB_Cstypeset.c:19
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
CCOLUMN
#define CCOLUMN
Definition: PBblacs.h:20
INB_
#define INB_
Definition: PBtools.h:42
PB_COutV
void PB_COutV()
C2F_CHAR
#define C2F_CHAR(a)
Definition: pblas.h:121
PB_CInOutV2
void PB_CInOutV2()
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
pblas.h
CUPPER
#define CUPPER
Definition: PBblas.h:26
pstrsv_
void pstrsv_(F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int *N, float *A, int *IA, int *JA, int *DESCA, float *X, int *IX, int *JX, int *DESCX, int *INCX)
Definition: pstrsv_.c:25
CTRAN
#define CTRAN
Definition: PBblas.h:20
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CTXT_
#define CTXT_
Definition: PBtools.h:38
PBTYP_T::zero
char * zero
Definition: pblas.h:331
CNOUNIT
#define CNOUNIT
Definition: PBblas.h:33
Csgsum2d
void Csgsum2d()
PB_Clcm
int PB_Clcm()