SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
pstrsv_.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "pblas.h"
14#include "PBpblas.h"
15#include "PBtools.h"
16#include "PBblacs.h"
17#include "PBblas.h"
18
19#ifdef __STDC__
20void pstrsv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, Int * N,
21 float * A, Int * IA, Int * JA, Int * DESCA,
22 float * X, Int * IX, Int * JX, Int * DESCX,
23 Int * INCX )
24#else
25void pstrsv_( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX,
26 DESCX, INCX )
27/*
28* .. Scalar Arguments ..
29*/
30 F_CHAR_T DIAG, TRANS, UPLO;
31 Int * IA, * INCX, * IX, * JA, * JX, * N;
32/*
33* .. Array Arguments ..
34*/
35 Int * DESCA, * DESCX;
36 float * A, * X;
37#endif
38{
39/*
40* Purpose
41* =======
42*
43* PSTRSV solves one of the systems of equations
44*
45* sub( A )*sub( X ) = b, or sub( A )'*sub( X ) = b,
46*
47* where
48*
49* sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and,
50*
51* sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
52* X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X.
53*
54* b and sub( X ) are n element subvectors and sub( A ) is an n by n
55* unit, or non-unit, upper or lower triangular submatrix.
56*
57* No test for singularity or near-singularity is included in this
58* routine. Such tests must be performed before calling this routine.
59*
60* Notes
61* =====
62*
63* A description vector is associated with each 2D block-cyclicly dis-
64* tributed matrix. This vector stores the information required to
65* establish the mapping between a matrix entry and its corresponding
66* process and memory location.
67*
68* In the following comments, the character _ should be read as
69* "of the distributed matrix". Let A be a generic term for any 2D
70* block cyclicly distributed matrix. Its description vector is DESC_A:
71*
72* NOTATION STORED IN EXPLANATION
73* ---------------- --------------- ------------------------------------
74* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
75* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
76* the NPROW x NPCOL BLACS process grid
77* A is distributed over. The context
78* itself is global, but the handle
79* (the integer value) may vary.
80* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
81* ted matrix A, M_A >= 0.
82* N_A (global) DESCA[ N_ ] The number of columns in the distri-
83* buted matrix A, N_A >= 0.
84* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
85* block of the matrix A, IMB_A > 0.
86* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
87* left block of the matrix A,
88* INB_A > 0.
89* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
90* bute the last M_A-IMB_A rows of A,
91* MB_A > 0.
92* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
93* bute the last N_A-INB_A columns of
94* A, NB_A > 0.
95* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
96* row of the matrix A is distributed,
97* NPROW > RSRC_A >= 0.
98* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
99* first column of A is distributed.
100* NPCOL > CSRC_A >= 0.
101* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
102* array storing the local blocks of
103* the distributed matrix A,
104* IF( Lc( 1, N_A ) > 0 )
105* LLD_A >= MAX( 1, Lr( 1, M_A ) )
106* ELSE
107* LLD_A >= 1.
108*
109* Let K be the number of rows of a matrix A starting at the global in-
110* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
111* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
112* receive if these K rows were distributed over NPROW processes. If K
113* is the number of columns of a matrix A starting at the global index
114* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
115* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
116* these K columns were distributed over NPCOL processes.
117*
118* The values of Lr() and Lc() may be determined via a call to the func-
119* tion PB_Cnumroc:
120* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
121* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
122*
123* Arguments
124* =========
125*
126* UPLO (global input) CHARACTER*1
127* On entry, UPLO specifies whether the submatrix sub( A ) is
128* an upper or lower triangular submatrix as follows:
129*
130* UPLO = 'U' or 'u' sub( A ) is an upper triangular
131* submatrix,
132*
133* UPLO = 'L' or 'l' sub( A ) is a lower triangular
134* submatrix.
135*
136* TRANS (global input) CHARACTER*1
137* On entry, TRANS specifies the operation to be performed as
138* follows:
139*
140* TRANS = 'N' or 'n' sub( A ) * sub( X ) = b.
141*
142* TRANS = 'T' or 't' sub( A )' * sub( X ) = b.
143*
144* TRANS = 'C' or 'c' sub( A )' * sub( X ) = b.
145*
146* DIAG (global input) CHARACTER*1
147* On entry, DIAG specifies whether or not sub( A ) is unit
148* triangular as follows:
149*
150* DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian-
151* gular,
152*
153* DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri-
154* angular.
155*
156* N (global input) INTEGER
157* On entry, N specifies the order of the submatrix sub( A ).
158* N must be at least zero.
159*
160* A (local input) REAL array
161* On entry, A is an array of dimension (LLD_A, Ka), where Ka is
162* at least Lc( 1, JA+N-1 ). Before entry, this array contains
163* the local entries of the matrix A.
164* Before entry with UPLO = 'U' or 'u', this array contains the
165* local entries corresponding to the entries of the upper tri-
166* angular submatrix sub( A ), and the local entries correspon-
167* ding to the entries of the strictly lower triangular part of
168* the submatrix sub( A ) are not referenced.
169* Before entry with UPLO = 'L' or 'l', this array contains the
170* local entries corresponding to the entries of the lower tri-
171* angular submatrix sub( A ), and the local entries correspon-
172* ding to the entries of the strictly upper triangular part of
173* the submatrix sub( A ) are not referenced.
174* Note that when DIAG = 'U' or 'u', the local entries corres-
175* ponding to the diagonal elements of the submatrix sub( A )
176* are not referenced either, but are assumed to be unity.
177*
178* IA (global input) INTEGER
179* On entry, IA specifies A's global row index, which points to
180* the beginning of the submatrix sub( A ).
181*
182* JA (global input) INTEGER
183* On entry, JA specifies A's global column index, which points
184* to the beginning of the submatrix sub( A ).
185*
186* DESCA (global and local input) INTEGER array
187* On entry, DESCA is an integer array of dimension DLEN_. This
188* is the array descriptor for the matrix A.
189*
190* X (local input/local output) REAL array
191* On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
192* is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
193* MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
194* Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
195* Before entry, this array contains the local entries of the
196* matrix X. On entry, sub( X ) is the n element right-hand side
197* b. On exit, sub( X ) is overwritten with the solution subvec-
198* tor.
199*
200* IX (global input) INTEGER
201* On entry, IX specifies X's global row index, which points to
202* the beginning of the submatrix sub( X ).
203*
204* JX (global input) INTEGER
205* On entry, JX specifies X's global column index, which points
206* to the beginning of the submatrix sub( X ).
207*
208* DESCX (global and local input) INTEGER array
209* On entry, DESCX is an integer array of dimension DLEN_. This
210* is the array descriptor for the matrix X.
211*
212* INCX (global input) INTEGER
213* On entry, INCX specifies the global increment for the
214* elements of X. Only two values of INCX are supported in
215* this version, namely 1 and M_X. INCX must not be zero.
216*
217* -- Written on April 1, 1998 by
218* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
219*
220* ---------------------------------------------------------------------
221*/
222/*
223* .. Local Scalars ..
224*/
225 char DiagA, TranOp, UploA, Xroc, btop, ctop, * negone, * one,
226 * zero;
227 Int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Anb,
228 Anp, Anp0, Anq, Anq0, Arow, Asrc, XACapbX, XACfr, XACld,
229 XACsum, XARapbX, XARfr, XARld, XARsum, Xi, Xj, ctxt, info,
230 ione=1, k, kb, kbnext, kbprev, ktmp, mycol, myrow, nb, notran,
231 nounit, npcol, nprow, size, upper;
232 PBTYP_T * type;
233/*
234* .. Local Arrays ..
235*/
236 Int Ad[DLEN_], Ad0[DLEN_], XACd[DLEN_], XARd[DLEN_], Xd[DLEN_];
237 char * Aptr = NULL, * XAC = NULL, * XAR = NULL;
238/* ..
239* .. Executable Statements ..
240*
241*/
242 upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER );
243 notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN );
244 nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT );
245 PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad );
246 PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd );
247#ifndef NO_ARGCHK
248/*
249* Test the input parameters
250*/
251 Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
252 if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) )
253 {
254 if( ( !upper ) && ( UploA != CLOWER ) )
255 {
256 PB_Cwarn( ctxt, __LINE__, "PSTRSV", "Illegal UPLO = %c\n", UploA );
257 info = -1;
258 }
259 else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) )
260 {
261 PB_Cwarn( ctxt, __LINE__, "PSTRSV", "Illegal TRANS = %c\n", TranOp );
262 info = -2;
263 }
264 else if( ( !nounit ) && ( DiagA != CUNIT ) )
265 {
266 PB_Cwarn( ctxt, __LINE__, "PSTRSV", "Illegal DIAG = %c\n", DiagA );
267 info = -3;
268 }
269 PB_Cchkmat( ctxt, "PSTRSV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 8, &info );
270 PB_Cchkvec( ctxt, "PSTRSV", "X", *N, 4, Xi, Xj, Xd, *INCX, 12, &info );
271 }
272 if( info ) { PB_Cabort( ctxt, "PSTRSV", info ); return; }
273#endif
274/*
275* Quick return if possible
276*/
277 if( *N == 0 ) return;
278/*
279* Retrieve process grid information
280*/
281#ifdef NO_ARGCHK
282 Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
283#endif
284/*
285* Get type structure
286*/
287 type = PB_Cstypeset();
288 size = type->size; one = type->one; zero = type->zero; negone = type->negone;
289/*
290* Compute descriptor Ad0 for sub( A )
291*/
292 PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj,
293 &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
294/*
295* Computational partitioning size is computed as the product of the logical
296* value returned by pilaenv_ and 2 * lcm( nprow, npcol )
297*/
298 nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) *
299 PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) );
300
301 Xroc = ( *INCX == Xd[M_] ? CROW : CCOLUMN );
302
303 if( notran )
304 {
305 if( upper )
306 {
307/*
308* Save current and enforce ring BLACS topologies
309*/
310 btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
311 ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
312 (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DRING );
313 (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DRING );
314/*
315* Remove next line when BLACS combine operations support ring topologies.
316*/
317 (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT );
318/*
319* Reuse sub( X ) and/or create vector XAC in process column owning the last
320* column of sub( A )
321*/
322 PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, *N-1, Ad0, 1,
323 ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd,
324 &XACfr, &XACsum, &XACapbX );
325/*
326* Create vector XAR in process rows spanned by sub( A )
327*/
328 PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr,
329 &XARsum );
330/*
331* Retrieve local quantities related to Ad0 -> sub( A )
332*/
333 Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
334 Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
335 Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
336 Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
337 Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
338 if( ( Anp > 0 ) && ( Anq > 0 ) )
339 Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
340
341 XACld = XACd[LLD_]; XARld = XARd[LLD_];
342
343 for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb )
344 {
345 ktmp = *N - k; kb = MIN( ktmp, nb );
346/*
347* Solve logical diagonal block, XAC contains the solution scattered in multiple
348* process columns and XAR contains the solution replicated in the process rows.
349*/
350 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
351 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
352 PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
353 Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
354 Akq, XARld, size ), XARld );
355/*
356* Update: only the part of sub( X ) to be solved at the next step is locally
357* updated and combined, the remaining part of the vector to be solved later is
358* only locally updated.
359*/
360 if( Akp > 0 )
361 {
362 Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
363 if( XACsum )
364 {
365 kbprev = MIN( k, nb );
366 ktmp = PB_Cnumroc( kbprev, k-kbprev, Aimb1, Amb, myrow,
367 Arow, nprow );
368 Akp -= ktmp;
369
370 if( ktmp > 0 )
371 {
372 if( Anq0 > 0 )
373 sgemv_( TRANS, &ktmp, &Anq0, negone,
374 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
375 Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
376 Mptr( XAC, Akp, 0, XACld, size ), &ione );
377 Asrc = PB_Cindxg2p( k-1, Ainb1, Anb, Acol, Acol, npcol );
378 Csgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp,
379 0, XACld, size ), XACld, myrow, Asrc );
380 if( mycol != Asrc )
381 sset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld,
382 size ), &ione );
383 }
384 if( Akp > 0 && Anq0 > 0 )
385 sgemv_( TRANS, &Akp, &Anq0, negone,
386 Mptr( Aptr, 0, Akq, Ald, size ), &Ald,
387 Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
388 XAC, &ione );
389 }
390 else
391 {
392 if( Anq0 > 0 )
393 sgemv_( TRANS, &Akp, &Anq0, negone,
394 Mptr( Aptr, 0, Akq, Ald, size ), &Ald,
395 Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
396 XAC, &ione );
397 }
398 }
399 }
400/*
401* Combine the scattered resulting vector XAC
402*/
403 if( XACsum && ( Anp > 0 ) )
404 {
405 Csgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow,
406 XACd[CSRC_] );
407 }
408/*
409* sub( X ) := XAC (if necessary)
410*/
411 if( XACapbX )
412 {
413 PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, COLUMN,
414 zero, ((char *) X), Xi, Xj, Xd, &Xroc );
415 }
416/*
417* Restore BLACS topologies
418*/
419 (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop );
420 (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop );
421 }
422 else
423 {
424/*
425* Save current and enforce ring BLACS topologies
426*/
427 btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
428 ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
429 (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING );
430 (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING );
431/*
432* Remove next line when BLACS combine operations support ring topologies.
433*/
434 (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT );
435/*
436* Reuse sub( X ) and/or create vector XAC in process column owning the first
437* column of sub( A )
438*/
439 PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, 0, Ad0, 1,
440 ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd,
441 &XACfr, &XACsum, &XACapbX );
442/*
443* Create vector XAR in process rows spanned by sub( A )
444*/
445 PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr,
446 &XARsum );
447/*
448* Retrieve local quantities related to Ad0 -> sub( A )
449*/
450 Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
451 Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
452 Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
453 Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
454 Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
455 if( ( Anp > 0 ) && ( Anq > 0 ) )
456 Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
457
458 XACld = XACd[LLD_]; XARld = XARd[LLD_];
459
460 for( k = 0; k < *N; k += nb )
461 {
462 ktmp = *N - k; kb = MIN( ktmp, nb );
463/*
464* Solve logical diagonal block, XAC contains the solution scattered in multiple
465* process columns and XAR contains the solution replicated in the process rows.
466*/
467 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
468 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
469 PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
470 Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
471 Akq, XARld, size ), XARld );
472/*
473* Update: only the part of sub( X ) to be solved at the next step is locally
474* updated and combined, the remaining part of the vector to be solved later is
475* only locally updated.
476*/
477 Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow );
478 if( ( Anp0 = Anp - Akp ) > 0 )
479 {
480 Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
481 if( XACsum )
482 {
483 kbnext = ktmp - kb; kbnext = MIN( kbnext, nb );
484 ktmp = PB_Cnumroc( kbnext, k+kb, Aimb1, Amb, myrow, Arow,
485 nprow );
486 Anp0 -= ktmp;
487
488 if( ktmp > 0 )
489 {
490 if( Anq0 > 0 )
491 sgemv_( TRANS, &ktmp, &Anq0, negone,
492 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
493 Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
494 Mptr( XAC, Akp, 0, XACld, size ), &ione );
495 Asrc = PB_Cindxg2p( k+kb, Ainb1, Anb, Acol, Acol, npcol );
496 Csgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp,
497 0, XACld, size ), XACld, myrow, Asrc );
498 if( mycol != Asrc )
499 sset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld,
500 size ), &ione );
501 }
502 if( Anp0 > 0 && Anq0 > 0 )
503 sgemv_( TRANS, &Anp0, &Anq0, negone,
504 Mptr( Aptr, Akp+ktmp, Akq, Ald, size ), &Ald,
505 Mptr( XAR, 0, Akq, XARld, size ), &XARld,
506 one,
507 Mptr( XAC, Akp+ktmp, 0, XACld, size ), &ione );
508 }
509 else
510 {
511 if( Anq0 > 0 )
512 sgemv_( TRANS, &Anp0, &Anq0, negone,
513 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
514 Mptr( XAR, 0, Akq, XARld, size ), &XARld,
515 one,
516 Mptr( XAC, Akp, 0, XACld, size ), &ione );
517 }
518 }
519 }
520/*
521* Combine the scattered resulting vector XAC
522*/
523 if( XACsum && ( Anp > 0 ) )
524 {
525 Csgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow,
526 XACd[CSRC_] );
527 }
528/*
529* sub( X ) := XAC (if necessary)
530*/
531 if( XACapbX )
532 {
533 PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd,
534 COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc );
535 }
536/*
537* Restore BLACS topologies
538*/
539 (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop );
540 (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop );
541 }
542 }
543 else
544 {
545 if( upper )
546 {
547/*
548* Save current and enforce ring BLACS topologies
549*/
550 btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
551 ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
552 (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING );
553 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING );
554/*
555* Remove next line when BLACS combine operations support ring topologies.
556*/
557 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT );
558/*
559* Reuse sub( X ) and/or create vector XAR in process row owning the first row
560* of sub( A )
561*/
562 PB_CInOutV2( type, NOCONJG, ROW, *N, *N, 0, Ad0, 1,
563 ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd,
564 &XARfr, &XARsum, &XARapbX );
565/*
566* Create vector XAC in process columns spanned by sub( A )
567*/
568 PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr,
569 &XACsum );
570/*
571* Retrieve local quantities related to Ad0 -> sub( A )
572*/
573 Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
574 Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
575 Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
576 Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
577 Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
578 if( ( Anp > 0 ) && ( Anq > 0 ) )
579 Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
580
581 XACld = XACd[LLD_]; XARld = XARd[LLD_];
582
583 for( k = 0; k < *N; k += nb )
584 {
585 ktmp = *N - k; kb = MIN( ktmp, nb );
586/*
587* Solve logical diagonal block, XAR contains the solution scattered in multiple
588* process rows and XAC contains the solution replicated in the process columns.
589*/
590 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
591 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
592 PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
593 Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
594 Akq, XARld, size ), XARld );
595/*
596* Update: only the part of sub( X ) to be solved at the next step is locally
597* updated and combined, the remaining part of the vector to be solved later is
598* only locally updated.
599*/
600 Akq = PB_Cnumroc( k+kb, 0, Ainb1, Anb, mycol, Acol, npcol );
601 if( ( Anq0 = Anq - Akq ) > 0 )
602 {
603 Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
604 if( XARsum )
605 {
606 kbnext = ktmp - kb; kbnext = MIN( kbnext, nb );
607 ktmp = PB_Cnumroc( kbnext, k+kb, Ainb1, Anb, mycol, Acol,
608 npcol );
609 Anq0 -= ktmp;
610
611 if( ktmp > 0 )
612 {
613 if( Anp0 > 0 )
614 sgemv_( TRANS, &Anp0, &ktmp, negone,
615 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
616 Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
617 Mptr( XAR, 0, Akq, XARld, size ), &XARld );
618 Asrc = PB_Cindxg2p( k+kb, Aimb1, Amb, Arow, Arow, nprow );
619 Csgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0,
620 Akq, XARld, size ), XARld, Asrc, mycol );
621 if( myrow != Asrc )
622 sset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld,
623 size ), &XARld );
624 }
625 if( Anp0 > 0 && Anq0 > 0 )
626 sgemv_( TRANS, &Anp0, &Anq0, negone,
627 Mptr( Aptr, Akp, Akq+ktmp, Ald, size ), &Ald,
628 Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
629 Mptr( XAR, 0, Akq+ktmp, XARld, size ), &XARld );
630 }
631 else
632 {
633 if( Anp0 > 0 )
634 sgemv_( TRANS, &Anp0, &Anq0, negone,
635 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
636 Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
637 Mptr( XAR, 0, Akq, XARld, size ), &XARld );
638 }
639 }
640 }
641/*
642* Combine the scattered resulting vector XAR
643*/
644 if( XARsum && ( Anq > 0 ) )
645 {
646 Csgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_],
647 mycol );
648 }
649/*
650* sub( X ) := XAR (if necessary)
651*/
652 if( XARapbX )
653 {
654 PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero,
655 ((char *) X), Xi, Xj, Xd, &Xroc );
656 }
657/*
658* Restore BLACS topologies
659*/
660 (void) PB_Ctop( &ctxt, BCAST, ROW, &btop );
661 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop );
662 }
663 else
664 {
665/*
666* Save current and enforce ring BLACS topologies
667*/
668 btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
669 ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
670 (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_DRING );
671 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DRING );
672/*
673* Remove next line when BLACS combine operations support ring topologies.
674*/
675 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT );
676/*
677* Reuse sub( X ) and/or create vector XAC in process row owning the last row
678* of sub( A )
679*/
680 PB_CInOutV2( type, NOCONJG, ROW, *N, *N, *N-1, Ad0, 1,
681 ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd,
682 &XARfr, &XARsum, &XARapbX );
683/*
684* Create vector XAC in process columns spanned by sub( A )
685*/
686 PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr,
687 &XACsum );
688/*
689* Retrieve local quantities related to Ad0 -> sub( A )
690*/
691 Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
692 Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
693 Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
694 Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
695 Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
696 if( ( Anp > 0 ) && ( Anq > 0 ) )
697 Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
698
699 XACld = XACd[LLD_]; XARld = XARd[LLD_];
700
701 for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb )
702 {
703 ktmp = *N - k; kb = MIN( ktmp, nb );
704/*
705* Solve logical diagonal block, XAR contains the solution scattered in multiple
706* process rows and XAC contains the solution replicated in the process columns.
707*/
708 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
709 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
710 PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
711 Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
712 Akq, XARld, size ), XARld );
713/*
714* Update: only the part of sub( X ) to be solved at the next step is locally
715* updated and combined, the remaining part of the vector to be solved later
716* is only locally updated.
717*/
718 if( Akq > 0 )
719 {
720 Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
721 if( XARsum )
722 {
723 kbprev = MIN( k, nb );
724 ktmp = PB_Cnumroc( kbprev, k-kbprev, Ainb1, Anb, mycol,
725 Acol, npcol );
726 Akq -= ktmp;
727
728 if( ktmp > 0 )
729 {
730 if( Anp0 > 0 )
731 sgemv_( TRANS, &Anp0, &ktmp, negone,
732 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
733 Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
734 Mptr( XAR, 0, Akq, XARld, size ), &XARld );
735 Asrc = PB_Cindxg2p( k-1, Aimb1, Amb, Arow, Arow, nprow );
736 Csgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0,
737 Akq, XARld, size ), XARld, Asrc, mycol );
738 if( myrow != Asrc )
739 sset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld,
740 size ), &XARld );
741 }
742 if( Anp0 > 0 && Akq > 0 )
743 sgemv_( TRANS, &Anp0, &Akq, negone,
744 Mptr( Aptr, Akp, 0, Ald, size ), &Ald,
745 Mptr( XAC, Akp, 0, XACld, size ), &ione,
746 one, XAR, &XARld );
747 }
748 else
749 {
750 if( Anp0 > 0 )
751 sgemv_( TRANS, &Anp0, &Akq, negone,
752 Mptr( Aptr, Akp, 0, Ald, size ), &Ald,
753 Mptr( XAC, Akp, 0, XACld, size ), &ione,
754 one, XAR, &XARld );
755 }
756 }
757 }
758/*
759* Combine the scattered resulting vector XAR
760*/
761 if( XARsum && ( Anq > 0 ) )
762 {
763 Csgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_],
764 mycol );
765 }
766/*
767* sub( X ) := XAR (if necessary)
768*/
769 if( XARapbX )
770 {
771 PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero,
772 ((char *) X), Xi, Xj, Xd, &Xroc );
773 }
774/*
775* Restore BLACS topologies
776*/
777 (void) PB_Ctop( &ctxt, BCAST, ROW, &btop );
778 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop );
779 }
780 }
781 if( XACfr ) free( XAC );
782 if( XARfr ) free( XAR );
783/*
784* End of PSTRSV
785*/
786}
#define Int
Definition Bconfig.h:22
#define F2C_CHAR(a)
Definition pblas.h:124
#define C2F_CHAR(a)
Definition pblas.h:125
char * F_CHAR_T
Definition pblas.h:122
#define CCOLUMN
Definition PBblacs.h:20
#define TOP_GET
Definition PBblacs.h:50
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define TOP_IRING
Definition PBblacs.h:52
#define CROW
Definition PBblacs.h:21
#define TOP_DRING
Definition PBblacs.h:53
#define TOP_DEFAULT
Definition PBblacs.h:51
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define BCAST
Definition PBblacs.h:48
void Csgsum2d()
#define CNOUNIT
Definition PBblas.h:33
#define NOCONJG
Definition PBblas.h:45
#define CUPPER
Definition PBblas.h:26
#define CNOTRAN
Definition PBblas.h:18
#define sgemv_
Definition PBblas.h:139
#define CTRAN
Definition PBblas.h:20
#define CCOTRAN
Definition PBblas.h:22
#define INIT
Definition PBblas.h:61
#define CLOWER
Definition PBblas.h:25
#define CUNIT
Definition PBblas.h:32
#define pstrsv_
Definition PBpblas.h:124
#define pilaenv_
Definition PBpblas.h:44
#define CTXT_
Definition PBtools.h:38
#define MB_
Definition PBtools.h:43
void PB_Cabort()
void PB_Cchkvec()
void PB_Cchkmat()
#define MIN(a_, b_)
Definition PBtools.h:76
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
void PB_Cwarn()
#define LLD_
Definition PBtools.h:47
Int PB_Cnumroc()
char * PB_Ctop()
PBTYP_T * PB_Cstypeset()
void PB_CInOutV2()
#define RSRC_
Definition PBtools.h:45
void PB_COutV()
#define M_
Definition PBtools.h:39
#define INB_
Definition PBtools.h:42
void PB_CargFtoC()
void PB_Cptrsv()
#define CSRC_
Definition PBtools.h:46
Int PB_Clcm()
#define IMB_
Definition PBtools.h:41
Int PB_Cindxg2p()
#define Mupcase(C)
Definition PBtools.h:83
#define DLEN_
Definition PBtools.h:48
#define NB_
Definition PBtools.h:44
void PB_Cpaxpby()
void PB_Cdescribe()
#define sset_
Definition PBtools.h:663
char type
Definition pblas.h:331
Int size
Definition pblas.h:333
char * zero
Definition pblas.h:335
char * negone
Definition pblas.h:337
char * one
Definition pblas.h:336