SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
pctrsv_.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "pblas.h"
14#include "PBpblas.h"
15#include "PBtools.h"
16#include "PBblacs.h"
17#include "PBblas.h"
18
19#ifdef __STDC__
20void pctrsv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, Int * N,
21 float * A, Int * IA, Int * JA, Int * DESCA,
22 float * X, Int * IX, Int * JX, Int * DESCX,
23 Int * INCX )
24#else
25void pctrsv_( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX,
26 DESCX, INCX )
27/*
28* .. Scalar Arguments ..
29*/
30 F_CHAR_T DIAG, TRANS, UPLO;
31 Int * IA, * INCX, * IX, * JA, * JX, * N;
32/*
33* .. Array Arguments ..
34*/
35 Int * DESCA, * DESCX;
36 float * A, * X;
37#endif
38{
39/*
40* Purpose
41* =======
42*
43* PCTRSV solves one of the systems of equations
44*
45* sub( A )*sub( X ) = b, or sub( A )'*sub( X ) = b, or
46*
47* conjg( sub( A )' )*sub( X ) = b,
48*
49* where
50*
51* sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and,
52*
53* sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
54* X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X.
55*
56* b and sub( X ) are n element subvectors and sub( A ) is an n by n
57* unit, or non-unit, upper or lower triangular submatrix.
58*
59* No test for singularity or near-singularity is included in this
60* routine. Such tests must be performed before calling this routine.
61*
62* Notes
63* =====
64*
65* A description vector is associated with each 2D block-cyclicly dis-
66* tributed matrix. This vector stores the information required to
67* establish the mapping between a matrix entry and its corresponding
68* process and memory location.
69*
70* In the following comments, the character _ should be read as
71* "of the distributed matrix". Let A be a generic term for any 2D
72* block cyclicly distributed matrix. Its description vector is DESC_A:
73*
74* NOTATION STORED IN EXPLANATION
75* ---------------- --------------- ------------------------------------
76* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
77* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
78* the NPROW x NPCOL BLACS process grid
79* A is distributed over. The context
80* itself is global, but the handle
81* (the integer value) may vary.
82* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
83* ted matrix A, M_A >= 0.
84* N_A (global) DESCA[ N_ ] The number of columns in the distri-
85* buted matrix A, N_A >= 0.
86* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
87* block of the matrix A, IMB_A > 0.
88* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
89* left block of the matrix A,
90* INB_A > 0.
91* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
92* bute the last M_A-IMB_A rows of A,
93* MB_A > 0.
94* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
95* bute the last N_A-INB_A columns of
96* A, NB_A > 0.
97* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
98* row of the matrix A is distributed,
99* NPROW > RSRC_A >= 0.
100* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
101* first column of A is distributed.
102* NPCOL > CSRC_A >= 0.
103* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
104* array storing the local blocks of
105* the distributed matrix A,
106* IF( Lc( 1, N_A ) > 0 )
107* LLD_A >= MAX( 1, Lr( 1, M_A ) )
108* ELSE
109* LLD_A >= 1.
110*
111* Let K be the number of rows of a matrix A starting at the global in-
112* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
113* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
114* receive if these K rows were distributed over NPROW processes. If K
115* is the number of columns of a matrix A starting at the global index
116* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
117* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
118* these K columns were distributed over NPCOL processes.
119*
120* The values of Lr() and Lc() may be determined via a call to the func-
121* tion PB_Cnumroc:
122* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
123* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
124*
125* Arguments
126* =========
127*
128* UPLO (global input) CHARACTER*1
129* On entry, UPLO specifies whether the submatrix sub( A ) is
130* an upper or lower triangular submatrix as follows:
131*
132* UPLO = 'U' or 'u' sub( A ) is an upper triangular
133* submatrix,
134*
135* UPLO = 'L' or 'l' sub( A ) is a lower triangular
136* submatrix.
137*
138* TRANS (global input) CHARACTER*1
139* On entry, TRANS specifies the operation to be performed as
140* follows:
141*
142* TRANS = 'N' or 'n' sub( A ) * sub( X ) = b.
143*
144* TRANS = 'T' or 't' sub( A )' * sub( X ) = b.
145*
146* TRANS = 'C' or 'c' conjg( sub( A )' ) * sub( X ) = b.
147*
148* DIAG (global input) CHARACTER*1
149* On entry, DIAG specifies whether or not sub( A ) is unit
150* triangular as follows:
151*
152* DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian-
153* gular,
154*
155* DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri-
156* angular.
157*
158* N (global input) INTEGER
159* On entry, N specifies the order of the submatrix sub( A ).
160* N must be at least zero.
161*
162* A (local input) COMPLEX array
163* On entry, A is an array of dimension (LLD_A, Ka), where Ka is
164* at least Lc( 1, JA+N-1 ). Before entry, this array contains
165* the local entries of the matrix A.
166* Before entry with UPLO = 'U' or 'u', this array contains the
167* local entries corresponding to the entries of the upper tri-
168* angular submatrix sub( A ), and the local entries correspon-
169* ding to the entries of the strictly lower triangular part of
170* the submatrix sub( A ) are not referenced.
171* Before entry with UPLO = 'L' or 'l', this array contains the
172* local entries corresponding to the entries of the lower tri-
173* angular submatrix sub( A ), and the local entries correspon-
174* ding to the entries of the strictly upper triangular part of
175* the submatrix sub( A ) are not referenced.
176* Note that when DIAG = 'U' or 'u', the local entries corres-
177* ponding to the diagonal elements of the submatrix sub( A )
178* are not referenced either, but are assumed to be unity.
179*
180* IA (global input) INTEGER
181* On entry, IA specifies A's global row index, which points to
182* the beginning of the submatrix sub( A ).
183*
184* JA (global input) INTEGER
185* On entry, JA specifies A's global column index, which points
186* to the beginning of the submatrix sub( A ).
187*
188* DESCA (global and local input) INTEGER array
189* On entry, DESCA is an integer array of dimension DLEN_. This
190* is the array descriptor for the matrix A.
191*
192* X (local input/local output) COMPLEX array
193* On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
194* is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
195* MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
196* Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
197* Before entry, this array contains the local entries of the
198* matrix X. On entry, sub( X ) is the n element right-hand side
199* b. On exit, sub( X ) is overwritten with the solution subvec-
200* tor.
201*
202* IX (global input) INTEGER
203* On entry, IX specifies X's global row index, which points to
204* the beginning of the submatrix sub( X ).
205*
206* JX (global input) INTEGER
207* On entry, JX specifies X's global column index, which points
208* to the beginning of the submatrix sub( X ).
209*
210* DESCX (global and local input) INTEGER array
211* On entry, DESCX is an integer array of dimension DLEN_. This
212* is the array descriptor for the matrix X.
213*
214* INCX (global input) INTEGER
215* On entry, INCX specifies the global increment for the
216* elements of X. Only two values of INCX are supported in
217* this version, namely 1 and M_X. INCX must not be zero.
218*
219* -- Written on April 1, 1998 by
220* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
221*
222* ---------------------------------------------------------------------
223*/
224/*
225* .. Local Scalars ..
226*/
227 char DiagA, TranOp, UploA, Xroc, btop, ctop, * negone, * one,
228 * zero;
229 Int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Anb,
230 Anp, Anp0, Anq, Anq0, Arow, Asrc, XACapbX, XACfr, XACld,
231 XACsum, XARapbX, XARfr, XARld, XARsum, Xi, Xj, ctxt, info,
232 ione=1, k, kb, kbnext, kbprev, ktmp, mycol, myrow, nb, notran,
233 nounit, npcol, nprow, size, upper;
234 PBTYP_T * type;
235/*
236* .. Local Arrays ..
237*/
238 Int Ad[DLEN_], Ad0[DLEN_], XACd[DLEN_], XARd[DLEN_], Xd[DLEN_];
239 char * Aptr = NULL, * XAC = NULL, * XAR = NULL;
240/* ..
241* .. Executable Statements ..
242*
243*/
244 upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER );
245 notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN );
246 nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT );
247 PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad );
248 PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd );
249#ifndef NO_ARGCHK
250/*
251* Test the input parameters
252*/
253 Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
254 if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) )
255 {
256 if( ( !upper ) && ( UploA != CLOWER ) )
257 {
258 PB_Cwarn( ctxt, __LINE__, "PCTRSV", "Illegal UPLO = %c\n", UploA );
259 info = -1;
260 }
261 else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) )
262 {
263 PB_Cwarn( ctxt, __LINE__, "PCTRSV", "Illegal TRANS = %c\n", TranOp );
264 info = -2;
265 }
266 else if( ( !nounit ) && ( DiagA != CUNIT ) )
267 {
268 PB_Cwarn( ctxt, __LINE__, "PCTRSV", "Illegal DIAG = %c\n", DiagA );
269 info = -3;
270 }
271 PB_Cchkmat( ctxt, "PCTRSV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 8, &info );
272 PB_Cchkvec( ctxt, "PCTRSV", "X", *N, 4, Xi, Xj, Xd, *INCX, 12, &info );
273 }
274 if( info ) { PB_Cabort( ctxt, "PCTRSV", info ); return; }
275#endif
276/*
277* Quick return if possible
278*/
279 if( *N == 0 ) return;
280/*
281* Retrieve process grid information
282*/
283#ifdef NO_ARGCHK
284 Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
285#endif
286/*
287* Get type structure
288*/
289 type = PB_Cctypeset();
290 size = type->size; one = type->one; zero = type->zero; negone = type->negone;
291/*
292* Compute descriptor Ad0 for sub( A )
293*/
294 PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj,
295 &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
296/*
297* Computational partitioning size is computed as the product of the logical
298* value returned by pilaenv_ and 2 * lcm( nprow, npcol )
299*/
300 nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) *
301 PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) );
302
303 Xroc = ( *INCX == Xd[M_] ? CROW : CCOLUMN );
304
305 if( notran )
306 {
307 if( upper )
308 {
309/*
310* Save current and enforce ring BLACS topologies
311*/
312 btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
313 ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
314 (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DRING );
315 (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DRING );
316/*
317* Remove next line when BLACS combine operations support ring topologies.
318*/
319 (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT );
320/*
321* Reuse sub( X ) and/or create vector XAC in process column owning the last
322* column of sub( A )
323*/
324 PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, *N-1, Ad0, 1,
325 ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd,
326 &XACfr, &XACsum, &XACapbX );
327/*
328* Create vector XAR in process rows spanned by sub( A )
329*/
330 PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr,
331 &XARsum );
332/*
333* Retrieve local quantities related to Ad0 -> sub( A )
334*/
335 Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
336 Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
337 Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
338 Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
339 Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
340 if( ( Anp > 0 ) && ( Anq > 0 ) )
341 Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
342
343 XACld = XACd[LLD_]; XARld = XARd[LLD_];
344
345 for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb )
346 {
347 ktmp = *N - k; kb = MIN( ktmp, nb );
348/*
349* Solve logical diagonal block, XAC contains the solution scattered in multiple
350* process columns and XAR contains the solution replicated in the process rows.
351*/
352 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
353 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
354 PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
355 Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
356 Akq, XARld, size ), XARld );
357/*
358* Update: only the part of sub( X ) to be solved at the next step is locally
359* updated and combined, the remaining part of the vector to be solved later is
360* only locally updated.
361*/
362 if( Akp > 0 )
363 {
364 Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
365 if( XACsum )
366 {
367 kbprev = MIN( k, nb );
368 ktmp = PB_Cnumroc( kbprev, k-kbprev, Aimb1, Amb, myrow,
369 Arow, nprow );
370 Akp -= ktmp;
371
372 if( ktmp > 0 )
373 {
374 if( Anq0 > 0 )
375 cgemv_( TRANS, &ktmp, &Anq0, negone,
376 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
377 Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
378 Mptr( XAC, Akp, 0, XACld, size ), &ione );
379 Asrc = PB_Cindxg2p( k-1, Ainb1, Anb, Acol, Acol, npcol );
380 Ccgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp,
381 0, XACld, size ), XACld, myrow, Asrc );
382 if( mycol != Asrc )
383 cset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld,
384 size ), &ione );
385 }
386 if( Akp > 0 && Anq0 > 0 )
387 cgemv_( TRANS, &Akp, &Anq0, negone,
388 Mptr( Aptr, 0, Akq, Ald, size ), &Ald,
389 Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
390 XAC, &ione );
391 }
392 else
393 {
394 if( Anq0 > 0 )
395 cgemv_( TRANS, &Akp, &Anq0, negone,
396 Mptr( Aptr, 0, Akq, Ald, size ), &Ald,
397 Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
398 XAC, &ione );
399 }
400 }
401 }
402/*
403* Combine the scattered resulting vector XAC
404*/
405 if( XACsum && ( Anp > 0 ) )
406 {
407 Ccgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow,
408 XACd[CSRC_] );
409 }
410/*
411* sub( X ) := XAC (if necessary)
412*/
413 if( XACapbX )
414 {
415 PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, COLUMN,
416 zero, ((char *) X), Xi, Xj, Xd, &Xroc );
417 }
418/*
419* Restore BLACS topologies
420*/
421 (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop );
422 (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop );
423 }
424 else
425 {
426/*
427* Save current and enforce ring BLACS topologies
428*/
429 btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
430 ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
431 (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING );
432 (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING );
433/*
434* Remove next line when BLACS combine operations support ring topologies.
435*/
436 (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT );
437/*
438* Reuse sub( X ) and/or create vector XAC in process column owning the first
439* column of sub( A )
440*/
441 PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, 0, Ad0, 1,
442 ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd,
443 &XACfr, &XACsum, &XACapbX );
444/*
445* Create vector XAR in process rows spanned by sub( A )
446*/
447 PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr,
448 &XARsum );
449/*
450* Retrieve local quantities related to Ad0 -> sub( A )
451*/
452 Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
453 Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
454 Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
455 Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
456 Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
457 if( ( Anp > 0 ) && ( Anq > 0 ) )
458 Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
459
460 XACld = XACd[LLD_]; XARld = XARd[LLD_];
461
462 for( k = 0; k < *N; k += nb )
463 {
464 ktmp = *N - k; kb = MIN( ktmp, nb );
465/*
466* Solve logical diagonal block, XAC contains the solution scattered in multiple
467* process columns and XAR contains the solution replicated in the process rows.
468*/
469 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
470 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
471 PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
472 Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
473 Akq, XARld, size ), XARld );
474/*
475* Update: only the part of sub( X ) to be solved at the next step is locally
476* updated and combined, the remaining part of the vector to be solved later is
477* only locally updated.
478*/
479 Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow );
480 if( ( Anp0 = Anp - Akp ) > 0 )
481 {
482 Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
483 if( XACsum )
484 {
485 kbnext = ktmp - kb; kbnext = MIN( kbnext, nb );
486 ktmp = PB_Cnumroc( kbnext, k+kb, Aimb1, Amb, myrow, Arow,
487 nprow );
488 Anp0 -= ktmp;
489
490 if( ktmp > 0 )
491 {
492 if( Anq0 > 0 )
493 cgemv_( TRANS, &ktmp, &Anq0, negone,
494 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
495 Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
496 Mptr( XAC, Akp, 0, XACld, size ), &ione );
497 Asrc = PB_Cindxg2p( k+kb, Ainb1, Anb, Acol, Acol, npcol );
498 Ccgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp,
499 0, XACld, size ), XACld, myrow, Asrc );
500 if( mycol != Asrc )
501 cset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld,
502 size ), &ione );
503 }
504 if( Anp0 > 0 && Anq0 > 0 )
505 cgemv_( TRANS, &Anp0, &Anq0, negone,
506 Mptr( Aptr, Akp+ktmp, Akq, Ald, size ), &Ald,
507 Mptr( XAR, 0, Akq, XARld, size ), &XARld,
508 one,
509 Mptr( XAC, Akp+ktmp, 0, XACld, size ), &ione );
510 }
511 else
512 {
513 if( Anq0 > 0 )
514 cgemv_( TRANS, &Anp0, &Anq0, negone,
515 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
516 Mptr( XAR, 0, Akq, XARld, size ), &XARld,
517 one,
518 Mptr( XAC, Akp, 0, XACld, size ), &ione );
519 }
520 }
521 }
522/*
523* Combine the scattered resulting vector XAC
524*/
525 if( XACsum && ( Anp > 0 ) )
526 {
527 Ccgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow,
528 XACd[CSRC_] );
529 }
530/*
531* sub( X ) := XAC (if necessary)
532*/
533 if( XACapbX )
534 {
535 PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd,
536 COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc );
537 }
538/*
539* Restore BLACS topologies
540*/
541 (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop );
542 (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop );
543 }
544 }
545 else
546 {
547 if( upper )
548 {
549/*
550* Save current and enforce ring BLACS topologies
551*/
552 btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
553 ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
554 (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING );
555 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING );
556/*
557* Remove next line when BLACS combine operations support ring topologies.
558*/
559 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT );
560/*
561* Reuse sub( X ) and/or create vector XAR in process row owning the first row
562* of sub( A )
563*/
564 PB_CInOutV2( type, NOCONJG, ROW, *N, *N, 0, Ad0, 1,
565 ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd,
566 &XARfr, &XARsum, &XARapbX );
567/*
568* Create vector XAC in process columns spanned by sub( A )
569*/
570 PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr,
571 &XACsum );
572/*
573* Retrieve local quantities related to Ad0 -> sub( A )
574*/
575 Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
576 Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
577 Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
578 Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
579 Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
580 if( ( Anp > 0 ) && ( Anq > 0 ) )
581 Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
582
583 XACld = XACd[LLD_]; XARld = XARd[LLD_];
584
585 for( k = 0; k < *N; k += nb )
586 {
587 ktmp = *N - k; kb = MIN( ktmp, nb );
588/*
589* Solve logical diagonal block, XAR contains the solution scattered in multiple
590* process rows and XAC contains the solution replicated in the process columns.
591*/
592 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
593 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
594 PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
595 Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
596 Akq, XARld, size ), XARld );
597/*
598* Update: only the part of sub( X ) to be solved at the next step is locally
599* updated and combined, the remaining part of the vector to be solved later is
600* only locally updated.
601*/
602 Akq = PB_Cnumroc( k+kb, 0, Ainb1, Anb, mycol, Acol, npcol );
603 if( ( Anq0 = Anq - Akq ) > 0 )
604 {
605 Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
606 if( XARsum )
607 {
608 kbnext = ktmp - kb; kbnext = MIN( kbnext, nb );
609 ktmp = PB_Cnumroc( kbnext, k+kb, Ainb1, Anb, mycol, Acol,
610 npcol );
611 Anq0 -= ktmp;
612
613 if( ktmp > 0 )
614 {
615 if( Anp0 > 0 )
616 cgemv_( TRANS, &Anp0, &ktmp, negone,
617 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
618 Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
619 Mptr( XAR, 0, Akq, XARld, size ), &XARld );
620 Asrc = PB_Cindxg2p( k+kb, Aimb1, Amb, Arow, Arow, nprow );
621 Ccgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0,
622 Akq, XARld, size ), XARld, Asrc, mycol );
623 if( myrow != Asrc )
624 cset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld,
625 size ), &XARld );
626 }
627 if( Anp0 > 0 && Anq0 > 0 )
628 cgemv_( TRANS, &Anp0, &Anq0, negone,
629 Mptr( Aptr, Akp, Akq+ktmp, Ald, size ), &Ald,
630 Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
631 Mptr( XAR, 0, Akq+ktmp, XARld, size ), &XARld );
632 }
633 else
634 {
635 if( Anp0 > 0 )
636 cgemv_( TRANS, &Anp0, &Anq0, negone,
637 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
638 Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
639 Mptr( XAR, 0, Akq, XARld, size ), &XARld );
640 }
641 }
642 }
643/*
644* Combine the scattered resulting vector XAR
645*/
646 if( XARsum && ( Anq > 0 ) )
647 {
648 Ccgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_],
649 mycol );
650 }
651/*
652* sub( X ) := XAR (if necessary)
653*/
654 if( XARapbX )
655 {
656 PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero,
657 ((char *) X), Xi, Xj, Xd, &Xroc );
658 }
659/*
660* Restore BLACS topologies
661*/
662 (void) PB_Ctop( &ctxt, BCAST, ROW, &btop );
663 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop );
664 }
665 else
666 {
667/*
668* Save current and enforce ring BLACS topologies
669*/
670 btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
671 ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
672 (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_DRING );
673 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DRING );
674/*
675* Remove next line when BLACS combine operations support ring topologies.
676*/
677 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT );
678/*
679* Reuse sub( X ) and/or create vector XAC in process row owning the last row
680* of sub( A )
681*/
682 PB_CInOutV2( type, NOCONJG, ROW, *N, *N, *N-1, Ad0, 1,
683 ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd,
684 &XARfr, &XARsum, &XARapbX );
685/*
686* Create vector XAC in process columns spanned by sub( A )
687*/
688 PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr,
689 &XACsum );
690/*
691* Retrieve local quantities related to Ad0 -> sub( A )
692*/
693 Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
694 Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
695 Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
696 Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
697 Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
698 if( ( Anp > 0 ) && ( Anq > 0 ) )
699 Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
700
701 XACld = XACd[LLD_]; XARld = XARd[LLD_];
702
703 for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb )
704 {
705 ktmp = *N - k; kb = MIN( ktmp, nb );
706/*
707* Solve logical diagonal block, XAR contains the solution scattered in multiple
708* process rows and XAC contains the solution replicated in the process columns.
709*/
710 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
711 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
712 PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
713 Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
714 Akq, XARld, size ), XARld );
715/*
716* Update: only the part of sub( X ) to be solved at the next step is locally
717* updated and combined, the remaining part of the vector to be solved later
718* is only locally updated.
719*/
720 if( Akq > 0 )
721 {
722 Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
723 if( XARsum )
724 {
725 kbprev = MIN( k, nb );
726 ktmp = PB_Cnumroc( kbprev, k-kbprev, Ainb1, Anb, mycol,
727 Acol, npcol );
728 Akq -= ktmp;
729
730 if( ktmp > 0 )
731 {
732 if( Anp0 > 0 )
733 cgemv_( TRANS, &Anp0, &ktmp, negone,
734 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
735 Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
736 Mptr( XAR, 0, Akq, XARld, size ), &XARld );
737 Asrc = PB_Cindxg2p( k-1, Aimb1, Amb, Arow, Arow, nprow );
738 Ccgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0,
739 Akq, XARld, size ), XARld, Asrc, mycol );
740 if( myrow != Asrc )
741 cset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld,
742 size ), &XARld );
743 }
744 if( Anp0 > 0 && Akq > 0 )
745 cgemv_( TRANS, &Anp0, &Akq, negone,
746 Mptr( Aptr, Akp, 0, Ald, size ), &Ald,
747 Mptr( XAC, Akp, 0, XACld, size ), &ione,
748 one, XAR, &XARld );
749 }
750 else
751 {
752 if( Anp0 > 0 )
753 cgemv_( TRANS, &Anp0, &Akq, negone,
754 Mptr( Aptr, Akp, 0, Ald, size ), &Ald,
755 Mptr( XAC, Akp, 0, XACld, size ), &ione,
756 one, XAR, &XARld );
757 }
758 }
759 }
760/*
761* Combine the scattered resulting vector XAR
762*/
763 if( XARsum && ( Anq > 0 ) )
764 {
765 Ccgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_],
766 mycol );
767 }
768/*
769* sub( X ) := XAR (if necessary)
770*/
771 if( XARapbX )
772 {
773 PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero,
774 ((char *) X), Xi, Xj, Xd, &Xroc );
775 }
776/*
777* Restore BLACS topologies
778*/
779 (void) PB_Ctop( &ctxt, BCAST, ROW, &btop );
780 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop );
781 }
782 }
783 if( XACfr ) free( XAC );
784 if( XARfr ) free( XAR );
785/*
786* End of PCTRSV
787*/
788}
#define Int
Definition Bconfig.h:22
#define F2C_CHAR(a)
Definition pblas.h:124
#define C2F_CHAR(a)
Definition pblas.h:125
char * F_CHAR_T
Definition pblas.h:122
#define CCOLUMN
Definition PBblacs.h:20
#define TOP_GET
Definition PBblacs.h:50
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define TOP_IRING
Definition PBblacs.h:52
#define CROW
Definition PBblacs.h:21
#define TOP_DRING
Definition PBblacs.h:53
#define TOP_DEFAULT
Definition PBblacs.h:51
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define BCAST
Definition PBblacs.h:48
void Ccgsum2d()
#define cgemv_
Definition PBblas.h:141
#define CNOUNIT
Definition PBblas.h:33
#define NOCONJG
Definition PBblas.h:45
#define CUPPER
Definition PBblas.h:26
#define CNOTRAN
Definition PBblas.h:18
#define CTRAN
Definition PBblas.h:20
#define CCOTRAN
Definition PBblas.h:22
#define INIT
Definition PBblas.h:61
#define CLOWER
Definition PBblas.h:25
#define CUNIT
Definition PBblas.h:32
#define pilaenv_
Definition PBpblas.h:44
#define pctrsv_
Definition PBpblas.h:126
#define CTXT_
Definition PBtools.h:38
#define MB_
Definition PBtools.h:43
void PB_Cabort()
void PB_Cchkvec()
void PB_Cchkmat()
#define MIN(a_, b_)
Definition PBtools.h:76
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
void PB_Cwarn()
#define LLD_
Definition PBtools.h:47
Int PB_Cnumroc()
char * PB_Ctop()
void PB_CInOutV2()
#define RSRC_
Definition PBtools.h:45
void PB_COutV()
#define cset_
Definition PBtools.h:665
#define M_
Definition PBtools.h:39
#define INB_
Definition PBtools.h:42
void PB_CargFtoC()
void PB_Cptrsv()
#define CSRC_
Definition PBtools.h:46
PBTYP_T * PB_Cctypeset()
Int PB_Clcm()
#define IMB_
Definition PBtools.h:41
Int PB_Cindxg2p()
#define Mupcase(C)
Definition PBtools.h:83
#define DLEN_
Definition PBtools.h:48
#define NB_
Definition PBtools.h:44
void PB_Cpaxpby()
void PB_Cdescribe()
char type
Definition pblas.h:331
Int size
Definition pblas.h:333
char * zero
Definition pblas.h:335
char * negone
Definition pblas.h:337
char * one
Definition pblas.h:336