SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
pspttrsv.f
Go to the documentation of this file.
1 SUBROUTINE pspttrsv( UPLO, N, NRHS, D, E, JA, DESCA, B, IB, DESCB,
2 $ AF, LAF, WORK, LWORK, INFO )
3*
4* -- ScaLAPACK routine (version 1.7) --
5* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
6* and University of California, Berkeley.
7* April 3, 2000
8*
9* .. Scalar Arguments ..
10 CHARACTER UPLO
11 INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS
12* ..
13* .. Array Arguments ..
14 INTEGER DESCA( * ), DESCB( * )
15 REAL AF( * ), B( * ), D( * ), E( * ), WORK( * )
16* ..
17*
18*
19* Purpose
20* =======
21*
22* PSPTTRSV solves a tridiagonal triangular system of linear equations
23*
24* A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
25* or
26* A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS)
27*
28* where A(1:N, JA:JA+N-1) is a tridiagonal
29* triangular matrix factor produced by the
30* Cholesky factorization code PSPTTRF
31* and is stored in A(1:N,JA:JA+N-1) and AF.
32* The matrix stored in A(1:N, JA:JA+N-1) is either
33* upper or lower triangular according to UPLO,
34* and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T
35* is dictated by the user by the parameter TRANS.
36*
37* Routine PSPTTRF MUST be called first.
38*
39* =====================================================================
40*
41* Arguments
42* =========
43*
44* UPLO (global input) CHARACTER
45* = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
46* = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
47*
48* N (global input) INTEGER
49* The number of rows and columns to be operated on, i.e. the
50* order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
51*
52* NRHS (global input) INTEGER
53* The number of right hand sides, i.e., the number of columns
54* of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
55* NRHS >= 0.
56*
57* D (local input/local output) REAL pointer to local
58* part of global vector storing the main diagonal of the
59* matrix.
60* On exit, this array contains information containing the
61* factors of the matrix.
62* Must be of size >= DESCA( NB_ ).
63*
64* E (local input/local output) REAL pointer to local
65* part of global vector storing the upper diagonal of the
66* matrix. Globally, DU(n) is not referenced, and DU must be
67* aligned with D.
68* On exit, this array contains information containing the
69* factors of the matrix.
70* Must be of size >= DESCA( NB_ ).
71*
72* JA (global input) INTEGER
73* The index in the global array A that points to the start of
74* the matrix to be operated on (which may be either all of A
75* or a submatrix of A).
76*
77* DESCA (global and local input) INTEGER array of dimension DLEN.
78* if 1D type (DTYPE_A=501 or 502), DLEN >= 7;
79* if 2D type (DTYPE_A=1), DLEN >= 9.
80* The array descriptor for the distributed matrix A.
81* Contains information of mapping of A to memory. Please
82* see NOTES below for full description and options.
83*
84* B (local input/local output) REAL pointer into
85* local memory to an array of local lead dimension lld_b>=NB.
86* On entry, this array contains the
87* the local pieces of the right hand sides
88* B(IB:IB+N-1, 1:NRHS).
89* On exit, this contains the local piece of the solutions
90* distributed matrix X.
91*
92* IB (global input) INTEGER
93* The row index in the global array B that points to the first
94* row of the matrix to be operated on (which may be either
95* all of B or a submatrix of B).
96*
97* DESCB (global and local input) INTEGER array of dimension DLEN.
98* if 1D type (DTYPE_B=502), DLEN >=7;
99* if 2D type (DTYPE_B=1), DLEN >= 9.
100* The array descriptor for the distributed matrix B.
101* Contains information of mapping of B to memory. Please
102* see NOTES below for full description and options.
103*
104* AF (local output) REAL array, dimension LAF.
105* Auxiliary Fillin Space.
106* Fillin is created during the factorization routine
107* PSPTTRF and this is stored in AF. If a linear system
108* is to be solved using PSPTTRS after the factorization
109* routine, AF *must not be altered* after the factorization.
110*
111* LAF (local input) INTEGER
112* Size of user-input Auxiliary Fillin space AF. Must be >=
113* (NB+2)
114* If LAF is not large enough, an error code will be returned
115* and the minimum acceptable size will be returned in AF( 1 )
116*
117* WORK (local workspace/local output)
118* REAL temporary workspace. This space may
119* be overwritten in between calls to routines. WORK must be
120* the size given in LWORK.
121* On exit, WORK( 1 ) contains the minimal LWORK.
122*
123* LWORK (local input or global input) INTEGER
124* Size of user-input workspace WORK.
125* If LWORK is too small, the minimal acceptable size will be
126* returned in WORK(1) and an error code is returned. LWORK>=
127* (10+2*min(100,NRHS))*NPCOL+4*NRHS
128*
129* INFO (local output) INTEGER
130* = 0: successful exit
131* < 0: If the i-th argument is an array and the j-entry had
132* an illegal value, then INFO = -(i*100+j), if the i-th
133* argument is a scalar and had an illegal value, then
134* INFO = -i.
135*
136* =====================================================================
137*
138*
139* Restrictions
140* ============
141*
142* The following are restrictions on the input parameters. Some of these
143* are temporary and will be removed in future releases, while others
144* may reflect fundamental technical limitations.
145*
146* Non-cyclic restriction: VERY IMPORTANT!
147* P*NB>= mod(JA-1,NB)+N.
148* The mapping for matrices must be blocked, reflecting the nature
149* of the divide and conquer algorithm as a task-parallel algorithm.
150* This formula in words is: no processor may have more than one
151* chunk of the matrix.
152*
153* Blocksize cannot be too small:
154* If the matrix spans more than one processor, the following
155* restriction on NB, the size of each block on each processor,
156* must hold:
157* NB >= 2
158* The bulk of parallel computation is done on the matrix of size
159* O(NB) on each processor. If this is too small, divide and conquer
160* is a poor choice of algorithm.
161*
162* Submatrix reference:
163* JA = IB
164* Alignment restriction that prevents unnecessary communication.
165*
166*
167* =====================================================================
168*
169*
170* Notes
171* =====
172*
173* If the factorization routine and the solve routine are to be called
174* separately (to solve various sets of righthand sides using the same
175* coefficient matrix), the auxiliary space AF *must not be altered*
176* between calls to the factorization routine and the solve routine.
177*
178* The best algorithm for solving banded and tridiagonal linear systems
179* depends on a variety of parameters, especially the bandwidth.
180* Currently, only algorithms designed for the case N/P >> bw are
181* implemented. These go by many names, including Divide and Conquer,
182* Partitioning, domain decomposition-type, etc.
183* For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C
184* algorithms are the appropriate choice.
185*
186* Algorithm description: Divide and Conquer
187*
188* The Divide and Conqer algorithm assumes the matrix is narrowly
189* banded compared with the number of equations. In this situation,
190* it is best to distribute the input matrix A one-dimensionally,
191* with columns atomic and rows divided amongst the processes.
192* The basic algorithm divides the tridiagonal matrix up into
193* P pieces with one stored on each processor,
194* and then proceeds in 2 phases for the factorization or 3 for the
195* solution of a linear system.
196* 1) Local Phase:
197* The individual pieces are factored independently and in
198* parallel. These factors are applied to the matrix creating
199* fillin, which is stored in a non-inspectable way in auxiliary
200* space AF. Mathematically, this is equivalent to reordering
201* the matrix A as P A P^T and then factoring the principal
202* leading submatrix of size equal to the sum of the sizes of
203* the matrices factored on each processor. The factors of
204* these submatrices overwrite the corresponding parts of A
205* in memory.
206* 2) Reduced System Phase:
207* A small ((P-1)) system is formed representing
208* interaction of the larger blocks, and is stored (as are its
209* factors) in the space AF. A parallel Block Cyclic Reduction
210* algorithm is used. For a linear system, a parallel front solve
211* followed by an analagous backsolve, both using the structure
212* of the factored matrix, are performed.
213* 3) Backsubsitution Phase:
214* For a linear system, a local backsubstitution is performed on
215* each processor in parallel.
216*
217*
218* Descriptors
219* ===========
220*
221* Descriptors now have *types* and differ from ScaLAPACK 1.0.
222*
223* Note: tridiagonal codes can use either the old two dimensional
224* or new one-dimensional descriptors, though the processor grid in
225* both cases *must be one-dimensional*. We describe both types below.
226*
227* Each global data object is described by an associated description
228* vector. This vector stores the information required to establish
229* the mapping between an object element and its corresponding process
230* and memory location.
231*
232* Let A be a generic term for any 2D block cyclicly distributed array.
233* Such a global array has an associated description vector DESCA.
234* In the following comments, the character _ should be read as
235* "of the global array".
236*
237* NOTATION STORED IN EXPLANATION
238* --------------- -------------- --------------------------------------
239* DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
240* DTYPE_A = 1.
241* CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
242* the BLACS process grid A is distribu-
243* ted over. The context itself is glo-
244* bal, but the handle (the integer
245* value) may vary.
246* M_A (global) DESCA( M_ ) The number of rows in the global
247* array A.
248* N_A (global) DESCA( N_ ) The number of columns in the global
249* array A.
250* MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
251* the rows of the array.
252* NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
253* the columns of the array.
254* RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
255* row of the array A is distributed.
256* CSRC_A (global) DESCA( CSRC_ ) The process column over which the
257* first column of the array A is
258* distributed.
259* LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
260* array. LLD_A >= MAX(1,LOCr(M_A)).
261*
262* Let K be the number of rows or columns of a distributed matrix,
263* and assume that its process grid has dimension p x q.
264* LOCr( K ) denotes the number of elements of K that a process
265* would receive if K were distributed over the p processes of its
266* process column.
267* Similarly, LOCc( K ) denotes the number of elements of K that a
268* process would receive if K were distributed over the q processes of
269* its process row.
270* The values of LOCr() and LOCc() may be determined via a call to the
271* ScaLAPACK tool function, NUMROC:
272* LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
273* LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
274* An upper bound for these quantities may be computed by:
275* LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
276* LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
277*
278*
279* One-dimensional descriptors:
280*
281* One-dimensional descriptors are a new addition to ScaLAPACK since
282* version 1.0. They simplify and shorten the descriptor for 1D
283* arrays.
284*
285* Since ScaLAPACK supports two-dimensional arrays as the fundamental
286* object, we allow 1D arrays to be distributed either over the
287* first dimension of the array (as if the grid were P-by-1) or the
288* 2nd dimension (as if the grid were 1-by-P). This choice is
289* indicated by the descriptor type (501 or 502)
290* as described below.
291* However, for tridiagonal matrices, since the objects being
292* distributed are the individual vectors storing the diagonals, we
293* have adopted the convention that both the P-by-1 descriptor and
294* the 1-by-P descriptor are allowed and are equivalent for
295* tridiagonal matrices. Thus, for tridiagonal matrices,
296* DTYPE_A = 501 or 502 can be used interchangeably
297* without any other change.
298* We require that the distributed vectors storing the diagonals of a
299* tridiagonal matrix be aligned with each other. Because of this, a
300* single descriptor, DESCA, serves to describe the distribution of
301* of all diagonals simultaneously.
302*
303* IMPORTANT NOTE: the actual BLACS grid represented by the
304* CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
305* irrespective of which one-dimensional descriptor type
306* (501 or 502) is input.
307* This routine will interpret the grid properly either way.
308* ScaLAPACK routines *do not support intercontext operations* so that
309* the grid passed to a single ScaLAPACK routine *must be the same*
310* for all array descriptors passed to that routine.
311*
312* NOTE: In all cases where 1D descriptors are used, 2D descriptors
313* may also be used, since a one-dimensional array is a special case
314* of a two-dimensional array with one dimension of size unity.
315* The two-dimensional array used in this case *must* be of the
316* proper orientation:
317* If the appropriate one-dimensional descriptor is DTYPEA=501
318* (1 by P type), then the two dimensional descriptor must
319* have a CTXT value that refers to a 1 by P BLACS grid;
320* If the appropriate one-dimensional descriptor is DTYPEA=502
321* (P by 1 type), then the two dimensional descriptor must
322* have a CTXT value that refers to a P by 1 BLACS grid.
323*
324*
325* Summary of allowed descriptors, types, and BLACS grids:
326* DTYPE 501 502 1 1
327* BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
328* -----------------------------------------------------
329* A OK OK OK NO
330* B NO OK NO OK
331*
332* Note that a consequence of this chart is that it is not possible
333* for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
334* to opposite requirements for the orientation of the BLACS grid,
335* and as noted before, the *same* BLACS context must be used in
336* all descriptors in a single ScaLAPACK subroutine call.
337*
338* Let A be a generic term for any 1D block cyclicly distributed array.
339* Such a global array has an associated description vector DESCA.
340* In the following comments, the character _ should be read as
341* "of the global array".
342*
343* NOTATION STORED IN EXPLANATION
344* --------------- ---------- ------------------------------------------
345* DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
346* TYPE_A = 501: 1-by-P grid.
347* TYPE_A = 502: P-by-1 grid.
348* CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
349* the BLACS process grid A is distribu-
350* ted over. The context itself is glo-
351* bal, but the handle (the integer
352* value) may vary.
353* N_A (global) DESCA( 3 ) The size of the array dimension being
354* distributed.
355* NB_A (global) DESCA( 4 ) The blocking factor used to distribute
356* the distributed dimension of the array.
357* SRC_A (global) DESCA( 5 ) The process row or column over which the
358* first row or column of the array
359* is distributed.
360* Ignored DESCA( 6 ) Ignored for tridiagonal matrices.
361* Reserved DESCA( 7 ) Reserved for future use.
362*
363*
364*
365* =====================================================================
366*
367* Code Developer: Andrew J. Cleary, University of Tennessee.
368* Current address: Lawrence Livermore National Labs.
369*
370* =====================================================================
371*
372* .. Parameters ..
373 REAL ONE
374 parameter( one = 1.0e+0 )
375 REAL ZERO
376 parameter( zero = 0.0e+0 )
377 INTEGER INT_ONE
378 parameter( int_one = 1 )
379 INTEGER DESCMULT, BIGNUM
380 parameter( descmult = 100, bignum = descmult*descmult )
381 INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
382 $ lld_, mb_, m_, nb_, n_, rsrc_
383 parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
384 $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
385 $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
386* ..
387* .. Local Scalars ..
388 INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
389 $ idum1, idum3, ja_new, level_dist, llda, lldb,
390 $ mycol, myrow, my_num_cols, nb, np, npcol,
391 $ nprow, np_save, odd_size, part_offset,
392 $ part_size, return_code, store_m_b, store_n_a,
393 $ temp, work_size_min
394* ..
395* .. Local Arrays ..
396 INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
397 $ param_check( 15, 3 )
398* ..
399* .. External Subroutines ..
400 EXTERNAL blacs_gridexit, blacs_gridinfo, desc_convert,
401 $ globchk, pxerbla, reshape, saxpy, sgemm,
402 $ sgerv2d, sgesd2d, smatadd, spttrsv, strtrs
403* ..
404* .. External Functions ..
405 LOGICAL LSAME
406 INTEGER NUMROC
407 EXTERNAL lsame, numroc
408* ..
409* .. Intrinsic Functions ..
410 INTRINSIC ichar, mod
411* ..
412* .. Executable Statements ..
413*
414* Test the input parameters
415*
416 info = 0
417*
418* Convert descriptor into standard form for easy access to
419* parameters, check that grid is of right shape.
420*
421 desca_1xp( 1 ) = 501
422 descb_px1( 1 ) = 502
423*
424 temp = desca( dtype_ )
425 IF( temp.EQ.502 ) THEN
426* Temporarily set the descriptor type to 1xP type
427 desca( dtype_ ) = 501
428 END IF
429*
430 CALL desc_convert( desca, desca_1xp, return_code )
431*
432 desca( dtype_ ) = temp
433*
434 IF( return_code.NE.0 ) THEN
435 info = -( 7*100+2 )
436 END IF
437*
438 CALL desc_convert( descb, descb_px1, return_code )
439*
440 IF( return_code.NE.0 ) THEN
441 info = -( 10*100+2 )
442 END IF
443*
444* Consistency checks for DESCA and DESCB.
445*
446* Context must be the same
447 IF( desca_1xp( 2 ).NE.descb_px1( 2 ) ) THEN
448 info = -( 10*100+2 )
449 END IF
450*
451* These are alignment restrictions that may or may not be removed
452* in future releases. -Andy Cleary, April 14, 1996.
453*
454* Block sizes must be the same
455 IF( desca_1xp( 4 ).NE.descb_px1( 4 ) ) THEN
456 info = -( 10*100+4 )
457 END IF
458*
459* Source processor must be the same
460*
461 IF( desca_1xp( 5 ).NE.descb_px1( 5 ) ) THEN
462 info = -( 10*100+5 )
463 END IF
464*
465* Get values out of descriptor for use in code.
466*
467 ictxt = desca_1xp( 2 )
468 csrc = desca_1xp( 5 )
469 nb = desca_1xp( 4 )
470 llda = desca_1xp( 6 )
471 store_n_a = desca_1xp( 3 )
472 lldb = descb_px1( 6 )
473 store_m_b = descb_px1( 3 )
474*
475* Get grid parameters
476*
477*
478 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
479 np = nprow*npcol
480*
481*
482*
483 IF( lsame( uplo, 'U' ) ) THEN
484 idum1 = ichar( 'U' )
485 ELSE IF( lsame( uplo, 'L' ) ) THEN
486 idum1 = ichar( 'L' )
487 ELSE
488 info = -1
489 END IF
490*
491 IF( lwork.LT.-1 ) THEN
492 info = -14
493 ELSE IF( lwork.EQ.-1 ) THEN
494 idum3 = -1
495 ELSE
496 idum3 = 1
497 END IF
498*
499 IF( n.LT.0 ) THEN
500 info = -2
501 END IF
502*
503 IF( n+ja-1.GT.store_n_a ) THEN
504 info = -( 7*100+6 )
505 END IF
506*
507 IF( n+ib-1.GT.store_m_b ) THEN
508 info = -( 10*100+3 )
509 END IF
510*
511 IF( lldb.LT.nb ) THEN
512 info = -( 10*100+6 )
513 END IF
514*
515 IF( nrhs.LT.0 ) THEN
516 info = -3
517 END IF
518*
519* Current alignment restriction
520*
521 IF( ja.NE.ib ) THEN
522 info = -6
523 END IF
524*
525* Argument checking that is specific to Divide & Conquer routine
526*
527 IF( nprow.NE.1 ) THEN
528 info = -( 7*100+2 )
529 END IF
530*
531 IF( n.GT.np*nb-mod( ja-1, nb ) ) THEN
532 info = -( 2 )
533 CALL pxerbla( ictxt,
534 $ 'PSPTTRSV, D&C alg.: only 1 block per proc',
535 $ -info )
536 RETURN
537 END IF
538*
539 IF( ( ja+n-1.GT.nb ) .AND. ( nb.LT.2*int_one ) ) THEN
540 info = -( 7*100+4 )
541 CALL pxerbla( ictxt, 'PSPTTRSV, D&C alg.: NB too small',
542 $ -info )
543 RETURN
544 END IF
545*
546*
547 work_size_min = int_one*nrhs
548*
549 work( 1 ) = work_size_min
550*
551 IF( lwork.LT.work_size_min ) THEN
552 IF( lwork.NE.-1 ) THEN
553 info = -14
554 CALL pxerbla( ictxt, 'PSPTTRSV: worksize error', -info )
555 END IF
556 RETURN
557 END IF
558*
559* Pack params and positions into arrays for global consistency check
560*
561 param_check( 15, 1 ) = descb( 5 )
562 param_check( 14, 1 ) = descb( 4 )
563 param_check( 13, 1 ) = descb( 3 )
564 param_check( 12, 1 ) = descb( 2 )
565 param_check( 11, 1 ) = descb( 1 )
566 param_check( 10, 1 ) = ib
567 param_check( 9, 1 ) = desca( 5 )
568 param_check( 8, 1 ) = desca( 4 )
569 param_check( 7, 1 ) = desca( 3 )
570 param_check( 6, 1 ) = desca( 1 )
571 param_check( 5, 1 ) = ja
572 param_check( 4, 1 ) = nrhs
573 param_check( 3, 1 ) = n
574 param_check( 2, 1 ) = idum3
575 param_check( 1, 1 ) = idum1
576*
577 param_check( 15, 2 ) = 1005
578 param_check( 14, 2 ) = 1004
579 param_check( 13, 2 ) = 1003
580 param_check( 12, 2 ) = 1002
581 param_check( 11, 2 ) = 1001
582 param_check( 10, 2 ) = 9
583 param_check( 9, 2 ) = 705
584 param_check( 8, 2 ) = 704
585 param_check( 7, 2 ) = 703
586 param_check( 6, 2 ) = 701
587 param_check( 5, 2 ) = 6
588 param_check( 4, 2 ) = 3
589 param_check( 3, 2 ) = 2
590 param_check( 2, 2 ) = 14
591 param_check( 1, 2 ) = 1
592*
593* Want to find errors with MIN( ), so if no error, set it to a big
594* number. If there already is an error, multiply by the the
595* descriptor multiplier.
596*
597 IF( info.GE.0 ) THEN
598 info = bignum
599 ELSE IF( info.LT.-descmult ) THEN
600 info = -info
601 ELSE
602 info = -info*descmult
603 END IF
604*
605* Check consistency across processors
606*
607 CALL globchk( ictxt, 15, param_check, 15, param_check( 1, 3 ),
608 $ info )
609*
610* Prepare output: set info = 0 if no error, and divide by DESCMULT
611* if error is not in a descriptor entry.
612*
613 IF( info.EQ.bignum ) THEN
614 info = 0
615 ELSE IF( mod( info, descmult ).EQ.0 ) THEN
616 info = -info / descmult
617 ELSE
618 info = -info
619 END IF
620*
621 IF( info.LT.0 ) THEN
622 CALL pxerbla( ictxt, 'PSPTTRSV', -info )
623 RETURN
624 END IF
625*
626* Quick return if possible
627*
628 IF( n.EQ.0 )
629 $ RETURN
630*
631 IF( nrhs.EQ.0 )
632 $ RETURN
633*
634*
635* Adjust addressing into matrix space to properly get into
636* the beginning part of the relevant data
637*
638 part_offset = nb*( ( ja-1 ) / ( npcol*nb ) )
639*
640 IF( ( mycol-csrc ).LT.( ja-part_offset-1 ) / nb ) THEN
641 part_offset = part_offset + nb
642 END IF
643*
644 IF( mycol.LT.csrc ) THEN
645 part_offset = part_offset - nb
646 END IF
647*
648* Form a new BLACS grid (the "standard form" grid) with only procs
649* holding part of the matrix, of size 1xNP where NP is adjusted,
650* starting at csrc=0, with JA modified to reflect dropped procs.
651*
652* First processor to hold part of the matrix:
653*
654 first_proc = mod( ( ja-1 ) / nb+csrc, npcol )
655*
656* Calculate new JA one while dropping off unused processors.
657*
658 ja_new = mod( ja-1, nb ) + 1
659*
660* Save and compute new value of NP
661*
662 np_save = np
663 np = ( ja_new+n-2 ) / nb + 1
664*
665* Call utility routine that forms "standard-form" grid
666*
667 CALL reshape( ictxt, int_one, ictxt_new, int_one, first_proc,
668 $ int_one, np )
669*
670* Use new context from standard grid as context.
671*
672 ictxt_save = ictxt
673 ictxt = ictxt_new
674 desca_1xp( 2 ) = ictxt_new
675 descb_px1( 2 ) = ictxt_new
676*
677* Get information about new grid.
678*
679 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
680*
681* Drop out processors that do not have part of the matrix.
682*
683 IF( myrow.LT.0 ) THEN
684 GO TO 100
685 END IF
686*
687* ********************************
688* Values reused throughout routine
689*
690* User-input value of partition size
691*
692 part_size = nb
693*
694* Number of columns in each processor
695*
696 my_num_cols = numroc( n, part_size, mycol, 0, npcol )
697*
698* Offset in columns to beginning of main partition in each proc
699*
700 IF( mycol.EQ.0 ) THEN
701 part_offset = part_offset + mod( ja_new-1, part_size )
702 my_num_cols = my_num_cols - mod( ja_new-1, part_size )
703 END IF
704*
705* Size of main (or odd) partition in each processor
706*
707 odd_size = my_num_cols
708 IF( mycol.LT.np-1 ) THEN
709 odd_size = odd_size - int_one
710 END IF
711*
712*
713*
714* Begin main code
715*
716 IF( lsame( uplo, 'L' ) ) THEN
717*
718*
719* Frontsolve
720*
721*
722******************************************
723* Local computation phase
724******************************************
725*
726* Use main partition in each processor to solve locally
727*
728 CALL spttrsv( 'N', odd_size, nrhs, d( part_offset+1 ),
729 $ e( part_offset+1 ), b( part_offset+1 ), lldb,
730 $ info )
731*
732*
733 IF( mycol.LT.np-1 ) THEN
734* Use factorization of odd-even connection block to modify
735* locally stored portion of right hand side(s)
736*
737 CALL saxpy( nrhs, -e( part_offset+odd_size ),
738 $ b( part_offset+odd_size ), lldb,
739 $ b( part_offset+odd_size+1 ), lldb )
740*
741 END IF
742*
743*
744 IF( mycol.NE.0 ) THEN
745* Use the "spike" fillin to calculate contribution to previous
746* processor's righthand-side.
747*
748 CALL sgemm( 'T', 'N', 1, nrhs, odd_size, -one, af( 1 ),
749 $ odd_size, b( part_offset+1 ), lldb, zero,
750 $ work( 1+int_one-1 ), int_one )
751 END IF
752*
753*
754************************************************
755* Formation and solution of reduced system
756************************************************
757*
758*
759* Send modifications to prior processor's right hand sides
760*
761 IF( mycol.GT.0 ) THEN
762*
763 CALL sgesd2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
764 $ mycol-1 )
765*
766 END IF
767*
768* Receive modifications to processor's right hand sides
769*
770 IF( mycol.LT.npcol-1 ) THEN
771*
772 CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
773 $ mycol+1 )
774*
775* Combine contribution to locally stored right hand sides
776*
777 CALL smatadd( int_one, nrhs, one, work( 1 ), int_one, one,
778 $ b( part_offset+odd_size+1 ), lldb )
779*
780 END IF
781*
782*
783* The last processor does not participate in the solution of the
784* reduced system, having sent its contribution already.
785 IF( mycol.EQ.npcol-1 ) THEN
786 GO TO 30
787 END IF
788*
789*
790* *************************************
791* Modification Loop
792*
793* The distance for sending and receiving for each level starts
794* at 1 for the first level.
795 level_dist = 1
796*
797* Do until this proc is needed to modify other procs' equations
798*
799 10 CONTINUE
800 IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
801 $ GO TO 20
802*
803* Receive and add contribution to righthand sides from left
804*
805 IF( mycol-level_dist.GE.0 ) THEN
806*
807 CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
808 $ mycol-level_dist )
809*
810 CALL smatadd( int_one, nrhs, one, work( 1 ), int_one, one,
811 $ b( part_offset+odd_size+1 ), lldb )
812*
813 END IF
814*
815* Receive and add contribution to righthand sides from right
816*
817 IF( mycol+level_dist.LT.npcol-1 ) THEN
818*
819 CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
820 $ mycol+level_dist )
821*
822 CALL smatadd( int_one, nrhs, one, work( 1 ), int_one, one,
823 $ b( part_offset+odd_size+1 ), lldb )
824*
825 END IF
826*
827 level_dist = level_dist*2
828*
829 GO TO 10
830 20 CONTINUE
831* [End of GOTO Loop]
832*
833*
834*
835* *********************************
836* Calculate and use this proc's blocks to modify other procs
837*
838* Solve with diagonal block
839*
840 CALL strtrs( 'L', 'N', 'U', int_one, nrhs, af( odd_size+2 ),
841 $ int_one, b( part_offset+odd_size+1 ), lldb, info )
842*
843 IF( info.NE.0 ) THEN
844 GO TO 90
845 END IF
846*
847*
848*
849* *********
850 IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
851*
852* Calculate contribution from this block to next diagonal block
853*
854 CALL sgemm( 'T', 'N', int_one, nrhs, int_one, -one,
855 $ af( ( odd_size )*1+1 ), int_one,
856 $ b( part_offset+odd_size+1 ), lldb, zero,
857 $ work( 1 ), int_one )
858*
859* Send contribution to diagonal block's owning processor.
860*
861 CALL sgesd2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
862 $ mycol+level_dist )
863*
864 END IF
865* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
866*
867* ************
868 IF( ( mycol / level_dist.GT.0 ) .AND.
869 $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) ) THEN
870*
871*
872* Use offdiagonal block to calculate modification to diag block
873* of processor to the left
874*
875 CALL sgemm( 'N', 'N', int_one, nrhs, int_one, -one,
876 $ af( odd_size*1+2+1 ), int_one,
877 $ b( part_offset+odd_size+1 ), lldb, zero,
878 $ work( 1 ), int_one )
879*
880* Send contribution to diagonal block's owning processor.
881*
882 CALL sgesd2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
883 $ mycol-level_dist )
884*
885 END IF
886* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
887*
888 30 CONTINUE
889*
890 ELSE
891*
892******************** BACKSOLVE *************************************
893*
894********************************************************************
895* .. Begin reduced system phase of algorithm ..
896********************************************************************
897*
898*
899*
900* The last processor does not participate in the solution of the
901* reduced system and just waits to receive its solution.
902 IF( mycol.EQ.npcol-1 ) THEN
903 GO TO 80
904 END IF
905*
906* Determine number of steps in tree loop
907*
908 level_dist = 1
909 40 CONTINUE
910 IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
911 $ GO TO 50
912*
913 level_dist = level_dist*2
914*
915 GO TO 40
916 50 CONTINUE
917*
918*
919 IF( ( mycol / level_dist.GT.0 ) .AND.
920 $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) ) THEN
921*
922* Receive solution from processor to left
923*
924 CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
925 $ mycol-level_dist )
926*
927*
928* Use offdiagonal block to calculate modification to RHS stored
929* on this processor
930*
931 CALL sgemm( 'T', 'N', int_one, nrhs, int_one, -one,
932 $ af( odd_size*1+2+1 ), int_one, work( 1 ),
933 $ int_one, one, b( part_offset+odd_size+1 ),
934 $ lldb )
935 END IF
936* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
937*
938*
939 IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
940*
941* Receive solution from processor to right
942*
943 CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
944 $ mycol+level_dist )
945*
946* Calculate contribution from this block to next diagonal block
947*
948 CALL sgemm( 'N', 'N', int_one, nrhs, int_one, -one,
949 $ af( ( odd_size )*1+1 ), int_one, work( 1 ),
950 $ int_one, one, b( part_offset+odd_size+1 ),
951 $ lldb )
952*
953 END IF
954* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
955*
956*
957* Solve with diagonal block
958*
959 CALL strtrs( 'L', 'T', 'U', int_one, nrhs, af( odd_size+2 ),
960 $ int_one, b( part_offset+odd_size+1 ), lldb, info )
961*
962 IF( info.NE.0 ) THEN
963 GO TO 90
964 END IF
965*
966*
967*
968***Modification Loop *******
969*
970 60 CONTINUE
971 IF( level_dist.EQ.1 )
972 $ GO TO 70
973*
974 level_dist = level_dist / 2
975*
976* Send solution to the right
977*
978 IF( mycol+level_dist.LT.npcol-1 ) THEN
979*
980 CALL sgesd2d( ictxt, int_one, nrhs,
981 $ b( part_offset+odd_size+1 ), lldb, 0,
982 $ mycol+level_dist )
983*
984 END IF
985*
986* Send solution to left
987*
988 IF( mycol-level_dist.GE.0 ) THEN
989*
990 CALL sgesd2d( ictxt, int_one, nrhs,
991 $ b( part_offset+odd_size+1 ), lldb, 0,
992 $ mycol-level_dist )
993*
994 END IF
995*
996 GO TO 60
997 70 CONTINUE
998* [End of GOTO Loop]
999*
1000 80 CONTINUE
1001* [Processor npcol - 1 jumped to here to await next stage]
1002*
1003*******************************
1004* Reduced system has been solved, communicate solutions to nearest
1005* neighbors in preparation for local computation phase.
1006*
1007*
1008* Send elements of solution to next proc
1009*
1010 IF( mycol.LT.npcol-1 ) THEN
1011*
1012 CALL sgesd2d( ictxt, int_one, nrhs,
1013 $ b( part_offset+odd_size+1 ), lldb, 0,
1014 $ mycol+1 )
1015*
1016 END IF
1017*
1018* Receive modifications to processor's right hand sides
1019*
1020 IF( mycol.GT.0 ) THEN
1021*
1022 CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
1023 $ mycol-1 )
1024*
1025 END IF
1026*
1027*
1028*
1029**********************************************
1030* Local computation phase
1031**********************************************
1032*
1033 IF( mycol.NE.0 ) THEN
1034* Use the "spike" fillin to calculate contribution from previous
1035* processor's solution.
1036*
1037 CALL sgemm( 'N', 'N', odd_size, nrhs, 1, -one, af( 1 ),
1038 $ odd_size, work( 1+int_one-1 ), int_one, one,
1039 $ b( part_offset+1 ), lldb )
1040*
1041 END IF
1042*
1043*
1044 IF( mycol.LT.np-1 ) THEN
1045* Use factorization of odd-even connection block to modify
1046* locally stored portion of right hand side(s)
1047*
1048 CALL saxpy( nrhs, -( e( part_offset+odd_size ) ),
1049 $ b( part_offset+odd_size+1 ), lldb,
1050 $ b( part_offset+odd_size ), lldb )
1051*
1052 END IF
1053*
1054* Use main partition in each processor to solve locally
1055*
1056 CALL spttrsv( 'T', odd_size, nrhs, d( part_offset+1 ),
1057 $ e( part_offset+1 ), b( part_offset+1 ), lldb,
1058 $ info )
1059*
1060*
1061 END IF
1062* End of "IF( LSAME( UPLO, 'L' ) )"...
1063 90 CONTINUE
1064*
1065*
1066* Free BLACS space used to hold standard-form grid.
1067*
1068 IF( ictxt_save.NE.ictxt_new ) THEN
1069 CALL blacs_gridexit( ictxt_new )
1070 END IF
1071*
1072 100 CONTINUE
1073*
1074* Restore saved input parameters
1075*
1076 ictxt = ictxt_save
1077 np = np_save
1078*
1079* Output minimum worksize
1080*
1081 work( 1 ) = work_size_min
1082*
1083*
1084 RETURN
1085*
1086* End of PSPTTRSV
1087*
1088 END
subroutine desc_convert(desc_in, desc_out, info)
Definition desc_convert.f:2
subroutine globchk(ictxt, n, x, ldx, iwork, info)
Definition pchkxmat.f:403
subroutine pspttrsv(uplo, n, nrhs, d, e, ja, desca, b, ib, descb, af, laf, work, lwork, info)
Definition pspttrsv.f:3
subroutine pxerbla(ictxt, srname, info)
Definition pxerbla.f:2
void reshape(Int *context_in, Int *major_in, Int *context_out, Int *major_out, Int *first_proc, Int *nprow_new, Int *npcol_new)
Definition reshape.c:80
subroutine smatadd(m, n, alpha, a, lda, beta, c, ldc)
Definition smatadd.f:2
subroutine spttrsv(trans, n, nrhs, d, e, b, ldb, info)
Definition spttrsv.f:3