SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
psdtsv.f
Go to the documentation of this file.
1 SUBROUTINE psdtsv( N, NRHS, DL, D, DU, JA, DESCA, B, IB, DESCB,
2 $ WORK, LWORK, INFO )
3*
4*
5*
6* -- ScaLAPACK routine (version 1.7) --
7* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
8* and University of California, Berkeley.
9* November 15, 1997
10*
11* .. Scalar Arguments ..
12 INTEGER IB, INFO, JA, LWORK, N, NRHS
13* ..
14* .. Array Arguments ..
15 INTEGER DESCA( * ), DESCB( * )
16 REAL B( * ), D( * ), DL( * ), DU( * ), WORK( * )
17* ..
18*
19*
20* Purpose
21* =======
22*
23* PSDTSV solves a system of linear equations
24*
25* A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
26*
27* where A(1:N, JA:JA+N-1) is an N-by-N real
28* tridiagonal diagonally dominant-like distributed
29* matrix.
30*
31* Gaussian elimination without pivoting
32* is used to factor a reordering
33* of the matrix into L U.
34*
35* See PSDTTRF and PSDTTRS for details.
36*
37* =====================================================================
38*
39* Arguments
40* =========
41*
42*
43* N (global input) INTEGER
44* The number of rows and columns to be operated on, i.e. the
45* order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
46*
47* NRHS (global input) INTEGER
48* The number of right hand sides, i.e., the number of columns
49* of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
50* NRHS >= 0.
51*
52* DL (local input/local output) REAL pointer to local
53* part of global vector storing the lower diagonal of the
54* matrix. Globally, DL(1) is not referenced, and DL must be
55* aligned with D.
56* Must be of size >= DESCA( NB_ ).
57* On exit, this array contains information containing the
58* factors of the matrix.
59*
60* D (local input/local output) REAL pointer to local
61* part of global vector storing the main diagonal of the
62* matrix.
63* On exit, this array contains information containing the
64* factors of the matrix.
65* Must be of size >= DESCA( NB_ ).
66*
67* DU (local input/local output) REAL pointer to local
68* part of global vector storing the upper diagonal of the
69* matrix. Globally, DU(n) is not referenced, and DU must be
70* aligned with D.
71* On exit, this array contains information containing the
72* factors of the matrix.
73* Must be of size >= DESCA( NB_ ).
74*
75* JA (global input) INTEGER
76* The index in the global array A that points to the start of
77* the matrix to be operated on (which may be either all of A
78* or a submatrix of A).
79*
80* DESCA (global and local input) INTEGER array of dimension DLEN.
81* if 1D type (DTYPE_A=501 or 502), DLEN >= 7;
82* if 2D type (DTYPE_A=1), DLEN >= 9.
83* The array descriptor for the distributed matrix A.
84* Contains information of mapping of A to memory. Please
85* see NOTES below for full description and options.
86*
87* B (local input/local output) REAL pointer into
88* local memory to an array of local lead dimension lld_b>=NB.
89* On entry, this array contains the
90* the local pieces of the right hand sides
91* B(IB:IB+N-1, 1:NRHS).
92* On exit, this contains the local piece of the solutions
93* distributed matrix X.
94*
95* IB (global input) INTEGER
96* The row index in the global array B that points to the first
97* row of the matrix to be operated on (which may be either
98* all of B or a submatrix of B).
99*
100* DESCB (global and local input) INTEGER array of dimension DLEN.
101* if 1D type (DTYPE_B=502), DLEN >=7;
102* if 2D type (DTYPE_B=1), DLEN >= 9.
103* The array descriptor for the distributed matrix B.
104* Contains information of mapping of B to memory. Please
105* see NOTES below for full description and options.
106*
107* WORK (local workspace/local output)
108* REAL temporary workspace. This space may
109* be overwritten in between calls to routines. WORK must be
110* the size given in LWORK.
111* On exit, WORK( 1 ) contains the minimal LWORK.
112*
113* LWORK (local input or global input) INTEGER
114* Size of user-input workspace WORK.
115* If LWORK is too small, the minimal acceptable size will be
116* returned in WORK(1) and an error code is returned. LWORK>=
117* (12*NPCOL+3*NB)
118* +max(10*NPCOL+4*NRHS, 8*NPCOL)
119*
120* INFO (local output) INTEGER
121* = 0: successful exit
122* < 0: If the i-th argument is an array and the j-entry had
123* an illegal value, then INFO = -(i*100+j), if the i-th
124* argument is a scalar and had an illegal value, then
125* INFO = -i.
126* > 0: If INFO = K<=NPROCS, the submatrix stored on processor
127* INFO and factored locally was not
128* diagonally dominant-like, and
129* the factorization was not completed.
130* If INFO = K>NPROCS, the submatrix stored on processor
131* INFO-NPROCS representing interactions with other
132* processors was not
133* stably factorable wo/interchanges,
134* and the factorization was not completed.
135*
136* =====================================================================
137*
138*
139* Restrictions
140* ============
141*
142* The following are restrictions on the input parameters. Some of these
143* are temporary and will be removed in future releases, while others
144* may reflect fundamental technical limitations.
145*
146* Non-cyclic restriction: VERY IMPORTANT!
147* P*NB>= mod(JA-1,NB)+N.
148* The mapping for matrices must be blocked, reflecting the nature
149* of the divide and conquer algorithm as a task-parallel algorithm.
150* This formula in words is: no processor may have more than one
151* chunk of the matrix.
152*
153* Blocksize cannot be too small:
154* If the matrix spans more than one processor, the following
155* restriction on NB, the size of each block on each processor,
156* must hold:
157* NB >= 2
158* The bulk of parallel computation is done on the matrix of size
159* O(NB) on each processor. If this is too small, divide and conquer
160* is a poor choice of algorithm.
161*
162* Submatrix reference:
163* JA = IB
164* Alignment restriction that prevents unnecessary communication.
165*
166*
167* =====================================================================
168*
169*
170* Notes
171* =====
172*
173* If the factorization routine and the solve routine are to be called
174* separately (to solve various sets of righthand sides using the same
175* coefficient matrix), the auxiliary space AF *must not be altered*
176* between calls to the factorization routine and the solve routine.
177*
178* The best algorithm for solving banded and tridiagonal linear systems
179* depends on a variety of parameters, especially the bandwidth.
180* Currently, only algorithms designed for the case N/P >> bw are
181* implemented. These go by many names, including Divide and Conquer,
182* Partitioning, domain decomposition-type, etc.
183* For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C
184* algorithms are the appropriate choice.
185*
186* Algorithm description: Divide and Conquer
187*
188* The Divide and Conqer algorithm assumes the matrix is narrowly
189* banded compared with the number of equations. In this situation,
190* it is best to distribute the input matrix A one-dimensionally,
191* with columns atomic and rows divided amongst the processes.
192* The basic algorithm divides the tridiagonal matrix up into
193* P pieces with one stored on each processor,
194* and then proceeds in 2 phases for the factorization or 3 for the
195* solution of a linear system.
196* 1) Local Phase:
197* The individual pieces are factored independently and in
198* parallel. These factors are applied to the matrix creating
199* fillin, which is stored in a non-inspectable way in auxiliary
200* space AF. Mathematically, this is equivalent to reordering
201* the matrix A as P A P^T and then factoring the principal
202* leading submatrix of size equal to the sum of the sizes of
203* the matrices factored on each processor. The factors of
204* these submatrices overwrite the corresponding parts of A
205* in memory.
206* 2) Reduced System Phase:
207* A small ((P-1)) system is formed representing
208* interaction of the larger blocks, and is stored (as are its
209* factors) in the space AF. A parallel Block Cyclic Reduction
210* algorithm is used. For a linear system, a parallel front solve
211* followed by an analagous backsolve, both using the structure
212* of the factored matrix, are performed.
213* 3) Backsubsitution Phase:
214* For a linear system, a local backsubstitution is performed on
215* each processor in parallel.
216*
217*
218* Descriptors
219* ===========
220*
221* Descriptors now have *types* and differ from ScaLAPACK 1.0.
222*
223* Note: tridiagonal codes can use either the old two dimensional
224* or new one-dimensional descriptors, though the processor grid in
225* both cases *must be one-dimensional*. We describe both types below.
226*
227* Each global data object is described by an associated description
228* vector. This vector stores the information required to establish
229* the mapping between an object element and its corresponding process
230* and memory location.
231*
232* Let A be a generic term for any 2D block cyclicly distributed array.
233* Such a global array has an associated description vector DESCA.
234* In the following comments, the character _ should be read as
235* "of the global array".
236*
237* NOTATION STORED IN EXPLANATION
238* --------------- -------------- --------------------------------------
239* DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
240* DTYPE_A = 1.
241* CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
242* the BLACS process grid A is distribu-
243* ted over. The context itself is glo-
244* bal, but the handle (the integer
245* value) may vary.
246* M_A (global) DESCA( M_ ) The number of rows in the global
247* array A.
248* N_A (global) DESCA( N_ ) The number of columns in the global
249* array A.
250* MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
251* the rows of the array.
252* NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
253* the columns of the array.
254* RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
255* row of the array A is distributed.
256* CSRC_A (global) DESCA( CSRC_ ) The process column over which the
257* first column of the array A is
258* distributed.
259* LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
260* array. LLD_A >= MAX(1,LOCr(M_A)).
261*
262* Let K be the number of rows or columns of a distributed matrix,
263* and assume that its process grid has dimension p x q.
264* LOCr( K ) denotes the number of elements of K that a process
265* would receive if K were distributed over the p processes of its
266* process column.
267* Similarly, LOCc( K ) denotes the number of elements of K that a
268* process would receive if K were distributed over the q processes of
269* its process row.
270* The values of LOCr() and LOCc() may be determined via a call to the
271* ScaLAPACK tool function, NUMROC:
272* LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
273* LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
274* An upper bound for these quantities may be computed by:
275* LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
276* LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
277*
278*
279* One-dimensional descriptors:
280*
281* One-dimensional descriptors are a new addition to ScaLAPACK since
282* version 1.0. They simplify and shorten the descriptor for 1D
283* arrays.
284*
285* Since ScaLAPACK supports two-dimensional arrays as the fundamental
286* object, we allow 1D arrays to be distributed either over the
287* first dimension of the array (as if the grid were P-by-1) or the
288* 2nd dimension (as if the grid were 1-by-P). This choice is
289* indicated by the descriptor type (501 or 502)
290* as described below.
291* However, for tridiagonal matrices, since the objects being
292* distributed are the individual vectors storing the diagonals, we
293* have adopted the convention that both the P-by-1 descriptor and
294* the 1-by-P descriptor are allowed and are equivalent for
295* tridiagonal matrices. Thus, for tridiagonal matrices,
296* DTYPE_A = 501 or 502 can be used interchangeably
297* without any other change.
298* We require that the distributed vectors storing the diagonals of a
299* tridiagonal matrix be aligned with each other. Because of this, a
300* single descriptor, DESCA, serves to describe the distribution of
301* of all diagonals simultaneously.
302*
303* IMPORTANT NOTE: the actual BLACS grid represented by the
304* CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
305* irrespective of which one-dimensional descriptor type
306* (501 or 502) is input.
307* This routine will interpret the grid properly either way.
308* ScaLAPACK routines *do not support intercontext operations* so that
309* the grid passed to a single ScaLAPACK routine *must be the same*
310* for all array descriptors passed to that routine.
311*
312* NOTE: In all cases where 1D descriptors are used, 2D descriptors
313* may also be used, since a one-dimensional array is a special case
314* of a two-dimensional array with one dimension of size unity.
315* The two-dimensional array used in this case *must* be of the
316* proper orientation:
317* If the appropriate one-dimensional descriptor is DTYPEA=501
318* (1 by P type), then the two dimensional descriptor must
319* have a CTXT value that refers to a 1 by P BLACS grid;
320* If the appropriate one-dimensional descriptor is DTYPEA=502
321* (P by 1 type), then the two dimensional descriptor must
322* have a CTXT value that refers to a P by 1 BLACS grid.
323*
324*
325* Summary of allowed descriptors, types, and BLACS grids:
326* DTYPE 501 502 1 1
327* BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
328* -----------------------------------------------------
329* A OK OK OK NO
330* B NO OK NO OK
331*
332* Note that a consequence of this chart is that it is not possible
333* for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
334* to opposite requirements for the orientation of the BLACS grid,
335* and as noted before, the *same* BLACS context must be used in
336* all descriptors in a single ScaLAPACK subroutine call.
337*
338* Let A be a generic term for any 1D block cyclicly distributed array.
339* Such a global array has an associated description vector DESCA.
340* In the following comments, the character _ should be read as
341* "of the global array".
342*
343* NOTATION STORED IN EXPLANATION
344* --------------- ---------- ------------------------------------------
345* DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
346* TYPE_A = 501: 1-by-P grid.
347* TYPE_A = 502: P-by-1 grid.
348* CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
349* the BLACS process grid A is distribu-
350* ted over. The context itself is glo-
351* bal, but the handle (the integer
352* value) may vary.
353* N_A (global) DESCA( 3 ) The size of the array dimension being
354* distributed.
355* NB_A (global) DESCA( 4 ) The blocking factor used to distribute
356* the distributed dimension of the array.
357* SRC_A (global) DESCA( 5 ) The process row or column over which the
358* first row or column of the array
359* is distributed.
360* Ignored DESCA( 6 ) Ignored for tridiagonal matrices.
361* Reserved DESCA( 7 ) Reserved for future use.
362*
363*
364*
365* =====================================================================
366*
367* Code Developer: Andrew J. Cleary, University of Tennessee.
368* Current address: Lawrence Livermore National Labs.
369* This version released: August, 2001.
370*
371* =====================================================================
372*
373* ..
374* .. Parameters ..
375 REAL ONE, ZERO
376 parameter( one = 1.0e+0 )
377 parameter( zero = 0.0e+0 )
378 INTEGER INT_ONE
379 parameter( int_one = 1 )
380 INTEGER DESCMULT, BIGNUM
381 parameter(descmult = 100, bignum = descmult * descmult)
382 INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
383 $ lld_, mb_, m_, nb_, n_, rsrc_
384 parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
385 $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
386 $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
387* ..
388* .. Local Scalars ..
389 INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW,
390 $ ws_factor
391* ..
392* .. External Subroutines ..
393 EXTERNAL psdttrf, psdttrs, pxerbla
394* ..
395* .. Executable Statements ..
396*
397* Note: to avoid duplication, most error checking is not performed
398* in this routine and is left to routines
399* PSDTTRF and PSDTTRS.
400*
401* Begin main code
402*
403 info = 0
404*
405* Get block size to calculate workspace requirements
406*
407 IF( desca( dtype_ ) .EQ. block_cyclic_2d ) THEN
408 nb = desca( nb_ )
409 ictxt = desca( ctxt_ )
410 ELSEIF( desca( dtype_ ) .EQ. 501 ) THEN
411 nb = desca( 4 )
412 ictxt = desca( 2 )
413 ELSEIF( desca( dtype_ ) .EQ. 502 ) THEN
414 nb = desca( 4 )
415 ictxt = desca( 2 )
416 ELSE
417 info = -( 6*100 + dtype_ )
418 CALL pxerbla( ictxt,
419 $ 'PSDTSV',
420 $ -info )
421 RETURN
422 ENDIF
423*
424 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
425*
426*
427* Size needed for AF in factorization
428*
429 ws_factor = (12*npcol+3*nb)
430*
431* Factor the matrix
432*
433 CALL psdttrf( n, dl, d, du, ja, desca, work,
434 $ min( lwork, ws_factor ), work( 1+ws_factor ),
435 $ lwork-ws_factor, info )
436*
437* Check info for error conditions
438*
439 IF( info.NE.0 ) THEN
440 IF( info .LT. 0 ) THEN
441 CALL pxerbla( ictxt, 'PSDTSV', -info )
442 ENDIF
443 RETURN
444 END IF
445*
446* Solve the system using the factorization
447*
448 CALL psdttrs( 'N', n, nrhs, dl, d, du, ja, desca, b, ib, descb,
449 $ work, min( lwork, ws_factor ), work( 1+ws_factor),
450 $ lwork-ws_factor, info )
451*
452* Check info for error conditions
453*
454 IF( info.NE.0 ) THEN
455 CALL pxerbla( ictxt, 'PSDTSV', -info )
456 RETURN
457 END IF
458*
459 RETURN
460*
461* End of PSDTSV
462*
463 END
#define min(A, B)
Definition pcgemr.c:181
subroutine psdtsv(n, nrhs, dl, d, du, ja, desca, b, ib, descb, work, lwork, info)
Definition psdtsv.f:3
subroutine psdttrf(n, dl, d, du, ja, desca, af, laf, work, lwork, info)
Definition psdttrf.f:3
subroutine psdttrs(trans, n, nrhs, dl, d, du, ja, desca, b, ib, descb, af, laf, work, lwork, info)
Definition psdttrs.f:3
subroutine pxerbla(ictxt, srname, info)
Definition pxerbla.f:2