SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
pzptsv.f
Go to the documentation of this file.
1 SUBROUTINE pzptsv( UPLO, N, NRHS, D, E, JA, DESCA, B, IB, DESCB,
2 $ WORK, LWORK, INFO )
3*
4*
5*
6* -- ScaLAPACK routine (version 1.7) --
7* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
8* and University of California, Berkeley.
9* November 15, 1997
10*
11* .. Scalar Arguments ..
12 CHARACTER UPLO
13 INTEGER IB, INFO, JA, LWORK, N, NRHS
14* ..
15* .. Array Arguments ..
16 INTEGER DESCA( * ), DESCB( * )
17 COMPLEX*16 B( * ), E( * ), WORK( * )
18 DOUBLE PRECISION D( * )
19* ..
20*
21*
22* Purpose
23* =======
24*
25* PZPTSV solves a system of linear equations
26*
27* A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
28*
29* where A(1:N, JA:JA+N-1) is an N-by-N complex
30* tridiagonal symmetric positive definite distributed
31* matrix.
32*
33* Cholesky factorization is used to factor a reordering of
34* the matrix into L L'.
35*
36* See PZPTTRF and PZPTTRS for details.
37*
38* =====================================================================
39*
40* Arguments
41* =========
42*
43* UPLO (global input) CHARACTER
44* = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
45* = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
46*
47* N (global input) INTEGER
48* The number of rows and columns to be operated on, i.e. the
49* order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
50*
51* NRHS (global input) INTEGER
52* The number of right hand sides, i.e., the number of columns
53* of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
54* NRHS >= 0.
55*
56* D (local input/local output) COMPLEX*16 pointer to local
57* part of global vector storing the main diagonal of the
58* matrix.
59* On exit, this array contains information containing the
60* factors of the matrix.
61* Must be of size >= DESCA( NB_ ).
62*
63* E (local input/local output) COMPLEX*16 pointer to local
64* part of global vector storing the upper diagonal of the
65* matrix. Globally, DU(n) is not referenced, and DU must be
66* aligned with D.
67* On exit, this array contains information containing the
68* factors of the matrix.
69* Must be of size >= DESCA( NB_ ).
70*
71* JA (global input) INTEGER
72* The index in the global array A that points to the start of
73* the matrix to be operated on (which may be either all of A
74* or a submatrix of A).
75*
76* DESCA (global and local input) INTEGER array of dimension DLEN.
77* if 1D type (DTYPE_A=501 or 502), DLEN >= 7;
78* if 2D type (DTYPE_A=1), DLEN >= 9.
79* The array descriptor for the distributed matrix A.
80* Contains information of mapping of A to memory. Please
81* see NOTES below for full description and options.
82*
83* B (local input/local output) COMPLEX*16 pointer into
84* local memory to an array of local lead dimension lld_b>=NB.
85* On entry, this array contains the
86* the local pieces of the right hand sides
87* B(IB:IB+N-1, 1:NRHS).
88* On exit, this contains the local piece of the solutions
89* distributed matrix X.
90*
91* IB (global input) INTEGER
92* The row index in the global array B that points to the first
93* row of the matrix to be operated on (which may be either
94* all of B or a submatrix of B).
95*
96* DESCB (global and local input) INTEGER array of dimension DLEN.
97* if 1D type (DTYPE_B=502), DLEN >=7;
98* if 2D type (DTYPE_B=1), DLEN >= 9.
99* The array descriptor for the distributed matrix B.
100* Contains information of mapping of B to memory. Please
101* see NOTES below for full description and options.
102*
103* WORK (local workspace/local output)
104* COMPLEX*16 temporary workspace. This space may
105* be overwritten in between calls to routines. WORK must be
106* the size given in LWORK.
107* On exit, WORK( 1 ) contains the minimal LWORK.
108*
109* LWORK (local input or global input) INTEGER
110* Size of user-input workspace WORK.
111* If LWORK is too small, the minimal acceptable size will be
112* returned in WORK(1) and an error code is returned. LWORK>=
113* (12*NPCOL + 3*NB)
114* +max((10+2*min(100,NRHS))*NPCOL+4*NRHS, 8*NPCOL)
115*
116* INFO (local output) INTEGER
117* = 0: successful exit
118* < 0: If the i-th argument is an array and the j-entry had
119* an illegal value, then INFO = -(i*100+j), if the i-th
120* argument is a scalar and had an illegal value, then
121* INFO = -i.
122* > 0: If INFO = K<=NPROCS, the submatrix stored on processor
123* INFO and factored locally was not
124* positive definite, and
125* the factorization was not completed.
126* If INFO = K>NPROCS, the submatrix stored on processor
127* INFO-NPROCS representing interactions with other
128* processors was not
129* positive definite,
130* and the factorization was not completed.
131*
132* =====================================================================
133*
134*
135* Restrictions
136* ============
137*
138* The following are restrictions on the input parameters. Some of these
139* are temporary and will be removed in future releases, while others
140* may reflect fundamental technical limitations.
141*
142* Non-cyclic restriction: VERY IMPORTANT!
143* P*NB>= mod(JA-1,NB)+N.
144* The mapping for matrices must be blocked, reflecting the nature
145* of the divide and conquer algorithm as a task-parallel algorithm.
146* This formula in words is: no processor may have more than one
147* chunk of the matrix.
148*
149* Blocksize cannot be too small:
150* If the matrix spans more than one processor, the following
151* restriction on NB, the size of each block on each processor,
152* must hold:
153* NB >= 2
154* The bulk of parallel computation is done on the matrix of size
155* O(NB) on each processor. If this is too small, divide and conquer
156* is a poor choice of algorithm.
157*
158* Submatrix reference:
159* JA = IB
160* Alignment restriction that prevents unnecessary communication.
161*
162*
163* =====================================================================
164*
165*
166* Notes
167* =====
168*
169* If the factorization routine and the solve routine are to be called
170* separately (to solve various sets of righthand sides using the same
171* coefficient matrix), the auxiliary space AF *must not be altered*
172* between calls to the factorization routine and the solve routine.
173*
174* The best algorithm for solving banded and tridiagonal linear systems
175* depends on a variety of parameters, especially the bandwidth.
176* Currently, only algorithms designed for the case N/P >> bw are
177* implemented. These go by many names, including Divide and Conquer,
178* Partitioning, domain decomposition-type, etc.
179* For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C
180* algorithms are the appropriate choice.
181*
182* Algorithm description: Divide and Conquer
183*
184* The Divide and Conqer algorithm assumes the matrix is narrowly
185* banded compared with the number of equations. In this situation,
186* it is best to distribute the input matrix A one-dimensionally,
187* with columns atomic and rows divided amongst the processes.
188* The basic algorithm divides the tridiagonal matrix up into
189* P pieces with one stored on each processor,
190* and then proceeds in 2 phases for the factorization or 3 for the
191* solution of a linear system.
192* 1) Local Phase:
193* The individual pieces are factored independently and in
194* parallel. These factors are applied to the matrix creating
195* fillin, which is stored in a non-inspectable way in auxiliary
196* space AF. Mathematically, this is equivalent to reordering
197* the matrix A as P A P^T and then factoring the principal
198* leading submatrix of size equal to the sum of the sizes of
199* the matrices factored on each processor. The factors of
200* these submatrices overwrite the corresponding parts of A
201* in memory.
202* 2) Reduced System Phase:
203* A small ((P-1)) system is formed representing
204* interaction of the larger blocks, and is stored (as are its
205* factors) in the space AF. A parallel Block Cyclic Reduction
206* algorithm is used. For a linear system, a parallel front solve
207* followed by an analagous backsolve, both using the structure
208* of the factored matrix, are performed.
209* 3) Backsubsitution Phase:
210* For a linear system, a local backsubstitution is performed on
211* each processor in parallel.
212*
213*
214* Descriptors
215* ===========
216*
217* Descriptors now have *types* and differ from ScaLAPACK 1.0.
218*
219* Note: tridiagonal codes can use either the old two dimensional
220* or new one-dimensional descriptors, though the processor grid in
221* both cases *must be one-dimensional*. We describe both types below.
222*
223* Each global data object is described by an associated description
224* vector. This vector stores the information required to establish
225* the mapping between an object element and its corresponding process
226* and memory location.
227*
228* Let A be a generic term for any 2D block cyclicly distributed array.
229* Such a global array has an associated description vector DESCA.
230* In the following comments, the character _ should be read as
231* "of the global array".
232*
233* NOTATION STORED IN EXPLANATION
234* --------------- -------------- --------------------------------------
235* DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
236* DTYPE_A = 1.
237* CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
238* the BLACS process grid A is distribu-
239* ted over. The context itself is glo-
240* bal, but the handle (the integer
241* value) may vary.
242* M_A (global) DESCA( M_ ) The number of rows in the global
243* array A.
244* N_A (global) DESCA( N_ ) The number of columns in the global
245* array A.
246* MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
247* the rows of the array.
248* NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
249* the columns of the array.
250* RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
251* row of the array A is distributed.
252* CSRC_A (global) DESCA( CSRC_ ) The process column over which the
253* first column of the array A is
254* distributed.
255* LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
256* array. LLD_A >= MAX(1,LOCr(M_A)).
257*
258* Let K be the number of rows or columns of a distributed matrix,
259* and assume that its process grid has dimension p x q.
260* LOCr( K ) denotes the number of elements of K that a process
261* would receive if K were distributed over the p processes of its
262* process column.
263* Similarly, LOCc( K ) denotes the number of elements of K that a
264* process would receive if K were distributed over the q processes of
265* its process row.
266* The values of LOCr() and LOCc() may be determined via a call to the
267* ScaLAPACK tool function, NUMROC:
268* LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
269* LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
270* An upper bound for these quantities may be computed by:
271* LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
272* LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
273*
274*
275* One-dimensional descriptors:
276*
277* One-dimensional descriptors are a new addition to ScaLAPACK since
278* version 1.0. They simplify and shorten the descriptor for 1D
279* arrays.
280*
281* Since ScaLAPACK supports two-dimensional arrays as the fundamental
282* object, we allow 1D arrays to be distributed either over the
283* first dimension of the array (as if the grid were P-by-1) or the
284* 2nd dimension (as if the grid were 1-by-P). This choice is
285* indicated by the descriptor type (501 or 502)
286* as described below.
287* However, for tridiagonal matrices, since the objects being
288* distributed are the individual vectors storing the diagonals, we
289* have adopted the convention that both the P-by-1 descriptor and
290* the 1-by-P descriptor are allowed and are equivalent for
291* tridiagonal matrices. Thus, for tridiagonal matrices,
292* DTYPE_A = 501 or 502 can be used interchangeably
293* without any other change.
294* We require that the distributed vectors storing the diagonals of a
295* tridiagonal matrix be aligned with each other. Because of this, a
296* single descriptor, DESCA, serves to describe the distribution of
297* of all diagonals simultaneously.
298*
299* IMPORTANT NOTE: the actual BLACS grid represented by the
300* CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
301* irrespective of which one-dimensional descriptor type
302* (501 or 502) is input.
303* This routine will interpret the grid properly either way.
304* ScaLAPACK routines *do not support intercontext operations* so that
305* the grid passed to a single ScaLAPACK routine *must be the same*
306* for all array descriptors passed to that routine.
307*
308* NOTE: In all cases where 1D descriptors are used, 2D descriptors
309* may also be used, since a one-dimensional array is a special case
310* of a two-dimensional array with one dimension of size unity.
311* The two-dimensional array used in this case *must* be of the
312* proper orientation:
313* If the appropriate one-dimensional descriptor is DTYPEA=501
314* (1 by P type), then the two dimensional descriptor must
315* have a CTXT value that refers to a 1 by P BLACS grid;
316* If the appropriate one-dimensional descriptor is DTYPEA=502
317* (P by 1 type), then the two dimensional descriptor must
318* have a CTXT value that refers to a P by 1 BLACS grid.
319*
320*
321* Summary of allowed descriptors, types, and BLACS grids:
322* DTYPE 501 502 1 1
323* BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
324* -----------------------------------------------------
325* A OK OK OK NO
326* B NO OK NO OK
327*
328* Note that a consequence of this chart is that it is not possible
329* for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
330* to opposite requirements for the orientation of the BLACS grid,
331* and as noted before, the *same* BLACS context must be used in
332* all descriptors in a single ScaLAPACK subroutine call.
333*
334* Let A be a generic term for any 1D block cyclicly distributed array.
335* Such a global array has an associated description vector DESCA.
336* In the following comments, the character _ should be read as
337* "of the global array".
338*
339* NOTATION STORED IN EXPLANATION
340* --------------- ---------- ------------------------------------------
341* DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
342* TYPE_A = 501: 1-by-P grid.
343* TYPE_A = 502: P-by-1 grid.
344* CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
345* the BLACS process grid A is distribu-
346* ted over. The context itself is glo-
347* bal, but the handle (the integer
348* value) may vary.
349* N_A (global) DESCA( 3 ) The size of the array dimension being
350* distributed.
351* NB_A (global) DESCA( 4 ) The blocking factor used to distribute
352* the distributed dimension of the array.
353* SRC_A (global) DESCA( 5 ) The process row or column over which the
354* first row or column of the array
355* is distributed.
356* Ignored DESCA( 6 ) Ignored for tridiagonal matrices.
357* Reserved DESCA( 7 ) Reserved for future use.
358*
359*
360*
361* =====================================================================
362*
363* Code Developer: Andrew J. Cleary, University of Tennessee.
364* Current address: Lawrence Livermore National Labs.
365* This version released: August, 2001.
366*
367* =====================================================================
368*
369* ..
370* .. Parameters ..
371 DOUBLE PRECISION ONE, ZERO
372 parameter( one = 1.0d+0 )
373 parameter( zero = 0.0d+0 )
374 COMPLEX*16 CONE, CZERO
375 parameter( cone = ( 1.0d+0, 0.0d+0 ) )
376 parameter( czero = ( 0.0d+0, 0.0d+0 ) )
377 INTEGER INT_ONE
378 parameter( int_one = 1 )
379 INTEGER DESCMULT, BIGNUM
380 parameter(descmult = 100, bignum = descmult * descmult)
381 INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
382 $ lld_, mb_, m_, nb_, n_, rsrc_
383 parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
384 $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
385 $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
386* ..
387* .. Local Scalars ..
388 INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW,
389 $ ws_factor
390* ..
391* .. External Subroutines ..
392 EXTERNAL pxerbla, pzpttrf, pzpttrs
393* ..
394* .. Executable Statements ..
395*
396* Note: to avoid duplication, most error checking is not performed
397* in this routine and is left to routines
398* PZPTTRF and PZPTTRS.
399*
400* Begin main code
401*
402 info = 0
403*
404* Get block size to calculate workspace requirements
405*
406 IF( desca( dtype_ ) .EQ. block_cyclic_2d ) THEN
407 nb = desca( nb_ )
408 ictxt = desca( ctxt_ )
409 ELSEIF( desca( dtype_ ) .EQ. 501 ) THEN
410 nb = desca( 4 )
411 ictxt = desca( 2 )
412 ELSEIF( desca( dtype_ ) .EQ. 502 ) THEN
413 nb = desca( 4 )
414 ictxt = desca( 2 )
415 ELSE
416 info = -( 5*100 + dtype_ )
417 CALL pxerbla( ictxt,
418 $ 'PZPTSV',
419 $ -info )
420 RETURN
421 ENDIF
422*
423 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
424*
425*
426* Size needed for AF in factorization
427*
428 ws_factor = (12*npcol + 3*nb)
429*
430* Factor the matrix
431*
432 CALL pzpttrf( n, d, e, ja, desca, work, min( lwork, ws_factor ),
433 $ work( 1+ws_factor ), lwork-ws_factor, info )
434*
435* Check info for error conditions
436*
437 IF( info.NE.0 ) THEN
438 IF( info .LT. 0 ) THEN
439 CALL pxerbla( ictxt, 'PZPTSV', -info )
440 ENDIF
441 RETURN
442 END IF
443*
444* Solve the system using the factorization
445*
446 CALL pzpttrs( uplo, n, nrhs, d, e, ja, desca, b, ib, descb, work,
447 $ min( lwork, ws_factor ), work( 1+ws_factor),
448 $ lwork-ws_factor, info )
449*
450* Check info for error conditions
451*
452 IF( info.NE.0 ) THEN
453 CALL pxerbla( ictxt, 'PZPTSV', -info )
454 RETURN
455 END IF
456*
457 RETURN
458*
459* End of PZPTSV
460*
461 END
#define min(A, B)
Definition pcgemr.c:181
subroutine pxerbla(ictxt, srname, info)
Definition pxerbla.f:2
subroutine pzptsv(uplo, n, nrhs, d, e, ja, desca, b, ib, descb, work, lwork, info)
Definition pzptsv.f:3
subroutine pzpttrf(n, d, e, ja, desca, af, laf, work, lwork, info)
Definition pzpttrf.f:3
subroutine pzpttrs(uplo, n, nrhs, d, e, ja, desca, b, ib, descb, af, laf, work, lwork, info)
Definition pzpttrs.f:3