SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
All Classes Files Functions Variables Typedefs Macros
pcgbtrs.f
Go to the documentation of this file.
1 SUBROUTINE pcgbtrs( TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, IPIV,
2 $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO )
3*
4* -- ScaLAPACK routine (version 2.0.2) --
5* Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6* May 1 2012
7*
8* .. Scalar Arguments ..
9 CHARACTER TRANS
10 INTEGER BWU, BWL, IB, INFO, JA, LAF, LWORK, N, NRHS
11* ..
12* .. Array Arguments ..
13 INTEGER DESCA( * ), DESCB( * ), IPIV(*)
14 COMPLEX A( * ), AF( * ), B( * ), WORK( * )
15* ..
16*
17*
18* Purpose
19* =======
20*
21* PCGBTRS solves a system of linear equations
22*
23* A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
24* or
25* A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS)
26*
27* where A(1:N, JA:JA+N-1) is the matrix used to produce the factors
28* stored in A(1:N,JA:JA+N-1) and AF by PCGBTRF.
29* A(1:N, JA:JA+N-1) is an N-by-N complex
30* banded distributed
31* matrix with bandwidth BWL, BWU.
32*
33* Routine PCGBTRF MUST be called first.
34*
35* =====================================================================
36*
37* Arguments
38* =========
39*
40*
41* TRANS (global input) CHARACTER
42* = 'N': Solve with A(1:N, JA:JA+N-1);
43* = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) );
44*
45* N (global input) INTEGER
46* The number of rows and columns to be operated on, i.e. the
47* order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
48*
49* BWL (global input) INTEGER
50* Number of subdiagonals. 0 <= BWL <= N-1
51*
52* BWU (global input) INTEGER
53* Number of superdiagonals. 0 <= BWU <= N-1
54*
55* NRHS (global input) INTEGER
56* The number of right hand sides, i.e., the number of columns
57* of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
58* NRHS >= 0.
59*
60* A (local input/local output) COMPLEX pointer into
61* local memory to an array with first dimension
62* LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA).
63* On entry, this array contains the local pieces of the
64* N-by-N unsymmetric banded distributed Cholesky factor L or
65* L^T A(1:N, JA:JA+N-1).
66* This local portion is stored in the packed banded format
67* used in LAPACK. Please see the Notes below and the
68* ScaLAPACK manual for more detail on the format of
69* distributed matrices.
70*
71* JA (global input) INTEGER
72* The index in the global array A that points to the start of
73* the matrix to be operated on (which may be either all of A
74* or a submatrix of A).
75*
76* DESCA (global and local input) INTEGER array of dimension DLEN.
77* if 1D type (DTYPE_A=501), DLEN >= 7;
78* if 2D type (DTYPE_A=1), DLEN >= 9 .
79* The array descriptor for the distributed matrix A.
80* Contains information of mapping of A to memory. Please
81* see NOTES below for full description and options.
82*
83* IPIV (local output) INTEGER array, dimension >= DESCA( NB ).
84* Pivot indices for local factorizations.
85* Users *should not* alter the contents between
86* factorization and solve.
87*
88* B (local input/local output) COMPLEX pointer into
89* local memory to an array of local lead dimension lld_b>=NB.
90* On entry, this array contains the
91* the local pieces of the right hand sides
92* B(IB:IB+N-1, 1:NRHS).
93* On exit, this contains the local piece of the solutions
94* distributed matrix X.
95*
96* IB (global input) INTEGER
97* The row index in the global array B that points to the first
98* row of the matrix to be operated on (which may be either
99* all of B or a submatrix of B).
100*
101* DESCB (global and local input) INTEGER array of dimension DLEN.
102* if 1D type (DTYPE_B=502), DLEN >=7;
103* if 2D type (DTYPE_B=1), DLEN >= 9.
104* The array descriptor for the distributed matrix B.
105* Contains information of mapping of B to memory. Please
106* see NOTES below for full description and options.
107*
108* AF (local output) COMPLEX array, dimension LAF.
109* Auxiliary Fillin Space.
110* Fillin is created during the factorization routine
111* PCGBTRF and this is stored in AF. If a linear system
112* is to be solved using PCGBTRS after the factorization
113* routine, AF *must not be altered* after the factorization.
114*
115* LAF (local input) INTEGER
116* Size of user-input Auxiliary Fillin space AF. Must be >=
117* (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu)
118* If LAF is not large enough, an error code will be returned
119* and the minimum acceptable size will be returned in AF( 1 )
120*
121* WORK (local workspace/local output)
122* COMPLEX temporary workspace. This space may
123* be overwritten in between calls to routines. WORK must be
124* the size given in LWORK.
125* On exit, WORK( 1 ) contains the minimal LWORK.
126*
127* LWORK (local input or global input) INTEGER
128* Size of user-input workspace WORK.
129* If LWORK is too small, the minimal acceptable size will be
130* returned in WORK(1) and an error code is returned. LWORK>=
131* NRHS*(NB+2*bwl+4*bwu)
132*
133* INFO (global output) INTEGER
134* = 0: successful exit
135* < 0: If the i-th argument is an array and the j-entry had
136* an illegal value, then INFO = -(i*100+j), if the i-th
137* argument is a scalar and had an illegal value, then
138* INFO = -i.
139*
140* =====================================================================
141*
142*
143* Restrictions
144* ============
145*
146* The following are restrictions on the input parameters. Some of these
147* are temporary and will be removed in future releases, while others
148* may reflect fundamental technical limitations.
149*
150* Non-cyclic restriction: VERY IMPORTANT!
151* P*NB>= mod(JA-1,NB)+N.
152* The mapping for matrices must be blocked, reflecting the nature
153* of the divide and conquer algorithm as a task-parallel algorithm.
154* This formula in words is: no processor may have more than one
155* chunk of the matrix.
156*
157* Blocksize cannot be too small:
158* If the matrix spans more than one processor, the following
159* restriction on NB, the size of each block on each processor,
160* must hold:
161* NB >= (BWL+BWU)+1
162* The bulk of parallel computation is done on the matrix of size
163* O(NB) on each processor. If this is too small, divide and conquer
164* is a poor choice of algorithm.
165*
166* Submatrix reference:
167* JA = IB
168* Alignment restriction that prevents unnecessary communication.
169*
170*
171* =====================================================================
172*
173*
174* Notes
175* =====
176*
177* If the factorization routine and the solve routine are to be called
178* separately (to solve various sets of righthand sides using the same
179* coefficient matrix), the auxiliary space AF *must not be altered*
180* between calls to the factorization routine and the solve routine.
181*
182* The best algorithm for solving banded and tridiagonal linear systems
183* depends on a variety of parameters, especially the bandwidth.
184* Currently, only algorithms designed for the case N/P >> bw are
185* implemented. These go by many names, including Divide and Conquer,
186* Partitioning, domain decomposition-type, etc.
187*
188* Algorithm description: Divide and Conquer
189*
190* The Divide and Conqer algorithm assumes the matrix is narrowly
191* banded compared with the number of equations. In this situation,
192* it is best to distribute the input matrix A one-dimensionally,
193* with columns atomic and rows divided amongst the processes.
194* The basic algorithm divides the banded matrix up into
195* P pieces with one stored on each processor,
196* and then proceeds in 2 phases for the factorization or 3 for the
197* solution of a linear system.
198* 1) Local Phase:
199* The individual pieces are factored independently and in
200* parallel. These factors are applied to the matrix creating
201* fillin, which is stored in a non-inspectable way in auxiliary
202* space AF. Mathematically, this is equivalent to reordering
203* the matrix A as P A P^T and then factoring the principal
204* leading submatrix of size equal to the sum of the sizes of
205* the matrices factored on each processor. The factors of
206* these submatrices overwrite the corresponding parts of A
207* in memory.
208* 2) Reduced System Phase:
209* A small (max(bwl,bwu)* (P-1)) system is formed representing
210* interaction of the larger blocks, and is stored (as are its
211* factors) in the space AF. A parallel Block Cyclic Reduction
212* algorithm is used. For a linear system, a parallel front solve
213* followed by an analagous backsolve, both using the structure
214* of the factored matrix, are performed.
215* 3) Backsubsitution Phase:
216* For a linear system, a local backsubstitution is performed on
217* each processor in parallel.
218*
219*
220* Descriptors
221* ===========
222*
223* Descriptors now have *types* and differ from ScaLAPACK 1.0.
224*
225* Note: banded codes can use either the old two dimensional
226* or new one-dimensional descriptors, though the processor grid in
227* both cases *must be one-dimensional*. We describe both types below.
228*
229* Each global data object is described by an associated description
230* vector. This vector stores the information required to establish
231* the mapping between an object element and its corresponding process
232* and memory location.
233*
234* Let A be a generic term for any 2D block cyclicly distributed array.
235* Such a global array has an associated description vector DESCA.
236* In the following comments, the character _ should be read as
237* "of the global array".
238*
239* NOTATION STORED IN EXPLANATION
240* --------------- -------------- --------------------------------------
241* DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
242* DTYPE_A = 1.
243* CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
244* the BLACS process grid A is distribu-
245* ted over. The context itself is glo-
246* bal, but the handle (the integer
247* value) may vary.
248* M_A (global) DESCA( M_ ) The number of rows in the global
249* array A.
250* N_A (global) DESCA( N_ ) The number of columns in the global
251* array A.
252* MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
253* the rows of the array.
254* NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
255* the columns of the array.
256* RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
257* row of the array A is distributed.
258* CSRC_A (global) DESCA( CSRC_ ) The process column over which the
259* first column of the array A is
260* distributed.
261* LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
262* array. LLD_A >= MAX(1,LOCr(M_A)).
263*
264* Let K be the number of rows or columns of a distributed matrix,
265* and assume that its process grid has dimension p x q.
266* LOCr( K ) denotes the number of elements of K that a process
267* would receive if K were distributed over the p processes of its
268* process column.
269* Similarly, LOCc( K ) denotes the number of elements of K that a
270* process would receive if K were distributed over the q processes of
271* its process row.
272* The values of LOCr() and LOCc() may be determined via a call to the
273* ScaLAPACK tool function, NUMROC:
274* LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
275* LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
276* An upper bound for these quantities may be computed by:
277* LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
278* LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
279*
280*
281* One-dimensional descriptors:
282*
283* One-dimensional descriptors are a new addition to ScaLAPACK since
284* version 1.0. They simplify and shorten the descriptor for 1D
285* arrays.
286*
287* Since ScaLAPACK supports two-dimensional arrays as the fundamental
288* object, we allow 1D arrays to be distributed either over the
289* first dimension of the array (as if the grid were P-by-1) or the
290* 2nd dimension (as if the grid were 1-by-P). This choice is
291* indicated by the descriptor type (501 or 502)
292* as described below.
293*
294* IMPORTANT NOTE: the actual BLACS grid represented by the
295* CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
296* irrespective of which one-dimensional descriptor type
297* (501 or 502) is input.
298* This routine will interpret the grid properly either way.
299* ScaLAPACK routines *do not support intercontext operations* so that
300* the grid passed to a single ScaLAPACK routine *must be the same*
301* for all array descriptors passed to that routine.
302*
303* NOTE: In all cases where 1D descriptors are used, 2D descriptors
304* may also be used, since a one-dimensional array is a special case
305* of a two-dimensional array with one dimension of size unity.
306* The two-dimensional array used in this case *must* be of the
307* proper orientation:
308* If the appropriate one-dimensional descriptor is DTYPEA=501
309* (1 by P type), then the two dimensional descriptor must
310* have a CTXT value that refers to a 1 by P BLACS grid;
311* If the appropriate one-dimensional descriptor is DTYPEA=502
312* (P by 1 type), then the two dimensional descriptor must
313* have a CTXT value that refers to a P by 1 BLACS grid.
314*
315*
316* Summary of allowed descriptors, types, and BLACS grids:
317* DTYPE 501 502 1 1
318* BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
319* -----------------------------------------------------
320* A OK NO OK NO
321* B NO OK NO OK
322*
323* Note that a consequence of this chart is that it is not possible
324* for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
325* to opposite requirements for the orientation of the BLACS grid,
326* and as noted before, the *same* BLACS context must be used in
327* all descriptors in a single ScaLAPACK subroutine call.
328*
329* Let A be a generic term for any 1D block cyclicly distributed array.
330* Such a global array has an associated description vector DESCA.
331* In the following comments, the character _ should be read as
332* "of the global array".
333*
334* NOTATION STORED IN EXPLANATION
335* --------------- ---------- ------------------------------------------
336* DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
337* TYPE_A = 501: 1-by-P grid.
338* TYPE_A = 502: P-by-1 grid.
339* CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
340* the BLACS process grid A is distribu-
341* ted over. The context itself is glo-
342* bal, but the handle (the integer
343* value) may vary.
344* N_A (global) DESCA( 3 ) The size of the array dimension being
345* distributed.
346* NB_A (global) DESCA( 4 ) The blocking factor used to distribute
347* the distributed dimension of the array.
348* SRC_A (global) DESCA( 5 ) The process row or column over which the
349* first row or column of the array
350* is distributed.
351* LLD_A (local) DESCA( 6 ) The leading dimension of the local array
352* storing the local blocks of the distri-
353* buted array A. Minimum value of LLD_A
354* depends on TYPE_A.
355* TYPE_A = 501: LLD_A >=
356* size of undistributed dimension, 1.
357* TYPE_A = 502: LLD_A >=NB_A, 1.
358* Reserved DESCA( 7 ) Reserved for future use.
359*
360*
361*
362* =====================================================================
363*
364* Implemented for ScaLAPACK by:
365* Andrew J. Cleary, Livermore National Lab and University of Tenn.,
366* and Marbwus Hegland, Australian Natonal University. Feb., 1997.
367* Based on code written by : Peter Arbenz, ETH Zurich, 1996.
368*
369* =====================================================================
370*
371* .. Parameters ..
372 REAL ONE, ZERO
373 parameter( one = 1.0e+0 )
374 parameter( zero = 0.0e+0 )
375 COMPLEX CONE, CZERO
376 parameter( cone = ( 1.0e+0, 0.0e+0 ) )
377 parameter( czero = ( 0.0e+0, 0.0e+0 ) )
378 INTEGER INT_ONE
379 parameter( int_one = 1 )
380 INTEGER DESCMULT, BIGNUM
381 parameter( descmult = 100, bignum = descmult*descmult )
382 INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
383 $ lld_, mb_, m_, nb_, n_, rsrc_
384 parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
385 $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
386 $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
387* ..
388* .. Local Scalars ..
389 INTEGER APTR, BBPTR, BM, BMN, BN, BNN, BW, CSRC,
390 $ first_proc, ictxt, ictxt_new, ictxt_save,
391 $ idum2, idum3, j, ja_new, l, lbwl, lbwu, ldbb,
392 $ ldw, llda, lldb, lm, lmj, ln, lptr, mycol,
393 $ myrow, nb, neicol, np, npact, npcol, nprow,
394 $ npstr, np_save, odd_size, part_offset,
395 $ recovery_val, return_code, store_m_b,
396 $ store_n_a, work_size_min, wptr
397* ..
398* .. Local Arrays ..
399 INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
400 $ param_check( 17, 3 )
401* ..
402* .. External Subroutines ..
403 EXTERNAL blacs_gridinfo, desc_convert, globchk, pxerbla,
404 $ reshape
405* ..
406* .. External Functions ..
407 LOGICAL LSAME
408 INTEGER NUMROC
409 EXTERNAL lsame
410 EXTERNAL numroc
411* ..
412* .. Intrinsic Functions ..
413 INTRINSIC ichar, mod
414* ..
415* .. Executable Statements ..
416*
417*
418* Test the input parameters
419*
420 info = 0
421*
422* Convert descriptor into standard form for easy access to
423* parameters, check that grid is of right shape.
424*
425 desca_1xp( 1 ) = 501
426 descb_px1( 1 ) = 502
427*
428 CALL desc_convert( desca, desca_1xp, return_code )
429*
430 IF( return_code .NE. 0) THEN
431 info = -( 8*100 + 2 )
432 ENDIF
433*
434 CALL desc_convert( descb, descb_px1, return_code )
435*
436 IF( return_code .NE. 0) THEN
437 info = -( 11*100 + 2 )
438 ENDIF
439*
440* Consistency checks for DESCA and DESCB.
441*
442* Context must be the same
443 IF( desca_1xp( 2 ) .NE. descb_px1( 2 ) ) THEN
444 info = -( 11*100 + 2 )
445 ENDIF
446*
447* These are alignment restrictions that may or may not be removed
448* in future releases. -Andy Cleary, April 14, 1996.
449*
450* Block sizes must be the same
451 IF( desca_1xp( 4 ) .NE. descb_px1( 4 ) ) THEN
452 info = -( 11*100 + 4 )
453 ENDIF
454*
455* Source processor must be the same
456*
457 IF( desca_1xp( 5 ) .NE. descb_px1( 5 ) ) THEN
458 info = -( 11*100 + 5 )
459 ENDIF
460*
461* Get values out of descriptor for use in code.
462*
463 ictxt = desca_1xp( 2 )
464 csrc = desca_1xp( 5 )
465 nb = desca_1xp( 4 )
466 llda = desca_1xp( 6 )
467 store_n_a = desca_1xp( 3 )
468 lldb = descb_px1( 6 )
469 store_m_b = descb_px1( 3 )
470*
471* Get grid parameters
472*
473*
474 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
475 np = nprow * npcol
476*
477*
478*
479 IF( lsame( trans, 'N' ) ) THEN
480 idum2 = ichar( 'N' )
481 ELSE IF ( lsame( trans, 'C' ) ) THEN
482 idum2 = ichar( 'C' )
483 ELSE
484 info = -1
485 END IF
486*
487 IF( lwork .LT. -1) THEN
488 info = -16
489 ELSE IF ( lwork .EQ. -1 ) THEN
490 idum3 = -1
491 ELSE
492 idum3 = 1
493 ENDIF
494*
495 IF( n .LT. 0 ) THEN
496 info = -2
497 ENDIF
498*
499 IF( n+ja-1 .GT. store_n_a ) THEN
500 info = -( 8*100 + 6 )
501 ENDIF
502*
503 IF(( bwl .GT. n-1 ) .OR.
504 $ ( bwl .LT. 0 ) ) THEN
505 info = -3
506 ENDIF
507*
508 IF(( bwu .GT. n-1 ) .OR.
509 $ ( bwu .LT. 0 ) ) THEN
510 info = -4
511 ENDIF
512*
513 IF( llda .LT. (2*bwl+2*bwu+1) ) THEN
514 info = -( 8*100 + 6 )
515 ENDIF
516*
517 IF( nb .LE. 0 ) THEN
518 info = -( 8*100 + 4 )
519 ENDIF
520*
521 bw = bwu+bwl
522*
523 IF( n+ib-1 .GT. store_m_b ) THEN
524 info = -( 11*100 + 3 )
525 ENDIF
526*
527 IF( lldb .LT. nb ) THEN
528 info = -( 11*100 + 6 )
529 ENDIF
530*
531 IF( nrhs .LT. 0 ) THEN
532 info = -5
533 ENDIF
534*
535* Current alignment restriction
536*
537 IF( ja .NE. ib) THEN
538 info = -7
539 ENDIF
540*
541* Argument checking that is specific to Divide & Conquer routine
542*
543 IF( nprow .NE. 1 ) THEN
544 info = -( 8*100+2 )
545 ENDIF
546*
547 IF( n .GT. np*nb-mod( ja-1, nb )) THEN
548 info = -( 2 )
549 CALL pxerbla( ictxt,
550 $ 'PCGBTRS, D&C alg.: only 1 block per proc',
551 $ -info )
552 RETURN
553 ENDIF
554*
555 IF((ja+n-1.GT.nb) .AND. ( nb.LT.(bwl+bwu+1) )) THEN
556 info = -( 8*100+4 )
557 CALL pxerbla( ictxt,
558 $ 'PCGBTRS, D&C alg.: NB too small',
559 $ -info )
560 RETURN
561 ENDIF
562*
563*
564* Check worksize
565*
566 work_size_min = nrhs*(nb+2*bwl+4*bwu)
567*
568 work( 1 ) = work_size_min
569*
570 IF( lwork .LT. work_size_min ) THEN
571 IF( lwork .NE. -1 ) THEN
572 info = -16
573 CALL pxerbla( ictxt,
574 $ 'PCGBTRS: worksize error ',
575 $ -info )
576 ENDIF
577 RETURN
578 ENDIF
579*
580* Pack params and positions into arrays for global consistency check
581*
582 param_check( 17, 1 ) = descb(5)
583 param_check( 16, 1 ) = descb(4)
584 param_check( 15, 1 ) = descb(3)
585 param_check( 14, 1 ) = descb(2)
586 param_check( 13, 1 ) = descb(1)
587 param_check( 12, 1 ) = ib
588 param_check( 11, 1 ) = desca(5)
589 param_check( 10, 1 ) = desca(4)
590 param_check( 9, 1 ) = desca(3)
591 param_check( 8, 1 ) = desca(1)
592 param_check( 7, 1 ) = ja
593 param_check( 6, 1 ) = nrhs
594 param_check( 5, 1 ) = bwu
595 param_check( 4, 1 ) = bwl
596 param_check( 3, 1 ) = n
597 param_check( 2, 1 ) = idum3
598 param_check( 1, 1 ) = idum2
599*
600 param_check( 17, 2 ) = 1105
601 param_check( 16, 2 ) = 1104
602 param_check( 15, 2 ) = 1103
603 param_check( 14, 2 ) = 1102
604 param_check( 13, 2 ) = 1101
605 param_check( 12, 2 ) = 10
606 param_check( 11, 2 ) = 805
607 param_check( 10, 2 ) = 804
608 param_check( 9, 2 ) = 803
609 param_check( 8, 2 ) = 801
610 param_check( 7, 2 ) = 7
611 param_check( 6, 2 ) = 5
612 param_check( 5, 2 ) = 4
613 param_check( 4, 2 ) = 3
614 param_check( 3, 2 ) = 2
615 param_check( 2, 2 ) = 16
616 param_check( 1, 2 ) = 1
617*
618* Want to find errors with MIN( ), so if no error, set it to a big
619* number. If there already is an error, multiply by the the
620* descriptor multiplier.
621*
622 IF( info.GE.0 ) THEN
623 info = bignum
624 ELSE IF( info.LT.-descmult ) THEN
625 info = -info
626 ELSE
627 info = -info * descmult
628 END IF
629*
630* Check consistency across processors
631*
632 CALL globchk( ictxt, 17, param_check, 17,
633 $ param_check( 1, 3 ), info )
634*
635* Prepare output: set info = 0 if no error, and divide by DESCMULT
636* if error is not in a descriptor entry.
637*
638 IF( info.EQ.bignum ) THEN
639 info = 0
640 ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
641 info = -info / descmult
642 ELSE
643 info = -info
644 END IF
645*
646 IF( info.LT.0 ) THEN
647 CALL pxerbla( ictxt, 'PCGBTRS', -info )
648 RETURN
649 END IF
650*
651* Quick return if possible
652*
653 IF( n.EQ.0 )
654 $ RETURN
655*
656 IF( nrhs.EQ.0 )
657 $ RETURN
658*
659*
660* Adjust addressing into matrix space to properly get into
661* the beginning part of the relevant data
662*
663 part_offset = nb*( (ja-1)/(npcol*nb) )
664*
665 IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
666 part_offset = part_offset + nb
667 ENDIF
668*
669 IF ( mycol .LT. csrc ) THEN
670 part_offset = part_offset - nb
671 ENDIF
672*
673* Form a new BLACS grid (the "standard form" grid) with only procs
674* holding part of the matrix, of size 1xNP where NP is adjusted,
675* starting at csrc=0, with JA modified to reflect dropped procs.
676*
677* First processor to hold part of the matrix:
678*
679 first_proc = mod( ( ja-1 )/nb+csrc, npcol )
680*
681* Calculate new JA one while dropping off unused processors.
682*
683 ja_new = mod( ja-1, nb ) + 1
684*
685* Save and compute new value of NP
686*
687 np_save = np
688 np = ( ja_new+n-2 )/nb + 1
689*
690* Call utility routine that forms "standard-form" grid
691*
692 CALL reshape( ictxt, int_one, ictxt_new, int_one,
693 $ first_proc, int_one, np )
694*
695* Use new context from standard grid as context.
696*
697 ictxt_save = ictxt
698 ictxt = ictxt_new
699 desca_1xp( 2 ) = ictxt_new
700 descb_px1( 2 ) = ictxt_new
701*
702* Get information about new grid.
703*
704 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
705*
706* Drop out processors that do not have part of the matrix.
707*
708 IF( myrow .LT. 0 ) THEN
709 GOTO 1234
710 ENDIF
711*
712*
713*
714* Begin main code
715*
716* Move data into workspace - communicate/copy (overlap)
717*
718 IF (mycol .LT. npcol-1) THEN
719 CALL cgesd2d( ictxt, bwu, nrhs, b(nb-bwu+1), lldb,
720 $ 0, mycol + 1)
721 ENDIF
722*
723 IF (mycol .LT. npcol-1) THEN
724 lm = nb-bwu
725 ELSE
726 lm = nb
727 ENDIF
728*
729 IF (mycol .GT. 0) THEN
730 wptr = bwu+1
731 ELSE
732 wptr = 1
733 ENDIF
734*
735 ldw = nb+bwu + 2*bw+bwu
736*
737 CALL clamov( 'G', lm, nrhs, b(1), lldb, work( wptr ), ldw )
738*
739* Zero out rest of work
740*
741 DO 1501 j=1, nrhs
742 DO 1502 l=wptr+lm, ldw
743 work( (j-1)*ldw+l ) = czero
744 1502 CONTINUE
745 1501 CONTINUE
746*
747 IF (mycol .GT. 0) THEN
748 CALL cgerv2d( ictxt, bwu, nrhs, work(1), ldw,
749 $ 0, mycol-1)
750 ENDIF
751*
752********************************************************************
753* PHASE 1: Local computation phase -- Solve L*X = B
754********************************************************************
755*
756* Size of main (or odd) partition in each processor
757*
758 odd_size = numroc( n, nb, mycol, 0, npcol )
759*
760 IF (mycol .NE. 0) THEN
761 lbwl = bw
762 lbwu = 0
763 aptr = 1
764 ELSE
765 lbwl = bwl
766 lbwu = bwu
767 aptr = 1+bwu
768 ENDIF
769*
770 IF (mycol .NE. npcol-1) THEN
771 lm = nb - lbwu
772 ln = nb - bw
773 ELSE IF (mycol .NE. 0) THEN
774 lm = odd_size + bwu
775 ln = max(odd_size-bw,0)
776 ELSE
777 lm = n
778 ln = max( n-bw, 0 )
779 ENDIF
780*
781 DO 21 j = 1, ln
782*
783 lmj = min(lbwl,lm-j)
784 l = ipiv( j )
785*
786 IF( l.NE.j ) THEN
787 CALL cswap(nrhs, work(l), ldw, work(j), ldw)
788 ENDIF
789*
790 lptr = bw+1 + (j-1)*llda + aptr
791*
792 CALL cgeru(lmj,nrhs,-cone, a(lptr),1, work(j),ldw,
793 $ work(j+1),ldw)
794*
795 21 CONTINUE
796*
797********************************************************************
798* PHASE 2: Global computation phase -- Solve L*X = B
799********************************************************************
800*
801* Define the initial dimensions of the diagonal blocks
802* The offdiagonal blocks (for MYCOL > 0) are of size BM by BW
803*
804 IF (mycol .NE. npcol-1) THEN
805 bm = bw - lbwu
806 bn = bw
807 ELSE
808 bm = min(bw,odd_size) + bwu
809 bn = min(bw,odd_size)
810 ENDIF
811*
812* Pointer to first element of block bidiagonal matrix in AF
813* Leading dimension of block bidiagonal system
814*
815 bbptr = (nb+bwu)*bw + 1
816 ldbb = 2*bw + bwu
817*
818 IF (npcol.EQ.1) THEN
819*
820* In this case the loop over the levels will not be
821* performed.
822 CALL cgetrs( 'N', n-ln, nrhs, af(bbptr+bw*ldbb), ldbb,
823 $ ipiv(ln+1), work( ln+1 ), ldw, info)
824*
825 ENDIF
826*
827* Loop over levels ...
828*
829* The two integers NPACT (nu. of active processors) and NPSTR
830* (stride between active processors) is used to control the
831* loop.
832*
833 npact = npcol
834 npstr = 1
835*
836* Begin loop over levels
837 200 IF (npact .LE. 1) GOTO 300
838*
839* Test if processor is active
840 IF (mod(mycol,npstr) .EQ. 0) THEN
841*
842* Send/Receive blocks
843*
844 IF (mod(mycol,2*npstr) .EQ. 0) THEN
845*
846 neicol = mycol + npstr
847*
848 IF (neicol/npstr .LE. npact-1) THEN
849*
850 IF (neicol/npstr .LT. npact-1) THEN
851 bmn = bw
852 ELSE
853 bmn = min(bw,numroc(n, nb, neicol, 0, npcol))+bwu
854 ENDIF
855*
856 CALL cgesd2d( ictxt, bm, nrhs,
857 $ work(ln+1), ldw, 0, neicol )
858*
859 IF( npact .NE. 2 )THEN
860*
861* Receive answers back from partner processor
862*
863 CALL cgerv2d(ictxt, bm+bmn-bw, nrhs,
864 $ work( ln+1 ), ldw, 0, neicol )
865*
866 bm = bm+bmn-bw
867*
868 ENDIF
869*
870 ENDIF
871*
872 ELSE
873*
874 neicol = mycol - npstr
875*
876 IF (neicol .EQ. 0) THEN
877 bmn = bw - bwu
878 ELSE
879 bmn = bw
880 ENDIF
881*
882 CALL clamov( 'G', bm, nrhs, work(ln+1), ldw,
883 $ work(nb+bwu+bmn+1), ldw )
884*
885 CALL cgerv2d( ictxt, bmn, nrhs, work( nb+bwu+1 ),
886 $ ldw, 0, neicol )
887*
888* and do the permutations and eliminations
889*
890 IF (npact .NE. 2) THEN
891*
892* Solve locally for BW variables
893*
894 CALL claswp( nrhs, work(nb+bwu+1), ldw, 1, bw,
895 $ ipiv(ln+1), 1)
896*
897 CALL ctrsm('L','L','N','U', bw, nrhs, cone,
898 $ af(bbptr+bw*ldbb), ldbb, work(nb+bwu+1), ldw)
899*
900* Use soln just calculated to update RHS
901*
902 CALL cgemm( 'N', 'N', bm+bmn-bw, nrhs, bw,
903 $ -cone, af(bbptr+bw*ldbb+bw), ldbb,
904 $ work(nb+bwu+1), ldw,
905 $ cone, work(nb+bwu+1+bw), ldw )
906*
907* Give answers back to partner processor
908*
909 CALL cgesd2d( ictxt, bm+bmn-bw, nrhs,
910 $ work(nb+bwu+1+bw), ldw, 0, neicol )
911*
912 ELSE
913*
914* Finish up calculations for final level
915*
916 CALL claswp( nrhs, work(nb+bwu+1), ldw, 1, bm+bmn,
917 $ ipiv(ln+1), 1)
918*
919 CALL ctrsm('L','L','N','U', bm+bmn, nrhs, cone,
920 $ af(bbptr+bw*ldbb), ldbb, work(nb+bwu+1), ldw)
921 ENDIF
922*
923 ENDIF
924*
925 npact = (npact + 1)/2
926 npstr = npstr * 2
927 GOTO 200
928*
929 ENDIF
930*
931 300 CONTINUE
932*
933*
934**************************************
935* BACKSOLVE
936********************************************************************
937* PHASE 2: Global computation phase -- Solve U*Y = X
938********************************************************************
939*
940 IF (npcol.EQ.1) THEN
941*
942* In this case the loop over the levels will not be
943* performed.
944* In fact, the backsolve portion was done in the call to
945* CGETRS in the frontsolve.
946*
947 ENDIF
948*
949* Compute variable needed to reverse loop structure in
950* reduced system.
951*
952 recovery_val = npact*npstr - npcol
953*
954* Loop over levels
955* Terminal values of NPACT and NPSTR from frontsolve are used
956*
957 2200 IF( npact .GE. npcol ) GOTO 2300
958*
959 npstr = npstr/2
960*
961 npact = npact*2
962*
963* Have to adjust npact for non-power-of-2
964*
965 npact = npact-mod( (recovery_val/npstr), 2 )
966*
967* Find size of submatrix in this proc at this level
968*
969 IF( mycol/npstr .LT. npact-1 ) THEN
970 bn = bw
971 ELSE
972 bn = min(bw, numroc(n, nb, npcol-1, 0, npcol) )
973 ENDIF
974*
975* If this processor is even in this level...
976*
977 IF( mod( mycol, 2*npstr ) .EQ. 0 ) THEN
978*
979 neicol = mycol+npstr
980*
981 IF( neicol/npstr .LE. npact-1 ) THEN
982*
983 IF( neicol/npstr .LT. npact-1 ) THEN
984 bmn = bw
985 bnn = bw
986 ELSE
987 bmn = min(bw,numroc(n, nb, neicol, 0, npcol))+bwu
988 bnn = min(bw, numroc(n, nb, neicol, 0, npcol) )
989 ENDIF
990*
991 IF( npact .GT. 2 ) THEN
992*
993 CALL cgesd2d( ictxt, 2*bw, nrhs, work( ln+1 ),
994 $ ldw, 0, neicol )
995*
996 CALL cgerv2d( ictxt, bw, nrhs, work( ln+1 ),
997 $ ldw, 0, neicol )
998*
999 ELSE
1000*
1001 CALL cgerv2d( ictxt, bw, nrhs, work( ln+1 ),
1002 $ ldw, 0, neicol )
1003*
1004 ENDIF
1005*
1006 ENDIF
1007*
1008 ELSE
1009* This processor is odd on this level
1010*
1011 neicol = mycol - npstr
1012*
1013 IF (neicol .EQ. 0) THEN
1014 bmn = bw - bwu
1015 ELSE
1016 bmn = bw
1017 ENDIF
1018*
1019 IF( neicol .LT. npcol-1 ) THEN
1020 bnn = bw
1021 ELSE
1022 bnn = min(bw, numroc(n, nb, neicol, 0, npcol) )
1023 ENDIF
1024*
1025 IF( npact .GT. 2 ) THEN
1026*
1027* Move RHS to make room for received solutions
1028*
1029 CALL clamov( 'G', bw, nrhs, work(nb+bwu+1),
1030 $ ldw, work(nb+bwu+bw+1), ldw )
1031*
1032 CALL cgerv2d( ictxt, 2*bw, nrhs, work( ln+1 ),
1033 $ ldw, 0, neicol )
1034*
1035 CALL cgemm( 'N', 'N', bw, nrhs, bn,
1036 $ -cone, af(bbptr), ldbb,
1037 $ work(ln+1), ldw,
1038 $ cone, work(nb+bwu+bw+1), ldw )
1039*
1040*
1041 IF( mycol .GT. npstr ) THEN
1042*
1043 CALL cgemm( 'N', 'N', bw, nrhs, bw,
1044 $ -cone, af(bbptr+2*bw*ldbb), ldbb,
1045 $ work(ln+bw+1), ldw,
1046 $ cone, work(nb+bwu+bw+1), ldw )
1047*
1048 ENDIF
1049*
1050 CALL ctrsm('L','U','N','N', bw, nrhs, cone,
1051 $ af(bbptr+bw*ldbb), ldbb, work(nb+bwu+bw+1), ldw)
1052*
1053* Send new solution to neighbor
1054*
1055 CALL cgesd2d( ictxt, bw, nrhs,
1056 $ work( nb+bwu+bw+1 ), ldw, 0, neicol )
1057*
1058* Copy new solution into expected place
1059*
1060 CALL clamov( 'G', bw, nrhs, work(nb+bwu+1+bw),
1061 $ ldw, work(ln+bw+1), ldw )
1062*
1063 ELSE
1064*
1065* Solve with local diagonal block
1066*
1067 CALL ctrsm( 'L','U','N','N', bn+bnn, nrhs, cone,
1068 $ af(bbptr+bw*ldbb), ldbb, work(nb+bwu+1), ldw)
1069*
1070* Send new solution to neighbor
1071*
1072 CALL cgesd2d( ictxt, bw, nrhs,
1073 $ work(nb+bwu+1), ldw, 0, neicol )
1074*
1075* Shift solutions into expected positions
1076*
1077 CALL clamov( 'G', bnn+bn-bw, nrhs, work(nb+bwu+1+bw),
1078 $ ldw, work(ln+1), ldw )
1079*
1080*
1081 IF( (nb+bwu+1) .NE. (ln+1+bw) ) THEN
1082*
1083* Copy one row at a time since spaces may overlap
1084*
1085 DO 1064 j=1, bw
1086 CALL ccopy( nrhs, work(nb+bwu+j), ldw,
1087 $ work(ln+bw+j), ldw )
1088 1064 CONTINUE
1089*
1090 ENDIF
1091*
1092 ENDIF
1093*
1094 ENDIF
1095*
1096 GOTO 2200
1097*
1098 2300 CONTINUE
1099* End of loop over levels
1100*
1101********************************************************************
1102* PHASE 1: (Almost) Local computation phase -- Solve U*Y = X
1103********************************************************************
1104*
1105* Reset BM to value it had before reduced system frontsolve...
1106*
1107 IF (mycol .NE. npcol-1) THEN
1108 bm = bw - lbwu
1109 ELSE
1110 bm = min(bw,odd_size) + bwu
1111 ENDIF
1112*
1113* First metastep is to account for the fillin blocks AF
1114*
1115 IF( mycol .LT. npcol-1 ) THEN
1116*
1117 CALL cgesd2d( ictxt, bw, nrhs, work( nb-bw+1 ),
1118 $ ldw, 0, mycol+1 )
1119*
1120 ENDIF
1121*
1122 IF( mycol .GT. 0 ) THEN
1123*
1124 CALL cgerv2d( ictxt, bw, nrhs, work( nb+bwu+1 ),
1125 $ ldw, 0, mycol-1 )
1126*
1127* Modify local right hand sides with received rhs's
1128*
1129 CALL cgemm( 'N', 'N', lm-bm, nrhs, bw, -cone,
1130 $ af( 1 ), lm, work( nb+bwu+1 ), ldw, cone,
1131 $ work( 1 ), ldw )
1132*
1133 ENDIF
1134*
1135 DO 2021 j = ln, 1, -1
1136*
1137 lmj = min( bw, odd_size-1 )
1138*
1139 lptr = bw-1+j*llda+aptr
1140*
1141* In the following, the TRANS=T option is used to reverse
1142* the order of multiplication, not as a true transpose
1143*
1144 CALL cgemv( 'T', lmj, nrhs, -cone, work( j+1), ldw,
1145 $ a( lptr ), llda-1, cone, work( j ), ldw )
1146*
1147* Divide by diagonal element
1148*
1149 CALL cscal( nrhs, cone/a( lptr-llda+1 ),
1150 $ work( j ), ldw )
1151 2021 CONTINUE
1152*
1153*
1154*
1155 CALL clamov( 'G', odd_size, nrhs, work( 1 ), ldw,
1156 $ b( 1 ), lldb )
1157*
1158* Free BLACS space used to hold standard-form grid.
1159*
1160 ictxt = ictxt_save
1161 IF( ictxt .NE. ictxt_new ) THEN
1162 CALL blacs_gridexit( ictxt_new )
1163 ENDIF
1164*
1165 1234 CONTINUE
1166*
1167* Restore saved input parameters
1168*
1169 np = np_save
1170*
1171* Output worksize
1172*
1173 work( 1 ) = work_size_min
1174*
1175 RETURN
1176*
1177* End of PCGBTRS
1178*
1179 END
subroutine desc_convert(desc_in, desc_out, info)
Definition desc_convert.f:2
subroutine pcgbtrs(trans, n, bwl, bwu, nrhs, a, ja, desca, ipiv, b, ib, descb, af, laf, work, lwork, info)
Definition pcgbtrs.f:3
#define max(A, B)
Definition pcgemr.c:180
#define min(A, B)
Definition pcgemr.c:181
subroutine globchk(ictxt, n, x, ldx, iwork, info)
Definition pchkxmat.f:403
subroutine pxerbla(ictxt, srname, info)
Definition pxerbla.f:2
void reshape(Int *context_in, Int *major_in, Int *context_out, Int *major_out, Int *first_proc, Int *nprow_new, Int *npcol_new)
Definition reshape.c:80