SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
pcdbtrsv.f
Go to the documentation of this file.
1 SUBROUTINE pcdbtrsv( UPLO, TRANS, N, BWL, BWU, NRHS, A, JA, DESCA,
2 $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO )
3*
4* -- ScaLAPACK routine (version 2.0.2) --
5* Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6* May 1 2012
7*
8* .. Scalar Arguments ..
9 CHARACTER TRANS, UPLO
10 INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS
11* ..
12* .. Array Arguments ..
13 INTEGER DESCA( * ), DESCB( * )
14 COMPLEX A( * ), AF( * ), B( * ), WORK( * )
15* ..
16*
17*
18* Purpose
19* =======
20*
21* PCDBTRSV solves a banded triangular system of linear equations
22*
23* A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
24* or
25* A(1:N, JA:JA+N-1)^H * X = B(IB:IB+N-1, 1:NRHS)
26*
27* where A(1:N, JA:JA+N-1) is a banded
28* triangular matrix factor produced by the
29* Gaussian elimination code PC@(dom_pre)BTRF
30* and is stored in A(1:N,JA:JA+N-1) and AF.
31* The matrix stored in A(1:N, JA:JA+N-1) is either
32* upper or lower triangular according to UPLO,
33* and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^H
34* is dictated by the user by the parameter TRANS.
35*
36* Routine PCDBTRF MUST be called first.
37*
38* =====================================================================
39*
40* Arguments
41* =========
42*
43* UPLO (global input) CHARACTER
44* = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
45* = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
46*
47* TRANS (global input) CHARACTER
48* = 'N': Solve with A(1:N, JA:JA+N-1);
49* = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) );
50*
51* N (global input) INTEGER
52* The number of rows and columns to be operated on, i.e. the
53* order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
54*
55* BWL (global input) INTEGER
56* Number of subdiagonals. 0 <= BWL <= N-1
57*
58* BWU (global input) INTEGER
59* Number of superdiagonals. 0 <= BWU <= N-1
60*
61* NRHS (global input) INTEGER
62* The number of right hand sides, i.e., the number of columns
63* of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
64* NRHS >= 0.
65*
66* A (local input/local output) COMPLEX pointer into
67* local memory to an array with first dimension
68* LLD_A >=(bwl+bwu+1) (stored in DESCA).
69* On entry, this array contains the local pieces of the
70* N-by-N unsymmetric banded distributed Cholesky factor L or
71* L^T A(1:N, JA:JA+N-1).
72* This local portion is stored in the packed banded format
73* used in LAPACK. Please see the Notes below and the
74* ScaLAPACK manual for more detail on the format of
75* distributed matrices.
76*
77* JA (global input) INTEGER
78* The index in the global array A that points to the start of
79* the matrix to be operated on (which may be either all of A
80* or a submatrix of A).
81*
82* DESCA (global and local input) INTEGER array of dimension DLEN.
83* if 1D type (DTYPE_A=501), DLEN >= 7;
84* if 2D type (DTYPE_A=1), DLEN >= 9 .
85* The array descriptor for the distributed matrix A.
86* Contains information of mapping of A to memory. Please
87* see NOTES below for full description and options.
88*
89* B (local input/local output) COMPLEX pointer into
90* local memory to an array of local lead dimension lld_b>=NB.
91* On entry, this array contains the
92* the local pieces of the right hand sides
93* B(IB:IB+N-1, 1:NRHS).
94* On exit, this contains the local piece of the solutions
95* distributed matrix X.
96*
97* IB (global input) INTEGER
98* The row index in the global array B that points to the first
99* row of the matrix to be operated on (which may be either
100* all of B or a submatrix of B).
101*
102* DESCB (global and local input) INTEGER array of dimension DLEN.
103* if 1D type (DTYPE_B=502), DLEN >=7;
104* if 2D type (DTYPE_B=1), DLEN >= 9.
105* The array descriptor for the distributed matrix B.
106* Contains information of mapping of B to memory. Please
107* see NOTES below for full description and options.
108*
109* AF (local output) COMPLEX array, dimension LAF.
110* Auxiliary Fillin Space.
111* Fillin is created during the factorization routine
112* PCDBTRF and this is stored in AF. If a linear system
113* is to be solved using PCDBTRS after the factorization
114* routine, AF *must not be altered* after the factorization.
115*
116* LAF (local input) INTEGER
117* Size of user-input Auxiliary Fillin space AF. Must be >=
118* NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu)
119* If LAF is not large enough, an error code will be returned
120* and the minimum acceptable size will be returned in AF( 1 )
121*
122* WORK (local workspace/local output)
123* COMPLEX temporary workspace. This space may
124* be overwritten in between calls to routines. WORK must be
125* the size given in LWORK.
126* On exit, WORK( 1 ) contains the minimal LWORK.
127*
128* LWORK (local input or global input) INTEGER
129* Size of user-input workspace WORK.
130* If LWORK is too small, the minimal acceptable size will be
131* returned in WORK(1) and an error code is returned. LWORK>=
132* (max(bwl,bwu)*NRHS)
133*
134* INFO (global output) INTEGER
135* = 0: successful exit
136* < 0: If the i-th argument is an array and the j-entry had
137* an illegal value, then INFO = -(i*100+j), if the i-th
138* argument is a scalar and had an illegal value, then
139* INFO = -i.
140*
141* =====================================================================
142*
143*
144* Restrictions
145* ============
146*
147* The following are restrictions on the input parameters. Some of these
148* are temporary and will be removed in future releases, while others
149* may reflect fundamental technical limitations.
150*
151* Non-cyclic restriction: VERY IMPORTANT!
152* P*NB>= mod(JA-1,NB)+N.
153* The mapping for matrices must be blocked, reflecting the nature
154* of the divide and conquer algorithm as a task-parallel algorithm.
155* This formula in words is: no processor may have more than one
156* chunk of the matrix.
157*
158* Blocksize cannot be too small:
159* If the matrix spans more than one processor, the following
160* restriction on NB, the size of each block on each processor,
161* must hold:
162* NB >= 2*MAX(BWL,BWU)
163* The bulk of parallel computation is done on the matrix of size
164* O(NB) on each processor. If this is too small, divide and conquer
165* is a poor choice of algorithm.
166*
167* Submatrix reference:
168* JA = IB
169* Alignment restriction that prevents unnecessary communication.
170*
171*
172* =====================================================================
173*
174*
175* Notes
176* =====
177*
178* If the factorization routine and the solve routine are to be called
179* separately (to solve various sets of righthand sides using the same
180* coefficient matrix), the auxiliary space AF *must not be altered*
181* between calls to the factorization routine and the solve routine.
182*
183* The best algorithm for solving banded and tridiagonal linear systems
184* depends on a variety of parameters, especially the bandwidth.
185* Currently, only algorithms designed for the case N/P >> bw are
186* implemented. These go by many names, including Divide and Conquer,
187* Partitioning, domain decomposition-type, etc.
188*
189* Algorithm description: Divide and Conquer
190*
191* The Divide and Conqer algorithm assumes the matrix is narrowly
192* banded compared with the number of equations. In this situation,
193* it is best to distribute the input matrix A one-dimensionally,
194* with columns atomic and rows divided amongst the processes.
195* The basic algorithm divides the banded matrix up into
196* P pieces with one stored on each processor,
197* and then proceeds in 2 phases for the factorization or 3 for the
198* solution of a linear system.
199* 1) Local Phase:
200* The individual pieces are factored independently and in
201* parallel. These factors are applied to the matrix creating
202* fillin, which is stored in a non-inspectable way in auxiliary
203* space AF. Mathematically, this is equivalent to reordering
204* the matrix A as P A P^T and then factoring the principal
205* leading submatrix of size equal to the sum of the sizes of
206* the matrices factored on each processor. The factors of
207* these submatrices overwrite the corresponding parts of A
208* in memory.
209* 2) Reduced System Phase:
210* A small (max(bwl,bwu)* (P-1)) system is formed representing
211* interaction of the larger blocks, and is stored (as are its
212* factors) in the space AF. A parallel Block Cyclic Reduction
213* algorithm is used. For a linear system, a parallel front solve
214* followed by an analagous backsolve, both using the structure
215* of the factored matrix, are performed.
216* 3) Backsubsitution Phase:
217* For a linear system, a local backsubstitution is performed on
218* each processor in parallel.
219*
220*
221* Descriptors
222* ===========
223*
224* Descriptors now have *types* and differ from ScaLAPACK 1.0.
225*
226* Note: banded codes can use either the old two dimensional
227* or new one-dimensional descriptors, though the processor grid in
228* both cases *must be one-dimensional*. We describe both types below.
229*
230* Each global data object is described by an associated description
231* vector. This vector stores the information required to establish
232* the mapping between an object element and its corresponding process
233* and memory location.
234*
235* Let A be a generic term for any 2D block cyclicly distributed array.
236* Such a global array has an associated description vector DESCA.
237* In the following comments, the character _ should be read as
238* "of the global array".
239*
240* NOTATION STORED IN EXPLANATION
241* --------------- -------------- --------------------------------------
242* DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
243* DTYPE_A = 1.
244* CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
245* the BLACS process grid A is distribu-
246* ted over. The context itself is glo-
247* bal, but the handle (the integer
248* value) may vary.
249* M_A (global) DESCA( M_ ) The number of rows in the global
250* array A.
251* N_A (global) DESCA( N_ ) The number of columns in the global
252* array A.
253* MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
254* the rows of the array.
255* NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
256* the columns of the array.
257* RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
258* row of the array A is distributed.
259* CSRC_A (global) DESCA( CSRC_ ) The process column over which the
260* first column of the array A is
261* distributed.
262* LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
263* array. LLD_A >= MAX(1,LOCr(M_A)).
264*
265* Let K be the number of rows or columns of a distributed matrix,
266* and assume that its process grid has dimension p x q.
267* LOCr( K ) denotes the number of elements of K that a process
268* would receive if K were distributed over the p processes of its
269* process column.
270* Similarly, LOCc( K ) denotes the number of elements of K that a
271* process would receive if K were distributed over the q processes of
272* its process row.
273* The values of LOCr() and LOCc() may be determined via a call to the
274* ScaLAPACK tool function, NUMROC:
275* LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
276* LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
277* An upper bound for these quantities may be computed by:
278* LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
279* LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
280*
281*
282* One-dimensional descriptors:
283*
284* One-dimensional descriptors are a new addition to ScaLAPACK since
285* version 1.0. They simplify and shorten the descriptor for 1D
286* arrays.
287*
288* Since ScaLAPACK supports two-dimensional arrays as the fundamental
289* object, we allow 1D arrays to be distributed either over the
290* first dimension of the array (as if the grid were P-by-1) or the
291* 2nd dimension (as if the grid were 1-by-P). This choice is
292* indicated by the descriptor type (501 or 502)
293* as described below.
294*
295* IMPORTANT NOTE: the actual BLACS grid represented by the
296* CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
297* irrespective of which one-dimensional descriptor type
298* (501 or 502) is input.
299* This routine will interpret the grid properly either way.
300* ScaLAPACK routines *do not support intercontext operations* so that
301* the grid passed to a single ScaLAPACK routine *must be the same*
302* for all array descriptors passed to that routine.
303*
304* NOTE: In all cases where 1D descriptors are used, 2D descriptors
305* may also be used, since a one-dimensional array is a special case
306* of a two-dimensional array with one dimension of size unity.
307* The two-dimensional array used in this case *must* be of the
308* proper orientation:
309* If the appropriate one-dimensional descriptor is DTYPEA=501
310* (1 by P type), then the two dimensional descriptor must
311* have a CTXT value that refers to a 1 by P BLACS grid;
312* If the appropriate one-dimensional descriptor is DTYPEA=502
313* (P by 1 type), then the two dimensional descriptor must
314* have a CTXT value that refers to a P by 1 BLACS grid.
315*
316*
317* Summary of allowed descriptors, types, and BLACS grids:
318* DTYPE 501 502 1 1
319* BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
320* -----------------------------------------------------
321* A OK NO OK NO
322* B NO OK NO OK
323*
324* Note that a consequence of this chart is that it is not possible
325* for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
326* to opposite requirements for the orientation of the BLACS grid,
327* and as noted before, the *same* BLACS context must be used in
328* all descriptors in a single ScaLAPACK subroutine call.
329*
330* Let A be a generic term for any 1D block cyclicly distributed array.
331* Such a global array has an associated description vector DESCA.
332* In the following comments, the character _ should be read as
333* "of the global array".
334*
335* NOTATION STORED IN EXPLANATION
336* --------------- ---------- ------------------------------------------
337* DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
338* TYPE_A = 501: 1-by-P grid.
339* TYPE_A = 502: P-by-1 grid.
340* CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
341* the BLACS process grid A is distribu-
342* ted over. The context itself is glo-
343* bal, but the handle (the integer
344* value) may vary.
345* N_A (global) DESCA( 3 ) The size of the array dimension being
346* distributed.
347* NB_A (global) DESCA( 4 ) The blocking factor used to distribute
348* the distributed dimension of the array.
349* SRC_A (global) DESCA( 5 ) The process row or column over which the
350* first row or column of the array
351* is distributed.
352* LLD_A (local) DESCA( 6 ) The leading dimension of the local array
353* storing the local blocks of the distri-
354* buted array A. Minimum value of LLD_A
355* depends on TYPE_A.
356* TYPE_A = 501: LLD_A >=
357* size of undistributed dimension, 1.
358* TYPE_A = 502: LLD_A >=NB_A, 1.
359* Reserved DESCA( 7 ) Reserved for future use.
360*
361*
362*
363* =====================================================================
364*
365* Code Developer: Andrew J. Cleary, University of Tennessee.
366* Current address: Lawrence Livermore National Labs.
367* This version released: August, 2001.
368*
369* =====================================================================
370*
371* ..
372* .. Parameters ..
373 REAL ONE, ZERO
374 parameter( one = 1.0e+0 )
375 parameter( zero = 0.0e+0 )
376 COMPLEX CONE, CZERO
377 parameter( cone = ( 1.0e+0, 0.0e+0 ) )
378 parameter( czero = ( 0.0e+0, 0.0e+0 ) )
379 INTEGER INT_ONE
380 parameter( int_one = 1 )
381 INTEGER DESCMULT, BIGNUM
382 parameter(descmult = 100, bignum = descmult * descmult)
383 INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
384 $ lld_, mb_, m_, nb_, n_, rsrc_
385 parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
386 $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
387 $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
388* ..
389* .. Local Scalars ..
390 INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
391 $ idum1, idum2, idum3, ja_new, level_dist, llda,
392 $ lldb, max_bw, mbw2, mycol, myrow, my_num_cols,
393 $ nb, np, npcol, nprow, np_save, odd_size, ofst,
394 $ part_offset, part_size, return_code, store_m_b,
395 $ store_n_a, work_size_min, work_u
396* ..
397* .. Local Arrays ..
398 INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
399 $ param_check( 18, 3 )
400* ..
401* .. External Subroutines ..
402 EXTERNAL blacs_get, blacs_gridexit, blacs_gridinfo,
403 $ cgemm, cgerv2d, cgesd2d, clamov, cmatadd,
404 $ ctbtrs, ctrmm, ctrtrs, desc_convert, globchk,
406* ..
407* .. External Functions ..
408 LOGICAL LSAME
409 INTEGER NUMROC
410 EXTERNAL lsame, numroc
411* ..
412* .. Intrinsic Functions ..
413 INTRINSIC ichar, min, mod
414* ..
415* .. Executable Statements ..
416*
417* Test the input parameters
418*
419 info = 0
420*
421* Convert descriptor into standard form for easy access to
422* parameters, check that grid is of right shape.
423*
424 desca_1xp( 1 ) = 501
425 descb_px1( 1 ) = 502
426*
427 CALL desc_convert( desca, desca_1xp, return_code )
428*
429 IF( return_code .NE. 0) THEN
430 info = -( 9*100 + 2 )
431 ENDIF
432*
433 CALL desc_convert( descb, descb_px1, return_code )
434*
435 IF( return_code .NE. 0) THEN
436 info = -( 12*100 + 2 )
437 ENDIF
438*
439* Consistency checks for DESCA and DESCB.
440*
441* Context must be the same
442 IF( desca_1xp( 2 ) .NE. descb_px1( 2 ) ) THEN
443 info = -( 12*100 + 2 )
444 ENDIF
445*
446* These are alignment restrictions that may or may not be removed
447* in future releases. -Andy Cleary, April 14, 1996.
448*
449* Block sizes must be the same
450 IF( desca_1xp( 4 ) .NE. descb_px1( 4 ) ) THEN
451 info = -( 12*100 + 4 )
452 ENDIF
453*
454* Source processor must be the same
455*
456 IF( desca_1xp( 5 ) .NE. descb_px1( 5 ) ) THEN
457 info = -( 12*100 + 5 )
458 ENDIF
459*
460* Get values out of descriptor for use in code.
461*
462 ictxt = desca_1xp( 2 )
463 csrc = desca_1xp( 5 )
464 nb = desca_1xp( 4 )
465 llda = desca_1xp( 6 )
466 store_n_a = desca_1xp( 3 )
467 lldb = descb_px1( 6 )
468 store_m_b = descb_px1( 3 )
469*
470* Get grid parameters
471*
472*
473* Size of separator blocks is maximum of bandwidths
474*
475 max_bw = max(bwl,bwu)
476 mbw2 = max_bw * max_bw
477*
478 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
479 np = nprow * npcol
480*
481*
482*
483 IF( lsame( uplo, 'U' ) ) THEN
484 idum1 = ichar( 'U' )
485 ELSE IF ( lsame( uplo, 'L' ) ) THEN
486 idum1 = ichar( 'L' )
487 ELSE
488 info = -1
489 END IF
490*
491 IF( lsame( trans, 'N' ) ) THEN
492 idum2 = ichar( 'N' )
493 ELSE IF ( lsame( trans, 'C' ) ) THEN
494 idum2 = ichar( 'C' )
495 ELSE
496 info = -2
497 END IF
498*
499 IF( lwork .LT. -1) THEN
500 info = -16
501 ELSE IF ( lwork .EQ. -1 ) THEN
502 idum3 = -1
503 ELSE
504 idum3 = 1
505 ENDIF
506*
507 IF( n .LT. 0 ) THEN
508 info = -3
509 ENDIF
510*
511 IF( n+ja-1 .GT. store_n_a ) THEN
512 info = -( 9*100 + 6 )
513 ENDIF
514*
515 IF(( bwl .GT. n-1 ) .OR.
516 $ ( bwl .LT. 0 ) ) THEN
517 info = -4
518 ENDIF
519*
520 IF(( bwu .GT. n-1 ) .OR.
521 $ ( bwu .LT. 0 ) ) THEN
522 info = -5
523 ENDIF
524*
525 IF( llda .LT. (bwl+bwu+1) ) THEN
526 info = -( 9*100 + 6 )
527 ENDIF
528*
529 IF( nb .LE. 0 ) THEN
530 info = -( 9*100 + 4 )
531 ENDIF
532*
533 IF( n+ib-1 .GT. store_m_b ) THEN
534 info = -( 12*100 + 3 )
535 ENDIF
536*
537 IF( lldb .LT. nb ) THEN
538 info = -( 12*100 + 6 )
539 ENDIF
540*
541 IF( nrhs .LT. 0 ) THEN
542 info = -6
543 ENDIF
544*
545* Current alignment restriction
546*
547 IF( ja .NE. ib) THEN
548 info = -8
549 ENDIF
550*
551* Argument checking that is specific to Divide & Conquer routine
552*
553 IF( nprow .NE. 1 ) THEN
554 info = -( 9*100+2 )
555 ENDIF
556*
557 IF( n .GT. np*nb-mod( ja-1, nb )) THEN
558 info = -( 3 )
559 CALL pxerbla( ictxt,
560 $ 'PCDBTRSV, D&C alg.: only 1 block per proc',
561 $ -info )
562 RETURN
563 ENDIF
564*
565 IF((ja+n-1.GT.nb) .AND. ( nb.LT.2*max(bwl,bwu) )) THEN
566 info = -( 9*100+4 )
567 CALL pxerbla( ictxt,
568 $ 'PCDBTRSV, D&C alg.: NB too small',
569 $ -info )
570 RETURN
571 ENDIF
572*
573*
574 work_size_min =
575 $ max(bwl,bwu)*nrhs
576*
577 work( 1 ) = work_size_min
578*
579 IF( lwork .LT. work_size_min ) THEN
580 IF( lwork .NE. -1 ) THEN
581 info = -16
582 CALL pxerbla( ictxt,
583 $ 'PCDBTRSV: worksize error',
584 $ -info )
585 ENDIF
586 RETURN
587 ENDIF
588*
589* Pack params and positions into arrays for global consistency check
590*
591 param_check( 18, 1 ) = descb(5)
592 param_check( 17, 1 ) = descb(4)
593 param_check( 16, 1 ) = descb(3)
594 param_check( 15, 1 ) = descb(2)
595 param_check( 14, 1 ) = descb(1)
596 param_check( 13, 1 ) = ib
597 param_check( 12, 1 ) = desca(5)
598 param_check( 11, 1 ) = desca(4)
599 param_check( 10, 1 ) = desca(3)
600 param_check( 9, 1 ) = desca(1)
601 param_check( 8, 1 ) = ja
602 param_check( 7, 1 ) = nrhs
603 param_check( 6, 1 ) = bwu
604 param_check( 5, 1 ) = bwl
605 param_check( 4, 1 ) = n
606 param_check( 3, 1 ) = idum3
607 param_check( 2, 1 ) = idum2
608 param_check( 1, 1 ) = idum1
609*
610 param_check( 18, 2 ) = 1205
611 param_check( 17, 2 ) = 1204
612 param_check( 16, 2 ) = 1203
613 param_check( 15, 2 ) = 1202
614 param_check( 14, 2 ) = 1201
615 param_check( 13, 2 ) = 11
616 param_check( 12, 2 ) = 905
617 param_check( 11, 2 ) = 904
618 param_check( 10, 2 ) = 903
619 param_check( 9, 2 ) = 901
620 param_check( 8, 2 ) = 8
621 param_check( 7, 2 ) = 6
622 param_check( 6, 2 ) = 5
623 param_check( 5, 2 ) = 4
624 param_check( 4, 2 ) = 3
625 param_check( 3, 2 ) = 16
626 param_check( 2, 2 ) = 2
627 param_check( 1, 2 ) = 1
628*
629* Want to find errors with MIN( ), so if no error, set it to a big
630* number. If there already is an error, multiply by the the
631* descriptor multiplier.
632*
633 IF( info.GE.0 ) THEN
634 info = bignum
635 ELSE IF( info.LT.-descmult ) THEN
636 info = -info
637 ELSE
638 info = -info * descmult
639 END IF
640*
641* Check consistency across processors
642*
643 CALL globchk( ictxt, 18, param_check, 18,
644 $ param_check( 1, 3 ), info )
645*
646* Prepare output: set info = 0 if no error, and divide by DESCMULT
647* if error is not in a descriptor entry.
648*
649 IF( info.EQ.bignum ) THEN
650 info = 0
651 ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
652 info = -info / descmult
653 ELSE
654 info = -info
655 END IF
656*
657 IF( info.LT.0 ) THEN
658 CALL pxerbla( ictxt, 'PCDBTRSV', -info )
659 RETURN
660 END IF
661*
662* Quick return if possible
663*
664 IF( n.EQ.0 )
665 $ RETURN
666*
667 IF( nrhs.EQ.0 )
668 $ RETURN
669*
670*
671* Adjust addressing into matrix space to properly get into
672* the beginning part of the relevant data
673*
674 part_offset = nb*( (ja-1)/(npcol*nb) )
675*
676 IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
677 part_offset = part_offset + nb
678 ENDIF
679*
680 IF ( mycol .LT. csrc ) THEN
681 part_offset = part_offset - nb
682 ENDIF
683*
684* Form a new BLACS grid (the "standard form" grid) with only procs
685* holding part of the matrix, of size 1xNP where NP is adjusted,
686* starting at csrc=0, with JA modified to reflect dropped procs.
687*
688* First processor to hold part of the matrix:
689*
690 first_proc = mod( ( ja-1 )/nb+csrc, npcol )
691*
692* Calculate new JA one while dropping off unused processors.
693*
694 ja_new = mod( ja-1, nb ) + 1
695*
696* Save and compute new value of NP
697*
698 np_save = np
699 np = ( ja_new+n-2 )/nb + 1
700*
701* Call utility routine that forms "standard-form" grid
702*
703 CALL reshape( ictxt, int_one, ictxt_new, int_one,
704 $ first_proc, int_one, np )
705*
706* Use new context from standard grid as context.
707*
708 ictxt_save = ictxt
709 ictxt = ictxt_new
710 desca_1xp( 2 ) = ictxt_new
711 descb_px1( 2 ) = ictxt_new
712*
713* Get information about new grid.
714*
715 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
716*
717* Drop out processors that do not have part of the matrix.
718*
719 IF( myrow .LT. 0 ) THEN
720 GOTO 1234
721 ENDIF
722*
723* ********************************
724* Values reused throughout routine
725*
726* User-input value of partition size
727*
728 part_size = nb
729*
730* Number of columns in each processor
731*
732 my_num_cols = numroc( n, part_size, mycol, 0, npcol )
733*
734* Offset in columns to beginning of main partition in each proc
735*
736 IF ( mycol .EQ. 0 ) THEN
737 part_offset = part_offset+mod( ja_new-1, part_size )
738 my_num_cols = my_num_cols - mod(ja_new-1, part_size )
739 ENDIF
740*
741* Offset in elements
742*
743 ofst = part_offset*llda
744*
745* Size of main (or odd) partition in each processor
746*
747 odd_size = my_num_cols
748 IF ( mycol .LT. np-1 ) THEN
749 odd_size = odd_size - max_bw
750 ENDIF
751*
752* Offset to workspace for Upper triangular factor
753*
754 work_u = bwu*odd_size + 3*mbw2
755*
756*
757*
758* Begin main code
759*
760 IF ( lsame( uplo, 'L' ) ) THEN
761*
762 IF ( lsame( trans, 'N' ) ) THEN
763*
764* Frontsolve
765*
766*
767******************************************
768* Local computation phase
769******************************************
770*
771* Use main partition in each processor to solve locally
772*
773 CALL ctbtrs( uplo, 'N', 'U', odd_size,
774 $ bwl, nrhs,
775 $ a( ofst+1+bwu ), llda,
776 $ b( part_offset+1 ), lldb, info )
777*
778*
779 IF ( mycol .LT. np-1 ) THEN
780* Use factorization of odd-even connection block to modify
781* locally stored portion of right hand side(s)
782*
783*
784* First copy and multiply it into temporary storage,
785* then use it on RHS
786*
787 CALL clamov( 'N', bwl, nrhs,
788 $ b( part_offset+odd_size-bwl+1), lldb,
789 $ work( 1 ), max_bw )
790*
791 CALL ctrmm( 'L', 'U', 'N', 'N', bwl, nrhs, -cone,
792 $ a(( ofst+(bwl+bwu+1)+(odd_size-bwl)*llda )),
793 $ llda-1, work( 1 ), max_bw )
794*
795 CALL cmatadd( bwl, nrhs, cone, work( 1 ), max_bw,
796 $ cone, b( part_offset+odd_size+1 ), lldb )
797*
798 ENDIF
799*
800* Clear garbage out of workspace block
801*
802 DO 10 idum1=1, work_size_min
803 work( idum1 )=0.0
804 10 CONTINUE
805*
806*
807 IF ( mycol .NE. 0 ) THEN
808* Use the "spike" fillin to calculate contribution to previous
809* processor's righthand-side.
810*
811 CALL cgemm( 'C', 'N', bwu, nrhs, odd_size, -cone, af( 1 ),
812 $ odd_size, b( part_offset+1 ), lldb, czero,
813 $ work( 1+max_bw-bwu ), max_bw )
814 ENDIF
815*
816*
817************************************************
818* Formation and solution of reduced system
819************************************************
820*
821*
822* Send modifications to prior processor's right hand sides
823*
824 IF( mycol .GT. 0) THEN
825*
826 CALL cgesd2d( ictxt, max_bw, nrhs,
827 $ work( 1 ), max_bw,
828 $ 0, mycol - 1 )
829*
830 ENDIF
831*
832* Receive modifications to processor's right hand sides
833*
834 IF( mycol .LT. npcol-1) THEN
835*
836 CALL cgerv2d( ictxt, max_bw, nrhs,
837 $ work( 1 ), max_bw,
838 $ 0, mycol + 1 )
839*
840* Combine contribution to locally stored right hand sides
841*
842 CALL cmatadd( max_bw, nrhs, cone,
843 $ work( 1 ), max_bw, cone,
844 $ b( part_offset+odd_size + 1 ), lldb )
845*
846 ENDIF
847*
848*
849* The last processor does not participate in the solution of the
850* reduced system, having sent its contribution already.
851 IF( mycol .EQ. npcol-1 ) THEN
852 GOTO 14
853 ENDIF
854*
855*
856* *************************************
857* Modification Loop
858*
859* The distance for sending and receiving for each level starts
860* at 1 for the first level.
861 level_dist = 1
862*
863* Do until this proc is needed to modify other procs' equations
864*
865 12 CONTINUE
866 IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 11
867*
868* Receive and add contribution to righthand sides from left
869*
870 IF( mycol-level_dist .GE. 0 ) THEN
871*
872 CALL cgerv2d( ictxt, max_bw, nrhs,
873 $ work( 1 ),
874 $ max_bw, 0, mycol-level_dist )
875*
876 CALL cmatadd( max_bw, nrhs, cone,
877 $ work( 1 ), max_bw, cone,
878 $ b( part_offset+odd_size + 1 ), lldb )
879*
880 ENDIF
881*
882* Receive and add contribution to righthand sides from right
883*
884 IF( mycol+level_dist .LT. npcol-1 ) THEN
885*
886 CALL cgerv2d( ictxt, max_bw, nrhs,
887 $ work( 1 ),
888 $ max_bw, 0, mycol+level_dist )
889*
890 CALL cmatadd( max_bw, nrhs, cone,
891 $ work( 1 ), max_bw, cone,
892 $ b( part_offset+odd_size + 1 ), lldb )
893*
894 ENDIF
895*
896 level_dist = level_dist*2
897*
898 GOTO 12
899 11 CONTINUE
900* [End of GOTO Loop]
901*
902*
903*
904* *********************************
905* Calculate and use this proc's blocks to modify other procs
906*
907* Solve with diagonal block
908*
909 CALL ctbtrs( 'L', 'N', 'U', max_bw, min( bwl, max_bw-1 ), nrhs,
910 $ af( odd_size*bwu+mbw2+1 ), max_bw+1,
911 $ b( part_offset+odd_size+1 ), lldb, info )
912*
913 IF( info.NE.0 ) THEN
914 GO TO 1000
915 ENDIF
916*
917*
918*
919* *********
920 IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
921*
922* Calculate contribution from this block to next diagonal block
923*
924 CALL cgemm( 'C', 'N', max_bw, nrhs, max_bw, -cone,
925 $ af( (odd_size)*bwu+1 ),
926 $ max_bw,
927 $ b( part_offset+odd_size+1 ),
928 $ lldb, czero,
929 $ work( 1 ),
930 $ max_bw )
931*
932* Send contribution to diagonal block's owning processor.
933*
934 CALL cgesd2d( ictxt, max_bw, nrhs,
935 $ work( 1 ),
936 $ max_bw, 0, mycol+level_dist )
937*
938 ENDIF
939* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
940*
941* ************
942 IF( (mycol/level_dist .GT. 0 ).AND.
943 $ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
944*
945*
946* Use offdiagonal block to calculate modification to diag block
947* of processor to the left
948*
949 CALL cgemm( 'N', 'N', max_bw, nrhs, max_bw, -cone,
950 $ af( odd_size*bwu+2*mbw2+1 ),
951 $ max_bw,
952 $ b( part_offset+odd_size+1 ),
953 $ lldb, czero,
954 $ work( 1 ),
955 $ max_bw )
956*
957* Send contribution to diagonal block's owning processor.
958*
959 CALL cgesd2d( ictxt, max_bw, nrhs,
960 $ work( 1 ),
961 $ max_bw, 0, mycol-level_dist )
962*
963 ENDIF
964* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
965*
966 14 CONTINUE
967*
968 ELSE
969*
970******************** BACKSOLVE *************************************
971*
972********************************************************************
973* .. Begin reduced system phase of algorithm ..
974********************************************************************
975*
976*
977*
978* The last processor does not participate in the solution of the
979* reduced system and just waits to receive its solution.
980 IF( mycol .EQ. npcol-1 ) THEN
981 GOTO 24
982 ENDIF
983*
984* Determine number of steps in tree loop
985*
986 level_dist = 1
987 27 CONTINUE
988 IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 26
989*
990 level_dist = level_dist*2
991*
992 GOTO 27
993 26 CONTINUE
994*
995*
996 IF( (mycol/level_dist .GT. 0 ).AND.
997 $ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
998*
999* Receive solution from processor to left
1000*
1001 CALL cgerv2d( ictxt, max_bw, nrhs,
1002 $ work( 1 ),
1003 $ max_bw, 0, mycol-level_dist )
1004*
1005*
1006* Use offdiagonal block to calculate modification to RHS stored
1007* on this processor
1008*
1009 CALL cgemm( 'C', 'N', max_bw, nrhs, max_bw, -cone,
1010 $ af( odd_size*bwu+2*mbw2+1 ),
1011 $ max_bw,
1012 $ work( 1 ),
1013 $ max_bw, cone,
1014 $ b( part_offset+odd_size+1 ),
1015 $ lldb )
1016 ENDIF
1017* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1018*
1019*
1020 IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
1021*
1022* Receive solution from processor to right
1023*
1024 CALL cgerv2d( ictxt, max_bw, nrhs,
1025 $ work( 1 ),
1026 $ max_bw, 0, mycol+level_dist )
1027*
1028* Calculate contribution from this block to next diagonal block
1029*
1030 CALL cgemm( 'N', 'N', max_bw, nrhs, max_bw, -cone,
1031 $ af( (odd_size)*bwu+1 ),
1032 $ max_bw,
1033 $ work( 1 ),
1034 $ max_bw, cone,
1035 $ b( part_offset+odd_size+1 ),
1036 $ lldb )
1037*
1038 ENDIF
1039* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1040*
1041*
1042* Solve with diagonal block
1043*
1044 CALL ctbtrs( 'L', 'C', 'U', max_bw, min( bwl, max_bw-1 ), nrhs,
1045 $ af( odd_size*bwu+mbw2+1 ), max_bw+1,
1046 $ b( part_offset+odd_size+1 ), lldb, info )
1047*
1048 IF( info.NE.0 ) THEN
1049 GO TO 1000
1050 ENDIF
1051*
1052*
1053*
1054***Modification Loop *******
1055*
1056 22 CONTINUE
1057 IF( level_dist .EQ. 1 ) GOTO 21
1058*
1059 level_dist = level_dist/2
1060*
1061* Send solution to the right
1062*
1063 IF( mycol+level_dist .LT. npcol-1 ) THEN
1064*
1065 CALL cgesd2d( ictxt, max_bw, nrhs,
1066 $ b( part_offset+odd_size+1 ),
1067 $ lldb, 0, mycol+level_dist )
1068*
1069 ENDIF
1070*
1071* Send solution to left
1072*
1073 IF( mycol-level_dist .GE. 0 ) THEN
1074*
1075 CALL cgesd2d( ictxt, max_bw, nrhs,
1076 $ b( part_offset+odd_size+1 ),
1077 $ lldb, 0, mycol-level_dist )
1078*
1079 ENDIF
1080*
1081 GOTO 22
1082 21 CONTINUE
1083* [End of GOTO Loop]
1084*
1085 24 CONTINUE
1086* [Processor npcol - 1 jumped to here to await next stage]
1087*
1088*******************************
1089* Reduced system has been solved, communicate solutions to nearest
1090* neighbors in preparation for local computation phase.
1091*
1092*
1093* Send elements of solution to next proc
1094*
1095 IF( mycol .LT. npcol-1) THEN
1096*
1097 CALL cgesd2d( ictxt, max_bw, nrhs,
1098 $ b( part_offset+odd_size+1 ), lldb,
1099 $ 0, mycol +1 )
1100*
1101 ENDIF
1102*
1103* Receive modifications to processor's right hand sides
1104*
1105 IF( mycol .GT. 0) THEN
1106*
1107 CALL cgerv2d( ictxt, max_bw, nrhs,
1108 $ work( 1 ), max_bw,
1109 $ 0, mycol - 1 )
1110*
1111 ENDIF
1112*
1113*
1114*
1115**********************************************
1116* Local computation phase
1117**********************************************
1118*
1119 IF ( mycol .NE. 0 ) THEN
1120* Use the "spike" fillin to calculate contribution from previous
1121* processor's solution.
1122*
1123 CALL cgemm( 'N', 'N', odd_size, nrhs, bwu, -cone, af( 1 ),
1124 $ odd_size, work( 1+max_bw-bwu ), max_bw, cone,
1125 $ b( part_offset+1 ), lldb )
1126*
1127 ENDIF
1128*
1129*
1130 IF ( mycol .LT. np-1 ) THEN
1131* Use factorization of odd-even connection block to modify
1132* locally stored portion of right hand side(s)
1133*
1134*
1135* First copy and multiply it into temporary storage,
1136* then use it on RHS
1137*
1138 CALL clamov( 'N', bwl, nrhs, b( part_offset+odd_size+1), lldb,
1139 $ work( 1+max_bw-bwl ), max_bw )
1140*
1141 CALL ctrmm( 'L', 'U', 'C', 'N', bwl, nrhs, -cone,
1142 $ a(( ofst+(bwl+bwu+1)+(odd_size-bwl)*llda )),
1143 $ llda-1, work( 1+max_bw-bwl ), max_bw )
1144*
1145 CALL cmatadd( bwl, nrhs, cone, work( 1+max_bw-bwl ), max_bw,
1146 $ cone, b( part_offset+odd_size-bwl+1 ), lldb )
1147*
1148 ENDIF
1149*
1150* Use main partition in each processor to solve locally
1151*
1152 CALL ctbtrs( uplo, 'C', 'U', odd_size,
1153 $ bwl, nrhs,
1154 $ a( ofst+1+bwu ),
1155 $ llda, b( part_offset+1 ),
1156 $ lldb, info )
1157*
1158 ENDIF
1159* End of "IF( LSAME( TRANS, 'N' ) )"...
1160*
1161*
1162 ELSE
1163***************************************************************
1164* CASE UPLO = 'U' *
1165***************************************************************
1166 IF ( lsame( trans, 'C' ) ) THEN
1167*
1168* Frontsolve
1169*
1170*
1171******************************************
1172* Local computation phase
1173******************************************
1174*
1175* Use main partition in each processor to solve locally
1176*
1177 CALL ctbtrs( uplo, 'C', 'N', odd_size,
1178 $ bwu, nrhs,
1179 $ a( ofst+1 ), llda,
1180 $ b( part_offset+1 ), lldb, info )
1181*
1182*
1183 IF ( mycol .LT. np-1 ) THEN
1184* Use factorization of odd-even connection block to modify
1185* locally stored portion of right hand side(s)
1186*
1187*
1188* First copy and multiply it into temporary storage,
1189* then use it on RHS
1190*
1191 CALL clamov( 'N', bwu, nrhs,
1192 $ b( part_offset+odd_size-bwu+1), lldb,
1193 $ work( 1 ), max_bw )
1194*
1195 CALL ctrmm( 'L', 'L', 'C', 'N', bwu, nrhs, -cone,
1196 $ a(( ofst+1+odd_size*llda )), llda-1, work( 1 ),
1197 $ max_bw )
1198*
1199 CALL cmatadd( bwu, nrhs, cone, work( 1 ), max_bw,
1200 $ cone, b( part_offset+odd_size+1 ), lldb )
1201*
1202 ENDIF
1203*
1204* Clear garbage out of workspace block
1205*
1206 DO 20 idum1=1, work_size_min
1207 work( idum1 )=0.0
1208 20 CONTINUE
1209*
1210*
1211 IF ( mycol .NE. 0 ) THEN
1212* Use the "spike" fillin to calculate contribution to previous
1213* processor's righthand-side.
1214*
1215 CALL cgemm( 'C', 'N', bwl, nrhs, odd_size, -cone,
1216 $ af( work_u+1 ), odd_size, b( part_offset+1 ),
1217 $ lldb, czero, work( 1+max_bw-bwl ), max_bw )
1218 ENDIF
1219*
1220*
1221************************************************
1222* Formation and solution of reduced system
1223************************************************
1224*
1225*
1226* Send modifications to prior processor's right hand sides
1227*
1228 IF( mycol .GT. 0) THEN
1229*
1230 CALL cgesd2d( ictxt, max_bw, nrhs,
1231 $ work( 1 ), max_bw,
1232 $ 0, mycol - 1 )
1233*
1234 ENDIF
1235*
1236* Receive modifications to processor's right hand sides
1237*
1238 IF( mycol .LT. npcol-1) THEN
1239*
1240 CALL cgerv2d( ictxt, max_bw, nrhs,
1241 $ work( 1 ), max_bw,
1242 $ 0, mycol + 1 )
1243*
1244* Combine contribution to locally stored right hand sides
1245*
1246 CALL cmatadd( max_bw, nrhs, cone,
1247 $ work( 1 ), max_bw, cone,
1248 $ b( part_offset+odd_size + 1 ), lldb )
1249*
1250 ENDIF
1251*
1252*
1253* The last processor does not participate in the solution of the
1254* reduced system, having sent its contribution already.
1255 IF( mycol .EQ. npcol-1 ) THEN
1256 GOTO 44
1257 ENDIF
1258*
1259*
1260* *************************************
1261* Modification Loop
1262*
1263* The distance for sending and receiving for each level starts
1264* at 1 for the first level.
1265 level_dist = 1
1266*
1267* Do until this proc is needed to modify other procs' equations
1268*
1269 42 CONTINUE
1270 IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 41
1271*
1272* Receive and add contribution to righthand sides from left
1273*
1274 IF( mycol-level_dist .GE. 0 ) THEN
1275*
1276 CALL cgerv2d( ictxt, max_bw, nrhs,
1277 $ work( 1 ),
1278 $ max_bw, 0, mycol-level_dist )
1279*
1280 CALL cmatadd( max_bw, nrhs, cone,
1281 $ work( 1 ), max_bw, cone,
1282 $ b( part_offset+odd_size + 1 ), lldb )
1283*
1284 ENDIF
1285*
1286* Receive and add contribution to righthand sides from right
1287*
1288 IF( mycol+level_dist .LT. npcol-1 ) THEN
1289*
1290 CALL cgerv2d( ictxt, max_bw, nrhs,
1291 $ work( 1 ),
1292 $ max_bw, 0, mycol+level_dist )
1293*
1294 CALL cmatadd( max_bw, nrhs, cone,
1295 $ work( 1 ), max_bw, cone,
1296 $ b( part_offset+odd_size + 1 ), lldb )
1297*
1298 ENDIF
1299*
1300 level_dist = level_dist*2
1301*
1302 GOTO 42
1303 41 CONTINUE
1304* [End of GOTO Loop]
1305*
1306*
1307*
1308* *********************************
1309* Calculate and use this proc's blocks to modify other procs
1310*
1311* Solve with diagonal block
1312*
1313 CALL ctbtrs( 'U', 'C', 'N', max_bw, min( bwu, max_bw-1 ), nrhs,
1314 $ af( odd_size*bwu+mbw2+1-min( bwu, max_bw-1 ) ),
1315 $ max_bw+1, b( part_offset+odd_size+1 ), lldb,
1316 $ info )
1317*
1318 IF( info.NE.0 ) THEN
1319 GO TO 1000
1320 ENDIF
1321*
1322*
1323*
1324* *********
1325 IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
1326*
1327* Calculate contribution from this block to next diagonal block
1328*
1329 CALL cgemm( 'C', 'N', max_bw, nrhs, max_bw, -cone,
1330 $ af( work_u+(odd_size)*bwl+1 ),
1331 $ max_bw,
1332 $ b( part_offset+odd_size+1 ),
1333 $ lldb, czero,
1334 $ work( 1 ),
1335 $ max_bw )
1336*
1337* Send contribution to diagonal block's owning processor.
1338*
1339 CALL cgesd2d( ictxt, max_bw, nrhs,
1340 $ work( 1 ),
1341 $ max_bw, 0, mycol+level_dist )
1342*
1343 ENDIF
1344* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1345*
1346* ************
1347 IF( (mycol/level_dist .GT. 0 ).AND.
1348 $ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
1349*
1350*
1351* Use offdiagonal block to calculate modification to diag block
1352* of processor to the left
1353*
1354 CALL cgemm( 'N', 'N', max_bw, nrhs, max_bw, -cone,
1355 $ af( work_u+odd_size*bwl+2*mbw2+1 ),
1356 $ max_bw,
1357 $ b( part_offset+odd_size+1 ),
1358 $ lldb, czero,
1359 $ work( 1 ),
1360 $ max_bw )
1361*
1362* Send contribution to diagonal block's owning processor.
1363*
1364 CALL cgesd2d( ictxt, max_bw, nrhs,
1365 $ work( 1 ),
1366 $ max_bw, 0, mycol-level_dist )
1367*
1368 ENDIF
1369* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1370*
1371 44 CONTINUE
1372*
1373 ELSE
1374*
1375******************** BACKSOLVE *************************************
1376*
1377********************************************************************
1378* .. Begin reduced system phase of algorithm ..
1379********************************************************************
1380*
1381*
1382*
1383* The last processor does not participate in the solution of the
1384* reduced system and just waits to receive its solution.
1385 IF( mycol .EQ. npcol-1 ) THEN
1386 GOTO 54
1387 ENDIF
1388*
1389* Determine number of steps in tree loop
1390*
1391 level_dist = 1
1392 57 CONTINUE
1393 IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 56
1394*
1395 level_dist = level_dist*2
1396*
1397 GOTO 57
1398 56 CONTINUE
1399*
1400*
1401 IF( (mycol/level_dist .GT. 0 ).AND.
1402 $ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
1403*
1404* Receive solution from processor to left
1405*
1406 CALL cgerv2d( ictxt, max_bw, nrhs,
1407 $ work( 1 ),
1408 $ max_bw, 0, mycol-level_dist )
1409*
1410*
1411* Use offdiagonal block to calculate modification to RHS stored
1412* on this processor
1413*
1414 CALL cgemm( 'C', 'N', max_bw, nrhs, max_bw, -cone,
1415 $ af( work_u+odd_size*bwl+2*mbw2+1 ),
1416 $ max_bw,
1417 $ work( 1 ),
1418 $ max_bw, cone,
1419 $ b( part_offset+odd_size+1 ),
1420 $ lldb )
1421 ENDIF
1422* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1423*
1424*
1425 IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
1426*
1427* Receive solution from processor to right
1428*
1429 CALL cgerv2d( ictxt, max_bw, nrhs,
1430 $ work( 1 ),
1431 $ max_bw, 0, mycol+level_dist )
1432*
1433* Calculate contribution from this block to next diagonal block
1434*
1435 CALL cgemm( 'N', 'N', max_bw, nrhs, max_bw, -cone,
1436 $ af( work_u+(odd_size)*bwl+1 ),
1437 $ max_bw,
1438 $ work( 1 ),
1439 $ max_bw, cone,
1440 $ b( part_offset+odd_size+1 ),
1441 $ lldb )
1442*
1443 ENDIF
1444* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1445*
1446*
1447* Solve with diagonal block
1448*
1449 CALL ctbtrs( 'U', 'N', 'N', max_bw, min( bwu, max_bw-1 ), nrhs,
1450 $ af( odd_size*bwu+mbw2+1-min( bwu, max_bw-1 ) ),
1451 $ max_bw+1, b( part_offset+odd_size+1 ), lldb,
1452 $ info )
1453*
1454 IF( info.NE.0 ) THEN
1455 GO TO 1000
1456 ENDIF
1457*
1458*
1459*
1460***Modification Loop *******
1461*
1462 52 CONTINUE
1463 IF( level_dist .EQ. 1 ) GOTO 51
1464*
1465 level_dist = level_dist/2
1466*
1467* Send solution to the right
1468*
1469 IF( mycol+level_dist .LT. npcol-1 ) THEN
1470*
1471 CALL cgesd2d( ictxt, max_bw, nrhs,
1472 $ b( part_offset+odd_size+1 ),
1473 $ lldb, 0, mycol+level_dist )
1474*
1475 ENDIF
1476*
1477* Send solution to left
1478*
1479 IF( mycol-level_dist .GE. 0 ) THEN
1480*
1481 CALL cgesd2d( ictxt, max_bw, nrhs,
1482 $ b( part_offset+odd_size+1 ),
1483 $ lldb, 0, mycol-level_dist )
1484*
1485 ENDIF
1486*
1487 GOTO 52
1488 51 CONTINUE
1489* [End of GOTO Loop]
1490*
1491 54 CONTINUE
1492* [Processor npcol - 1 jumped to here to await next stage]
1493*
1494*******************************
1495* Reduced system has been solved, communicate solutions to nearest
1496* neighbors in preparation for local computation phase.
1497*
1498*
1499* Send elements of solution to next proc
1500*
1501 IF( mycol .LT. npcol-1) THEN
1502*
1503 CALL cgesd2d( ictxt, max_bw, nrhs,
1504 $ b( part_offset+odd_size+1 ), lldb,
1505 $ 0, mycol +1 )
1506*
1507 ENDIF
1508*
1509* Receive modifications to processor's right hand sides
1510*
1511 IF( mycol .GT. 0) THEN
1512*
1513 CALL cgerv2d( ictxt, max_bw, nrhs,
1514 $ work( 1 ), max_bw,
1515 $ 0, mycol - 1 )
1516*
1517 ENDIF
1518*
1519*
1520*
1521**********************************************
1522* Local computation phase
1523**********************************************
1524*
1525 IF ( mycol .NE. 0 ) THEN
1526* Use the "spike" fillin to calculate contribution from previous
1527* processor's solution.
1528*
1529 CALL cgemm( 'N', 'N', odd_size, nrhs, bwl, -cone,
1530 $ af( work_u+1 ), odd_size, work( 1+max_bw-bwl ),
1531 $ max_bw, cone, b( part_offset+1 ), lldb )
1532*
1533 ENDIF
1534*
1535*
1536 IF ( mycol .LT. np-1 ) THEN
1537* Use factorization of odd-even connection block to modify
1538* locally stored portion of right hand side(s)
1539*
1540*
1541* First copy and multiply it into temporary storage,
1542* then use it on RHS
1543*
1544 CALL clamov( 'N', bwu, nrhs, b( part_offset+odd_size+1), lldb,
1545 $ work( 1+max_bw-bwu ), max_bw+bwl )
1546*
1547 CALL ctrmm( 'L', 'L', 'N', 'N', bwu, nrhs, -cone,
1548 $ a(( ofst+1+odd_size*llda )), llda-1,
1549 $ work( 1+max_bw-bwu ), max_bw+bwl )
1550*
1551 CALL cmatadd( bwu, nrhs, cone, work( 1+max_bw-bwu ),
1552 $ max_bw+bwl, cone,
1553 $ b( part_offset+odd_size-bwu+1 ), lldb )
1554*
1555 ENDIF
1556*
1557* Use main partition in each processor to solve locally
1558*
1559 CALL ctbtrs( uplo, 'N', 'N', odd_size,
1560 $ bwu, nrhs,
1561 $ a( ofst+1 ),
1562 $ llda, b( part_offset+1 ),
1563 $ lldb, info )
1564*
1565 ENDIF
1566* End of "IF( LSAME( TRANS, 'N' ) )"...
1567*
1568*
1569 ENDIF
1570* End of "IF( LSAME( UPLO, 'L' ) )"...
1571 1000 CONTINUE
1572*
1573*
1574* Free BLACS space used to hold standard-form grid.
1575*
1576 IF( ictxt_save .NE. ictxt_new ) THEN
1577 CALL blacs_gridexit( ictxt_new )
1578 ENDIF
1579*
1580 1234 CONTINUE
1581*
1582* Restore saved input parameters
1583*
1584 ictxt = ictxt_save
1585 np = np_save
1586*
1587* Output minimum worksize
1588*
1589 work( 1 ) = work_size_min
1590*
1591*
1592 RETURN
1593*
1594* End of PCDBTRSV
1595*
1596 END
subroutine cmatadd(m, n, alpha, a, lda, beta, c, ldc)
Definition cmatadd.f:2
subroutine desc_convert(desc_in, desc_out, info)
Definition desc_convert.f:2
subroutine pcdbtrsv(uplo, trans, n, bwl, bwu, nrhs, a, ja, desca, b, ib, descb, af, laf, work, lwork, info)
Definition pcdbtrsv.f:3
#define max(A, B)
Definition pcgemr.c:180
#define min(A, B)
Definition pcgemr.c:181
subroutine globchk(ictxt, n, x, ldx, iwork, info)
Definition pchkxmat.f:403
subroutine pxerbla(ictxt, srname, info)
Definition pxerbla.f:2
void reshape(Int *context_in, Int *major_in, Int *context_out, Int *major_out, Int *first_proc, Int *nprow_new, Int *npcol_new)
Definition reshape.c:80