SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
pdpbtrsv.f
Go to the documentation of this file.
1 SUBROUTINE pdpbtrsv( UPLO, TRANS, N, BW, NRHS, A, JA, DESCA, B,
2 $ IB, DESCB, AF, LAF, WORK, LWORK, INFO )
3*
4* -- ScaLAPACK routine (version 2.0.2) --
5* Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6* May 1 2012
7*
8* .. Scalar Arguments ..
9 CHARACTER TRANS, UPLO
10 INTEGER BW, IB, INFO, JA, LAF, LWORK, N, NRHS
11* ..
12* .. Array Arguments ..
13 INTEGER DESCA( * ), DESCB( * )
14 DOUBLE PRECISION A( * ), AF( * ), B( * ), WORK( * )
15* ..
16*
17*
18* Purpose
19* =======
20*
21* PDPBTRSV solves a banded triangular system of linear equations
22*
23* A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
24* or
25* A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS)
26*
27* where A(1:N, JA:JA+N-1) is a banded
28* triangular matrix factor produced by the
29* Cholesky factorization code PDPBTRF
30* and is stored in A(1:N,JA:JA+N-1) and AF.
31* The matrix stored in A(1:N, JA:JA+N-1) is either
32* upper or lower triangular according to UPLO,
33* and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T
34* is dictated by the user by the parameter TRANS.
35*
36* Routine PDPBTRF MUST be called first.
37*
38* =====================================================================
39*
40* Arguments
41* =========
42*
43* UPLO (global input) CHARACTER
44* = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
45* = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
46*
47* TRANS (global input) CHARACTER
48* = 'N': Solve with A(1:N, JA:JA+N-1);
49* = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T;
50*
51* N (global input) INTEGER
52* The number of rows and columns to be operated on, i.e. the
53* order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
54*
55* BW (global input) INTEGER
56* Number of subdiagonals in L or U. 0 <= BW <= N-1
57*
58* NRHS (global input) INTEGER
59* The number of right hand sides, i.e., the number of columns
60* of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
61* NRHS >= 0.
62*
63* A (local input/local output) DOUBLE PRECISION pointer into
64* local memory to an array with first dimension
65* LLD_A >=(bw+1) (stored in DESCA).
66* On entry, this array contains the local pieces of the
67* N-by-N symmetric banded distributed Cholesky factor L or
68* L^T A(1:N, JA:JA+N-1).
69* This local portion is stored in the packed banded format
70* used in LAPACK. Please see the Notes below and the
71* ScaLAPACK manual for more detail on the format of
72* distributed matrices.
73*
74* JA (global input) INTEGER
75* The index in the global array A that points to the start of
76* the matrix to be operated on (which may be either all of A
77* or a submatrix of A).
78*
79* DESCA (global and local input) INTEGER array of dimension DLEN.
80* if 1D type (DTYPE_A=501), DLEN >= 7;
81* if 2D type (DTYPE_A=1), DLEN >= 9 .
82* The array descriptor for the distributed matrix A.
83* Contains information of mapping of A to memory. Please
84* see NOTES below for full description and options.
85*
86* B (local input/local output) DOUBLE PRECISION pointer into
87* local memory to an array of local lead dimension lld_b>=NB.
88* On entry, this array contains the
89* the local pieces of the right hand sides
90* B(IB:IB+N-1, 1:NRHS).
91* On exit, this contains the local piece of the solutions
92* distributed matrix X.
93*
94* IB (global input) INTEGER
95* The row index in the global array B that points to the first
96* row of the matrix to be operated on (which may be either
97* all of B or a submatrix of B).
98*
99* DESCB (global and local input) INTEGER array of dimension DLEN.
100* if 1D type (DTYPE_B=502), DLEN >=7;
101* if 2D type (DTYPE_B=1), DLEN >= 9.
102* The array descriptor for the distributed matrix B.
103* Contains information of mapping of B to memory. Please
104* see NOTES below for full description and options.
105*
106* AF (local output) DOUBLE PRECISION array, dimension LAF.
107* Auxiliary Fillin Space.
108* Fillin is created during the factorization routine
109* PDPBTRF and this is stored in AF. If a linear system
110* is to be solved using PDPBTRS after the factorization
111* routine, AF *must not be altered* after the factorization.
112*
113* LAF (local input) INTEGER
114* Size of user-input Auxiliary Fillin space AF. Must be >=
115* (NB+2*bw)*bw
116* If LAF is not large enough, an error code will be returned
117* and the minimum acceptable size will be returned in AF( 1 )
118*
119* WORK (local workspace/local output)
120* DOUBLE PRECISION temporary workspace. This space may
121* be overwritten in between calls to routines. WORK must be
122* the size given in LWORK.
123* On exit, WORK( 1 ) contains the minimal LWORK.
124*
125* LWORK (local input or global input) INTEGER
126* Size of user-input workspace WORK.
127* If LWORK is too small, the minimal acceptable size will be
128* returned in WORK(1) and an error code is returned. LWORK>=
129* (bw*NRHS)
130*
131* INFO (global output) INTEGER
132* = 0: successful exit
133* < 0: If the i-th argument is an array and the j-entry had
134* an illegal value, then INFO = -(i*100+j), if the i-th
135* argument is a scalar and had an illegal value, then
136* INFO = -i.
137*
138* =====================================================================
139*
140*
141* Restrictions
142* ============
143*
144* The following are restrictions on the input parameters. Some of these
145* are temporary and will be removed in future releases, while others
146* may reflect fundamental technical limitations.
147*
148* Non-cyclic restriction: VERY IMPORTANT!
149* P*NB>= mod(JA-1,NB)+N.
150* The mapping for matrices must be blocked, reflecting the nature
151* of the divide and conquer algorithm as a task-parallel algorithm.
152* This formula in words is: no processor may have more than one
153* chunk of the matrix.
154*
155* Blocksize cannot be too small:
156* If the matrix spans more than one processor, the following
157* restriction on NB, the size of each block on each processor,
158* must hold:
159* NB >= 2*BW
160* The bulk of parallel computation is done on the matrix of size
161* O(NB) on each processor. If this is too small, divide and conquer
162* is a poor choice of algorithm.
163*
164* Submatrix reference:
165* JA = IB
166* Alignment restriction that prevents unnecessary communication.
167*
168*
169* =====================================================================
170*
171*
172* Notes
173* =====
174*
175* If the factorization routine and the solve routine are to be called
176* separately (to solve various sets of righthand sides using the same
177* coefficient matrix), the auxiliary space AF *must not be altered*
178* between calls to the factorization routine and the solve routine.
179*
180* The best algorithm for solving banded and tridiagonal linear systems
181* depends on a variety of parameters, especially the bandwidth.
182* Currently, only algorithms designed for the case N/P >> bw are
183* implemented. These go by many names, including Divide and Conquer,
184* Partitioning, domain decomposition-type, etc.
185*
186* Algorithm description: Divide and Conquer
187*
188* The Divide and Conqer algorithm assumes the matrix is narrowly
189* banded compared with the number of equations. In this situation,
190* it is best to distribute the input matrix A one-dimensionally,
191* with columns atomic and rows divided amongst the processes.
192* The basic algorithm divides the banded matrix up into
193* P pieces with one stored on each processor,
194* and then proceeds in 2 phases for the factorization or 3 for the
195* solution of a linear system.
196* 1) Local Phase:
197* The individual pieces are factored independently and in
198* parallel. These factors are applied to the matrix creating
199* fillin, which is stored in a non-inspectable way in auxiliary
200* space AF. Mathematically, this is equivalent to reordering
201* the matrix A as P A P^T and then factoring the principal
202* leading submatrix of size equal to the sum of the sizes of
203* the matrices factored on each processor. The factors of
204* these submatrices overwrite the corresponding parts of A
205* in memory.
206* 2) Reduced System Phase:
207* A small (BW* (P-1)) system is formed representing
208* interaction of the larger blocks, and is stored (as are its
209* factors) in the space AF. A parallel Block Cyclic Reduction
210* algorithm is used. For a linear system, a parallel front solve
211* followed by an analagous backsolve, both using the structure
212* of the factored matrix, are performed.
213* 3) Backsubsitution Phase:
214* For a linear system, a local backsubstitution is performed on
215* each processor in parallel.
216*
217*
218* Descriptors
219* ===========
220*
221* Descriptors now have *types* and differ from ScaLAPACK 1.0.
222*
223* Note: banded codes can use either the old two dimensional
224* or new one-dimensional descriptors, though the processor grid in
225* both cases *must be one-dimensional*. We describe both types below.
226*
227* Each global data object is described by an associated description
228* vector. This vector stores the information required to establish
229* the mapping between an object element and its corresponding process
230* and memory location.
231*
232* Let A be a generic term for any 2D block cyclicly distributed array.
233* Such a global array has an associated description vector DESCA.
234* In the following comments, the character _ should be read as
235* "of the global array".
236*
237* NOTATION STORED IN EXPLANATION
238* --------------- -------------- --------------------------------------
239* DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
240* DTYPE_A = 1.
241* CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
242* the BLACS process grid A is distribu-
243* ted over. The context itself is glo-
244* bal, but the handle (the integer
245* value) may vary.
246* M_A (global) DESCA( M_ ) The number of rows in the global
247* array A.
248* N_A (global) DESCA( N_ ) The number of columns in the global
249* array A.
250* MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
251* the rows of the array.
252* NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
253* the columns of the array.
254* RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
255* row of the array A is distributed.
256* CSRC_A (global) DESCA( CSRC_ ) The process column over which the
257* first column of the array A is
258* distributed.
259* LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
260* array. LLD_A >= MAX(1,LOCr(M_A)).
261*
262* Let K be the number of rows or columns of a distributed matrix,
263* and assume that its process grid has dimension p x q.
264* LOCr( K ) denotes the number of elements of K that a process
265* would receive if K were distributed over the p processes of its
266* process column.
267* Similarly, LOCc( K ) denotes the number of elements of K that a
268* process would receive if K were distributed over the q processes of
269* its process row.
270* The values of LOCr() and LOCc() may be determined via a call to the
271* ScaLAPACK tool function, NUMROC:
272* LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
273* LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
274* An upper bound for these quantities may be computed by:
275* LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
276* LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
277*
278*
279* One-dimensional descriptors:
280*
281* One-dimensional descriptors are a new addition to ScaLAPACK since
282* version 1.0. They simplify and shorten the descriptor for 1D
283* arrays.
284*
285* Since ScaLAPACK supports two-dimensional arrays as the fundamental
286* object, we allow 1D arrays to be distributed either over the
287* first dimension of the array (as if the grid were P-by-1) or the
288* 2nd dimension (as if the grid were 1-by-P). This choice is
289* indicated by the descriptor type (501 or 502)
290* as described below.
291*
292* IMPORTANT NOTE: the actual BLACS grid represented by the
293* CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
294* irrespective of which one-dimensional descriptor type
295* (501 or 502) is input.
296* This routine will interpret the grid properly either way.
297* ScaLAPACK routines *do not support intercontext operations* so that
298* the grid passed to a single ScaLAPACK routine *must be the same*
299* for all array descriptors passed to that routine.
300*
301* NOTE: In all cases where 1D descriptors are used, 2D descriptors
302* may also be used, since a one-dimensional array is a special case
303* of a two-dimensional array with one dimension of size unity.
304* The two-dimensional array used in this case *must* be of the
305* proper orientation:
306* If the appropriate one-dimensional descriptor is DTYPEA=501
307* (1 by P type), then the two dimensional descriptor must
308* have a CTXT value that refers to a 1 by P BLACS grid;
309* If the appropriate one-dimensional descriptor is DTYPEA=502
310* (P by 1 type), then the two dimensional descriptor must
311* have a CTXT value that refers to a P by 1 BLACS grid.
312*
313*
314* Summary of allowed descriptors, types, and BLACS grids:
315* DTYPE 501 502 1 1
316* BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
317* -----------------------------------------------------
318* A OK NO OK NO
319* B NO OK NO OK
320*
321* Note that a consequence of this chart is that it is not possible
322* for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
323* to opposite requirements for the orientation of the BLACS grid,
324* and as noted before, the *same* BLACS context must be used in
325* all descriptors in a single ScaLAPACK subroutine call.
326*
327* Let A be a generic term for any 1D block cyclicly distributed array.
328* Such a global array has an associated description vector DESCA.
329* In the following comments, the character _ should be read as
330* "of the global array".
331*
332* NOTATION STORED IN EXPLANATION
333* --------------- ---------- ------------------------------------------
334* DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
335* TYPE_A = 501: 1-by-P grid.
336* TYPE_A = 502: P-by-1 grid.
337* CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
338* the BLACS process grid A is distribu-
339* ted over. The context itself is glo-
340* bal, but the handle (the integer
341* value) may vary.
342* N_A (global) DESCA( 3 ) The size of the array dimension being
343* distributed.
344* NB_A (global) DESCA( 4 ) The blocking factor used to distribute
345* the distributed dimension of the array.
346* SRC_A (global) DESCA( 5 ) The process row or column over which the
347* first row or column of the array
348* is distributed.
349* LLD_A (local) DESCA( 6 ) The leading dimension of the local array
350* storing the local blocks of the distri-
351* buted array A. Minimum value of LLD_A
352* depends on TYPE_A.
353* TYPE_A = 501: LLD_A >=
354* size of undistributed dimension, 1.
355* TYPE_A = 502: LLD_A >=NB_A, 1.
356* Reserved DESCA( 7 ) Reserved for future use.
357*
358*
359*
360* =====================================================================
361*
362* Code Developer: Andrew J. Cleary, University of Tennessee.
363* Current address: Lawrence Livermore National Labs.
364*
365* =====================================================================
366*
367* .. Parameters ..
368 DOUBLE PRECISION ONE
369 parameter( one = 1.0d+0 )
370 DOUBLE PRECISION ZERO
371 parameter( zero = 0.0d+0 )
372 INTEGER INT_ONE
373 parameter( int_one = 1 )
374 INTEGER DESCMULT, BIGNUM
375 parameter( descmult = 100, bignum = descmult*descmult )
376 INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
377 $ lld_, mb_, m_, nb_, n_, rsrc_
378 parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
379 $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
380 $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
381* ..
382* .. Local Scalars ..
383 INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
384 $ idum1, idum2, idum3, ja_new, level_dist, llda,
385 $ lldb, mbw2, mycol, myrow, my_num_cols, nb, np,
386 $ npcol, nprow, np_save, odd_size, ofst,
387 $ part_offset, part_size, return_code, store_m_b,
388 $ store_n_a, work_size_min
389* ..
390* .. Local Arrays ..
391 INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
392 $ param_check( 17, 3 )
393* ..
394* .. External Subroutines ..
395 EXTERNAL blacs_gridexit, blacs_gridinfo, desc_convert,
396 $ dgemm, dgerv2d, dgesd2d, dlamov, dmatadd,
397 $ dtbtrs, dtrmm, dtrtrs, globchk, pxerbla,
398 $ reshape
399* ..
400* .. External Functions ..
401 LOGICAL LSAME
402 INTEGER NUMROC
403 EXTERNAL lsame, numroc
404* ..
405* .. Intrinsic Functions ..
406 INTRINSIC ichar, mod
407* ..
408* .. Executable Statements ..
409*
410* Test the input parameters
411*
412 info = 0
413*
414* Convert descriptor into standard form for easy access to
415* parameters, check that grid is of right shape.
416*
417 desca_1xp( 1 ) = 501
418 descb_px1( 1 ) = 502
419*
420 CALL desc_convert( desca, desca_1xp, return_code )
421*
422 IF( return_code.NE.0 ) THEN
423 info = -( 8*100+2 )
424 END IF
425*
426 CALL desc_convert( descb, descb_px1, return_code )
427*
428 IF( return_code.NE.0 ) THEN
429 info = -( 11*100+2 )
430 END IF
431*
432* Consistency checks for DESCA and DESCB.
433*
434* Context must be the same
435 IF( desca_1xp( 2 ).NE.descb_px1( 2 ) ) THEN
436 info = -( 11*100+2 )
437 END IF
438*
439* These are alignment restrictions that may or may not be removed
440* in future releases. -Andy Cleary, April 14, 1996.
441*
442* Block sizes must be the same
443 IF( desca_1xp( 4 ).NE.descb_px1( 4 ) ) THEN
444 info = -( 11*100+4 )
445 END IF
446*
447* Source processor must be the same
448*
449 IF( desca_1xp( 5 ).NE.descb_px1( 5 ) ) THEN
450 info = -( 11*100+5 )
451 END IF
452*
453* Get values out of descriptor for use in code.
454*
455 ictxt = desca_1xp( 2 )
456 csrc = desca_1xp( 5 )
457 nb = desca_1xp( 4 )
458 llda = desca_1xp( 6 )
459 store_n_a = desca_1xp( 3 )
460 lldb = descb_px1( 6 )
461 store_m_b = descb_px1( 3 )
462*
463* Get grid parameters
464*
465*
466* Pre-calculate bw^2
467*
468 mbw2 = bw*bw
469*
470 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
471 np = nprow*npcol
472*
473*
474*
475 IF( lsame( uplo, 'U' ) ) THEN
476 idum1 = ichar( 'U' )
477 ELSE IF( lsame( uplo, 'L' ) ) THEN
478 idum1 = ichar( 'L' )
479 ELSE
480 info = -1
481 END IF
482*
483 IF( lsame( trans, 'N' ) ) THEN
484 idum2 = ichar( 'N' )
485 ELSE IF( lsame( trans, 'T' ) ) THEN
486 idum2 = ichar( 'T' )
487 ELSE IF( lsame( trans, 'C' ) ) THEN
488 idum2 = ichar( 'T' )
489 ELSE
490 info = -2
491 END IF
492*
493 IF( lwork.LT.-1 ) THEN
494 info = -14
495 ELSE IF( lwork.EQ.-1 ) THEN
496 idum3 = -1
497 ELSE
498 idum3 = 1
499 END IF
500*
501 IF( n.LT.0 ) THEN
502 info = -3
503 END IF
504*
505 IF( n+ja-1.GT.store_n_a ) THEN
506 info = -( 8*100+6 )
507 END IF
508*
509 IF( ( bw.GT.n-1 ) .OR. ( bw.LT.0 ) ) THEN
510 info = -4
511 END IF
512*
513 IF( llda.LT.( bw+1 ) ) THEN
514 info = -( 8*100+6 )
515 END IF
516*
517 IF( nb.LE.0 ) THEN
518 info = -( 8*100+4 )
519 END IF
520*
521 IF( n+ib-1.GT.store_m_b ) THEN
522 info = -( 11*100+3 )
523 END IF
524*
525 IF( lldb.LT.nb ) THEN
526 info = -( 11*100+6 )
527 END IF
528*
529 IF( nrhs.LT.0 ) THEN
530 info = -5
531 END IF
532*
533* Current alignment restriction
534*
535 IF( ja.NE.ib ) THEN
536 info = -7
537 END IF
538*
539* Argument checking that is specific to Divide & Conquer routine
540*
541 IF( nprow.NE.1 ) THEN
542 info = -( 8*100+2 )
543 END IF
544*
545 IF( n.GT.np*nb-mod( ja-1, nb ) ) THEN
546 info = -( 3 )
547 CALL pxerbla( ictxt,
548 $ 'PDPBTRSV, D&C alg.: only 1 block per proc',
549 $ -info )
550 RETURN
551 END IF
552*
553 IF( ( ja+n-1.GT.nb ) .AND. ( nb.LT.2*bw ) ) THEN
554 info = -( 8*100+4 )
555 CALL pxerbla( ictxt, 'PDPBTRSV, D&C alg.: NB too small',
556 $ -info )
557 RETURN
558 END IF
559*
560*
561 work_size_min = bw*nrhs
562*
563 work( 1 ) = work_size_min
564*
565 IF( lwork.LT.work_size_min ) THEN
566 IF( lwork.NE.-1 ) THEN
567 info = -14
568 CALL pxerbla( ictxt, 'PDPBTRSV: worksize error', -info )
569 END IF
570 RETURN
571 END IF
572*
573* Pack params and positions into arrays for global consistency check
574*
575 param_check( 17, 1 ) = descb( 5 )
576 param_check( 16, 1 ) = descb( 4 )
577 param_check( 15, 1 ) = descb( 3 )
578 param_check( 14, 1 ) = descb( 2 )
579 param_check( 13, 1 ) = descb( 1 )
580 param_check( 12, 1 ) = ib
581 param_check( 11, 1 ) = desca( 5 )
582 param_check( 10, 1 ) = desca( 4 )
583 param_check( 9, 1 ) = desca( 3 )
584 param_check( 8, 1 ) = desca( 1 )
585 param_check( 7, 1 ) = ja
586 param_check( 6, 1 ) = nrhs
587 param_check( 5, 1 ) = bw
588 param_check( 4, 1 ) = n
589 param_check( 3, 1 ) = idum3
590 param_check( 2, 1 ) = idum2
591 param_check( 1, 1 ) = idum1
592*
593 param_check( 17, 2 ) = 1105
594 param_check( 16, 2 ) = 1104
595 param_check( 15, 2 ) = 1103
596 param_check( 14, 2 ) = 1102
597 param_check( 13, 2 ) = 1101
598 param_check( 12, 2 ) = 10
599 param_check( 11, 2 ) = 805
600 param_check( 10, 2 ) = 804
601 param_check( 9, 2 ) = 803
602 param_check( 8, 2 ) = 801
603 param_check( 7, 2 ) = 7
604 param_check( 6, 2 ) = 5
605 param_check( 5, 2 ) = 4
606 param_check( 4, 2 ) = 3
607 param_check( 3, 2 ) = 14
608 param_check( 2, 2 ) = 2
609 param_check( 1, 2 ) = 1
610*
611* Want to find errors with MIN( ), so if no error, set it to a big
612* number. If there already is an error, multiply by the the
613* descriptor multiplier.
614*
615 IF( info.GE.0 ) THEN
616 info = bignum
617 ELSE IF( info.LT.-descmult ) THEN
618 info = -info
619 ELSE
620 info = -info*descmult
621 END IF
622*
623* Check consistency across processors
624*
625 CALL globchk( ictxt, 17, param_check, 17, param_check( 1, 3 ),
626 $ info )
627*
628* Prepare output: set info = 0 if no error, and divide by DESCMULT
629* if error is not in a descriptor entry.
630*
631 IF( info.EQ.bignum ) THEN
632 info = 0
633 ELSE IF( mod( info, descmult ).EQ.0 ) THEN
634 info = -info / descmult
635 ELSE
636 info = -info
637 END IF
638*
639 IF( info.LT.0 ) THEN
640 CALL pxerbla( ictxt, 'PDPBTRSV', -info )
641 RETURN
642 END IF
643*
644* Quick return if possible
645*
646 IF( n.EQ.0 )
647 $ RETURN
648*
649 IF( nrhs.EQ.0 )
650 $ RETURN
651*
652*
653* Adjust addressing into matrix space to properly get into
654* the beginning part of the relevant data
655*
656 part_offset = nb*( ( ja-1 ) / ( npcol*nb ) )
657*
658 IF( ( mycol-csrc ).LT.( ja-part_offset-1 ) / nb ) THEN
659 part_offset = part_offset + nb
660 END IF
661*
662 IF( mycol.LT.csrc ) THEN
663 part_offset = part_offset - nb
664 END IF
665*
666* Form a new BLACS grid (the "standard form" grid) with only procs
667* holding part of the matrix, of size 1xNP where NP is adjusted,
668* starting at csrc=0, with JA modified to reflect dropped procs.
669*
670* First processor to hold part of the matrix:
671*
672 first_proc = mod( ( ja-1 ) / nb+csrc, npcol )
673*
674* Calculate new JA one while dropping off unused processors.
675*
676 ja_new = mod( ja-1, nb ) + 1
677*
678* Save and compute new value of NP
679*
680 np_save = np
681 np = ( ja_new+n-2 ) / nb + 1
682*
683* Call utility routine that forms "standard-form" grid
684*
685 CALL reshape( ictxt, int_one, ictxt_new, int_one, first_proc,
686 $ int_one, np )
687*
688* Use new context from standard grid as context.
689*
690 ictxt_save = ictxt
691 ictxt = ictxt_new
692 desca_1xp( 2 ) = ictxt_new
693 descb_px1( 2 ) = ictxt_new
694*
695* Get information about new grid.
696*
697 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
698*
699* Drop out processors that do not have part of the matrix.
700*
701 IF( myrow.LT.0 ) THEN
702 GO TO 180
703 END IF
704*
705* ********************************
706* Values reused throughout routine
707*
708* User-input value of partition size
709*
710 part_size = nb
711*
712* Number of columns in each processor
713*
714 my_num_cols = numroc( n, part_size, mycol, 0, npcol )
715*
716* Offset in columns to beginning of main partition in each proc
717*
718 IF( mycol.EQ.0 ) THEN
719 part_offset = part_offset + mod( ja_new-1, part_size )
720 my_num_cols = my_num_cols - mod( ja_new-1, part_size )
721 END IF
722*
723* Offset in elements
724*
725 ofst = part_offset*llda
726*
727* Size of main (or odd) partition in each processor
728*
729 odd_size = my_num_cols
730 IF( mycol.LT.np-1 ) THEN
731 odd_size = odd_size - bw
732 END IF
733*
734*
735*
736* Begin main code
737*
738 IF( lsame( uplo, 'L' ) ) THEN
739*
740 IF( lsame( trans, 'N' ) ) THEN
741*
742* Frontsolve
743*
744*
745******************************************
746* Local computation phase
747******************************************
748*
749* Use main partition in each processor to solve locally
750*
751 CALL dtbtrs( uplo, 'N', 'N', odd_size, bw, nrhs,
752 $ a( ofst+1 ), llda, b( part_offset+1 ), lldb,
753 $ info )
754*
755*
756 IF( mycol.LT.np-1 ) THEN
757* Use factorization of odd-even connection block to modify
758* locally stored portion of right hand side(s)
759*
760*
761* First copy and multiply it into temporary storage,
762* then use it on RHS
763*
764 CALL dlamov( 'N', bw, nrhs,
765 $ b( part_offset+odd_size-bw+1 ), lldb,
766 $ work( 1 ), bw )
767*
768 CALL dtrmm( 'L', 'U', 'N', 'N', bw, nrhs, -one,
769 $ a( ( ofst+( bw+1 )+( odd_size-bw )*llda ) ),
770 $ llda-1, work( 1 ), bw )
771*
772 CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
773 $ b( part_offset+odd_size+1 ), lldb )
774*
775 END IF
776*
777*
778 IF( mycol.NE.0 ) THEN
779* Use the "spike" fillin to calculate contribution to previous
780* processor's righthand-side.
781*
782 CALL dgemm( 'T', 'N', bw, nrhs, odd_size, -one, af( 1 ),
783 $ odd_size, b( part_offset+1 ), lldb, zero,
784 $ work( 1+bw-bw ), bw )
785 END IF
786*
787*
788************************************************
789* Formation and solution of reduced system
790************************************************
791*
792*
793* Send modifications to prior processor's right hand sides
794*
795 IF( mycol.GT.0 ) THEN
796*
797 CALL dgesd2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
798 $ mycol-1 )
799*
800 END IF
801*
802* Receive modifications to processor's right hand sides
803*
804 IF( mycol.LT.npcol-1 ) THEN
805*
806 CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
807 $ mycol+1 )
808*
809* Combine contribution to locally stored right hand sides
810*
811 CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
812 $ b( part_offset+odd_size+1 ), lldb )
813*
814 END IF
815*
816*
817* The last processor does not participate in the solution of the
818* reduced system, having sent its contribution already.
819 IF( mycol.EQ.npcol-1 ) THEN
820 GO TO 30
821 END IF
822*
823*
824* *************************************
825* Modification Loop
826*
827* The distance for sending and receiving for each level starts
828* at 1 for the first level.
829 level_dist = 1
830*
831* Do until this proc is needed to modify other procs' equations
832*
833 10 CONTINUE
834 IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
835 $ GO TO 20
836*
837* Receive and add contribution to righthand sides from left
838*
839 IF( mycol-level_dist.GE.0 ) THEN
840*
841 CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
842 $ mycol-level_dist )
843*
844 CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
845 $ b( part_offset+odd_size+1 ), lldb )
846*
847 END IF
848*
849* Receive and add contribution to righthand sides from right
850*
851 IF( mycol+level_dist.LT.npcol-1 ) THEN
852*
853 CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
854 $ mycol+level_dist )
855*
856 CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
857 $ b( part_offset+odd_size+1 ), lldb )
858*
859 END IF
860*
861 level_dist = level_dist*2
862*
863 GO TO 10
864 20 CONTINUE
865* [End of GOTO Loop]
866*
867*
868*
869* *********************************
870* Calculate and use this proc's blocks to modify other procs
871*
872* Solve with diagonal block
873*
874 CALL dtrtrs( 'L', 'N', 'N', bw, nrhs,
875 $ af( odd_size*bw+mbw2+1 ), bw,
876 $ b( part_offset+odd_size+1 ), lldb, info )
877*
878 IF( info.NE.0 ) THEN
879 GO TO 170
880 END IF
881*
882*
883*
884* *********
885 IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
886*
887* Calculate contribution from this block to next diagonal block
888*
889 CALL dgemm( 'T', 'N', bw, nrhs, bw, -one,
890 $ af( ( odd_size )*bw+1 ), bw,
891 $ b( part_offset+odd_size+1 ), lldb, zero,
892 $ work( 1 ), bw )
893*
894* Send contribution to diagonal block's owning processor.
895*
896 CALL dgesd2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
897 $ mycol+level_dist )
898*
899 END IF
900* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
901*
902* ************
903 IF( ( mycol / level_dist.GT.0 ) .AND.
904 $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
905 $ THEN
906*
907*
908* Use offdiagonal block to calculate modification to diag block
909* of processor to the left
910*
911 CALL dgemm( 'N', 'N', bw, nrhs, bw, -one,
912 $ af( odd_size*bw+2*mbw2+1 ), bw,
913 $ b( part_offset+odd_size+1 ), lldb, zero,
914 $ work( 1 ), bw )
915*
916* Send contribution to diagonal block's owning processor.
917*
918 CALL dgesd2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
919 $ mycol-level_dist )
920*
921 END IF
922* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
923*
924 30 CONTINUE
925*
926 ELSE
927*
928******************** BACKSOLVE *************************************
929*
930********************************************************************
931* .. Begin reduced system phase of algorithm ..
932********************************************************************
933*
934*
935*
936* The last processor does not participate in the solution of the
937* reduced system and just waits to receive its solution.
938 IF( mycol.EQ.npcol-1 ) THEN
939 GO TO 80
940 END IF
941*
942* Determine number of steps in tree loop
943*
944 level_dist = 1
945 40 CONTINUE
946 IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
947 $ GO TO 50
948*
949 level_dist = level_dist*2
950*
951 GO TO 40
952 50 CONTINUE
953*
954*
955 IF( ( mycol / level_dist.GT.0 ) .AND.
956 $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
957 $ THEN
958*
959* Receive solution from processor to left
960*
961 CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
962 $ mycol-level_dist )
963*
964*
965* Use offdiagonal block to calculate modification to RHS stored
966* on this processor
967*
968 CALL dgemm( 'T', 'N', bw, nrhs, bw, -one,
969 $ af( odd_size*bw+2*mbw2+1 ), bw, work( 1 ),
970 $ bw, one, b( part_offset+odd_size+1 ), lldb )
971 END IF
972* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
973*
974*
975 IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
976*
977* Receive solution from processor to right
978*
979 CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
980 $ mycol+level_dist )
981*
982* Calculate contribution from this block to next diagonal block
983*
984 CALL dgemm( 'N', 'N', bw, nrhs, bw, -one,
985 $ af( ( odd_size )*bw+1 ), bw, work( 1 ), bw,
986 $ one, b( part_offset+odd_size+1 ), lldb )
987*
988 END IF
989* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
990*
991*
992* Solve with diagonal block
993*
994 CALL dtrtrs( 'L', 'T', 'N', bw, nrhs,
995 $ af( odd_size*bw+mbw2+1 ), bw,
996 $ b( part_offset+odd_size+1 ), lldb, info )
997*
998 IF( info.NE.0 ) THEN
999 GO TO 170
1000 END IF
1001*
1002*
1003*
1004***Modification Loop *******
1005*
1006 60 CONTINUE
1007 IF( level_dist.EQ.1 )
1008 $ GO TO 70
1009*
1010 level_dist = level_dist / 2
1011*
1012* Send solution to the right
1013*
1014 IF( mycol+level_dist.LT.npcol-1 ) THEN
1015*
1016 CALL dgesd2d( ictxt, bw, nrhs,
1017 $ b( part_offset+odd_size+1 ), lldb, 0,
1018 $ mycol+level_dist )
1019*
1020 END IF
1021*
1022* Send solution to left
1023*
1024 IF( mycol-level_dist.GE.0 ) THEN
1025*
1026 CALL dgesd2d( ictxt, bw, nrhs,
1027 $ b( part_offset+odd_size+1 ), lldb, 0,
1028 $ mycol-level_dist )
1029*
1030 END IF
1031*
1032 GO TO 60
1033 70 CONTINUE
1034* [End of GOTO Loop]
1035*
1036 80 CONTINUE
1037* [Processor npcol - 1 jumped to here to await next stage]
1038*
1039*******************************
1040* Reduced system has been solved, communicate solutions to nearest
1041* neighbors in preparation for local computation phase.
1042*
1043*
1044* Send elements of solution to next proc
1045*
1046 IF( mycol.LT.npcol-1 ) THEN
1047*
1048 CALL dgesd2d( ictxt, bw, nrhs,
1049 $ b( part_offset+odd_size+1 ), lldb, 0,
1050 $ mycol+1 )
1051*
1052 END IF
1053*
1054* Receive modifications to processor's right hand sides
1055*
1056 IF( mycol.GT.0 ) THEN
1057*
1058 CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1059 $ mycol-1 )
1060*
1061 END IF
1062*
1063*
1064*
1065**********************************************
1066* Local computation phase
1067**********************************************
1068*
1069 IF( mycol.NE.0 ) THEN
1070* Use the "spike" fillin to calculate contribution from previous
1071* processor's solution.
1072*
1073 CALL dgemm( 'N', 'N', odd_size, nrhs, bw, -one, af( 1 ),
1074 $ odd_size, work( 1+bw-bw ), bw, one,
1075 $ b( part_offset+1 ), lldb )
1076*
1077 END IF
1078*
1079*
1080 IF( mycol.LT.np-1 ) THEN
1081* Use factorization of odd-even connection block to modify
1082* locally stored portion of right hand side(s)
1083*
1084*
1085* First copy and multiply it into temporary storage,
1086* then use it on RHS
1087*
1088 CALL dlamov( 'N', bw, nrhs, b( part_offset+odd_size+1 ),
1089 $ lldb, work( 1+bw-bw ), bw )
1090*
1091 CALL dtrmm( 'L', 'U', 'T', 'N', bw, nrhs, -one,
1092 $ a( ( ofst+( bw+1 )+( odd_size-bw )*llda ) ),
1093 $ llda-1, work( 1+bw-bw ), bw )
1094*
1095 CALL dmatadd( bw, nrhs, one, work( 1+bw-bw ), bw, one,
1096 $ b( part_offset+odd_size-bw+1 ), lldb )
1097*
1098 END IF
1099*
1100* Use main partition in each processor to solve locally
1101*
1102 CALL dtbtrs( uplo, 'T', 'N', odd_size, bw, nrhs,
1103 $ a( ofst+1 ), llda, b( part_offset+1 ), lldb,
1104 $ info )
1105*
1106 END IF
1107* End of "IF( LSAME( TRANS, 'N' ) )"...
1108*
1109*
1110 ELSE
1111***************************************************************
1112* CASE UPLO = 'U' *
1113***************************************************************
1114 IF( lsame( trans, 'T' ) ) THEN
1115*
1116* Frontsolve
1117*
1118*
1119******************************************
1120* Local computation phase
1121******************************************
1122*
1123* Use main partition in each processor to solve locally
1124*
1125 CALL dtbtrs( uplo, 'T', 'N', odd_size, bw, nrhs,
1126 $ a( ofst+1 ), llda, b( part_offset+1 ), lldb,
1127 $ info )
1128*
1129*
1130 IF( mycol.LT.np-1 ) THEN
1131* Use factorization of odd-even connection block to modify
1132* locally stored portion of right hand side(s)
1133*
1134*
1135* First copy and multiply it into temporary storage,
1136* then use it on RHS
1137*
1138 CALL dlamov( 'N', bw, nrhs,
1139 $ b( part_offset+odd_size-bw+1 ), lldb,
1140 $ work( 1 ), bw )
1141*
1142 CALL dtrmm( 'L', 'L', 'T', 'N', bw, nrhs, -one,
1143 $ a( ( ofst+1+odd_size*llda ) ), llda-1,
1144 $ work( 1 ), bw )
1145*
1146 CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
1147 $ b( part_offset+odd_size+1 ), lldb )
1148*
1149 END IF
1150*
1151*
1152 IF( mycol.NE.0 ) THEN
1153* Use the "spike" fillin to calculate contribution to previous
1154* processor's righthand-side.
1155*
1156 CALL dgemm( 'T', 'N', bw, nrhs, odd_size, -one, af( 1 ),
1157 $ odd_size, b( part_offset+1 ), lldb, zero,
1158 $ work( 1+bw-bw ), bw )
1159 END IF
1160*
1161*
1162************************************************
1163* Formation and solution of reduced system
1164************************************************
1165*
1166*
1167* Send modifications to prior processor's right hand sides
1168*
1169 IF( mycol.GT.0 ) THEN
1170*
1171 CALL dgesd2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1172 $ mycol-1 )
1173*
1174 END IF
1175*
1176* Receive modifications to processor's right hand sides
1177*
1178 IF( mycol.LT.npcol-1 ) THEN
1179*
1180 CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1181 $ mycol+1 )
1182*
1183* Combine contribution to locally stored right hand sides
1184*
1185 CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
1186 $ b( part_offset+odd_size+1 ), lldb )
1187*
1188 END IF
1189*
1190*
1191* The last processor does not participate in the solution of the
1192* reduced system, having sent its contribution already.
1193 IF( mycol.EQ.npcol-1 ) THEN
1194 GO TO 110
1195 END IF
1196*
1197*
1198* *************************************
1199* Modification Loop
1200*
1201* The distance for sending and receiving for each level starts
1202* at 1 for the first level.
1203 level_dist = 1
1204*
1205* Do until this proc is needed to modify other procs' equations
1206*
1207 90 CONTINUE
1208 IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
1209 $ GO TO 100
1210*
1211* Receive and add contribution to righthand sides from left
1212*
1213 IF( mycol-level_dist.GE.0 ) THEN
1214*
1215 CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1216 $ mycol-level_dist )
1217*
1218 CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
1219 $ b( part_offset+odd_size+1 ), lldb )
1220*
1221 END IF
1222*
1223* Receive and add contribution to righthand sides from right
1224*
1225 IF( mycol+level_dist.LT.npcol-1 ) THEN
1226*
1227 CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1228 $ mycol+level_dist )
1229*
1230 CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
1231 $ b( part_offset+odd_size+1 ), lldb )
1232*
1233 END IF
1234*
1235 level_dist = level_dist*2
1236*
1237 GO TO 90
1238 100 CONTINUE
1239* [End of GOTO Loop]
1240*
1241*
1242*
1243* *********************************
1244* Calculate and use this proc's blocks to modify other procs
1245*
1246* Solve with diagonal block
1247*
1248 CALL dtrtrs( 'L', 'N', 'N', bw, nrhs,
1249 $ af( odd_size*bw+mbw2+1 ), bw,
1250 $ b( part_offset+odd_size+1 ), lldb, info )
1251*
1252 IF( info.NE.0 ) THEN
1253 GO TO 170
1254 END IF
1255*
1256*
1257*
1258* *********
1259 IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1260*
1261* Calculate contribution from this block to next diagonal block
1262*
1263 CALL dgemm( 'T', 'N', bw, nrhs, bw, -one,
1264 $ af( ( odd_size )*bw+1 ), bw,
1265 $ b( part_offset+odd_size+1 ), lldb, zero,
1266 $ work( 1 ), bw )
1267*
1268* Send contribution to diagonal block's owning processor.
1269*
1270 CALL dgesd2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1271 $ mycol+level_dist )
1272*
1273 END IF
1274* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1275*
1276* ************
1277 IF( ( mycol / level_dist.GT.0 ) .AND.
1278 $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
1279 $ THEN
1280*
1281*
1282* Use offdiagonal block to calculate modification to diag block
1283* of processor to the left
1284*
1285 CALL dgemm( 'N', 'N', bw, nrhs, bw, -one,
1286 $ af( odd_size*bw+2*mbw2+1 ), bw,
1287 $ b( part_offset+odd_size+1 ), lldb, zero,
1288 $ work( 1 ), bw )
1289*
1290* Send contribution to diagonal block's owning processor.
1291*
1292 CALL dgesd2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1293 $ mycol-level_dist )
1294*
1295 END IF
1296* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1297*
1298 110 CONTINUE
1299*
1300 ELSE
1301*
1302******************** BACKSOLVE *************************************
1303*
1304********************************************************************
1305* .. Begin reduced system phase of algorithm ..
1306********************************************************************
1307*
1308*
1309*
1310* The last processor does not participate in the solution of the
1311* reduced system and just waits to receive its solution.
1312 IF( mycol.EQ.npcol-1 ) THEN
1313 GO TO 160
1314 END IF
1315*
1316* Determine number of steps in tree loop
1317*
1318 level_dist = 1
1319 120 CONTINUE
1320 IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
1321 $ GO TO 130
1322*
1323 level_dist = level_dist*2
1324*
1325 GO TO 120
1326 130 CONTINUE
1327*
1328*
1329 IF( ( mycol / level_dist.GT.0 ) .AND.
1330 $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
1331 $ THEN
1332*
1333* Receive solution from processor to left
1334*
1335 CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1336 $ mycol-level_dist )
1337*
1338*
1339* Use offdiagonal block to calculate modification to RHS stored
1340* on this processor
1341*
1342 CALL dgemm( 'T', 'N', bw, nrhs, bw, -one,
1343 $ af( odd_size*bw+2*mbw2+1 ), bw, work( 1 ),
1344 $ bw, one, b( part_offset+odd_size+1 ), lldb )
1345 END IF
1346* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1347*
1348*
1349 IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1350*
1351* Receive solution from processor to right
1352*
1353 CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1354 $ mycol+level_dist )
1355*
1356* Calculate contribution from this block to next diagonal block
1357*
1358 CALL dgemm( 'N', 'N', bw, nrhs, bw, -one,
1359 $ af( ( odd_size )*bw+1 ), bw, work( 1 ), bw,
1360 $ one, b( part_offset+odd_size+1 ), lldb )
1361*
1362 END IF
1363* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1364*
1365*
1366* Solve with diagonal block
1367*
1368 CALL dtrtrs( 'L', 'T', 'N', bw, nrhs,
1369 $ af( odd_size*bw+mbw2+1 ), bw,
1370 $ b( part_offset+odd_size+1 ), lldb, info )
1371*
1372 IF( info.NE.0 ) THEN
1373 GO TO 170
1374 END IF
1375*
1376*
1377*
1378***Modification Loop *******
1379*
1380 140 CONTINUE
1381 IF( level_dist.EQ.1 )
1382 $ GO TO 150
1383*
1384 level_dist = level_dist / 2
1385*
1386* Send solution to the right
1387*
1388 IF( mycol+level_dist.LT.npcol-1 ) THEN
1389*
1390 CALL dgesd2d( ictxt, bw, nrhs,
1391 $ b( part_offset+odd_size+1 ), lldb, 0,
1392 $ mycol+level_dist )
1393*
1394 END IF
1395*
1396* Send solution to left
1397*
1398 IF( mycol-level_dist.GE.0 ) THEN
1399*
1400 CALL dgesd2d( ictxt, bw, nrhs,
1401 $ b( part_offset+odd_size+1 ), lldb, 0,
1402 $ mycol-level_dist )
1403*
1404 END IF
1405*
1406 GO TO 140
1407 150 CONTINUE
1408* [End of GOTO Loop]
1409*
1410 160 CONTINUE
1411* [Processor npcol - 1 jumped to here to await next stage]
1412*
1413*******************************
1414* Reduced system has been solved, communicate solutions to nearest
1415* neighbors in preparation for local computation phase.
1416*
1417*
1418* Send elements of solution to next proc
1419*
1420 IF( mycol.LT.npcol-1 ) THEN
1421*
1422 CALL dgesd2d( ictxt, bw, nrhs,
1423 $ b( part_offset+odd_size+1 ), lldb, 0,
1424 $ mycol+1 )
1425*
1426 END IF
1427*
1428* Receive modifications to processor's right hand sides
1429*
1430 IF( mycol.GT.0 ) THEN
1431*
1432 CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1433 $ mycol-1 )
1434*
1435 END IF
1436*
1437*
1438*
1439**********************************************
1440* Local computation phase
1441**********************************************
1442*
1443 IF( mycol.NE.0 ) THEN
1444* Use the "spike" fillin to calculate contribution from previous
1445* processor's solution.
1446*
1447 CALL dgemm( 'N', 'N', odd_size, nrhs, bw, -one, af( 1 ),
1448 $ odd_size, work( 1+bw-bw ), bw, one,
1449 $ b( part_offset+1 ), lldb )
1450*
1451 END IF
1452*
1453*
1454 IF( mycol.LT.np-1 ) THEN
1455* Use factorization of odd-even connection block to modify
1456* locally stored portion of right hand side(s)
1457*
1458*
1459* First copy and multiply it into temporary storage,
1460* then use it on RHS
1461*
1462 CALL dlamov( 'N', bw, nrhs, b( part_offset+odd_size+1 ),
1463 $ lldb, work( 1+bw-bw ), bw )
1464*
1465 CALL dtrmm( 'L', 'L', 'N', 'N', bw, nrhs, -one,
1466 $ a( ( ofst+1+odd_size*llda ) ), llda-1,
1467 $ work( 1+bw-bw ), bw )
1468*
1469 CALL dmatadd( bw, nrhs, one, work( 1+bw-bw ), bw, one,
1470 $ b( part_offset+odd_size-bw+1 ), lldb )
1471*
1472 END IF
1473*
1474* Use main partition in each processor to solve locally
1475*
1476 CALL dtbtrs( uplo, 'N', 'N', odd_size, bw, nrhs,
1477 $ a( ofst+1 ), llda, b( part_offset+1 ), lldb,
1478 $ info )
1479*
1480 END IF
1481* End of "IF( LSAME( TRANS, 'N' ) )"...
1482*
1483*
1484 END IF
1485* End of "IF( LSAME( UPLO, 'L' ) )"...
1486 170 CONTINUE
1487*
1488*
1489* Free BLACS space used to hold standard-form grid.
1490*
1491 IF( ictxt_save.NE.ictxt_new ) THEN
1492 CALL blacs_gridexit( ictxt_new )
1493 END IF
1494*
1495 180 CONTINUE
1496*
1497* Restore saved input parameters
1498*
1499 ictxt = ictxt_save
1500 np = np_save
1501*
1502* Output minimum worksize
1503*
1504 work( 1 ) = work_size_min
1505*
1506*
1507 RETURN
1508*
1509* End of PDPBTRSV
1510*
1511 END
subroutine desc_convert(desc_in, desc_out, info)
Definition desc_convert.f:2
subroutine dmatadd(m, n, alpha, a, lda, beta, c, ldc)
Definition dmatadd.f:2
subroutine globchk(ictxt, n, x, ldx, iwork, info)
Definition pchkxmat.f:403
subroutine pdpbtrsv(uplo, trans, n, bw, nrhs, a, ja, desca, b, ib, descb, af, laf, work, lwork, info)
Definition pdpbtrsv.f:3
subroutine pxerbla(ictxt, srname, info)
Definition pxerbla.f:2
void reshape(Int *context_in, Int *major_in, Int *context_out, Int *major_out, Int *first_proc, Int *nprow_new, Int *npcol_new)
Definition reshape.c:80