SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
psdbtrsv.f
Go to the documentation of this file.
1 SUBROUTINE psdbtrsv( UPLO, TRANS, N, BWL, BWU, NRHS, A, JA, DESCA,
2 $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO )
3*
4* -- ScaLAPACK routine (version 2.0.2) --
5* Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6* May 1 2012
7*
8* .. Scalar Arguments ..
9 CHARACTER TRANS, UPLO
10 INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS
11* ..
12* .. Array Arguments ..
13 INTEGER DESCA( * ), DESCB( * )
14 REAL A( * ), AF( * ), B( * ), WORK( * )
15* ..
16*
17*
18* Purpose
19* =======
20*
21* PSDBTRSV solves a banded triangular system of linear equations
22*
23* A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
24* or
25* A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS)
26*
27* where A(1:N, JA:JA+N-1) is a banded
28* triangular matrix factor produced by the
29* Gaussian elimination code PD@(dom_pre)BTRF
30* and is stored in A(1:N,JA:JA+N-1) and AF.
31* The matrix stored in A(1:N, JA:JA+N-1) is either
32* upper or lower triangular according to UPLO,
33* and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T
34* is dictated by the user by the parameter TRANS.
35*
36* Routine PSDBTRF MUST be called first.
37*
38* =====================================================================
39*
40* Arguments
41* =========
42*
43* UPLO (global input) CHARACTER
44* = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
45* = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
46*
47* TRANS (global input) CHARACTER
48* = 'N': Solve with A(1:N, JA:JA+N-1);
49* = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T;
50*
51* N (global input) INTEGER
52* The number of rows and columns to be operated on, i.e. the
53* order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
54*
55* BWL (global input) INTEGER
56* Number of subdiagonals. 0 <= BWL <= N-1
57*
58* BWU (global input) INTEGER
59* Number of superdiagonals. 0 <= BWU <= N-1
60*
61* NRHS (global input) INTEGER
62* The number of right hand sides, i.e., the number of columns
63* of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
64* NRHS >= 0.
65*
66* A (local input/local output) REAL pointer into
67* local memory to an array with first dimension
68* LLD_A >=(bwl+bwu+1) (stored in DESCA).
69* On entry, this array contains the local pieces of the
70* N-by-N unsymmetric banded distributed Cholesky factor L or
71* L^T A(1:N, JA:JA+N-1).
72* This local portion is stored in the packed banded format
73* used in LAPACK. Please see the Notes below and the
74* ScaLAPACK manual for more detail on the format of
75* distributed matrices.
76*
77* JA (global input) INTEGER
78* The index in the global array A that points to the start of
79* the matrix to be operated on (which may be either all of A
80* or a submatrix of A).
81*
82* DESCA (global and local input) INTEGER array of dimension DLEN.
83* if 1D type (DTYPE_A=501), DLEN >= 7;
84* if 2D type (DTYPE_A=1), DLEN >= 9 .
85* The array descriptor for the distributed matrix A.
86* Contains information of mapping of A to memory. Please
87* see NOTES below for full description and options.
88*
89* B (local input/local output) REAL pointer into
90* local memory to an array of local lead dimension lld_b>=NB.
91* On entry, this array contains the
92* the local pieces of the right hand sides
93* B(IB:IB+N-1, 1:NRHS).
94* On exit, this contains the local piece of the solutions
95* distributed matrix X.
96*
97* IB (global input) INTEGER
98* The row index in the global array B that points to the first
99* row of the matrix to be operated on (which may be either
100* all of B or a submatrix of B).
101*
102* DESCB (global and local input) INTEGER array of dimension DLEN.
103* if 1D type (DTYPE_B=502), DLEN >=7;
104* if 2D type (DTYPE_B=1), DLEN >= 9.
105* The array descriptor for the distributed matrix B.
106* Contains information of mapping of B to memory. Please
107* see NOTES below for full description and options.
108*
109* AF (local output) REAL array, dimension LAF.
110* Auxiliary Fillin Space.
111* Fillin is created during the factorization routine
112* PSDBTRF and this is stored in AF. If a linear system
113* is to be solved using PSDBTRS after the factorization
114* routine, AF *must not be altered* after the factorization.
115*
116* LAF (local input) INTEGER
117* Size of user-input Auxiliary Fillin space AF. Must be >=
118* NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu)
119* If LAF is not large enough, an error code will be returned
120* and the minimum acceptable size will be returned in AF( 1 )
121*
122* WORK (local workspace/local output)
123* REAL temporary workspace. This space may
124* be overwritten in between calls to routines. WORK must be
125* the size given in LWORK.
126* On exit, WORK( 1 ) contains the minimal LWORK.
127*
128* LWORK (local input or global input) INTEGER
129* Size of user-input workspace WORK.
130* If LWORK is too small, the minimal acceptable size will be
131* returned in WORK(1) and an error code is returned. LWORK>=
132* (max(bwl,bwu)*NRHS)
133*
134* INFO (global output) INTEGER
135* = 0: successful exit
136* < 0: If the i-th argument is an array and the j-entry had
137* an illegal value, then INFO = -(i*100+j), if the i-th
138* argument is a scalar and had an illegal value, then
139* INFO = -i.
140*
141* =====================================================================
142*
143*
144* Restrictions
145* ============
146*
147* The following are restrictions on the input parameters. Some of these
148* are temporary and will be removed in future releases, while others
149* may reflect fundamental technical limitations.
150*
151* Non-cyclic restriction: VERY IMPORTANT!
152* P*NB>= mod(JA-1,NB)+N.
153* The mapping for matrices must be blocked, reflecting the nature
154* of the divide and conquer algorithm as a task-parallel algorithm.
155* This formula in words is: no processor may have more than one
156* chunk of the matrix.
157*
158* Blocksize cannot be too small:
159* If the matrix spans more than one processor, the following
160* restriction on NB, the size of each block on each processor,
161* must hold:
162* NB >= 2*MAX(BWL,BWU)
163* The bulk of parallel computation is done on the matrix of size
164* O(NB) on each processor. If this is too small, divide and conquer
165* is a poor choice of algorithm.
166*
167* Submatrix reference:
168* JA = IB
169* Alignment restriction that prevents unnecessary communication.
170*
171*
172* =====================================================================
173*
174*
175* Notes
176* =====
177*
178* If the factorization routine and the solve routine are to be called
179* separately (to solve various sets of righthand sides using the same
180* coefficient matrix), the auxiliary space AF *must not be altered*
181* between calls to the factorization routine and the solve routine.
182*
183* The best algorithm for solving banded and tridiagonal linear systems
184* depends on a variety of parameters, especially the bandwidth.
185* Currently, only algorithms designed for the case N/P >> bw are
186* implemented. These go by many names, including Divide and Conquer,
187* Partitioning, domain decomposition-type, etc.
188*
189* Algorithm description: Divide and Conquer
190*
191* The Divide and Conqer algorithm assumes the matrix is narrowly
192* banded compared with the number of equations. In this situation,
193* it is best to distribute the input matrix A one-dimensionally,
194* with columns atomic and rows divided amongst the processes.
195* The basic algorithm divides the banded matrix up into
196* P pieces with one stored on each processor,
197* and then proceeds in 2 phases for the factorization or 3 for the
198* solution of a linear system.
199* 1) Local Phase:
200* The individual pieces are factored independently and in
201* parallel. These factors are applied to the matrix creating
202* fillin, which is stored in a non-inspectable way in auxiliary
203* space AF. Mathematically, this is equivalent to reordering
204* the matrix A as P A P^T and then factoring the principal
205* leading submatrix of size equal to the sum of the sizes of
206* the matrices factored on each processor. The factors of
207* these submatrices overwrite the corresponding parts of A
208* in memory.
209* 2) Reduced System Phase:
210* A small (max(bwl,bwu)* (P-1)) system is formed representing
211* interaction of the larger blocks, and is stored (as are its
212* factors) in the space AF. A parallel Block Cyclic Reduction
213* algorithm is used. For a linear system, a parallel front solve
214* followed by an analagous backsolve, both using the structure
215* of the factored matrix, are performed.
216* 3) Backsubsitution Phase:
217* For a linear system, a local backsubstitution is performed on
218* each processor in parallel.
219*
220*
221* Descriptors
222* ===========
223*
224* Descriptors now have *types* and differ from ScaLAPACK 1.0.
225*
226* Note: banded codes can use either the old two dimensional
227* or new one-dimensional descriptors, though the processor grid in
228* both cases *must be one-dimensional*. We describe both types below.
229*
230* Each global data object is described by an associated description
231* vector. This vector stores the information required to establish
232* the mapping between an object element and its corresponding process
233* and memory location.
234*
235* Let A be a generic term for any 2D block cyclicly distributed array.
236* Such a global array has an associated description vector DESCA.
237* In the following comments, the character _ should be read as
238* "of the global array".
239*
240* NOTATION STORED IN EXPLANATION
241* --------------- -------------- --------------------------------------
242* DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
243* DTYPE_A = 1.
244* CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
245* the BLACS process grid A is distribu-
246* ted over. The context itself is glo-
247* bal, but the handle (the integer
248* value) may vary.
249* M_A (global) DESCA( M_ ) The number of rows in the global
250* array A.
251* N_A (global) DESCA( N_ ) The number of columns in the global
252* array A.
253* MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
254* the rows of the array.
255* NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
256* the columns of the array.
257* RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
258* row of the array A is distributed.
259* CSRC_A (global) DESCA( CSRC_ ) The process column over which the
260* first column of the array A is
261* distributed.
262* LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
263* array. LLD_A >= MAX(1,LOCr(M_A)).
264*
265* Let K be the number of rows or columns of a distributed matrix,
266* and assume that its process grid has dimension p x q.
267* LOCr( K ) denotes the number of elements of K that a process
268* would receive if K were distributed over the p processes of its
269* process column.
270* Similarly, LOCc( K ) denotes the number of elements of K that a
271* process would receive if K were distributed over the q processes of
272* its process row.
273* The values of LOCr() and LOCc() may be determined via a call to the
274* ScaLAPACK tool function, NUMROC:
275* LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
276* LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
277* An upper bound for these quantities may be computed by:
278* LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
279* LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
280*
281*
282* One-dimensional descriptors:
283*
284* One-dimensional descriptors are a new addition to ScaLAPACK since
285* version 1.0. They simplify and shorten the descriptor for 1D
286* arrays.
287*
288* Since ScaLAPACK supports two-dimensional arrays as the fundamental
289* object, we allow 1D arrays to be distributed either over the
290* first dimension of the array (as if the grid were P-by-1) or the
291* 2nd dimension (as if the grid were 1-by-P). This choice is
292* indicated by the descriptor type (501 or 502)
293* as described below.
294*
295* IMPORTANT NOTE: the actual BLACS grid represented by the
296* CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
297* irrespective of which one-dimensional descriptor type
298* (501 or 502) is input.
299* This routine will interpret the grid properly either way.
300* ScaLAPACK routines *do not support intercontext operations* so that
301* the grid passed to a single ScaLAPACK routine *must be the same*
302* for all array descriptors passed to that routine.
303*
304* NOTE: In all cases where 1D descriptors are used, 2D descriptors
305* may also be used, since a one-dimensional array is a special case
306* of a two-dimensional array with one dimension of size unity.
307* The two-dimensional array used in this case *must* be of the
308* proper orientation:
309* If the appropriate one-dimensional descriptor is DTYPEA=501
310* (1 by P type), then the two dimensional descriptor must
311* have a CTXT value that refers to a 1 by P BLACS grid;
312* If the appropriate one-dimensional descriptor is DTYPEA=502
313* (P by 1 type), then the two dimensional descriptor must
314* have a CTXT value that refers to a P by 1 BLACS grid.
315*
316*
317* Summary of allowed descriptors, types, and BLACS grids:
318* DTYPE 501 502 1 1
319* BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
320* -----------------------------------------------------
321* A OK NO OK NO
322* B NO OK NO OK
323*
324* Note that a consequence of this chart is that it is not possible
325* for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
326* to opposite requirements for the orientation of the BLACS grid,
327* and as noted before, the *same* BLACS context must be used in
328* all descriptors in a single ScaLAPACK subroutine call.
329*
330* Let A be a generic term for any 1D block cyclicly distributed array.
331* Such a global array has an associated description vector DESCA.
332* In the following comments, the character _ should be read as
333* "of the global array".
334*
335* NOTATION STORED IN EXPLANATION
336* --------------- ---------- ------------------------------------------
337* DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
338* TYPE_A = 501: 1-by-P grid.
339* TYPE_A = 502: P-by-1 grid.
340* CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
341* the BLACS process grid A is distribu-
342* ted over. The context itself is glo-
343* bal, but the handle (the integer
344* value) may vary.
345* N_A (global) DESCA( 3 ) The size of the array dimension being
346* distributed.
347* NB_A (global) DESCA( 4 ) The blocking factor used to distribute
348* the distributed dimension of the array.
349* SRC_A (global) DESCA( 5 ) The process row or column over which the
350* first row or column of the array
351* is distributed.
352* LLD_A (local) DESCA( 6 ) The leading dimension of the local array
353* storing the local blocks of the distri-
354* buted array A. Minimum value of LLD_A
355* depends on TYPE_A.
356* TYPE_A = 501: LLD_A >=
357* size of undistributed dimension, 1.
358* TYPE_A = 502: LLD_A >=NB_A, 1.
359* Reserved DESCA( 7 ) Reserved for future use.
360*
361*
362*
363* =====================================================================
364*
365* Code Developer: Andrew J. Cleary, University of Tennessee.
366* Current address: Lawrence Livermore National Labs.
367* Last modified by: Peter Arbenz, Institute of Scientific Computing,
368* ETH, Zurich.
369*
370* =====================================================================
371*
372* .. Parameters ..
373 REAL ONE
374 parameter( one = 1.0e+0 )
375 REAL ZERO
376 parameter( zero = 0.0e+0 )
377 INTEGER INT_ONE
378 parameter( int_one = 1 )
379 INTEGER DESCMULT, BIGNUM
380 parameter( descmult = 100, bignum = descmult*descmult )
381 INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
382 $ lld_, mb_, m_, nb_, n_, rsrc_
383 parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
384 $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
385 $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
386* ..
387* .. Local Scalars ..
388 INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
389 $ idum1, idum2, idum3, ja_new, level_dist, llda,
390 $ lldb, max_bw, mbw2, mycol, myrow, my_num_cols,
391 $ nb, np, npcol, nprow, np_save, odd_size, ofst,
392 $ part_offset, part_size, return_code, store_m_b,
393 $ store_n_a, work_size_min, work_u
394* ..
395* .. Local Arrays ..
396 INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
397 $ param_check( 18, 3 )
398* ..
399* .. External Subroutines ..
400 EXTERNAL blacs_gridexit, blacs_gridinfo, desc_convert,
401 $ sgemm, sgerv2d, sgesd2d, slamov, smatadd,
402 $ stbtrs, strmm, globchk, pxerbla, reshape
403* ..
404* .. External Functions ..
405 LOGICAL LSAME
406 INTEGER NUMROC
407 EXTERNAL lsame, numroc
408* ..
409* .. Intrinsic Functions ..
410 INTRINSIC ichar, max, min, mod
411* ..
412* .. Executable Statements ..
413*
414* Test the input parameters
415*
416 info = 0
417*
418* Convert descriptor into standard form for easy access to
419* parameters, check that grid is of right shape.
420*
421 desca_1xp( 1 ) = 501
422 descb_px1( 1 ) = 502
423*
424 CALL desc_convert( desca, desca_1xp, return_code )
425*
426 IF( return_code.NE.0 ) THEN
427 info = -( 9*100+2 )
428 END IF
429*
430 CALL desc_convert( descb, descb_px1, return_code )
431*
432 IF( return_code.NE.0 ) THEN
433 info = -( 12*100+2 )
434 END IF
435*
436* Consistency checks for DESCA and DESCB.
437*
438* Context must be the same
439 IF( desca_1xp( 2 ).NE.descb_px1( 2 ) ) THEN
440 info = -( 12*100+2 )
441 END IF
442*
443* These are alignment restrictions that may or may not be removed
444* in future releases. -Andy Cleary, April 14, 1996.
445*
446* Block sizes must be the same
447 IF( desca_1xp( 4 ).NE.descb_px1( 4 ) ) THEN
448 info = -( 12*100+4 )
449 END IF
450*
451* Source processor must be the same
452*
453 IF( desca_1xp( 5 ).NE.descb_px1( 5 ) ) THEN
454 info = -( 12*100+5 )
455 END IF
456*
457* Get values out of descriptor for use in code.
458*
459 ictxt = desca_1xp( 2 )
460 csrc = desca_1xp( 5 )
461 nb = desca_1xp( 4 )
462 llda = desca_1xp( 6 )
463 store_n_a = desca_1xp( 3 )
464 lldb = descb_px1( 6 )
465 store_m_b = descb_px1( 3 )
466*
467* Get grid parameters
468*
469*
470* Size of separator blocks is maximum of bandwidths
471*
472 max_bw = max( bwl, bwu )
473 mbw2 = max_bw*max_bw
474*
475 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
476 np = nprow*npcol
477*
478*
479*
480 IF( lsame( uplo, 'U' ) ) THEN
481 idum1 = ichar( 'U' )
482 ELSE IF( lsame( uplo, 'L' ) ) THEN
483 idum1 = ichar( 'L' )
484 ELSE
485 info = -1
486 END IF
487*
488 IF( lsame( trans, 'N' ) ) THEN
489 idum2 = ichar( 'N' )
490 ELSE IF( lsame( trans, 'T' ) ) THEN
491 idum2 = ichar( 'T' )
492 ELSE IF( lsame( trans, 'C' ) ) THEN
493 idum2 = ichar( 'T' )
494 ELSE
495 info = -2
496 END IF
497*
498 IF( lwork.LT.-1 ) THEN
499 info = -16
500 ELSE IF( lwork.EQ.-1 ) THEN
501 idum3 = -1
502 ELSE
503 idum3 = 1
504 END IF
505*
506 IF( n.LT.0 ) THEN
507 info = -3
508 END IF
509*
510 IF( n+ja-1.GT.store_n_a ) THEN
511 info = -( 9*100+6 )
512 END IF
513*
514 IF( ( bwl.GT.n-1 ) .OR. ( bwl.LT.0 ) ) THEN
515 info = -4
516 END IF
517*
518 IF( ( bwu.GT.n-1 ) .OR. ( bwu.LT.0 ) ) THEN
519 info = -5
520 END IF
521*
522 IF( llda.LT.( bwl+bwu+1 ) ) THEN
523 info = -( 9*100+6 )
524 END IF
525*
526 IF( nb.LE.0 ) THEN
527 info = -( 9*100+4 )
528 END IF
529*
530 IF( n+ib-1.GT.store_m_b ) THEN
531 info = -( 12*100+3 )
532 END IF
533*
534 IF( lldb.LT.nb ) THEN
535 info = -( 12*100+6 )
536 END IF
537*
538 IF( nrhs.LT.0 ) THEN
539 info = -6
540 END IF
541*
542* Current alignment restriction
543*
544 IF( ja.NE.ib ) THEN
545 info = -8
546 END IF
547*
548* Argument checking that is specific to Divide & Conquer routine
549*
550 IF( nprow.NE.1 ) THEN
551 info = -( 9*100+2 )
552 END IF
553*
554 IF( n.GT.np*nb-mod( ja-1, nb ) ) THEN
555 info = -( 3 )
556 CALL pxerbla( ictxt,
557 $ 'PSDBTRSV, D&C alg.: only 1 block per proc',
558 $ -info )
559 RETURN
560 END IF
561*
562 IF( ( ja+n-1.GT.nb ) .AND. ( nb.LT.2*max( bwl, bwu ) ) ) THEN
563 info = -( 9*100+4 )
564 CALL pxerbla( ictxt, 'PSDBTRSV, D&C alg.: NB too small',
565 $ -info )
566 RETURN
567 END IF
568*
569*
570 work_size_min = max( bwl, bwu )*nrhs
571*
572 work( 1 ) = work_size_min
573*
574 IF( lwork.LT.work_size_min ) THEN
575 IF( lwork.NE.-1 ) THEN
576 info = -16
577 CALL pxerbla( ictxt, 'PSDBTRSV: worksize error', -info )
578 END IF
579 RETURN
580 END IF
581*
582* Pack params and positions into arrays for global consistency check
583*
584 param_check( 18, 1 ) = descb( 5 )
585 param_check( 17, 1 ) = descb( 4 )
586 param_check( 16, 1 ) = descb( 3 )
587 param_check( 15, 1 ) = descb( 2 )
588 param_check( 14, 1 ) = descb( 1 )
589 param_check( 13, 1 ) = ib
590 param_check( 12, 1 ) = desca( 5 )
591 param_check( 11, 1 ) = desca( 4 )
592 param_check( 10, 1 ) = desca( 3 )
593 param_check( 9, 1 ) = desca( 1 )
594 param_check( 8, 1 ) = ja
595 param_check( 7, 1 ) = nrhs
596 param_check( 6, 1 ) = bwu
597 param_check( 5, 1 ) = bwl
598 param_check( 4, 1 ) = n
599 param_check( 3, 1 ) = idum3
600 param_check( 2, 1 ) = idum2
601 param_check( 1, 1 ) = idum1
602*
603 param_check( 18, 2 ) = 1205
604 param_check( 17, 2 ) = 1204
605 param_check( 16, 2 ) = 1203
606 param_check( 15, 2 ) = 1202
607 param_check( 14, 2 ) = 1201
608 param_check( 13, 2 ) = 11
609 param_check( 12, 2 ) = 905
610 param_check( 11, 2 ) = 904
611 param_check( 10, 2 ) = 903
612 param_check( 9, 2 ) = 901
613 param_check( 8, 2 ) = 8
614 param_check( 7, 2 ) = 6
615 param_check( 6, 2 ) = 5
616 param_check( 5, 2 ) = 4
617 param_check( 4, 2 ) = 3
618 param_check( 3, 2 ) = 16
619 param_check( 2, 2 ) = 2
620 param_check( 1, 2 ) = 1
621*
622* Want to find errors with MIN( ), so if no error, set it to a big
623* number. If there already is an error, multiply by the the
624* descriptor multiplier.
625*
626 IF( info.GE.0 ) THEN
627 info = bignum
628 ELSE IF( info.LT.-descmult ) THEN
629 info = -info
630 ELSE
631 info = -info*descmult
632 END IF
633*
634* Check consistency across processors
635*
636 CALL globchk( ictxt, 18, param_check, 18, param_check( 1, 3 ),
637 $ info )
638*
639* Prepare output: set info = 0 if no error, and divide by DESCMULT
640* if error is not in a descriptor entry.
641*
642 IF( info.EQ.bignum ) THEN
643 info = 0
644 ELSE IF( mod( info, descmult ).EQ.0 ) THEN
645 info = -info / descmult
646 ELSE
647 info = -info
648 END IF
649*
650 IF( info.LT.0 ) THEN
651 CALL pxerbla( ictxt, 'PSDBTRSV', -info )
652 RETURN
653 END IF
654*
655* Quick return if possible
656*
657 IF( n.EQ.0 )
658 $ RETURN
659*
660 IF( nrhs.EQ.0 )
661 $ RETURN
662*
663*
664* Adjust addressing into matrix space to properly get into
665* the beginning part of the relevant data
666*
667 part_offset = nb*( ( ja-1 ) / ( npcol*nb ) )
668*
669 IF( ( mycol-csrc ).LT.( ja-part_offset-1 ) / nb ) THEN
670 part_offset = part_offset + nb
671 END IF
672*
673 IF( mycol.LT.csrc ) THEN
674 part_offset = part_offset - nb
675 END IF
676*
677* Form a new BLACS grid (the "standard form" grid) with only procs
678* holding part of the matrix, of size 1xNP where NP is adjusted,
679* starting at csrc=0, with JA modified to reflect dropped procs.
680*
681* First processor to hold part of the matrix:
682*
683 first_proc = mod( ( ja-1 ) / nb+csrc, npcol )
684*
685* Calculate new JA one while dropping off unused processors.
686*
687 ja_new = mod( ja-1, nb ) + 1
688*
689* Save and compute new value of NP
690*
691 np_save = np
692 np = ( ja_new+n-2 ) / nb + 1
693*
694* Call utility routine that forms "standard-form" grid
695*
696 CALL reshape( ictxt, int_one, ictxt_new, int_one, first_proc,
697 $ int_one, np )
698*
699* Use new context from standard grid as context.
700*
701 ictxt_save = ictxt
702 ictxt = ictxt_new
703 desca_1xp( 2 ) = ictxt_new
704 descb_px1( 2 ) = ictxt_new
705*
706* Get information about new grid.
707*
708 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
709*
710* Drop out processors that do not have part of the matrix.
711*
712 IF( myrow.LT.0 ) THEN
713 GO TO 200
714 END IF
715*
716* ********************************
717* Values reused throughout routine
718*
719* User-input value of partition size
720*
721 part_size = nb
722*
723* Number of columns in each processor
724*
725 my_num_cols = numroc( n, part_size, mycol, 0, npcol )
726*
727* Offset in columns to beginning of main partition in each proc
728*
729 IF( mycol.EQ.0 ) THEN
730 part_offset = part_offset + mod( ja_new-1, part_size )
731 my_num_cols = my_num_cols - mod( ja_new-1, part_size )
732 END IF
733*
734* Offset in elements
735*
736 ofst = part_offset*llda
737*
738* Size of main (or odd) partition in each processor
739*
740 odd_size = my_num_cols
741 IF( mycol.LT.np-1 ) THEN
742 odd_size = odd_size - max_bw
743 END IF
744*
745* Offset to workspace for Upper triangular factor
746*
747 work_u = bwu*odd_size + 3*mbw2
748*
749*
750*
751* Begin main code
752*
753 IF( lsame( uplo, 'L' ) ) THEN
754*
755 IF( lsame( trans, 'N' ) ) THEN
756*
757* Frontsolve
758*
759*
760******************************************
761* Local computation phase
762******************************************
763*
764* Use main partition in each processor to solve locally
765*
766 CALL stbtrs( uplo, 'N', 'U', odd_size, bwl, nrhs,
767 $ a( ofst+1+bwu ), llda, b( part_offset+1 ),
768 $ lldb, info )
769*
770*
771 IF( mycol.LT.np-1 ) THEN
772* Use factorization of odd-even connection block to modify
773* locally stored portion of right hand side(s)
774*
775*
776* First copy and multiply it into temporary storage,
777* then use it on RHS
778*
779 CALL slamov( 'N', bwl, nrhs,
780 $ b( part_offset+odd_size-bwl+1 ), lldb,
781 $ work( 1 ), max_bw )
782*
783 CALL strmm( 'L', 'U', 'N', 'N', bwl, nrhs, -one,
784 $ a( ( ofst+( bwl+bwu+1 )+( odd_size-bwl )*
785 $ llda ) ), llda-1, work( 1 ), max_bw )
786*
787 CALL smatadd( bwl, nrhs, one, work( 1 ), max_bw, one,
788 $ b( part_offset+odd_size+1 ), lldb )
789*
790 END IF
791*
792* Clear garbage out of workspace block
793*
794 DO 10 idum1 = 1, work_size_min
795 work( idum1 ) = 0.0
796 10 CONTINUE
797*
798*
799 IF( mycol.NE.0 ) THEN
800*
801* Use the "spike" fillin to calculate contribution to previous
802* processor's righthand-side.
803*
804 CALL sgemm( 'N', 'N', bwu, nrhs, odd_size, -one, af( 1 ),
805 $ bwu, b( part_offset+1 ), lldb, zero,
806 $ work( 1+max_bw-bwu ), max_bw )
807*
808 END IF
809*
810*
811************************************************
812* Formation and solution of reduced system
813************************************************
814*
815*
816* Send modifications to prior processor's right hand sides
817*
818 IF( mycol.GT.0 ) THEN
819*
820 CALL sgesd2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
821 $ mycol-1 )
822*
823 END IF
824*
825* Receive modifications to processor's right hand sides
826*
827 IF( mycol.LT.npcol-1 ) THEN
828*
829 CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
830 $ mycol+1 )
831*
832* Combine contribution to locally stored right hand sides
833*
834 CALL smatadd( max_bw, nrhs, one, work( 1 ), max_bw, one,
835 $ b( part_offset+odd_size+1 ), lldb )
836*
837 END IF
838*
839*
840* The last processor does not participate in the solution of the
841* reduced system, having sent its contribution already.
842 IF( mycol.EQ.npcol-1 ) THEN
843 GO TO 40
844 END IF
845*
846*
847* *************************************
848* Modification Loop
849*
850* The distance for sending and receiving for each level starts
851* at 1 for the first level.
852 level_dist = 1
853*
854* Do until this proc is needed to modify other procs' equations
855*
856 20 CONTINUE
857 IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
858 $ GO TO 30
859*
860* Receive and add contribution to righthand sides from left
861*
862 IF( mycol-level_dist.GE.0 ) THEN
863*
864 CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
865 $ mycol-level_dist )
866*
867 CALL smatadd( max_bw, nrhs, one, work( 1 ), max_bw, one,
868 $ b( part_offset+odd_size+1 ), lldb )
869*
870 END IF
871*
872* Receive and add contribution to righthand sides from right
873*
874 IF( mycol+level_dist.LT.npcol-1 ) THEN
875*
876 CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
877 $ mycol+level_dist )
878*
879 CALL smatadd( max_bw, nrhs, one, work( 1 ), max_bw, one,
880 $ b( part_offset+odd_size+1 ), lldb )
881*
882 END IF
883*
884 level_dist = level_dist*2
885*
886 GO TO 20
887 30 CONTINUE
888* [End of GOTO Loop]
889*
890*
891*
892* *********************************
893* Calculate and use this proc's blocks to modify other procs
894*
895* Solve with diagonal block
896*
897 CALL stbtrs( 'L', 'N', 'U', max_bw, min( bwl, max_bw-1 ),
898 $ nrhs, af( odd_size*bwu+mbw2+1 ), max_bw+1,
899 $ b( part_offset+odd_size+1 ), lldb, info )
900*
901 IF( info.NE.0 ) THEN
902 GO TO 190
903 END IF
904*
905*
906*
907* *********
908 IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
909*
910* Calculate contribution from this block to next diagonal block
911*
912 CALL sgemm( 'T', 'N', max_bw, nrhs, max_bw, -one,
913 $ af( ( odd_size )*bwu+1 ), max_bw,
914 $ b( part_offset+odd_size+1 ), lldb, zero,
915 $ work( 1 ), max_bw )
916*
917* Send contribution to diagonal block's owning processor.
918*
919 CALL sgesd2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
920 $ mycol+level_dist )
921*
922 END IF
923* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
924*
925* ************
926 IF( ( mycol / level_dist.GT.0 ) .AND.
927 $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
928 $ THEN
929*
930*
931* Use offdiagonal block to calculate modification to diag block
932* of processor to the left
933*
934 CALL sgemm( 'N', 'N', max_bw, nrhs, max_bw, -one,
935 $ af( odd_size*bwu+2*mbw2+1 ), max_bw,
936 $ b( part_offset+odd_size+1 ), lldb, zero,
937 $ work( 1 ), max_bw )
938*
939* Send contribution to diagonal block's owning processor.
940*
941 CALL sgesd2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
942 $ mycol-level_dist )
943*
944 END IF
945* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
946*
947 40 CONTINUE
948*
949 ELSE
950*
951******************** BACKSOLVE *************************************
952*
953********************************************************************
954* .. Begin reduced system phase of algorithm ..
955********************************************************************
956*
957*
958*
959* The last processor does not participate in the solution of the
960* reduced system and just waits to receive its solution.
961 IF( mycol.EQ.npcol-1 ) THEN
962 GO TO 90
963 END IF
964*
965* Determine number of steps in tree loop
966*
967 level_dist = 1
968 50 CONTINUE
969 IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
970 $ GO TO 60
971*
972 level_dist = level_dist*2
973*
974 GO TO 50
975 60 CONTINUE
976*
977*
978 IF( ( mycol / level_dist.GT.0 ) .AND.
979 $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
980 $ THEN
981*
982* Receive solution from processor to left
983*
984 CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
985 $ mycol-level_dist )
986*
987*
988* Use offdiagonal block to calculate modification to RHS stored
989* on this processor
990*
991 CALL sgemm( 'T', 'N', max_bw, nrhs, max_bw, -one,
992 $ af( odd_size*bwu+2*mbw2+1 ), max_bw,
993 $ work( 1 ), max_bw, one,
994 $ b( part_offset+odd_size+1 ), lldb )
995 END IF
996* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
997*
998*
999 IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1000*
1001* Receive solution from processor to right
1002*
1003 CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1004 $ mycol+level_dist )
1005*
1006* Calculate contribution from this block to next diagonal block
1007*
1008 CALL sgemm( 'N', 'N', max_bw, nrhs, max_bw, -one,
1009 $ af( ( odd_size )*bwu+1 ), max_bw, work( 1 ),
1010 $ max_bw, one, b( part_offset+odd_size+1 ),
1011 $ lldb )
1012*
1013 END IF
1014* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1015*
1016*
1017* Solve with diagonal block
1018*
1019 CALL stbtrs( 'L', 'T', 'U', max_bw, min( bwl, max_bw-1 ),
1020 $ nrhs, af( odd_size*bwu+mbw2+1 ), max_bw+1,
1021 $ b( part_offset+odd_size+1 ), lldb, info )
1022*
1023 IF( info.NE.0 ) THEN
1024 GO TO 190
1025 END IF
1026*
1027*
1028*
1029***Modification Loop *******
1030*
1031 70 CONTINUE
1032 IF( level_dist.EQ.1 )
1033 $ GO TO 80
1034*
1035 level_dist = level_dist / 2
1036*
1037* Send solution to the right
1038*
1039 IF( mycol+level_dist.LT.npcol-1 ) THEN
1040*
1041 CALL sgesd2d( ictxt, max_bw, nrhs,
1042 $ b( part_offset+odd_size+1 ), lldb, 0,
1043 $ mycol+level_dist )
1044*
1045 END IF
1046*
1047* Send solution to left
1048*
1049 IF( mycol-level_dist.GE.0 ) THEN
1050*
1051 CALL sgesd2d( ictxt, max_bw, nrhs,
1052 $ b( part_offset+odd_size+1 ), lldb, 0,
1053 $ mycol-level_dist )
1054*
1055 END IF
1056*
1057 GO TO 70
1058 80 CONTINUE
1059* [End of GOTO Loop]
1060*
1061 90 CONTINUE
1062* [Processor npcol - 1 jumped to here to await next stage]
1063*
1064*******************************
1065* Reduced system has been solved, communicate solutions to nearest
1066* neighbors in preparation for local computation phase.
1067*
1068*
1069* Send elements of solution to next proc
1070*
1071 IF( mycol.LT.npcol-1 ) THEN
1072*
1073 CALL sgesd2d( ictxt, max_bw, nrhs,
1074 $ b( part_offset+odd_size+1 ), lldb, 0,
1075 $ mycol+1 )
1076*
1077 END IF
1078*
1079* Receive modifications to processor's right hand sides
1080*
1081 IF( mycol.GT.0 ) THEN
1082*
1083 CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1084 $ mycol-1 )
1085*
1086 END IF
1087*
1088*
1089*
1090**********************************************
1091* Local computation phase
1092**********************************************
1093*
1094 IF( mycol.NE.0 ) THEN
1095* Use the "spike" fillin to calculate contribution from previous
1096* processor's solution.
1097*
1098 CALL sgemm( 'T', 'N', odd_size, nrhs, bwu, -one, af( 1 ),
1099 $ bwu, work( 1+max_bw-bwu ), max_bw, one,
1100 $ b( part_offset+1 ), lldb )
1101*
1102 END IF
1103*
1104*
1105 IF( mycol.LT.np-1 ) THEN
1106* Use factorization of odd-even connection block to modify
1107* locally stored portion of right hand side(s)
1108*
1109*
1110* First copy and multiply it into temporary storage,
1111* then use it on RHS
1112*
1113 CALL slamov( 'N', bwl, nrhs, b( part_offset+odd_size+1 ),
1114 $ lldb, work( 1+max_bw-bwl ), max_bw )
1115*
1116 CALL strmm( 'L', 'U', 'T', 'N', bwl, nrhs, -one,
1117 $ a( ( ofst+( bwl+bwu+1 )+( odd_size-bwl )*
1118 $ llda ) ), llda-1, work( 1+max_bw-bwl ),
1119 $ max_bw )
1120*
1121 CALL smatadd( bwl, nrhs, one, work( 1+max_bw-bwl ),
1122 $ max_bw, one, b( part_offset+odd_size-bwl+
1123 $ 1 ), lldb )
1124*
1125 END IF
1126*
1127* Use main partition in each processor to solve locally
1128*
1129 CALL stbtrs( uplo, 'T', 'U', odd_size, bwl, nrhs,
1130 $ a( ofst+1+bwu ), llda, b( part_offset+1 ),
1131 $ lldb, info )
1132*
1133 END IF
1134* End of "IF( LSAME( TRANS, 'N' ) )"...
1135*
1136*
1137 ELSE
1138***************************************************************
1139* CASE UPLO = 'U' *
1140***************************************************************
1141 IF( lsame( trans, 'T' ) ) THEN
1142*
1143* Frontsolve
1144*
1145*
1146******************************************
1147* Local computation phase
1148******************************************
1149*
1150* Use main partition in each processor to solve locally
1151*
1152 CALL stbtrs( uplo, 'T', 'N', odd_size, bwu, nrhs,
1153 $ a( ofst+1 ), llda, b( part_offset+1 ), lldb,
1154 $ info )
1155*
1156*
1157 IF( mycol.LT.np-1 ) THEN
1158* Use factorization of odd-even connection block to modify
1159* locally stored portion of right hand side(s)
1160*
1161*
1162* First copy and multiply it into temporary storage,
1163* then use it on RHS
1164*
1165 CALL slamov( 'N', bwu, nrhs,
1166 $ b( part_offset+odd_size-bwu+1 ), lldb,
1167 $ work( 1 ), max_bw )
1168*
1169 CALL strmm( 'L', 'L', 'T', 'N', bwu, nrhs, -one,
1170 $ a( ( ofst+1+odd_size*llda ) ), llda-1,
1171 $ work( 1 ), max_bw )
1172*
1173 CALL smatadd( bwu, nrhs, one, work( 1 ), max_bw, one,
1174 $ b( part_offset+odd_size+1 ), lldb )
1175*
1176 END IF
1177*
1178* Clear garbage out of workspace block
1179*
1180 DO 100 idum1 = 1, work_size_min
1181 work( idum1 ) = 0.0
1182 100 CONTINUE
1183*
1184*
1185 IF( mycol.NE.0 ) THEN
1186* Use the "spike" fillin to calculate contribution to previous
1187* processor's righthand-side.
1188*
1189 CALL sgemm( 'N', 'N', bwl, nrhs, odd_size, -one,
1190 $ af( work_u+1 ), bwl, b( part_offset+1 ),
1191 $ lldb, zero, work( 1+max_bw-bwl ), max_bw )
1192 END IF
1193*
1194*
1195************************************************
1196* Formation and solution of reduced system
1197************************************************
1198*
1199*
1200* Send modifications to prior processor's right hand sides
1201*
1202 IF( mycol.GT.0 ) THEN
1203*
1204 CALL sgesd2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1205 $ mycol-1 )
1206*
1207 END IF
1208*
1209* Receive modifications to processor's right hand sides
1210*
1211 IF( mycol.LT.npcol-1 ) THEN
1212*
1213 CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1214 $ mycol+1 )
1215*
1216* Combine contribution to locally stored right hand sides
1217*
1218 CALL smatadd( max_bw, nrhs, one, work( 1 ), max_bw, one,
1219 $ b( part_offset+odd_size+1 ), lldb )
1220*
1221 END IF
1222*
1223*
1224* The last processor does not participate in the solution of the
1225* reduced system, having sent its contribution already.
1226 IF( mycol.EQ.npcol-1 ) THEN
1227 GO TO 130
1228 END IF
1229*
1230*
1231* *************************************
1232* Modification Loop
1233*
1234* The distance for sending and receiving for each level starts
1235* at 1 for the first level.
1236 level_dist = 1
1237*
1238* Do until this proc is needed to modify other procs' equations
1239*
1240 110 CONTINUE
1241 IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
1242 $ GO TO 120
1243*
1244* Receive and add contribution to righthand sides from left
1245*
1246 IF( mycol-level_dist.GE.0 ) THEN
1247*
1248 CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1249 $ mycol-level_dist )
1250*
1251 CALL smatadd( max_bw, nrhs, one, work( 1 ), max_bw, one,
1252 $ b( part_offset+odd_size+1 ), lldb )
1253*
1254 END IF
1255*
1256* Receive and add contribution to righthand sides from right
1257*
1258 IF( mycol+level_dist.LT.npcol-1 ) THEN
1259*
1260 CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1261 $ mycol+level_dist )
1262*
1263 CALL smatadd( max_bw, nrhs, one, work( 1 ), max_bw, one,
1264 $ b( part_offset+odd_size+1 ), lldb )
1265*
1266 END IF
1267*
1268 level_dist = level_dist*2
1269*
1270 GO TO 110
1271 120 CONTINUE
1272* [End of GOTO Loop]
1273*
1274*
1275*
1276* *********************************
1277* Calculate and use this proc's blocks to modify other procs
1278*
1279* Solve with diagonal block
1280*
1281 CALL stbtrs( 'U', 'T', 'N', max_bw, min( bwu, max_bw-1 ),
1282 $ nrhs, af( odd_size*bwu+mbw2+1-min( bwu,
1283 $ max_bw-1 ) ), max_bw+1,
1284 $ b( part_offset+odd_size+1 ), lldb, info )
1285*
1286 IF( info.NE.0 ) THEN
1287 GO TO 190
1288 END IF
1289*
1290*
1291*
1292* *********
1293 IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1294*
1295* Calculate contribution from this block to next diagonal block
1296*
1297 CALL sgemm( 'T', 'N', max_bw, nrhs, max_bw, -one,
1298 $ af( work_u+( odd_size )*bwl+1 ), max_bw,
1299 $ b( part_offset+odd_size+1 ), lldb, zero,
1300 $ work( 1 ), max_bw )
1301*
1302* Send contribution to diagonal block's owning processor.
1303*
1304 CALL sgesd2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1305 $ mycol+level_dist )
1306*
1307 END IF
1308* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1309*
1310* ************
1311 IF( ( mycol / level_dist.GT.0 ) .AND.
1312 $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
1313 $ THEN
1314*
1315*
1316* Use offdiagonal block to calculate modification to diag block
1317* of processor to the left
1318*
1319 CALL sgemm( 'N', 'N', max_bw, nrhs, max_bw, -one,
1320 $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw,
1321 $ b( part_offset+odd_size+1 ), lldb, zero,
1322 $ work( 1 ), max_bw )
1323*
1324* Send contribution to diagonal block's owning processor.
1325*
1326 CALL sgesd2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1327 $ mycol-level_dist )
1328*
1329 END IF
1330* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1331*
1332 130 CONTINUE
1333*
1334 ELSE
1335*
1336******************** BACKSOLVE *************************************
1337*
1338********************************************************************
1339* .. Begin reduced system phase of algorithm ..
1340********************************************************************
1341*
1342*
1343*
1344* The last processor does not participate in the solution of the
1345* reduced system and just waits to receive its solution.
1346 IF( mycol.EQ.npcol-1 ) THEN
1347 GO TO 180
1348 END IF
1349*
1350* Determine number of steps in tree loop
1351*
1352 level_dist = 1
1353 140 CONTINUE
1354 IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
1355 $ GO TO 150
1356*
1357 level_dist = level_dist*2
1358*
1359 GO TO 140
1360 150 CONTINUE
1361*
1362*
1363 IF( ( mycol / level_dist.GT.0 ) .AND.
1364 $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
1365 $ THEN
1366*
1367* Receive solution from processor to left
1368*
1369 CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1370 $ mycol-level_dist )
1371*
1372*
1373* Use offdiagonal block to calculate modification to RHS stored
1374* on this processor
1375*
1376 CALL sgemm( 'T', 'N', max_bw, nrhs, max_bw, -one,
1377 $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw,
1378 $ work( 1 ), max_bw, one,
1379 $ b( part_offset+odd_size+1 ), lldb )
1380 END IF
1381* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1382*
1383*
1384 IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1385*
1386* Receive solution from processor to right
1387*
1388 CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1389 $ mycol+level_dist )
1390*
1391* Calculate contribution from this block to next diagonal block
1392*
1393 CALL sgemm( 'N', 'N', max_bw, nrhs, max_bw, -one,
1394 $ af( work_u+( odd_size )*bwl+1 ), max_bw,
1395 $ work( 1 ), max_bw, one,
1396 $ b( part_offset+odd_size+1 ), lldb )
1397*
1398 END IF
1399* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1400*
1401*
1402* Solve with diagonal block
1403*
1404 CALL stbtrs( 'U', 'N', 'N', max_bw, min( bwu, max_bw-1 ),
1405 $ nrhs, af( odd_size*bwu+mbw2+1-min( bwu,
1406 $ max_bw-1 ) ), max_bw+1,
1407 $ b( part_offset+odd_size+1 ), lldb, info )
1408*
1409 IF( info.NE.0 ) THEN
1410 GO TO 190
1411 END IF
1412*
1413*
1414*
1415***Modification Loop *******
1416*
1417 160 CONTINUE
1418 IF( level_dist.EQ.1 )
1419 $ GO TO 170
1420*
1421 level_dist = level_dist / 2
1422*
1423* Send solution to the right
1424*
1425 IF( mycol+level_dist.LT.npcol-1 ) THEN
1426*
1427 CALL sgesd2d( ictxt, max_bw, nrhs,
1428 $ b( part_offset+odd_size+1 ), lldb, 0,
1429 $ mycol+level_dist )
1430*
1431 END IF
1432*
1433* Send solution to left
1434*
1435 IF( mycol-level_dist.GE.0 ) THEN
1436*
1437 CALL sgesd2d( ictxt, max_bw, nrhs,
1438 $ b( part_offset+odd_size+1 ), lldb, 0,
1439 $ mycol-level_dist )
1440*
1441 END IF
1442*
1443 GO TO 160
1444 170 CONTINUE
1445* [End of GOTO Loop]
1446*
1447 180 CONTINUE
1448* [Processor npcol - 1 jumped to here to await next stage]
1449*
1450*******************************
1451* Reduced system has been solved, communicate solutions to nearest
1452* neighbors in preparation for local computation phase.
1453*
1454*
1455* Send elements of solution to next proc
1456*
1457 IF( mycol.LT.npcol-1 ) THEN
1458*
1459 CALL sgesd2d( ictxt, max_bw, nrhs,
1460 $ b( part_offset+odd_size+1 ), lldb, 0,
1461 $ mycol+1 )
1462*
1463 END IF
1464*
1465* Receive modifications to processor's right hand sides
1466*
1467 IF( mycol.GT.0 ) THEN
1468*
1469 CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1470 $ mycol-1 )
1471*
1472 END IF
1473*
1474*
1475*
1476**********************************************
1477* Local computation phase
1478**********************************************
1479*
1480 IF( mycol.NE.0 ) THEN
1481* Use the "spike" fillin to calculate contribution from previous
1482* processor's solution.
1483*
1484 CALL sgemm( 'T', 'N', odd_size, nrhs, bwl, -one,
1485 $ af( work_u+1 ), bwl, work( 1+max_bw-bwl ),
1486 $ max_bw, one, b( part_offset+1 ), lldb )
1487*
1488 END IF
1489*
1490*
1491 IF( mycol.LT.np-1 ) THEN
1492* Use factorization of odd-even connection block to modify
1493* locally stored portion of right hand side(s)
1494*
1495*
1496* First copy and multiply it into temporary storage,
1497* then use it on RHS
1498*
1499 CALL slamov( 'N', bwu, nrhs, b( part_offset+odd_size+1 ),
1500 $ lldb, work( 1+max_bw-bwu ), max_bw+bwl )
1501*
1502 CALL strmm( 'L', 'L', 'N', 'N', bwu, nrhs, -one,
1503 $ a( ( ofst+1+odd_size*llda ) ), llda-1,
1504 $ work( 1+max_bw-bwu ), max_bw+bwl )
1505*
1506 CALL smatadd( bwu, nrhs, one, work( 1+max_bw-bwu ),
1507 $ max_bw+bwl, one, b( part_offset+odd_size-
1508 $ bwu+1 ), lldb )
1509*
1510 END IF
1511*
1512* Use main partition in each processor to solve locally
1513*
1514 CALL stbtrs( uplo, 'N', 'N', odd_size, bwu, nrhs,
1515 $ a( ofst+1 ), llda, b( part_offset+1 ), lldb,
1516 $ info )
1517*
1518 END IF
1519* End of "IF( LSAME( TRANS, 'N' ) )"...
1520*
1521*
1522 END IF
1523* End of "IF( LSAME( UPLO, 'L' ) )"...
1524 190 CONTINUE
1525*
1526*
1527* Free BLACS space used to hold standard-form grid.
1528*
1529 IF( ictxt_save.NE.ictxt_new ) THEN
1530 CALL blacs_gridexit( ictxt_new )
1531 END IF
1532*
1533 200 CONTINUE
1534*
1535* Restore saved input parameters
1536*
1537 ictxt = ictxt_save
1538 np = np_save
1539*
1540* Output minimum worksize
1541*
1542 work( 1 ) = work_size_min
1543*
1544*
1545 RETURN
1546*
1547* End of PSDBTRSV
1548*
1549 END
subroutine desc_convert(desc_in, desc_out, info)
Definition desc_convert.f:2
#define max(A, B)
Definition pcgemr.c:180
#define min(A, B)
Definition pcgemr.c:181
subroutine globchk(ictxt, n, x, ldx, iwork, info)
Definition pchkxmat.f:403
subroutine psdbtrsv(uplo, trans, n, bwl, bwu, nrhs, a, ja, desca, b, ib, descb, af, laf, work, lwork, info)
Definition psdbtrsv.f:3
subroutine pxerbla(ictxt, srname, info)
Definition pxerbla.f:2
void reshape(Int *context_in, Int *major_in, Int *context_out, Int *major_out, Int *first_proc, Int *nprow_new, Int *npcol_new)
Definition reshape.c:80
subroutine smatadd(m, n, alpha, a, lda, beta, c, ldc)
Definition smatadd.f:2