ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pdpbtrsv.f
Go to the documentation of this file.
1  SUBROUTINE pdpbtrsv( UPLO, TRANS, N, BW, NRHS, A, JA, DESCA, B,
2  $ IB, DESCB, AF, LAF, WORK, LWORK, INFO )
3 *
4 * -- ScaLAPACK routine (version 2.0.2) --
5 * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6 * May 1 2012
7 *
8 * .. Scalar Arguments ..
9  CHARACTER TRANS, UPLO
10  INTEGER BW, IB, INFO, JA, LAF, LWORK, N, NRHS
11 * ..
12 * .. Array Arguments ..
13  INTEGER DESCA( * ), DESCB( * )
14  DOUBLE PRECISION A( * ), AF( * ), B( * ), WORK( * )
15 * ..
16 *
17 *
18 * Purpose
19 * =======
20 *
21 * PDPBTRSV solves a banded triangular system of linear equations
22 *
23 * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
24 * or
25 * A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS)
26 *
27 * where A(1:N, JA:JA+N-1) is a banded
28 * triangular matrix factor produced by the
29 * Cholesky factorization code PDPBTRF
30 * and is stored in A(1:N,JA:JA+N-1) and AF.
31 * The matrix stored in A(1:N, JA:JA+N-1) is either
32 * upper or lower triangular according to UPLO,
33 * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T
34 * is dictated by the user by the parameter TRANS.
35 *
36 * Routine PDPBTRF MUST be called first.
37 *
38 * =====================================================================
39 *
40 * Arguments
41 * =========
42 *
43 * UPLO (global input) CHARACTER
44 * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
45 * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
46 *
47 * TRANS (global input) CHARACTER
48 * = 'N': Solve with A(1:N, JA:JA+N-1);
49 * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T;
50 *
51 * N (global input) INTEGER
52 * The number of rows and columns to be operated on, i.e. the
53 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
54 *
55 * BW (global input) INTEGER
56 * Number of subdiagonals in L or U. 0 <= BW <= N-1
57 *
58 * NRHS (global input) INTEGER
59 * The number of right hand sides, i.e., the number of columns
60 * of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
61 * NRHS >= 0.
62 *
63 * A (local input/local output) DOUBLE PRECISION pointer into
64 * local memory to an array with first dimension
65 * LLD_A >=(bw+1) (stored in DESCA).
66 * On entry, this array contains the local pieces of the
67 * N-by-N symmetric banded distributed Cholesky factor L or
68 * L^T A(1:N, JA:JA+N-1).
69 * This local portion is stored in the packed banded format
70 * used in LAPACK. Please see the Notes below and the
71 * ScaLAPACK manual for more detail on the format of
72 * distributed matrices.
73 *
74 * JA (global input) INTEGER
75 * The index in the global array A that points to the start of
76 * the matrix to be operated on (which may be either all of A
77 * or a submatrix of A).
78 *
79 * DESCA (global and local input) INTEGER array of dimension DLEN.
80 * if 1D type (DTYPE_A=501), DLEN >= 7;
81 * if 2D type (DTYPE_A=1), DLEN >= 9 .
82 * The array descriptor for the distributed matrix A.
83 * Contains information of mapping of A to memory. Please
84 * see NOTES below for full description and options.
85 *
86 * B (local input/local output) DOUBLE PRECISION pointer into
87 * local memory to an array of local lead dimension lld_b>=NB.
88 * On entry, this array contains the
89 * the local pieces of the right hand sides
90 * B(IB:IB+N-1, 1:NRHS).
91 * On exit, this contains the local piece of the solutions
92 * distributed matrix X.
93 *
94 * IB (global input) INTEGER
95 * The row index in the global array B that points to the first
96 * row of the matrix to be operated on (which may be either
97 * all of B or a submatrix of B).
98 *
99 * DESCB (global and local input) INTEGER array of dimension DLEN.
100 * if 1D type (DTYPE_B=502), DLEN >=7;
101 * if 2D type (DTYPE_B=1), DLEN >= 9.
102 * The array descriptor for the distributed matrix B.
103 * Contains information of mapping of B to memory. Please
104 * see NOTES below for full description and options.
105 *
106 * AF (local output) DOUBLE PRECISION array, dimension LAF.
107 * Auxiliary Fillin Space.
108 * Fillin is created during the factorization routine
109 * PDPBTRF and this is stored in AF. If a linear system
110 * is to be solved using PDPBTRS after the factorization
111 * routine, AF *must not be altered* after the factorization.
112 *
113 * LAF (local input) INTEGER
114 * Size of user-input Auxiliary Fillin space AF. Must be >=
115 * (NB+2*bw)*bw
116 * If LAF is not large enough, an error code will be returned
117 * and the minimum acceptable size will be returned in AF( 1 )
118 *
119 * WORK (local workspace/local output)
120 * DOUBLE PRECISION temporary workspace. This space may
121 * be overwritten in between calls to routines. WORK must be
122 * the size given in LWORK.
123 * On exit, WORK( 1 ) contains the minimal LWORK.
124 *
125 * LWORK (local input or global input) INTEGER
126 * Size of user-input workspace WORK.
127 * If LWORK is too small, the minimal acceptable size will be
128 * returned in WORK(1) and an error code is returned. LWORK>=
129 * (bw*NRHS)
130 *
131 * INFO (global output) INTEGER
132 * = 0: successful exit
133 * < 0: If the i-th argument is an array and the j-entry had
134 * an illegal value, then INFO = -(i*100+j), if the i-th
135 * argument is a scalar and had an illegal value, then
136 * INFO = -i.
137 *
138 * =====================================================================
139 *
140 *
141 * Restrictions
142 * ============
143 *
144 * The following are restrictions on the input parameters. Some of these
145 * are temporary and will be removed in future releases, while others
146 * may reflect fundamental technical limitations.
147 *
148 * Non-cyclic restriction: VERY IMPORTANT!
149 * P*NB>= mod(JA-1,NB)+N.
150 * The mapping for matrices must be blocked, reflecting the nature
151 * of the divide and conquer algorithm as a task-parallel algorithm.
152 * This formula in words is: no processor may have more than one
153 * chunk of the matrix.
154 *
155 * Blocksize cannot be too small:
156 * If the matrix spans more than one processor, the following
157 * restriction on NB, the size of each block on each processor,
158 * must hold:
159 * NB >= 2*BW
160 * The bulk of parallel computation is done on the matrix of size
161 * O(NB) on each processor. If this is too small, divide and conquer
162 * is a poor choice of algorithm.
163 *
164 * Submatrix reference:
165 * JA = IB
166 * Alignment restriction that prevents unnecessary communication.
167 *
168 *
169 * =====================================================================
170 *
171 *
172 * Notes
173 * =====
174 *
175 * If the factorization routine and the solve routine are to be called
176 * separately (to solve various sets of righthand sides using the same
177 * coefficient matrix), the auxiliary space AF *must not be altered*
178 * between calls to the factorization routine and the solve routine.
179 *
180 * The best algorithm for solving banded and tridiagonal linear systems
181 * depends on a variety of parameters, especially the bandwidth.
182 * Currently, only algorithms designed for the case N/P >> bw are
183 * implemented. These go by many names, including Divide and Conquer,
184 * Partitioning, domain decomposition-type, etc.
185 *
186 * Algorithm description: Divide and Conquer
187 *
188 * The Divide and Conqer algorithm assumes the matrix is narrowly
189 * banded compared with the number of equations. In this situation,
190 * it is best to distribute the input matrix A one-dimensionally,
191 * with columns atomic and rows divided amongst the processes.
192 * The basic algorithm divides the banded matrix up into
193 * P pieces with one stored on each processor,
194 * and then proceeds in 2 phases for the factorization or 3 for the
195 * solution of a linear system.
196 * 1) Local Phase:
197 * The individual pieces are factored independently and in
198 * parallel. These factors are applied to the matrix creating
199 * fillin, which is stored in a non-inspectable way in auxiliary
200 * space AF. Mathematically, this is equivalent to reordering
201 * the matrix A as P A P^T and then factoring the principal
202 * leading submatrix of size equal to the sum of the sizes of
203 * the matrices factored on each processor. The factors of
204 * these submatrices overwrite the corresponding parts of A
205 * in memory.
206 * 2) Reduced System Phase:
207 * A small (BW* (P-1)) system is formed representing
208 * interaction of the larger blocks, and is stored (as are its
209 * factors) in the space AF. A parallel Block Cyclic Reduction
210 * algorithm is used. For a linear system, a parallel front solve
211 * followed by an analagous backsolve, both using the structure
212 * of the factored matrix, are performed.
213 * 3) Backsubsitution Phase:
214 * For a linear system, a local backsubstitution is performed on
215 * each processor in parallel.
216 *
217 *
218 * Descriptors
219 * ===========
220 *
221 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
222 *
223 * Note: banded codes can use either the old two dimensional
224 * or new one-dimensional descriptors, though the processor grid in
225 * both cases *must be one-dimensional*. We describe both types below.
226 *
227 * Each global data object is described by an associated description
228 * vector. This vector stores the information required to establish
229 * the mapping between an object element and its corresponding process
230 * and memory location.
231 *
232 * Let A be a generic term for any 2D block cyclicly distributed array.
233 * Such a global array has an associated description vector DESCA.
234 * In the following comments, the character _ should be read as
235 * "of the global array".
236 *
237 * NOTATION STORED IN EXPLANATION
238 * --------------- -------------- --------------------------------------
239 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
240 * DTYPE_A = 1.
241 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
242 * the BLACS process grid A is distribu-
243 * ted over. The context itself is glo-
244 * bal, but the handle (the integer
245 * value) may vary.
246 * M_A (global) DESCA( M_ ) The number of rows in the global
247 * array A.
248 * N_A (global) DESCA( N_ ) The number of columns in the global
249 * array A.
250 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
251 * the rows of the array.
252 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
253 * the columns of the array.
254 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
255 * row of the array A is distributed.
256 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
257 * first column of the array A is
258 * distributed.
259 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
260 * array. LLD_A >= MAX(1,LOCr(M_A)).
261 *
262 * Let K be the number of rows or columns of a distributed matrix,
263 * and assume that its process grid has dimension p x q.
264 * LOCr( K ) denotes the number of elements of K that a process
265 * would receive if K were distributed over the p processes of its
266 * process column.
267 * Similarly, LOCc( K ) denotes the number of elements of K that a
268 * process would receive if K were distributed over the q processes of
269 * its process row.
270 * The values of LOCr() and LOCc() may be determined via a call to the
271 * ScaLAPACK tool function, NUMROC:
272 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
273 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
274 * An upper bound for these quantities may be computed by:
275 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
276 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
277 *
278 *
279 * One-dimensional descriptors:
280 *
281 * One-dimensional descriptors are a new addition to ScaLAPACK since
282 * version 1.0. They simplify and shorten the descriptor for 1D
283 * arrays.
284 *
285 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
286 * object, we allow 1D arrays to be distributed either over the
287 * first dimension of the array (as if the grid were P-by-1) or the
288 * 2nd dimension (as if the grid were 1-by-P). This choice is
289 * indicated by the descriptor type (501 or 502)
290 * as described below.
291 *
292 * IMPORTANT NOTE: the actual BLACS grid represented by the
293 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
294 * irrespective of which one-dimensional descriptor type
295 * (501 or 502) is input.
296 * This routine will interpret the grid properly either way.
297 * ScaLAPACK routines *do not support intercontext operations* so that
298 * the grid passed to a single ScaLAPACK routine *must be the same*
299 * for all array descriptors passed to that routine.
300 *
301 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
302 * may also be used, since a one-dimensional array is a special case
303 * of a two-dimensional array with one dimension of size unity.
304 * The two-dimensional array used in this case *must* be of the
305 * proper orientation:
306 * If the appropriate one-dimensional descriptor is DTYPEA=501
307 * (1 by P type), then the two dimensional descriptor must
308 * have a CTXT value that refers to a 1 by P BLACS grid;
309 * If the appropriate one-dimensional descriptor is DTYPEA=502
310 * (P by 1 type), then the two dimensional descriptor must
311 * have a CTXT value that refers to a P by 1 BLACS grid.
312 *
313 *
314 * Summary of allowed descriptors, types, and BLACS grids:
315 * DTYPE 501 502 1 1
316 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
317 * -----------------------------------------------------
318 * A OK NO OK NO
319 * B NO OK NO OK
320 *
321 * Note that a consequence of this chart is that it is not possible
322 * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
323 * to opposite requirements for the orientation of the BLACS grid,
324 * and as noted before, the *same* BLACS context must be used in
325 * all descriptors in a single ScaLAPACK subroutine call.
326 *
327 * Let A be a generic term for any 1D block cyclicly distributed array.
328 * Such a global array has an associated description vector DESCA.
329 * In the following comments, the character _ should be read as
330 * "of the global array".
331 *
332 * NOTATION STORED IN EXPLANATION
333 * --------------- ---------- ------------------------------------------
334 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
335 * TYPE_A = 501: 1-by-P grid.
336 * TYPE_A = 502: P-by-1 grid.
337 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
338 * the BLACS process grid A is distribu-
339 * ted over. The context itself is glo-
340 * bal, but the handle (the integer
341 * value) may vary.
342 * N_A (global) DESCA( 3 ) The size of the array dimension being
343 * distributed.
344 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
345 * the distributed dimension of the array.
346 * SRC_A (global) DESCA( 5 ) The process row or column over which the
347 * first row or column of the array
348 * is distributed.
349 * LLD_A (local) DESCA( 6 ) The leading dimension of the local array
350 * storing the local blocks of the distri-
351 * buted array A. Minimum value of LLD_A
352 * depends on TYPE_A.
353 * TYPE_A = 501: LLD_A >=
354 * size of undistributed dimension, 1.
355 * TYPE_A = 502: LLD_A >=NB_A, 1.
356 * Reserved DESCA( 7 ) Reserved for future use.
357 *
358 *
359 *
360 * =====================================================================
361 *
362 * Code Developer: Andrew J. Cleary, University of Tennessee.
363 * Current address: Lawrence Livermore National Labs.
364 *
365 * =====================================================================
366 *
367 * .. Parameters ..
368  DOUBLE PRECISION ONE
369  parameter( one = 1.0d+0 )
370  DOUBLE PRECISION ZERO
371  parameter( zero = 0.0d+0 )
372  INTEGER INT_ONE
373  parameter( int_one = 1 )
374  INTEGER DESCMULT, BIGNUM
375  parameter( descmult = 100, bignum = descmult*descmult )
376  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
377  $ lld_, mb_, m_, nb_, n_, rsrc_
378  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
379  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
380  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
381 * ..
382 * .. Local Scalars ..
383  INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
384  $ idum1, idum2, idum3, ja_new, level_dist, llda,
385  $ lldb, mbw2, mycol, myrow, my_num_cols, nb, np,
386  $ npcol, nprow, np_save, odd_size, ofst,
387  $ part_offset, part_size, return_code, store_m_b,
388  $ store_n_a, work_size_min
389 * ..
390 * .. Local Arrays ..
391  INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
392  $ param_check( 17, 3 )
393 * ..
394 * .. External Subroutines ..
395  EXTERNAL blacs_gridexit, blacs_gridinfo, desc_convert,
396  $ dgemm, dgerv2d, dgesd2d, dlamov, dmatadd,
397  $ dtbtrs, dtrmm, dtrtrs, globchk, pxerbla,
398  $ reshape
399 * ..
400 * .. External Functions ..
401  LOGICAL LSAME
402  INTEGER NUMROC
403  EXTERNAL lsame, numroc
404 * ..
405 * .. Intrinsic Functions ..
406  INTRINSIC ichar, mod
407 * ..
408 * .. Executable Statements ..
409 *
410 * Test the input parameters
411 *
412  info = 0
413 *
414 * Convert descriptor into standard form for easy access to
415 * parameters, check that grid is of right shape.
416 *
417  desca_1xp( 1 ) = 501
418  descb_px1( 1 ) = 502
419 *
420  CALL desc_convert( desca, desca_1xp, return_code )
421 *
422  IF( return_code.NE.0 ) THEN
423  info = -( 8*100+2 )
424  END IF
425 *
426  CALL desc_convert( descb, descb_px1, return_code )
427 *
428  IF( return_code.NE.0 ) THEN
429  info = -( 11*100+2 )
430  END IF
431 *
432 * Consistency checks for DESCA and DESCB.
433 *
434 * Context must be the same
435  IF( desca_1xp( 2 ).NE.descb_px1( 2 ) ) THEN
436  info = -( 11*100+2 )
437  END IF
438 *
439 * These are alignment restrictions that may or may not be removed
440 * in future releases. -Andy Cleary, April 14, 1996.
441 *
442 * Block sizes must be the same
443  IF( desca_1xp( 4 ).NE.descb_px1( 4 ) ) THEN
444  info = -( 11*100+4 )
445  END IF
446 *
447 * Source processor must be the same
448 *
449  IF( desca_1xp( 5 ).NE.descb_px1( 5 ) ) THEN
450  info = -( 11*100+5 )
451  END IF
452 *
453 * Get values out of descriptor for use in code.
454 *
455  ictxt = desca_1xp( 2 )
456  csrc = desca_1xp( 5 )
457  nb = desca_1xp( 4 )
458  llda = desca_1xp( 6 )
459  store_n_a = desca_1xp( 3 )
460  lldb = descb_px1( 6 )
461  store_m_b = descb_px1( 3 )
462 *
463 * Get grid parameters
464 *
465 *
466 * Pre-calculate bw^2
467 *
468  mbw2 = bw*bw
469 *
470  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
471  np = nprow*npcol
472 *
473 *
474 *
475  IF( lsame( uplo, 'U' ) ) THEN
476  idum1 = ichar( 'U' )
477  ELSE IF( lsame( uplo, 'L' ) ) THEN
478  idum1 = ichar( 'L' )
479  ELSE
480  info = -1
481  END IF
482 *
483  IF( lsame( trans, 'N' ) ) THEN
484  idum2 = ichar( 'N' )
485  ELSE IF( lsame( trans, 'T' ) ) THEN
486  idum2 = ichar( 'T' )
487  ELSE IF( lsame( trans, 'C' ) ) THEN
488  idum2 = ichar( 'T' )
489  ELSE
490  info = -2
491  END IF
492 *
493  IF( lwork.LT.-1 ) THEN
494  info = -14
495  ELSE IF( lwork.EQ.-1 ) THEN
496  idum3 = -1
497  ELSE
498  idum3 = 1
499  END IF
500 *
501  IF( n.LT.0 ) THEN
502  info = -3
503  END IF
504 *
505  IF( n+ja-1.GT.store_n_a ) THEN
506  info = -( 8*100+6 )
507  END IF
508 *
509  IF( ( bw.GT.n-1 ) .OR. ( bw.LT.0 ) ) THEN
510  info = -4
511  END IF
512 *
513  IF( llda.LT.( bw+1 ) ) THEN
514  info = -( 8*100+6 )
515  END IF
516 *
517  IF( nb.LE.0 ) THEN
518  info = -( 8*100+4 )
519  END IF
520 *
521  IF( n+ib-1.GT.store_m_b ) THEN
522  info = -( 11*100+3 )
523  END IF
524 *
525  IF( lldb.LT.nb ) THEN
526  info = -( 11*100+6 )
527  END IF
528 *
529  IF( nrhs.LT.0 ) THEN
530  info = -5
531  END IF
532 *
533 * Current alignment restriction
534 *
535  IF( ja.NE.ib ) THEN
536  info = -7
537  END IF
538 *
539 * Argument checking that is specific to Divide & Conquer routine
540 *
541  IF( nprow.NE.1 ) THEN
542  info = -( 8*100+2 )
543  END IF
544 *
545  IF( n.GT.np*nb-mod( ja-1, nb ) ) THEN
546  info = -( 3 )
547  CALL pxerbla( ictxt,
548  $ 'PDPBTRSV, D&C alg.: only 1 block per proc',
549  $ -info )
550  RETURN
551  END IF
552 *
553  IF( ( ja+n-1.GT.nb ) .AND. ( nb.LT.2*bw ) ) THEN
554  info = -( 8*100+4 )
555  CALL pxerbla( ictxt, 'PDPBTRSV, D&C alg.: NB too small',
556  $ -info )
557  RETURN
558  END IF
559 *
560 *
561  work_size_min = bw*nrhs
562 *
563  work( 1 ) = work_size_min
564 *
565  IF( lwork.LT.work_size_min ) THEN
566  IF( lwork.NE.-1 ) THEN
567  info = -14
568  CALL pxerbla( ictxt, 'PDPBTRSV: worksize error', -info )
569  END IF
570  RETURN
571  END IF
572 *
573 * Pack params and positions into arrays for global consistency check
574 *
575  param_check( 17, 1 ) = descb( 5 )
576  param_check( 16, 1 ) = descb( 4 )
577  param_check( 15, 1 ) = descb( 3 )
578  param_check( 14, 1 ) = descb( 2 )
579  param_check( 13, 1 ) = descb( 1 )
580  param_check( 12, 1 ) = ib
581  param_check( 11, 1 ) = desca( 5 )
582  param_check( 10, 1 ) = desca( 4 )
583  param_check( 9, 1 ) = desca( 3 )
584  param_check( 8, 1 ) = desca( 1 )
585  param_check( 7, 1 ) = ja
586  param_check( 6, 1 ) = nrhs
587  param_check( 5, 1 ) = bw
588  param_check( 4, 1 ) = n
589  param_check( 3, 1 ) = idum3
590  param_check( 2, 1 ) = idum2
591  param_check( 1, 1 ) = idum1
592 *
593  param_check( 17, 2 ) = 1105
594  param_check( 16, 2 ) = 1104
595  param_check( 15, 2 ) = 1103
596  param_check( 14, 2 ) = 1102
597  param_check( 13, 2 ) = 1101
598  param_check( 12, 2 ) = 10
599  param_check( 11, 2 ) = 805
600  param_check( 10, 2 ) = 804
601  param_check( 9, 2 ) = 803
602  param_check( 8, 2 ) = 801
603  param_check( 7, 2 ) = 7
604  param_check( 6, 2 ) = 5
605  param_check( 5, 2 ) = 4
606  param_check( 4, 2 ) = 3
607  param_check( 3, 2 ) = 14
608  param_check( 2, 2 ) = 2
609  param_check( 1, 2 ) = 1
610 *
611 * Want to find errors with MIN( ), so if no error, set it to a big
612 * number. If there already is an error, multiply by the the
613 * descriptor multiplier.
614 *
615  IF( info.GE.0 ) THEN
616  info = bignum
617  ELSE IF( info.LT.-descmult ) THEN
618  info = -info
619  ELSE
620  info = -info*descmult
621  END IF
622 *
623 * Check consistency across processors
624 *
625  CALL globchk( ictxt, 17, param_check, 17, param_check( 1, 3 ),
626  $ info )
627 *
628 * Prepare output: set info = 0 if no error, and divide by DESCMULT
629 * if error is not in a descriptor entry.
630 *
631  IF( info.EQ.bignum ) THEN
632  info = 0
633  ELSE IF( mod( info, descmult ).EQ.0 ) THEN
634  info = -info / descmult
635  ELSE
636  info = -info
637  END IF
638 *
639  IF( info.LT.0 ) THEN
640  CALL pxerbla( ictxt, 'PDPBTRSV', -info )
641  RETURN
642  END IF
643 *
644 * Quick return if possible
645 *
646  IF( n.EQ.0 )
647  $ RETURN
648 *
649  IF( nrhs.EQ.0 )
650  $ RETURN
651 *
652 *
653 * Adjust addressing into matrix space to properly get into
654 * the beginning part of the relevant data
655 *
656  part_offset = nb*( ( ja-1 ) / ( npcol*nb ) )
657 *
658  IF( ( mycol-csrc ).LT.( ja-part_offset-1 ) / nb ) THEN
659  part_offset = part_offset + nb
660  END IF
661 *
662  IF( mycol.LT.csrc ) THEN
663  part_offset = part_offset - nb
664  END IF
665 *
666 * Form a new BLACS grid (the "standard form" grid) with only procs
667 * holding part of the matrix, of size 1xNP where NP is adjusted,
668 * starting at csrc=0, with JA modified to reflect dropped procs.
669 *
670 * First processor to hold part of the matrix:
671 *
672  first_proc = mod( ( ja-1 ) / nb+csrc, npcol )
673 *
674 * Calculate new JA one while dropping off unused processors.
675 *
676  ja_new = mod( ja-1, nb ) + 1
677 *
678 * Save and compute new value of NP
679 *
680  np_save = np
681  np = ( ja_new+n-2 ) / nb + 1
682 *
683 * Call utility routine that forms "standard-form" grid
684 *
685  CALL reshape( ictxt, int_one, ictxt_new, int_one, first_proc,
686  $ int_one, np )
687 *
688 * Use new context from standard grid as context.
689 *
690  ictxt_save = ictxt
691  ictxt = ictxt_new
692  desca_1xp( 2 ) = ictxt_new
693  descb_px1( 2 ) = ictxt_new
694 *
695 * Get information about new grid.
696 *
697  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
698 *
699 * Drop out processors that do not have part of the matrix.
700 *
701  IF( myrow.LT.0 ) THEN
702  GO TO 180
703  END IF
704 *
705 * ********************************
706 * Values reused throughout routine
707 *
708 * User-input value of partition size
709 *
710  part_size = nb
711 *
712 * Number of columns in each processor
713 *
714  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
715 *
716 * Offset in columns to beginning of main partition in each proc
717 *
718  IF( mycol.EQ.0 ) THEN
719  part_offset = part_offset + mod( ja_new-1, part_size )
720  my_num_cols = my_num_cols - mod( ja_new-1, part_size )
721  END IF
722 *
723 * Offset in elements
724 *
725  ofst = part_offset*llda
726 *
727 * Size of main (or odd) partition in each processor
728 *
729  odd_size = my_num_cols
730  IF( mycol.LT.np-1 ) THEN
731  odd_size = odd_size - bw
732  END IF
733 *
734 *
735 *
736 * Begin main code
737 *
738  IF( lsame( uplo, 'L' ) ) THEN
739 *
740  IF( lsame( trans, 'N' ) ) THEN
741 *
742 * Frontsolve
743 *
744 *
745 ******************************************
746 * Local computation phase
747 ******************************************
748 *
749 * Use main partition in each processor to solve locally
750 *
751  CALL dtbtrs( uplo, 'N', 'N', odd_size, bw, nrhs,
752  $ a( ofst+1 ), llda, b( part_offset+1 ), lldb,
753  $ info )
754 *
755 *
756  IF( mycol.LT.np-1 ) THEN
757 * Use factorization of odd-even connection block to modify
758 * locally stored portion of right hand side(s)
759 *
760 *
761 * First copy and multiply it into temporary storage,
762 * then use it on RHS
763 *
764  CALL dlamov( 'N', bw, nrhs,
765  $ b( part_offset+odd_size-bw+1 ), lldb,
766  $ work( 1 ), bw )
767 *
768  CALL dtrmm( 'L', 'U', 'N', 'N', bw, nrhs, -one,
769  $ a( ( ofst+( bw+1 )+( odd_size-bw )*llda ) ),
770  $ llda-1, work( 1 ), bw )
771 *
772  CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
773  $ b( part_offset+odd_size+1 ), lldb )
774 *
775  END IF
776 *
777 *
778  IF( mycol.NE.0 ) THEN
779 * Use the "spike" fillin to calculate contribution to previous
780 * processor's righthand-side.
781 *
782  CALL dgemm( 'T', 'N', bw, nrhs, odd_size, -one, af( 1 ),
783  $ odd_size, b( part_offset+1 ), lldb, zero,
784  $ work( 1+bw-bw ), bw )
785  END IF
786 *
787 *
788 ************************************************
789 * Formation and solution of reduced system
790 ************************************************
791 *
792 *
793 * Send modifications to prior processor's right hand sides
794 *
795  IF( mycol.GT.0 ) THEN
796 *
797  CALL dgesd2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
798  $ mycol-1 )
799 *
800  END IF
801 *
802 * Receive modifications to processor's right hand sides
803 *
804  IF( mycol.LT.npcol-1 ) THEN
805 *
806  CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
807  $ mycol+1 )
808 *
809 * Combine contribution to locally stored right hand sides
810 *
811  CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
812  $ b( part_offset+odd_size+1 ), lldb )
813 *
814  END IF
815 *
816 *
817 * The last processor does not participate in the solution of the
818 * reduced system, having sent its contribution already.
819  IF( mycol.EQ.npcol-1 ) THEN
820  GO TO 30
821  END IF
822 *
823 *
824 * *************************************
825 * Modification Loop
826 *
827 * The distance for sending and receiving for each level starts
828 * at 1 for the first level.
829  level_dist = 1
830 *
831 * Do until this proc is needed to modify other procs' equations
832 *
833  10 CONTINUE
834  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
835  $ GO TO 20
836 *
837 * Receive and add contribution to righthand sides from left
838 *
839  IF( mycol-level_dist.GE.0 ) THEN
840 *
841  CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
842  $ mycol-level_dist )
843 *
844  CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
845  $ b( part_offset+odd_size+1 ), lldb )
846 *
847  END IF
848 *
849 * Receive and add contribution to righthand sides from right
850 *
851  IF( mycol+level_dist.LT.npcol-1 ) THEN
852 *
853  CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
854  $ mycol+level_dist )
855 *
856  CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
857  $ b( part_offset+odd_size+1 ), lldb )
858 *
859  END IF
860 *
861  level_dist = level_dist*2
862 *
863  GO TO 10
864  20 CONTINUE
865 * [End of GOTO Loop]
866 *
867 *
868 *
869 * *********************************
870 * Calculate and use this proc's blocks to modify other procs
871 *
872 * Solve with diagonal block
873 *
874  CALL dtrtrs( 'L', 'N', 'N', bw, nrhs,
875  $ af( odd_size*bw+mbw2+1 ), bw,
876  $ b( part_offset+odd_size+1 ), lldb, info )
877 *
878  IF( info.NE.0 ) THEN
879  GO TO 170
880  END IF
881 *
882 *
883 *
884 * *********
885  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
886 *
887 * Calculate contribution from this block to next diagonal block
888 *
889  CALL dgemm( 'T', 'N', bw, nrhs, bw, -one,
890  $ af( ( odd_size )*bw+1 ), bw,
891  $ b( part_offset+odd_size+1 ), lldb, zero,
892  $ work( 1 ), bw )
893 *
894 * Send contribution to diagonal block's owning processor.
895 *
896  CALL dgesd2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
897  $ mycol+level_dist )
898 *
899  END IF
900 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
901 *
902 * ************
903  IF( ( mycol / level_dist.GT.0 ) .AND.
904  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
905  $ THEN
906 *
907 *
908 * Use offdiagonal block to calculate modification to diag block
909 * of processor to the left
910 *
911  CALL dgemm( 'N', 'N', bw, nrhs, bw, -one,
912  $ af( odd_size*bw+2*mbw2+1 ), bw,
913  $ b( part_offset+odd_size+1 ), lldb, zero,
914  $ work( 1 ), bw )
915 *
916 * Send contribution to diagonal block's owning processor.
917 *
918  CALL dgesd2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
919  $ mycol-level_dist )
920 *
921  END IF
922 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
923 *
924  30 CONTINUE
925 *
926  ELSE
927 *
928 ******************** BACKSOLVE *************************************
929 *
930 ********************************************************************
931 * .. Begin reduced system phase of algorithm ..
932 ********************************************************************
933 *
934 *
935 *
936 * The last processor does not participate in the solution of the
937 * reduced system and just waits to receive its solution.
938  IF( mycol.EQ.npcol-1 ) THEN
939  GO TO 80
940  END IF
941 *
942 * Determine number of steps in tree loop
943 *
944  level_dist = 1
945  40 CONTINUE
946  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
947  $ GO TO 50
948 *
949  level_dist = level_dist*2
950 *
951  GO TO 40
952  50 CONTINUE
953 *
954 *
955  IF( ( mycol / level_dist.GT.0 ) .AND.
956  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
957  $ THEN
958 *
959 * Receive solution from processor to left
960 *
961  CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
962  $ mycol-level_dist )
963 *
964 *
965 * Use offdiagonal block to calculate modification to RHS stored
966 * on this processor
967 *
968  CALL dgemm( 'T', 'N', bw, nrhs, bw, -one,
969  $ af( odd_size*bw+2*mbw2+1 ), bw, work( 1 ),
970  $ bw, one, b( part_offset+odd_size+1 ), lldb )
971  END IF
972 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
973 *
974 *
975  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
976 *
977 * Receive solution from processor to right
978 *
979  CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
980  $ mycol+level_dist )
981 *
982 * Calculate contribution from this block to next diagonal block
983 *
984  CALL dgemm( 'N', 'N', bw, nrhs, bw, -one,
985  $ af( ( odd_size )*bw+1 ), bw, work( 1 ), bw,
986  $ one, b( part_offset+odd_size+1 ), lldb )
987 *
988  END IF
989 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
990 *
991 *
992 * Solve with diagonal block
993 *
994  CALL dtrtrs( 'L', 'T', 'N', bw, nrhs,
995  $ af( odd_size*bw+mbw2+1 ), bw,
996  $ b( part_offset+odd_size+1 ), lldb, info )
997 *
998  IF( info.NE.0 ) THEN
999  GO TO 170
1000  END IF
1001 *
1002 *
1003 *
1004 ***Modification Loop *******
1005 *
1006  60 CONTINUE
1007  IF( level_dist.EQ.1 )
1008  $ GO TO 70
1009 *
1010  level_dist = level_dist / 2
1011 *
1012 * Send solution to the right
1013 *
1014  IF( mycol+level_dist.LT.npcol-1 ) THEN
1015 *
1016  CALL dgesd2d( ictxt, bw, nrhs,
1017  $ b( part_offset+odd_size+1 ), lldb, 0,
1018  $ mycol+level_dist )
1019 *
1020  END IF
1021 *
1022 * Send solution to left
1023 *
1024  IF( mycol-level_dist.GE.0 ) THEN
1025 *
1026  CALL dgesd2d( ictxt, bw, nrhs,
1027  $ b( part_offset+odd_size+1 ), lldb, 0,
1028  $ mycol-level_dist )
1029 *
1030  END IF
1031 *
1032  GO TO 60
1033  70 CONTINUE
1034 * [End of GOTO Loop]
1035 *
1036  80 CONTINUE
1037 * [Processor npcol - 1 jumped to here to await next stage]
1038 *
1039 *******************************
1040 * Reduced system has been solved, communicate solutions to nearest
1041 * neighbors in preparation for local computation phase.
1042 *
1043 *
1044 * Send elements of solution to next proc
1045 *
1046  IF( mycol.LT.npcol-1 ) THEN
1047 *
1048  CALL dgesd2d( ictxt, bw, nrhs,
1049  $ b( part_offset+odd_size+1 ), lldb, 0,
1050  $ mycol+1 )
1051 *
1052  END IF
1053 *
1054 * Receive modifications to processor's right hand sides
1055 *
1056  IF( mycol.GT.0 ) THEN
1057 *
1058  CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1059  $ mycol-1 )
1060 *
1061  END IF
1062 *
1063 *
1064 *
1065 **********************************************
1066 * Local computation phase
1067 **********************************************
1068 *
1069  IF( mycol.NE.0 ) THEN
1070 * Use the "spike" fillin to calculate contribution from previous
1071 * processor's solution.
1072 *
1073  CALL dgemm( 'N', 'N', odd_size, nrhs, bw, -one, af( 1 ),
1074  $ odd_size, work( 1+bw-bw ), bw, one,
1075  $ b( part_offset+1 ), lldb )
1076 *
1077  END IF
1078 *
1079 *
1080  IF( mycol.LT.np-1 ) THEN
1081 * Use factorization of odd-even connection block to modify
1082 * locally stored portion of right hand side(s)
1083 *
1084 *
1085 * First copy and multiply it into temporary storage,
1086 * then use it on RHS
1087 *
1088  CALL dlamov( 'N', bw, nrhs, b( part_offset+odd_size+1 ),
1089  $ lldb, work( 1+bw-bw ), bw )
1090 *
1091  CALL dtrmm( 'L', 'U', 'T', 'N', bw, nrhs, -one,
1092  $ a( ( ofst+( bw+1 )+( odd_size-bw )*llda ) ),
1093  $ llda-1, work( 1+bw-bw ), bw )
1094 *
1095  CALL dmatadd( bw, nrhs, one, work( 1+bw-bw ), bw, one,
1096  $ b( part_offset+odd_size-bw+1 ), lldb )
1097 *
1098  END IF
1099 *
1100 * Use main partition in each processor to solve locally
1101 *
1102  CALL dtbtrs( uplo, 'T', 'N', odd_size, bw, nrhs,
1103  $ a( ofst+1 ), llda, b( part_offset+1 ), lldb,
1104  $ info )
1105 *
1106  END IF
1107 * End of "IF( LSAME( TRANS, 'N' ) )"...
1108 *
1109 *
1110  ELSE
1111 ***************************************************************
1112 * CASE UPLO = 'U' *
1113 ***************************************************************
1114  IF( lsame( trans, 'T' ) ) THEN
1115 *
1116 * Frontsolve
1117 *
1118 *
1119 ******************************************
1120 * Local computation phase
1121 ******************************************
1122 *
1123 * Use main partition in each processor to solve locally
1124 *
1125  CALL dtbtrs( uplo, 'T', 'N', odd_size, bw, nrhs,
1126  $ a( ofst+1 ), llda, b( part_offset+1 ), lldb,
1127  $ info )
1128 *
1129 *
1130  IF( mycol.LT.np-1 ) THEN
1131 * Use factorization of odd-even connection block to modify
1132 * locally stored portion of right hand side(s)
1133 *
1134 *
1135 * First copy and multiply it into temporary storage,
1136 * then use it on RHS
1137 *
1138  CALL dlamov( 'N', bw, nrhs,
1139  $ b( part_offset+odd_size-bw+1 ), lldb,
1140  $ work( 1 ), bw )
1141 *
1142  CALL dtrmm( 'L', 'L', 'T', 'N', bw, nrhs, -one,
1143  $ a( ( ofst+1+odd_size*llda ) ), llda-1,
1144  $ work( 1 ), bw )
1145 *
1146  CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
1147  $ b( part_offset+odd_size+1 ), lldb )
1148 *
1149  END IF
1150 *
1151 *
1152  IF( mycol.NE.0 ) THEN
1153 * Use the "spike" fillin to calculate contribution to previous
1154 * processor's righthand-side.
1155 *
1156  CALL dgemm( 'T', 'N', bw, nrhs, odd_size, -one, af( 1 ),
1157  $ odd_size, b( part_offset+1 ), lldb, zero,
1158  $ work( 1+bw-bw ), bw )
1159  END IF
1160 *
1161 *
1162 ************************************************
1163 * Formation and solution of reduced system
1164 ************************************************
1165 *
1166 *
1167 * Send modifications to prior processor's right hand sides
1168 *
1169  IF( mycol.GT.0 ) THEN
1170 *
1171  CALL dgesd2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1172  $ mycol-1 )
1173 *
1174  END IF
1175 *
1176 * Receive modifications to processor's right hand sides
1177 *
1178  IF( mycol.LT.npcol-1 ) THEN
1179 *
1180  CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1181  $ mycol+1 )
1182 *
1183 * Combine contribution to locally stored right hand sides
1184 *
1185  CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
1186  $ b( part_offset+odd_size+1 ), lldb )
1187 *
1188  END IF
1189 *
1190 *
1191 * The last processor does not participate in the solution of the
1192 * reduced system, having sent its contribution already.
1193  IF( mycol.EQ.npcol-1 ) THEN
1194  GO TO 110
1195  END IF
1196 *
1197 *
1198 * *************************************
1199 * Modification Loop
1200 *
1201 * The distance for sending and receiving for each level starts
1202 * at 1 for the first level.
1203  level_dist = 1
1204 *
1205 * Do until this proc is needed to modify other procs' equations
1206 *
1207  90 CONTINUE
1208  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
1209  $ GO TO 100
1210 *
1211 * Receive and add contribution to righthand sides from left
1212 *
1213  IF( mycol-level_dist.GE.0 ) THEN
1214 *
1215  CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1216  $ mycol-level_dist )
1217 *
1218  CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
1219  $ b( part_offset+odd_size+1 ), lldb )
1220 *
1221  END IF
1222 *
1223 * Receive and add contribution to righthand sides from right
1224 *
1225  IF( mycol+level_dist.LT.npcol-1 ) THEN
1226 *
1227  CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1228  $ mycol+level_dist )
1229 *
1230  CALL dmatadd( bw, nrhs, one, work( 1 ), bw, one,
1231  $ b( part_offset+odd_size+1 ), lldb )
1232 *
1233  END IF
1234 *
1235  level_dist = level_dist*2
1236 *
1237  GO TO 90
1238  100 CONTINUE
1239 * [End of GOTO Loop]
1240 *
1241 *
1242 *
1243 * *********************************
1244 * Calculate and use this proc's blocks to modify other procs
1245 *
1246 * Solve with diagonal block
1247 *
1248  CALL dtrtrs( 'L', 'N', 'N', bw, nrhs,
1249  $ af( odd_size*bw+mbw2+1 ), bw,
1250  $ b( part_offset+odd_size+1 ), lldb, info )
1251 *
1252  IF( info.NE.0 ) THEN
1253  GO TO 170
1254  END IF
1255 *
1256 *
1257 *
1258 * *********
1259  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1260 *
1261 * Calculate contribution from this block to next diagonal block
1262 *
1263  CALL dgemm( 'T', 'N', bw, nrhs, bw, -one,
1264  $ af( ( odd_size )*bw+1 ), bw,
1265  $ b( part_offset+odd_size+1 ), lldb, zero,
1266  $ work( 1 ), bw )
1267 *
1268 * Send contribution to diagonal block's owning processor.
1269 *
1270  CALL dgesd2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1271  $ mycol+level_dist )
1272 *
1273  END IF
1274 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1275 *
1276 * ************
1277  IF( ( mycol / level_dist.GT.0 ) .AND.
1278  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
1279  $ THEN
1280 *
1281 *
1282 * Use offdiagonal block to calculate modification to diag block
1283 * of processor to the left
1284 *
1285  CALL dgemm( 'N', 'N', bw, nrhs, bw, -one,
1286  $ af( odd_size*bw+2*mbw2+1 ), bw,
1287  $ b( part_offset+odd_size+1 ), lldb, zero,
1288  $ work( 1 ), bw )
1289 *
1290 * Send contribution to diagonal block's owning processor.
1291 *
1292  CALL dgesd2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1293  $ mycol-level_dist )
1294 *
1295  END IF
1296 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1297 *
1298  110 CONTINUE
1299 *
1300  ELSE
1301 *
1302 ******************** BACKSOLVE *************************************
1303 *
1304 ********************************************************************
1305 * .. Begin reduced system phase of algorithm ..
1306 ********************************************************************
1307 *
1308 *
1309 *
1310 * The last processor does not participate in the solution of the
1311 * reduced system and just waits to receive its solution.
1312  IF( mycol.EQ.npcol-1 ) THEN
1313  GO TO 160
1314  END IF
1315 *
1316 * Determine number of steps in tree loop
1317 *
1318  level_dist = 1
1319  120 CONTINUE
1320  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
1321  $ GO TO 130
1322 *
1323  level_dist = level_dist*2
1324 *
1325  GO TO 120
1326  130 CONTINUE
1327 *
1328 *
1329  IF( ( mycol / level_dist.GT.0 ) .AND.
1330  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
1331  $ THEN
1332 *
1333 * Receive solution from processor to left
1334 *
1335  CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1336  $ mycol-level_dist )
1337 *
1338 *
1339 * Use offdiagonal block to calculate modification to RHS stored
1340 * on this processor
1341 *
1342  CALL dgemm( 'T', 'N', bw, nrhs, bw, -one,
1343  $ af( odd_size*bw+2*mbw2+1 ), bw, work( 1 ),
1344  $ bw, one, b( part_offset+odd_size+1 ), lldb )
1345  END IF
1346 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1347 *
1348 *
1349  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1350 *
1351 * Receive solution from processor to right
1352 *
1353  CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1354  $ mycol+level_dist )
1355 *
1356 * Calculate contribution from this block to next diagonal block
1357 *
1358  CALL dgemm( 'N', 'N', bw, nrhs, bw, -one,
1359  $ af( ( odd_size )*bw+1 ), bw, work( 1 ), bw,
1360  $ one, b( part_offset+odd_size+1 ), lldb )
1361 *
1362  END IF
1363 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1364 *
1365 *
1366 * Solve with diagonal block
1367 *
1368  CALL dtrtrs( 'L', 'T', 'N', bw, nrhs,
1369  $ af( odd_size*bw+mbw2+1 ), bw,
1370  $ b( part_offset+odd_size+1 ), lldb, info )
1371 *
1372  IF( info.NE.0 ) THEN
1373  GO TO 170
1374  END IF
1375 *
1376 *
1377 *
1378 ***Modification Loop *******
1379 *
1380  140 CONTINUE
1381  IF( level_dist.EQ.1 )
1382  $ GO TO 150
1383 *
1384  level_dist = level_dist / 2
1385 *
1386 * Send solution to the right
1387 *
1388  IF( mycol+level_dist.LT.npcol-1 ) THEN
1389 *
1390  CALL dgesd2d( ictxt, bw, nrhs,
1391  $ b( part_offset+odd_size+1 ), lldb, 0,
1392  $ mycol+level_dist )
1393 *
1394  END IF
1395 *
1396 * Send solution to left
1397 *
1398  IF( mycol-level_dist.GE.0 ) THEN
1399 *
1400  CALL dgesd2d( ictxt, bw, nrhs,
1401  $ b( part_offset+odd_size+1 ), lldb, 0,
1402  $ mycol-level_dist )
1403 *
1404  END IF
1405 *
1406  GO TO 140
1407  150 CONTINUE
1408 * [End of GOTO Loop]
1409 *
1410  160 CONTINUE
1411 * [Processor npcol - 1 jumped to here to await next stage]
1412 *
1413 *******************************
1414 * Reduced system has been solved, communicate solutions to nearest
1415 * neighbors in preparation for local computation phase.
1416 *
1417 *
1418 * Send elements of solution to next proc
1419 *
1420  IF( mycol.LT.npcol-1 ) THEN
1421 *
1422  CALL dgesd2d( ictxt, bw, nrhs,
1423  $ b( part_offset+odd_size+1 ), lldb, 0,
1424  $ mycol+1 )
1425 *
1426  END IF
1427 *
1428 * Receive modifications to processor's right hand sides
1429 *
1430  IF( mycol.GT.0 ) THEN
1431 *
1432  CALL dgerv2d( ictxt, bw, nrhs, work( 1 ), bw, 0,
1433  $ mycol-1 )
1434 *
1435  END IF
1436 *
1437 *
1438 *
1439 **********************************************
1440 * Local computation phase
1441 **********************************************
1442 *
1443  IF( mycol.NE.0 ) THEN
1444 * Use the "spike" fillin to calculate contribution from previous
1445 * processor's solution.
1446 *
1447  CALL dgemm( 'N', 'N', odd_size, nrhs, bw, -one, af( 1 ),
1448  $ odd_size, work( 1+bw-bw ), bw, one,
1449  $ b( part_offset+1 ), lldb )
1450 *
1451  END IF
1452 *
1453 *
1454  IF( mycol.LT.np-1 ) THEN
1455 * Use factorization of odd-even connection block to modify
1456 * locally stored portion of right hand side(s)
1457 *
1458 *
1459 * First copy and multiply it into temporary storage,
1460 * then use it on RHS
1461 *
1462  CALL dlamov( 'N', bw, nrhs, b( part_offset+odd_size+1 ),
1463  $ lldb, work( 1+bw-bw ), bw )
1464 *
1465  CALL dtrmm( 'L', 'L', 'N', 'N', bw, nrhs, -one,
1466  $ a( ( ofst+1+odd_size*llda ) ), llda-1,
1467  $ work( 1+bw-bw ), bw )
1468 *
1469  CALL dmatadd( bw, nrhs, one, work( 1+bw-bw ), bw, one,
1470  $ b( part_offset+odd_size-bw+1 ), lldb )
1471 *
1472  END IF
1473 *
1474 * Use main partition in each processor to solve locally
1475 *
1476  CALL dtbtrs( uplo, 'N', 'N', odd_size, bw, nrhs,
1477  $ a( ofst+1 ), llda, b( part_offset+1 ), lldb,
1478  $ info )
1479 *
1480  END IF
1481 * End of "IF( LSAME( TRANS, 'N' ) )"...
1482 *
1483 *
1484  END IF
1485 * End of "IF( LSAME( UPLO, 'L' ) )"...
1486  170 CONTINUE
1487 *
1488 *
1489 * Free BLACS space used to hold standard-form grid.
1490 *
1491  IF( ictxt_save.NE.ictxt_new ) THEN
1492  CALL blacs_gridexit( ictxt_new )
1493  END IF
1494 *
1495  180 CONTINUE
1496 *
1497 * Restore saved input parameters
1498 *
1499  ictxt = ictxt_save
1500  np = np_save
1501 *
1502 * Output minimum worksize
1503 *
1504  work( 1 ) = work_size_min
1505 *
1506 *
1507  RETURN
1508 *
1509 * End of PDPBTRSV
1510 *
1511  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
dmatadd
subroutine dmatadd(M, N, ALPHA, A, LDA, BETA, C, LDC)
Definition: dmatadd.f:2
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
pdpbtrsv
subroutine pdpbtrsv(UPLO, TRANS, N, BW, NRHS, A, JA, DESCA, B, IB, DESCB, AF, LAF, WORK, LWORK, INFO)
Definition: pdpbtrsv.f:3
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2