ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pcgbtrs.f
Go to the documentation of this file.
1  SUBROUTINE pcgbtrs( TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, IPIV,
2  $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO )
3 *
4 * -- ScaLAPACK routine (version 2.0.2) --
5 * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6 * May 1 2012
7 *
8 * .. Scalar Arguments ..
9  CHARACTER TRANS
10  INTEGER BWU, BWL, IB, INFO, JA, LAF, LWORK, N, NRHS
11 * ..
12 * .. Array Arguments ..
13  INTEGER DESCA( * ), DESCB( * ), IPIV(*)
14  COMPLEX A( * ), AF( * ), B( * ), WORK( * )
15 * ..
16 *
17 *
18 * Purpose
19 * =======
20 *
21 * PCGBTRS solves a system of linear equations
22 *
23 * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
24 * or
25 * A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS)
26 *
27 * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors
28 * stored in A(1:N,JA:JA+N-1) and AF by PCGBTRF.
29 * A(1:N, JA:JA+N-1) is an N-by-N complex
30 * banded distributed
31 * matrix with bandwidth BWL, BWU.
32 *
33 * Routine PCGBTRF MUST be called first.
34 *
35 * =====================================================================
36 *
37 * Arguments
38 * =========
39 *
40 *
41 * TRANS (global input) CHARACTER
42 * = 'N': Solve with A(1:N, JA:JA+N-1);
43 * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) );
44 *
45 * N (global input) INTEGER
46 * The number of rows and columns to be operated on, i.e. the
47 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
48 *
49 * BWL (global input) INTEGER
50 * Number of subdiagonals. 0 <= BWL <= N-1
51 *
52 * BWU (global input) INTEGER
53 * Number of superdiagonals. 0 <= BWU <= N-1
54 *
55 * NRHS (global input) INTEGER
56 * The number of right hand sides, i.e., the number of columns
57 * of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
58 * NRHS >= 0.
59 *
60 * A (local input/local output) COMPLEX pointer into
61 * local memory to an array with first dimension
62 * LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA).
63 * On entry, this array contains the local pieces of the
64 * N-by-N unsymmetric banded distributed Cholesky factor L or
65 * L^T A(1:N, JA:JA+N-1).
66 * This local portion is stored in the packed banded format
67 * used in LAPACK. Please see the Notes below and the
68 * ScaLAPACK manual for more detail on the format of
69 * distributed matrices.
70 *
71 * JA (global input) INTEGER
72 * The index in the global array A that points to the start of
73 * the matrix to be operated on (which may be either all of A
74 * or a submatrix of A).
75 *
76 * DESCA (global and local input) INTEGER array of dimension DLEN.
77 * if 1D type (DTYPE_A=501), DLEN >= 7;
78 * if 2D type (DTYPE_A=1), DLEN >= 9 .
79 * The array descriptor for the distributed matrix A.
80 * Contains information of mapping of A to memory. Please
81 * see NOTES below for full description and options.
82 *
83 * IPIV (local output) INTEGER array, dimension >= DESCA( NB ).
84 * Pivot indices for local factorizations.
85 * Users *should not* alter the contents between
86 * factorization and solve.
87 *
88 * B (local input/local output) COMPLEX pointer into
89 * local memory to an array of local lead dimension lld_b>=NB.
90 * On entry, this array contains the
91 * the local pieces of the right hand sides
92 * B(IB:IB+N-1, 1:NRHS).
93 * On exit, this contains the local piece of the solutions
94 * distributed matrix X.
95 *
96 * IB (global input) INTEGER
97 * The row index in the global array B that points to the first
98 * row of the matrix to be operated on (which may be either
99 * all of B or a submatrix of B).
100 *
101 * DESCB (global and local input) INTEGER array of dimension DLEN.
102 * if 1D type (DTYPE_B=502), DLEN >=7;
103 * if 2D type (DTYPE_B=1), DLEN >= 9.
104 * The array descriptor for the distributed matrix B.
105 * Contains information of mapping of B to memory. Please
106 * see NOTES below for full description and options.
107 *
108 * AF (local output) COMPLEX array, dimension LAF.
109 * Auxiliary Fillin Space.
110 * Fillin is created during the factorization routine
111 * PCGBTRF and this is stored in AF. If a linear system
112 * is to be solved using PCGBTRS after the factorization
113 * routine, AF *must not be altered* after the factorization.
114 *
115 * LAF (local input) INTEGER
116 * Size of user-input Auxiliary Fillin space AF. Must be >=
117 * (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu)
118 * If LAF is not large enough, an error code will be returned
119 * and the minimum acceptable size will be returned in AF( 1 )
120 *
121 * WORK (local workspace/local output)
122 * COMPLEX temporary workspace. This space may
123 * be overwritten in between calls to routines. WORK must be
124 * the size given in LWORK.
125 * On exit, WORK( 1 ) contains the minimal LWORK.
126 *
127 * LWORK (local input or global input) INTEGER
128 * Size of user-input workspace WORK.
129 * If LWORK is too small, the minimal acceptable size will be
130 * returned in WORK(1) and an error code is returned. LWORK>=
131 * NRHS*(NB+2*bwl+4*bwu)
132 *
133 * INFO (global output) INTEGER
134 * = 0: successful exit
135 * < 0: If the i-th argument is an array and the j-entry had
136 * an illegal value, then INFO = -(i*100+j), if the i-th
137 * argument is a scalar and had an illegal value, then
138 * INFO = -i.
139 *
140 * =====================================================================
141 *
142 *
143 * Restrictions
144 * ============
145 *
146 * The following are restrictions on the input parameters. Some of these
147 * are temporary and will be removed in future releases, while others
148 * may reflect fundamental technical limitations.
149 *
150 * Non-cyclic restriction: VERY IMPORTANT!
151 * P*NB>= mod(JA-1,NB)+N.
152 * The mapping for matrices must be blocked, reflecting the nature
153 * of the divide and conquer algorithm as a task-parallel algorithm.
154 * This formula in words is: no processor may have more than one
155 * chunk of the matrix.
156 *
157 * Blocksize cannot be too small:
158 * If the matrix spans more than one processor, the following
159 * restriction on NB, the size of each block on each processor,
160 * must hold:
161 * NB >= (BWL+BWU)+1
162 * The bulk of parallel computation is done on the matrix of size
163 * O(NB) on each processor. If this is too small, divide and conquer
164 * is a poor choice of algorithm.
165 *
166 * Submatrix reference:
167 * JA = IB
168 * Alignment restriction that prevents unnecessary communication.
169 *
170 *
171 * =====================================================================
172 *
173 *
174 * Notes
175 * =====
176 *
177 * If the factorization routine and the solve routine are to be called
178 * separately (to solve various sets of righthand sides using the same
179 * coefficient matrix), the auxiliary space AF *must not be altered*
180 * between calls to the factorization routine and the solve routine.
181 *
182 * The best algorithm for solving banded and tridiagonal linear systems
183 * depends on a variety of parameters, especially the bandwidth.
184 * Currently, only algorithms designed for the case N/P >> bw are
185 * implemented. These go by many names, including Divide and Conquer,
186 * Partitioning, domain decomposition-type, etc.
187 *
188 * Algorithm description: Divide and Conquer
189 *
190 * The Divide and Conqer algorithm assumes the matrix is narrowly
191 * banded compared with the number of equations. In this situation,
192 * it is best to distribute the input matrix A one-dimensionally,
193 * with columns atomic and rows divided amongst the processes.
194 * The basic algorithm divides the banded matrix up into
195 * P pieces with one stored on each processor,
196 * and then proceeds in 2 phases for the factorization or 3 for the
197 * solution of a linear system.
198 * 1) Local Phase:
199 * The individual pieces are factored independently and in
200 * parallel. These factors are applied to the matrix creating
201 * fillin, which is stored in a non-inspectable way in auxiliary
202 * space AF. Mathematically, this is equivalent to reordering
203 * the matrix A as P A P^T and then factoring the principal
204 * leading submatrix of size equal to the sum of the sizes of
205 * the matrices factored on each processor. The factors of
206 * these submatrices overwrite the corresponding parts of A
207 * in memory.
208 * 2) Reduced System Phase:
209 * A small (max(bwl,bwu)* (P-1)) system is formed representing
210 * interaction of the larger blocks, and is stored (as are its
211 * factors) in the space AF. A parallel Block Cyclic Reduction
212 * algorithm is used. For a linear system, a parallel front solve
213 * followed by an analagous backsolve, both using the structure
214 * of the factored matrix, are performed.
215 * 3) Backsubsitution Phase:
216 * For a linear system, a local backsubstitution is performed on
217 * each processor in parallel.
218 *
219 *
220 * Descriptors
221 * ===========
222 *
223 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
224 *
225 * Note: banded codes can use either the old two dimensional
226 * or new one-dimensional descriptors, though the processor grid in
227 * both cases *must be one-dimensional*. We describe both types below.
228 *
229 * Each global data object is described by an associated description
230 * vector. This vector stores the information required to establish
231 * the mapping between an object element and its corresponding process
232 * and memory location.
233 *
234 * Let A be a generic term for any 2D block cyclicly distributed array.
235 * Such a global array has an associated description vector DESCA.
236 * In the following comments, the character _ should be read as
237 * "of the global array".
238 *
239 * NOTATION STORED IN EXPLANATION
240 * --------------- -------------- --------------------------------------
241 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
242 * DTYPE_A = 1.
243 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
244 * the BLACS process grid A is distribu-
245 * ted over. The context itself is glo-
246 * bal, but the handle (the integer
247 * value) may vary.
248 * M_A (global) DESCA( M_ ) The number of rows in the global
249 * array A.
250 * N_A (global) DESCA( N_ ) The number of columns in the global
251 * array A.
252 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
253 * the rows of the array.
254 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
255 * the columns of the array.
256 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
257 * row of the array A is distributed.
258 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
259 * first column of the array A is
260 * distributed.
261 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
262 * array. LLD_A >= MAX(1,LOCr(M_A)).
263 *
264 * Let K be the number of rows or columns of a distributed matrix,
265 * and assume that its process grid has dimension p x q.
266 * LOCr( K ) denotes the number of elements of K that a process
267 * would receive if K were distributed over the p processes of its
268 * process column.
269 * Similarly, LOCc( K ) denotes the number of elements of K that a
270 * process would receive if K were distributed over the q processes of
271 * its process row.
272 * The values of LOCr() and LOCc() may be determined via a call to the
273 * ScaLAPACK tool function, NUMROC:
274 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
275 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
276 * An upper bound for these quantities may be computed by:
277 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
278 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
279 *
280 *
281 * One-dimensional descriptors:
282 *
283 * One-dimensional descriptors are a new addition to ScaLAPACK since
284 * version 1.0. They simplify and shorten the descriptor for 1D
285 * arrays.
286 *
287 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
288 * object, we allow 1D arrays to be distributed either over the
289 * first dimension of the array (as if the grid were P-by-1) or the
290 * 2nd dimension (as if the grid were 1-by-P). This choice is
291 * indicated by the descriptor type (501 or 502)
292 * as described below.
293 *
294 * IMPORTANT NOTE: the actual BLACS grid represented by the
295 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
296 * irrespective of which one-dimensional descriptor type
297 * (501 or 502) is input.
298 * This routine will interpret the grid properly either way.
299 * ScaLAPACK routines *do not support intercontext operations* so that
300 * the grid passed to a single ScaLAPACK routine *must be the same*
301 * for all array descriptors passed to that routine.
302 *
303 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
304 * may also be used, since a one-dimensional array is a special case
305 * of a two-dimensional array with one dimension of size unity.
306 * The two-dimensional array used in this case *must* be of the
307 * proper orientation:
308 * If the appropriate one-dimensional descriptor is DTYPEA=501
309 * (1 by P type), then the two dimensional descriptor must
310 * have a CTXT value that refers to a 1 by P BLACS grid;
311 * If the appropriate one-dimensional descriptor is DTYPEA=502
312 * (P by 1 type), then the two dimensional descriptor must
313 * have a CTXT value that refers to a P by 1 BLACS grid.
314 *
315 *
316 * Summary of allowed descriptors, types, and BLACS grids:
317 * DTYPE 501 502 1 1
318 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
319 * -----------------------------------------------------
320 * A OK NO OK NO
321 * B NO OK NO OK
322 *
323 * Note that a consequence of this chart is that it is not possible
324 * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
325 * to opposite requirements for the orientation of the BLACS grid,
326 * and as noted before, the *same* BLACS context must be used in
327 * all descriptors in a single ScaLAPACK subroutine call.
328 *
329 * Let A be a generic term for any 1D block cyclicly distributed array.
330 * Such a global array has an associated description vector DESCA.
331 * In the following comments, the character _ should be read as
332 * "of the global array".
333 *
334 * NOTATION STORED IN EXPLANATION
335 * --------------- ---------- ------------------------------------------
336 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
337 * TYPE_A = 501: 1-by-P grid.
338 * TYPE_A = 502: P-by-1 grid.
339 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
340 * the BLACS process grid A is distribu-
341 * ted over. The context itself is glo-
342 * bal, but the handle (the integer
343 * value) may vary.
344 * N_A (global) DESCA( 3 ) The size of the array dimension being
345 * distributed.
346 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
347 * the distributed dimension of the array.
348 * SRC_A (global) DESCA( 5 ) The process row or column over which the
349 * first row or column of the array
350 * is distributed.
351 * LLD_A (local) DESCA( 6 ) The leading dimension of the local array
352 * storing the local blocks of the distri-
353 * buted array A. Minimum value of LLD_A
354 * depends on TYPE_A.
355 * TYPE_A = 501: LLD_A >=
356 * size of undistributed dimension, 1.
357 * TYPE_A = 502: LLD_A >=NB_A, 1.
358 * Reserved DESCA( 7 ) Reserved for future use.
359 *
360 *
361 *
362 * =====================================================================
363 *
364 * Implemented for ScaLAPACK by:
365 * Andrew J. Cleary, Livermore National Lab and University of Tenn.,
366 * and Marbwus Hegland, Australian Natonal University. Feb., 1997.
367 * Based on code written by : Peter Arbenz, ETH Zurich, 1996.
368 *
369 * =====================================================================
370 *
371 * .. Parameters ..
372  REAL ONE, ZERO
373  parameter( one = 1.0e+0 )
374  parameter( zero = 0.0e+0 )
375  COMPLEX CONE, CZERO
376  parameter( cone = ( 1.0e+0, 0.0e+0 ) )
377  parameter( czero = ( 0.0e+0, 0.0e+0 ) )
378  INTEGER INT_ONE
379  parameter( int_one = 1 )
380  INTEGER DESCMULT, BIGNUM
381  parameter( descmult = 100, bignum = descmult*descmult )
382  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
383  $ lld_, mb_, m_, nb_, n_, rsrc_
384  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
385  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
386  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
387 * ..
388 * .. Local Scalars ..
389  INTEGER APTR, BBPTR, BM, BMN, BN, BNN, BW, CSRC,
390  $ first_proc, ictxt, ictxt_new, ictxt_save,
391  $ idum2, idum3, j, ja_new, l, lbwl, lbwu, ldbb,
392  $ ldw, llda, lldb, lm, lmj, ln, lptr, mycol,
393  $ myrow, nb, neicol, np, npact, npcol, nprow,
394  $ npstr, np_save, odd_size, part_offset,
395  $ recovery_val, return_code, store_m_b,
396  $ store_n_a, work_size_min, wptr
397 * ..
398 * .. Local Arrays ..
399  INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
400  $ param_check( 17, 3 )
401 * ..
402 * .. External Subroutines ..
403  EXTERNAL blacs_gridinfo, desc_convert, globchk, pxerbla,
404  $ reshape
405 * ..
406 * .. External Functions ..
407  LOGICAL LSAME
408  INTEGER NUMROC
409  EXTERNAL lsame
410  EXTERNAL numroc
411 * ..
412 * .. Intrinsic Functions ..
413  INTRINSIC ichar, mod
414 * ..
415 * .. Executable Statements ..
416 *
417 *
418 * Test the input parameters
419 *
420  info = 0
421 *
422 * Convert descriptor into standard form for easy access to
423 * parameters, check that grid is of right shape.
424 *
425  desca_1xp( 1 ) = 501
426  descb_px1( 1 ) = 502
427 *
428  CALL desc_convert( desca, desca_1xp, return_code )
429 *
430  IF( return_code .NE. 0) THEN
431  info = -( 8*100 + 2 )
432  ENDIF
433 *
434  CALL desc_convert( descb, descb_px1, return_code )
435 *
436  IF( return_code .NE. 0) THEN
437  info = -( 11*100 + 2 )
438  ENDIF
439 *
440 * Consistency checks for DESCA and DESCB.
441 *
442 * Context must be the same
443  IF( desca_1xp( 2 ) .NE. descb_px1( 2 ) ) THEN
444  info = -( 11*100 + 2 )
445  ENDIF
446 *
447 * These are alignment restrictions that may or may not be removed
448 * in future releases. -Andy Cleary, April 14, 1996.
449 *
450 * Block sizes must be the same
451  IF( desca_1xp( 4 ) .NE. descb_px1( 4 ) ) THEN
452  info = -( 11*100 + 4 )
453  ENDIF
454 *
455 * Source processor must be the same
456 *
457  IF( desca_1xp( 5 ) .NE. descb_px1( 5 ) ) THEN
458  info = -( 11*100 + 5 )
459  ENDIF
460 *
461 * Get values out of descriptor for use in code.
462 *
463  ictxt = desca_1xp( 2 )
464  csrc = desca_1xp( 5 )
465  nb = desca_1xp( 4 )
466  llda = desca_1xp( 6 )
467  store_n_a = desca_1xp( 3 )
468  lldb = descb_px1( 6 )
469  store_m_b = descb_px1( 3 )
470 *
471 * Get grid parameters
472 *
473 *
474  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
475  np = nprow * npcol
476 *
477 *
478 *
479  IF( lsame( trans, 'N' ) ) THEN
480  idum2 = ichar( 'N' )
481  ELSE IF ( lsame( trans, 'C' ) ) THEN
482  idum2 = ichar( 'C' )
483  ELSE
484  info = -1
485  END IF
486 *
487  IF( lwork .LT. -1) THEN
488  info = -16
489  ELSE IF ( lwork .EQ. -1 ) THEN
490  idum3 = -1
491  ELSE
492  idum3 = 1
493  ENDIF
494 *
495  IF( n .LT. 0 ) THEN
496  info = -2
497  ENDIF
498 *
499  IF( n+ja-1 .GT. store_n_a ) THEN
500  info = -( 8*100 + 6 )
501  ENDIF
502 *
503  IF(( bwl .GT. n-1 ) .OR.
504  $ ( bwl .LT. 0 ) ) THEN
505  info = -3
506  ENDIF
507 *
508  IF(( bwu .GT. n-1 ) .OR.
509  $ ( bwu .LT. 0 ) ) THEN
510  info = -4
511  ENDIF
512 *
513  IF( llda .LT. (2*bwl+2*bwu+1) ) THEN
514  info = -( 8*100 + 6 )
515  ENDIF
516 *
517  IF( nb .LE. 0 ) THEN
518  info = -( 8*100 + 4 )
519  ENDIF
520 *
521  bw = bwu+bwl
522 *
523  IF( n+ib-1 .GT. store_m_b ) THEN
524  info = -( 11*100 + 3 )
525  ENDIF
526 *
527  IF( lldb .LT. nb ) THEN
528  info = -( 11*100 + 6 )
529  ENDIF
530 *
531  IF( nrhs .LT. 0 ) THEN
532  info = -5
533  ENDIF
534 *
535 * Current alignment restriction
536 *
537  IF( ja .NE. ib) THEN
538  info = -7
539  ENDIF
540 *
541 * Argument checking that is specific to Divide & Conquer routine
542 *
543  IF( nprow .NE. 1 ) THEN
544  info = -( 8*100+2 )
545  ENDIF
546 *
547  IF( n .GT. np*nb-mod( ja-1, nb )) THEN
548  info = -( 2 )
549  CALL pxerbla( ictxt,
550  $ 'PCGBTRS, D&C alg.: only 1 block per proc',
551  $ -info )
552  RETURN
553  ENDIF
554 *
555  IF((ja+n-1.GT.nb) .AND. ( nb.LT.(bwl+bwu+1) )) THEN
556  info = -( 8*100+4 )
557  CALL pxerbla( ictxt,
558  $ 'PCGBTRS, D&C alg.: NB too small',
559  $ -info )
560  RETURN
561  ENDIF
562 *
563 *
564 * Check worksize
565 *
566  work_size_min = nrhs*(nb+2*bwl+4*bwu)
567 *
568  work( 1 ) = work_size_min
569 *
570  IF( lwork .LT. work_size_min ) THEN
571  IF( lwork .NE. -1 ) THEN
572  info = -16
573  CALL pxerbla( ictxt,
574  $ 'PCGBTRS: worksize error ',
575  $ -info )
576  ENDIF
577  RETURN
578  ENDIF
579 *
580 * Pack params and positions into arrays for global consistency check
581 *
582  param_check( 17, 1 ) = descb(5)
583  param_check( 16, 1 ) = descb(4)
584  param_check( 15, 1 ) = descb(3)
585  param_check( 14, 1 ) = descb(2)
586  param_check( 13, 1 ) = descb(1)
587  param_check( 12, 1 ) = ib
588  param_check( 11, 1 ) = desca(5)
589  param_check( 10, 1 ) = desca(4)
590  param_check( 9, 1 ) = desca(3)
591  param_check( 8, 1 ) = desca(1)
592  param_check( 7, 1 ) = ja
593  param_check( 6, 1 ) = nrhs
594  param_check( 5, 1 ) = bwu
595  param_check( 4, 1 ) = bwl
596  param_check( 3, 1 ) = n
597  param_check( 2, 1 ) = idum3
598  param_check( 1, 1 ) = idum2
599 *
600  param_check( 17, 2 ) = 1105
601  param_check( 16, 2 ) = 1104
602  param_check( 15, 2 ) = 1103
603  param_check( 14, 2 ) = 1102
604  param_check( 13, 2 ) = 1101
605  param_check( 12, 2 ) = 10
606  param_check( 11, 2 ) = 805
607  param_check( 10, 2 ) = 804
608  param_check( 9, 2 ) = 803
609  param_check( 8, 2 ) = 801
610  param_check( 7, 2 ) = 7
611  param_check( 6, 2 ) = 5
612  param_check( 5, 2 ) = 4
613  param_check( 4, 2 ) = 3
614  param_check( 3, 2 ) = 2
615  param_check( 2, 2 ) = 16
616  param_check( 1, 2 ) = 1
617 *
618 * Want to find errors with MIN( ), so if no error, set it to a big
619 * number. If there already is an error, multiply by the the
620 * descriptor multiplier.
621 *
622  IF( info.GE.0 ) THEN
623  info = bignum
624  ELSE IF( info.LT.-descmult ) THEN
625  info = -info
626  ELSE
627  info = -info * descmult
628  END IF
629 *
630 * Check consistency across processors
631 *
632  CALL globchk( ictxt, 17, param_check, 17,
633  $ param_check( 1, 3 ), info )
634 *
635 * Prepare output: set info = 0 if no error, and divide by DESCMULT
636 * if error is not in a descriptor entry.
637 *
638  IF( info.EQ.bignum ) THEN
639  info = 0
640  ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
641  info = -info / descmult
642  ELSE
643  info = -info
644  END IF
645 *
646  IF( info.LT.0 ) THEN
647  CALL pxerbla( ictxt, 'PCGBTRS', -info )
648  RETURN
649  END IF
650 *
651 * Quick return if possible
652 *
653  IF( n.EQ.0 )
654  $ RETURN
655 *
656  IF( nrhs.EQ.0 )
657  $ RETURN
658 *
659 *
660 * Adjust addressing into matrix space to properly get into
661 * the beginning part of the relevant data
662 *
663  part_offset = nb*( (ja-1)/(npcol*nb) )
664 *
665  IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
666  part_offset = part_offset + nb
667  ENDIF
668 *
669  IF ( mycol .LT. csrc ) THEN
670  part_offset = part_offset - nb
671  ENDIF
672 *
673 * Form a new BLACS grid (the "standard form" grid) with only procs
674 * holding part of the matrix, of size 1xNP where NP is adjusted,
675 * starting at csrc=0, with JA modified to reflect dropped procs.
676 *
677 * First processor to hold part of the matrix:
678 *
679  first_proc = mod( ( ja-1 )/nb+csrc, npcol )
680 *
681 * Calculate new JA one while dropping off unused processors.
682 *
683  ja_new = mod( ja-1, nb ) + 1
684 *
685 * Save and compute new value of NP
686 *
687  np_save = np
688  np = ( ja_new+n-2 )/nb + 1
689 *
690 * Call utility routine that forms "standard-form" grid
691 *
692  CALL reshape( ictxt, int_one, ictxt_new, int_one,
693  $ first_proc, int_one, np )
694 *
695 * Use new context from standard grid as context.
696 *
697  ictxt_save = ictxt
698  ictxt = ictxt_new
699  desca_1xp( 2 ) = ictxt_new
700  descb_px1( 2 ) = ictxt_new
701 *
702 * Get information about new grid.
703 *
704  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
705 *
706 * Drop out processors that do not have part of the matrix.
707 *
708  IF( myrow .LT. 0 ) THEN
709  GOTO 1234
710  ENDIF
711 *
712 *
713 *
714 * Begin main code
715 *
716 * Move data into workspace - communicate/copy (overlap)
717 *
718  IF (mycol .LT. npcol-1) THEN
719  CALL cgesd2d( ictxt, bwu, nrhs, b(nb-bwu+1), lldb,
720  $ 0, mycol + 1)
721  ENDIF
722 *
723  IF (mycol .LT. npcol-1) THEN
724  lm = nb-bwu
725  ELSE
726  lm = nb
727  ENDIF
728 *
729  IF (mycol .GT. 0) THEN
730  wptr = bwu+1
731  ELSE
732  wptr = 1
733  ENDIF
734 *
735  ldw = nb+bwu + 2*bw+bwu
736 *
737  CALL clamov( 'G', lm, nrhs, b(1), lldb, work( wptr ), ldw )
738 *
739 * Zero out rest of work
740 *
741  DO 1501 j=1, nrhs
742  DO 1502 l=wptr+lm, ldw
743  work( (j-1)*ldw+l ) = czero
744  1502 CONTINUE
745  1501 CONTINUE
746 *
747  IF (mycol .GT. 0) THEN
748  CALL cgerv2d( ictxt, bwu, nrhs, work(1), ldw,
749  $ 0, mycol-1)
750  ENDIF
751 *
752 ********************************************************************
753 * PHASE 1: Local computation phase -- Solve L*X = B
754 ********************************************************************
755 *
756 * Size of main (or odd) partition in each processor
757 *
758  odd_size = numroc( n, nb, mycol, 0, npcol )
759 *
760  IF (mycol .NE. 0) THEN
761  lbwl = bw
762  lbwu = 0
763  aptr = 1
764  ELSE
765  lbwl = bwl
766  lbwu = bwu
767  aptr = 1+bwu
768  ENDIF
769 *
770  IF (mycol .NE. npcol-1) THEN
771  lm = nb - lbwu
772  ln = nb - bw
773  ELSE IF (mycol .NE. 0) THEN
774  lm = odd_size + bwu
775  ln = max(odd_size-bw,0)
776  ELSE
777  lm = n
778  ln = max( n-bw, 0 )
779  ENDIF
780 *
781  DO 21 j = 1, ln
782 *
783  lmj = min(lbwl,lm-j)
784  l = ipiv( j )
785 *
786  IF( l.NE.j ) THEN
787  CALL cswap(nrhs, work(l), ldw, work(j), ldw)
788  ENDIF
789 *
790  lptr = bw+1 + (j-1)*llda + aptr
791 *
792  CALL cgeru(lmj,nrhs,-cone, a(lptr),1, work(j),ldw,
793  $ work(j+1),ldw)
794 *
795  21 CONTINUE
796 *
797 ********************************************************************
798 * PHASE 2: Global computation phase -- Solve L*X = B
799 ********************************************************************
800 *
801 * Define the initial dimensions of the diagonal blocks
802 * The offdiagonal blocks (for MYCOL > 0) are of size BM by BW
803 *
804  IF (mycol .NE. npcol-1) THEN
805  bm = bw - lbwu
806  bn = bw
807  ELSE
808  bm = min(bw,odd_size) + bwu
809  bn = min(bw,odd_size)
810  ENDIF
811 *
812 * Pointer to first element of block bidiagonal matrix in AF
813 * Leading dimension of block bidiagonal system
814 *
815  bbptr = (nb+bwu)*bw + 1
816  ldbb = 2*bw + bwu
817 *
818  IF (npcol.EQ.1) THEN
819 *
820 * In this case the loop over the levels will not be
821 * performed.
822  CALL cgetrs( 'N', n-ln, nrhs, af(bbptr+bw*ldbb), ldbb,
823  $ ipiv(ln+1), work( ln+1 ), ldw, info)
824 *
825  ENDIF
826 *
827 * Loop over levels ...
828 *
829 * The two integers NPACT (nu. of active processors) and NPSTR
830 * (stride between active processors) is used to control the
831 * loop.
832 *
833  npact = npcol
834  npstr = 1
835 *
836 * Begin loop over levels
837  200 IF (npact .LE. 1) GOTO 300
838 *
839 * Test if processor is active
840  IF (mod(mycol,npstr) .EQ. 0) THEN
841 *
842 * Send/Receive blocks
843 *
844  IF (mod(mycol,2*npstr) .EQ. 0) THEN
845 *
846  neicol = mycol + npstr
847 *
848  IF (neicol/npstr .LE. npact-1) THEN
849 *
850  IF (neicol/npstr .LT. npact-1) THEN
851  bmn = bw
852  ELSE
853  bmn = min(bw,numroc(n, nb, neicol, 0, npcol))+bwu
854  ENDIF
855 *
856  CALL cgesd2d( ictxt, bm, nrhs,
857  $ work(ln+1), ldw, 0, neicol )
858 *
859  IF( npact .NE. 2 )THEN
860 *
861 * Receive answers back from partner processor
862 *
863  CALL cgerv2d(ictxt, bm+bmn-bw, nrhs,
864  $ work( ln+1 ), ldw, 0, neicol )
865 *
866  bm = bm+bmn-bw
867 *
868  ENDIF
869 *
870  ENDIF
871 *
872  ELSE
873 *
874  neicol = mycol - npstr
875 *
876  IF (neicol .EQ. 0) THEN
877  bmn = bw - bwu
878  ELSE
879  bmn = bw
880  ENDIF
881 *
882  CALL clamov( 'G', bm, nrhs, work(ln+1), ldw,
883  $ work(nb+bwu+bmn+1), ldw )
884 *
885  CALL cgerv2d( ictxt, bmn, nrhs, work( nb+bwu+1 ),
886  $ ldw, 0, neicol )
887 *
888 * and do the permutations and eliminations
889 *
890  IF (npact .NE. 2) THEN
891 *
892 * Solve locally for BW variables
893 *
894  CALL claswp( nrhs, work(nb+bwu+1), ldw, 1, bw,
895  $ ipiv(ln+1), 1)
896 *
897  CALL ctrsm('L','L','N','U', bw, nrhs, cone,
898  $ af(bbptr+bw*ldbb), ldbb, work(nb+bwu+1), ldw)
899 *
900 * Use soln just calculated to update RHS
901 *
902  CALL cgemm( 'N', 'N', bm+bmn-bw, nrhs, bw,
903  $ -cone, af(bbptr+bw*ldbb+bw), ldbb,
904  $ work(nb+bwu+1), ldw,
905  $ cone, work(nb+bwu+1+bw), ldw )
906 *
907 * Give answers back to partner processor
908 *
909  CALL cgesd2d( ictxt, bm+bmn-bw, nrhs,
910  $ work(nb+bwu+1+bw), ldw, 0, neicol )
911 *
912  ELSE
913 *
914 * Finish up calculations for final level
915 *
916  CALL claswp( nrhs, work(nb+bwu+1), ldw, 1, bm+bmn,
917  $ ipiv(ln+1), 1)
918 *
919  CALL ctrsm('L','L','N','U', bm+bmn, nrhs, cone,
920  $ af(bbptr+bw*ldbb), ldbb, work(nb+bwu+1), ldw)
921  ENDIF
922 *
923  ENDIF
924 *
925  npact = (npact + 1)/2
926  npstr = npstr * 2
927  GOTO 200
928 *
929  ENDIF
930 *
931  300 CONTINUE
932 *
933 *
934 **************************************
935 * BACKSOLVE
936 ********************************************************************
937 * PHASE 2: Global computation phase -- Solve U*Y = X
938 ********************************************************************
939 *
940  IF (npcol.EQ.1) THEN
941 *
942 * In this case the loop over the levels will not be
943 * performed.
944 * In fact, the backsolve portion was done in the call to
945 * CGETRS in the frontsolve.
946 *
947  ENDIF
948 *
949 * Compute variable needed to reverse loop structure in
950 * reduced system.
951 *
952  recovery_val = npact*npstr - npcol
953 *
954 * Loop over levels
955 * Terminal values of NPACT and NPSTR from frontsolve are used
956 *
957  2200 IF( npact .GE. npcol ) GOTO 2300
958 *
959  npstr = npstr/2
960 *
961  npact = npact*2
962 *
963 * Have to adjust npact for non-power-of-2
964 *
965  npact = npact-mod( (recovery_val/npstr), 2 )
966 *
967 * Find size of submatrix in this proc at this level
968 *
969  IF( mycol/npstr .LT. npact-1 ) THEN
970  bn = bw
971  ELSE
972  bn = min(bw, numroc(n, nb, npcol-1, 0, npcol) )
973  ENDIF
974 *
975 * If this processor is even in this level...
976 *
977  IF( mod( mycol, 2*npstr ) .EQ. 0 ) THEN
978 *
979  neicol = mycol+npstr
980 *
981  IF( neicol/npstr .LE. npact-1 ) THEN
982 *
983  IF( neicol/npstr .LT. npact-1 ) THEN
984  bmn = bw
985  bnn = bw
986  ELSE
987  bmn = min(bw,numroc(n, nb, neicol, 0, npcol))+bwu
988  bnn = min(bw, numroc(n, nb, neicol, 0, npcol) )
989  ENDIF
990 *
991  IF( npact .GT. 2 ) THEN
992 *
993  CALL cgesd2d( ictxt, 2*bw, nrhs, work( ln+1 ),
994  $ ldw, 0, neicol )
995 *
996  CALL cgerv2d( ictxt, bw, nrhs, work( ln+1 ),
997  $ ldw, 0, neicol )
998 *
999  ELSE
1000 *
1001  CALL cgerv2d( ictxt, bw, nrhs, work( ln+1 ),
1002  $ ldw, 0, neicol )
1003 *
1004  ENDIF
1005 *
1006  ENDIF
1007 *
1008  ELSE
1009 * This processor is odd on this level
1010 *
1011  neicol = mycol - npstr
1012 *
1013  IF (neicol .EQ. 0) THEN
1014  bmn = bw - bwu
1015  ELSE
1016  bmn = bw
1017  ENDIF
1018 *
1019  IF( neicol .LT. npcol-1 ) THEN
1020  bnn = bw
1021  ELSE
1022  bnn = min(bw, numroc(n, nb, neicol, 0, npcol) )
1023  ENDIF
1024 *
1025  IF( npact .GT. 2 ) THEN
1026 *
1027 * Move RHS to make room for received solutions
1028 *
1029  CALL clamov( 'G', bw, nrhs, work(nb+bwu+1),
1030  $ ldw, work(nb+bwu+bw+1), ldw )
1031 *
1032  CALL cgerv2d( ictxt, 2*bw, nrhs, work( ln+1 ),
1033  $ ldw, 0, neicol )
1034 *
1035  CALL cgemm( 'N', 'N', bw, nrhs, bn,
1036  $ -cone, af(bbptr), ldbb,
1037  $ work(ln+1), ldw,
1038  $ cone, work(nb+bwu+bw+1), ldw )
1039 *
1040 *
1041  IF( mycol .GT. npstr ) THEN
1042 *
1043  CALL cgemm( 'N', 'N', bw, nrhs, bw,
1044  $ -cone, af(bbptr+2*bw*ldbb), ldbb,
1045  $ work(ln+bw+1), ldw,
1046  $ cone, work(nb+bwu+bw+1), ldw )
1047 *
1048  ENDIF
1049 *
1050  CALL ctrsm('L','U','N','N', bw, nrhs, cone,
1051  $ af(bbptr+bw*ldbb), ldbb, work(nb+bwu+bw+1), ldw)
1052 *
1053 * Send new solution to neighbor
1054 *
1055  CALL cgesd2d( ictxt, bw, nrhs,
1056  $ work( nb+bwu+bw+1 ), ldw, 0, neicol )
1057 *
1058 * Copy new solution into expected place
1059 *
1060  CALL clamov( 'G', bw, nrhs, work(nb+bwu+1+bw),
1061  $ ldw, work(ln+bw+1), ldw )
1062 *
1063  ELSE
1064 *
1065 * Solve with local diagonal block
1066 *
1067  CALL ctrsm( 'L','U','N','N', bn+bnn, nrhs, cone,
1068  $ af(bbptr+bw*ldbb), ldbb, work(nb+bwu+1), ldw)
1069 *
1070 * Send new solution to neighbor
1071 *
1072  CALL cgesd2d( ictxt, bw, nrhs,
1073  $ work(nb+bwu+1), ldw, 0, neicol )
1074 *
1075 * Shift solutions into expected positions
1076 *
1077  CALL clamov( 'G', bnn+bn-bw, nrhs, work(nb+bwu+1+bw),
1078  $ ldw, work(ln+1), ldw )
1079 *
1080 *
1081  IF( (nb+bwu+1) .NE. (ln+1+bw) ) THEN
1082 *
1083 * Copy one row at a time since spaces may overlap
1084 *
1085  DO 1064 j=1, bw
1086  CALL ccopy( nrhs, work(nb+bwu+j), ldw,
1087  $ work(ln+bw+j), ldw )
1088  1064 CONTINUE
1089 *
1090  ENDIF
1091 *
1092  ENDIF
1093 *
1094  ENDIF
1095 *
1096  GOTO 2200
1097 *
1098  2300 CONTINUE
1099 * End of loop over levels
1100 *
1101 ********************************************************************
1102 * PHASE 1: (Almost) Local computation phase -- Solve U*Y = X
1103 ********************************************************************
1104 *
1105 * Reset BM to value it had before reduced system frontsolve...
1106 *
1107  IF (mycol .NE. npcol-1) THEN
1108  bm = bw - lbwu
1109  ELSE
1110  bm = min(bw,odd_size) + bwu
1111  ENDIF
1112 *
1113 * First metastep is to account for the fillin blocks AF
1114 *
1115  IF( mycol .LT. npcol-1 ) THEN
1116 *
1117  CALL cgesd2d( ictxt, bw, nrhs, work( nb-bw+1 ),
1118  $ ldw, 0, mycol+1 )
1119 *
1120  ENDIF
1121 *
1122  IF( mycol .GT. 0 ) THEN
1123 *
1124  CALL cgerv2d( ictxt, bw, nrhs, work( nb+bwu+1 ),
1125  $ ldw, 0, mycol-1 )
1126 *
1127 * Modify local right hand sides with received rhs's
1128 *
1129  CALL cgemm( 'N', 'N', lm-bm, nrhs, bw, -cone,
1130  $ af( 1 ), lm, work( nb+bwu+1 ), ldw, cone,
1131  $ work( 1 ), ldw )
1132 *
1133  ENDIF
1134 *
1135  DO 2021 j = ln, 1, -1
1136 *
1137  lmj = min( bw, odd_size-1 )
1138 *
1139  lptr = bw-1+j*llda+aptr
1140 *
1141 * In the following, the TRANS=T option is used to reverse
1142 * the order of multiplication, not as a true transpose
1143 *
1144  CALL cgemv( 'T', lmj, nrhs, -cone, work( j+1), ldw,
1145  $ a( lptr ), llda-1, cone, work( j ), ldw )
1146 *
1147 * Divide by diagonal element
1148 *
1149  CALL cscal( nrhs, cone/a( lptr-llda+1 ),
1150  $ work( j ), ldw )
1151  2021 CONTINUE
1152 *
1153 *
1154 *
1155  CALL clamov( 'G', odd_size, nrhs, work( 1 ), ldw,
1156  $ b( 1 ), lldb )
1157 *
1158 * Free BLACS space used to hold standard-form grid.
1159 *
1160  ictxt = ictxt_save
1161  IF( ictxt .NE. ictxt_new ) THEN
1162  CALL blacs_gridexit( ictxt_new )
1163  ENDIF
1164 *
1165  1234 CONTINUE
1166 *
1167 * Restore saved input parameters
1168 *
1169  np = np_save
1170 *
1171 * Output worksize
1172 *
1173  work( 1 ) = work_size_min
1174 *
1175  RETURN
1176 *
1177 * End of PCGBTRS
1178 *
1179  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
max
#define max(A, B)
Definition: pcgemr.c:180
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
pcgbtrs
subroutine pcgbtrs(TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, IPIV, B, IB, DESCB, AF, LAF, WORK, LWORK, INFO)
Definition: pcgbtrs.f:3
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181