ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
psdbtrf.f
Go to the documentation of this file.
1  SUBROUTINE psdbtrf( N, BWL, BWU, A, JA, DESCA, AF, LAF, WORK,
2  $ LWORK, INFO )
3 *
4 * -- ScaLAPACK routine (version 2.0.2) --
5 * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6 * May 1 2012
7 *
8 * .. Scalar Arguments ..
9  INTEGER BWL, BWU, INFO, JA, LAF, LWORK, N
10 * ..
11 * .. Array Arguments ..
12  INTEGER DESCA( * )
13  REAL A( * ), AF( * ), WORK( * )
14 * ..
15 *
16 * Purpose
17 * =======
18 *
19 * PSDBTRF computes a LU factorization
20 * of an N-by-N real banded
21 * diagonally dominant-like distributed matrix
22 * with bandwidth BWL, BWU: A(1:N, JA:JA+N-1).
23 * Reordering is used to increase parallelism in the factorization.
24 * This reordering results in factors that are DIFFERENT from those
25 * produced by equivalent sequential codes. These factors cannot
26 * be used directly by users; however, they can be used in
27 * subsequent calls to PSDBTRS to solve linear systems.
28 *
29 * The factorization has the form
30 *
31 * P A(1:N, JA:JA+N-1) P^T = L U
32 *
33 * where U is a banded upper triangular matrix and L is banded
34 * lower triangular, and P is a permutation matrix.
35 *
36 * =====================================================================
37 *
38 * Arguments
39 * =========
40 *
41 *
42 * N (global input) INTEGER
43 * The number of rows and columns to be operated on, i.e. the
44 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
45 *
46 * BWL (global input) INTEGER
47 * Number of subdiagonals. 0 <= BWL <= N-1
48 *
49 * BWU (global input) INTEGER
50 * Number of superdiagonals. 0 <= BWU <= N-1
51 *
52 * A (local input/local output) REAL pointer into
53 * local memory to an array with first dimension
54 * LLD_A >=(bwl+bwu+1) (stored in DESCA).
55 * On entry, this array contains the local pieces of the
56 * N-by-N unsymmetric banded distributed matrix
57 * A(1:N, JA:JA+N-1) to be factored.
58 * This local portion is stored in the packed banded format
59 * used in LAPACK. Please see the Notes below and the
60 * ScaLAPACK manual for more detail on the format of
61 * distributed matrices.
62 * On exit, this array contains information containing details
63 * of the factorization.
64 * Note that permutations are performed on the matrix, so that
65 * the factors returned are different from those returned
66 * by LAPACK.
67 *
68 * JA (global input) INTEGER
69 * The index in the global array A that points to the start of
70 * the matrix to be operated on (which may be either all of A
71 * or a submatrix of A).
72 *
73 * DESCA (global and local input) INTEGER array of dimension DLEN.
74 * if 1D type (DTYPE_A=501), DLEN >= 7;
75 * if 2D type (DTYPE_A=1), DLEN >= 9 .
76 * The array descriptor for the distributed matrix A.
77 * Contains information of mapping of A to memory. Please
78 * see NOTES below for full description and options.
79 *
80 * AF (local output) REAL array, dimension LAF.
81 * Auxiliary Fillin Space.
82 * Fillin is created during the factorization routine
83 * PSDBTRF and this is stored in AF. If a linear system
84 * is to be solved using PSDBTRS after the factorization
85 * routine, AF *must not be altered* after the factorization.
86 *
87 * LAF (local input) INTEGER
88 * Size of user-input Auxiliary Fillin space AF. Must be >=
89 * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu)
90 * If LAF is not large enough, an error code will be returned
91 * and the minimum acceptable size will be returned in AF( 1 )
92 *
93 * WORK (local workspace/local output)
94 * REAL temporary workspace. This space may
95 * be overwritten in between calls to routines. WORK must be
96 * the size given in LWORK.
97 * On exit, WORK( 1 ) contains the minimal LWORK.
98 *
99 * LWORK (local input or global input) INTEGER
100 * Size of user-input workspace WORK.
101 * If LWORK is too small, the minimal acceptable size will be
102 * returned in WORK(1) and an error code is returned. LWORK>=
103 * max(bwl,bwu)*max(bwl,bwu)
104 *
105 * INFO (global output) INTEGER
106 * = 0: successful exit
107 * < 0: If the i-th argument is an array and the j-entry had
108 * an illegal value, then INFO = -(i*100+j), if the i-th
109 * argument is a scalar and had an illegal value, then
110 * INFO = -i.
111 * > 0: If INFO = K<=NPROCS, the submatrix stored on processor
112 * INFO and factored locally was not
113 * diagonally dominant-like, and
114 * the factorization was not completed.
115 * If INFO = K>NPROCS, the submatrix stored on processor
116 * INFO-NPROCS representing interactions with other
117 * processors was not
118 * stably factorable wo/interchanges,
119 * and the factorization was not completed.
120 *
121 * =====================================================================
122 *
123 * Restrictions
124 * ============
125 *
126 * The following are restrictions on the input parameters. Some of these
127 * are temporary and will be removed in future releases, while others
128 * may reflect fundamental technical limitations.
129 *
130 * Non-cyclic restriction: VERY IMPORTANT!
131 * P*NB>= mod(JA-1,NB)+N.
132 * The mapping for matrices must be blocked, reflecting the nature
133 * of the divide and conquer algorithm as a task-parallel algorithm.
134 * This formula in words is: no processor may have more than one
135 * chunk of the matrix.
136 *
137 * Blocksize cannot be too small:
138 * If the matrix spans more than one processor, the following
139 * restriction on NB, the size of each block on each processor,
140 * must hold:
141 * NB >= 2*MAX(BWL,BWU)
142 * The bulk of parallel computation is done on the matrix of size
143 * O(NB) on each processor. If this is too small, divide and conquer
144 * is a poor choice of algorithm.
145 *
146 * Submatrix reference:
147 * JA = IB
148 * Alignment restriction that prevents unnecessary communication.
149 *
150 *
151 * =====================================================================
152 *
153 * Notes
154 * =====
155 *
156 * If the factorization routine and the solve routine are to be called
157 * separately (to solve various sets of righthand sides using the same
158 * coefficient matrix), the auxiliary space AF *must not be altered*
159 * between calls to the factorization routine and the solve routine.
160 *
161 * The best algorithm for solving banded and tridiagonal linear systems
162 * depends on a variety of parameters, especially the bandwidth.
163 * Currently, only algorithms designed for the case N/P >> bw are
164 * implemented. These go by many names, including Divide and Conquer,
165 * Partitioning, domain decomposition-type, etc.
166 *
167 * Algorithm description: Divide and Conquer
168 *
169 * The Divide and Conqer algorithm assumes the matrix is narrowly
170 * banded compared with the number of equations. In this situation,
171 * it is best to distribute the input matrix A one-dimensionally,
172 * with columns atomic and rows divided amongst the processes.
173 * The basic algorithm divides the banded matrix up into
174 * P pieces with one stored on each processor,
175 * and then proceeds in 2 phases for the factorization or 3 for the
176 * solution of a linear system.
177 * 1) Local Phase:
178 * The individual pieces are factored independently and in
179 * parallel. These factors are applied to the matrix creating
180 * fillin, which is stored in a non-inspectable way in auxiliary
181 * space AF. Mathematically, this is equivalent to reordering
182 * the matrix A as P A P^T and then factoring the principal
183 * leading submatrix of size equal to the sum of the sizes of
184 * the matrices factored on each processor. The factors of
185 * these submatrices overwrite the corresponding parts of A
186 * in memory.
187 * 2) Reduced System Phase:
188 * A small (max(bwl,bwu)* (P-1)) system is formed representing
189 * interaction of the larger blocks, and is stored (as are its
190 * factors) in the space AF. A parallel Block Cyclic Reduction
191 * algorithm is used. For a linear system, a parallel front solve
192 * followed by an analagous backsolve, both using the structure
193 * of the factored matrix, are performed.
194 * 3) Backsubsitution Phase:
195 * For a linear system, a local backsubstitution is performed on
196 * each processor in parallel.
197 *
198 * Descriptors
199 * ===========
200 *
201 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
202 *
203 * Note: banded codes can use either the old two dimensional
204 * or new one-dimensional descriptors, though the processor grid in
205 * both cases *must be one-dimensional*. We describe both types below.
206 *
207 * Each global data object is described by an associated description
208 * vector. This vector stores the information required to establish
209 * the mapping between an object element and its corresponding process
210 * and memory location.
211 *
212 * Let A be a generic term for any 2D block cyclicly distributed array.
213 * Such a global array has an associated description vector DESCA.
214 * In the following comments, the character _ should be read as
215 * "of the global array".
216 *
217 * NOTATION STORED IN EXPLANATION
218 * --------------- -------------- --------------------------------------
219 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
220 * DTYPE_A = 1.
221 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
222 * the BLACS process grid A is distribu-
223 * ted over. The context itself is glo-
224 * bal, but the handle (the integer
225 * value) may vary.
226 * M_A (global) DESCA( M_ ) The number of rows in the global
227 * array A.
228 * N_A (global) DESCA( N_ ) The number of columns in the global
229 * array A.
230 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
231 * the rows of the array.
232 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
233 * the columns of the array.
234 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
235 * row of the array A is distributed.
236 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
237 * first column of the array A is
238 * distributed.
239 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
240 * array. LLD_A >= MAX(1,LOCr(M_A)).
241 *
242 * Let K be the number of rows or columns of a distributed matrix,
243 * and assume that its process grid has dimension p x q.
244 * LOCr( K ) denotes the number of elements of K that a process
245 * would receive if K were distributed over the p processes of its
246 * process column.
247 * Similarly, LOCc( K ) denotes the number of elements of K that a
248 * process would receive if K were distributed over the q processes of
249 * its process row.
250 * The values of LOCr() and LOCc() may be determined via a call to the
251 * ScaLAPACK tool function, NUMROC:
252 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
253 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
254 * An upper bound for these quantities may be computed by:
255 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
256 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
257 *
258 *
259 * One-dimensional descriptors:
260 *
261 * One-dimensional descriptors are a new addition to ScaLAPACK since
262 * version 1.0. They simplify and shorten the descriptor for 1D
263 * arrays.
264 *
265 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
266 * object, we allow 1D arrays to be distributed either over the
267 * first dimension of the array (as if the grid were P-by-1) or the
268 * 2nd dimension (as if the grid were 1-by-P). This choice is
269 * indicated by the descriptor type (501 or 502)
270 * as described below.
271 *
272 * IMPORTANT NOTE: the actual BLACS grid represented by the
273 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
274 * irrespective of which one-dimensional descriptor type
275 * (501 or 502) is input.
276 * This routine will interpret the grid properly either way.
277 * ScaLAPACK routines *do not support intercontext operations* so that
278 * the grid passed to a single ScaLAPACK routine *must be the same*
279 * for all array descriptors passed to that routine.
280 *
281 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
282 * may also be used, since a one-dimensional array is a special case
283 * of a two-dimensional array with one dimension of size unity.
284 * The two-dimensional array used in this case *must* be of the
285 * proper orientation:
286 * If the appropriate one-dimensional descriptor is DTYPEA=501
287 * (1 by P type), then the two dimensional descriptor must
288 * have a CTXT value that refers to a 1 by P BLACS grid;
289 * If the appropriate one-dimensional descriptor is DTYPEA=502
290 * (P by 1 type), then the two dimensional descriptor must
291 * have a CTXT value that refers to a P by 1 BLACS grid.
292 *
293 *
294 * Summary of allowed descriptors, types, and BLACS grids:
295 * DTYPE 501 502 1 1
296 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
297 * -----------------------------------------------------
298 * A OK NO OK NO
299 * B NO OK NO OK
300 *
301 * Let A be a generic term for any 1D block cyclicly distributed array.
302 * Such a global array has an associated description vector DESCA.
303 * In the following comments, the character _ should be read as
304 * "of the global array".
305 *
306 * NOTATION STORED IN EXPLANATION
307 * --------------- ---------- ------------------------------------------
308 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
309 * TYPE_A = 501: 1-by-P grid.
310 * TYPE_A = 502: P-by-1 grid.
311 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
312 * the BLACS process grid A is distribu-
313 * ted over. The context itself is glo-
314 * bal, but the handle (the integer
315 * value) may vary.
316 * N_A (global) DESCA( 3 ) The size of the array dimension being
317 * distributed.
318 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
319 * the distributed dimension of the array.
320 * SRC_A (global) DESCA( 5 ) The process row or column over which the
321 * first row or column of the array
322 * is distributed.
323 * LLD_A (local) DESCA( 6 ) The leading dimension of the local array
324 * storing the local blocks of the distri-
325 * buted array A. Minimum value of LLD_A
326 * depends on TYPE_A.
327 * TYPE_A = 501: LLD_A >=
328 * size of undistributed dimension, 1.
329 * TYPE_A = 502: LLD_A >=NB_A, 1.
330 * Reserved DESCA( 7 ) Reserved for future use.
331 *
332 * =====================================================================
333 *
334 * Code Developer: Andrew J. Cleary, University of Tennessee.
335 * Current address: Lawrence Livermore National Labs.
336 * Last modified by: Peter Arbenz, Institute of Scientific Computing,
337 * ETH, Zurich.
338 *
339 * =====================================================================
340 *
341 * .. Parameters ..
342  REAL ONE
343  parameter( one = 1.0e+0 )
344  REAL ZERO
345  parameter( zero = 0.0e+0 )
346  INTEGER INT_ONE
347  parameter( int_one = 1 )
348  INTEGER DESCMULT, BIGNUM
349  parameter( descmult = 100, bignum = descmult*descmult )
350  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
351  $ lld_, mb_, m_, nb_, n_, rsrc_
352  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
353  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
354  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
355 * ..
356 * .. Local Scalars ..
357  INTEGER COMM_PROC, CSRC, FIRST_PROC, I, I1, I2, ICTXT,
358  $ ictxt_new, ictxt_save, idum3, ja_new, laf_min,
359  $ level_dist, llda, max_bw, mbw2, mycol, myrow,
360  $ my_num_cols, nb, next_tri_size_m,
361  $ next_tri_size_n, np, npcol, nprow, np_save,
362  $ odd_size, ofst, part_offset, part_size,
363  $ prev_tri_size_m, prev_tri_size_n, return_code,
364  $ store_n_a, up_prev_tri_size_m,
365  $ up_prev_tri_size_n, work_size_min, work_u
366 * ..
367 * .. Local Arrays ..
368  INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 )
369 * ..
370 * .. External Subroutines ..
371  EXTERNAL blacs_gridexit, blacs_gridinfo, saxpy, sdbtrf,
372  $ desc_convert, sgemm, sgemv, sgerv2d, sgesd2d,
373  $ slamov, slatcpy, stbtrs, strmm, strrv2d,
374  $ strsd2d, globchk, igamx2d, igebr2d, igebs2d,
375  $ pxerbla, reshape
376 * ..
377 * .. External Functions ..
378  INTEGER NUMROC
379  EXTERNAL numroc
380 * ..
381 * .. Intrinsic Functions ..
382  INTRINSIC max, min, mod
383 * ..
384 * .. Executable Statements ..
385 *
386 * Test the input parameters
387 *
388  info = 0
389 *
390 * Convert descriptor into standard form for easy access to
391 * parameters, check that grid is of right shape.
392 *
393  desca_1xp( 1 ) = 501
394 *
395  CALL desc_convert( desca, desca_1xp, return_code )
396 *
397  IF( return_code.NE.0 ) THEN
398  info = -( 6*100+2 )
399  END IF
400 *
401 * Get values out of descriptor for use in code.
402 *
403  ictxt = desca_1xp( 2 )
404  csrc = desca_1xp( 5 )
405  nb = desca_1xp( 4 )
406  llda = desca_1xp( 6 )
407  store_n_a = desca_1xp( 3 )
408 *
409 * Get grid parameters
410 *
411 *
412 * Size of separator blocks is maximum of bandwidths
413 *
414  max_bw = max( bwl, bwu )
415  mbw2 = max_bw*max_bw
416 *
417  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
418  np = nprow*npcol
419 *
420 *
421 *
422  IF( lwork.LT.-1 ) THEN
423  info = -10
424  ELSE IF( lwork.EQ.-1 ) THEN
425  idum3 = -1
426  ELSE
427  idum3 = 1
428  END IF
429 *
430  IF( n.LT.0 ) THEN
431  info = -1
432  END IF
433 *
434  IF( n+ja-1.GT.store_n_a ) THEN
435  info = -( 6*100+6 )
436  END IF
437 *
438  IF( ( bwl.GT.n-1 ) .OR. ( bwl.LT.0 ) ) THEN
439  info = -2
440  END IF
441 *
442  IF( ( bwu.GT.n-1 ) .OR. ( bwu.LT.0 ) ) THEN
443  info = -3
444  END IF
445 *
446  IF( llda.LT.( bwl+bwu+1 ) ) THEN
447  info = -( 6*100+6 )
448  END IF
449 *
450  IF( nb.LE.0 ) THEN
451  info = -( 6*100+4 )
452  END IF
453 *
454 * Argument checking that is specific to Divide & Conquer routine
455 *
456  IF( nprow.NE.1 ) THEN
457  info = -( 6*100+2 )
458  END IF
459 *
460  IF( n.GT.np*nb-mod( ja-1, nb ) ) THEN
461  info = -( 1 )
462  CALL pxerbla( ictxt, 'PSDBTRF, D&C alg.: only 1 block per proc'
463  $ , -info )
464  RETURN
465  END IF
466 *
467  IF( ( ja+n-1.GT.nb ) .AND. ( nb.LT.2*max( bwl, bwu ) ) ) THEN
468  info = -( 6*100+4 )
469  CALL pxerbla( ictxt, 'PSDBTRF, D&C alg.: NB too small', -info )
470  RETURN
471  END IF
472 *
473 *
474 * Check auxiliary storage size
475 *
476  laf_min = nb*( bwl+bwu ) + 6*max( bwl, bwu )*max( bwl, bwu )
477 *
478  IF( laf.LT.laf_min ) THEN
479  info = -8
480 * put minimum value of laf into AF( 1 )
481  af( 1 ) = laf_min
482  CALL pxerbla( ictxt, 'PSDBTRF: auxiliary storage error ',
483  $ -info )
484  RETURN
485  END IF
486 *
487 * Check worksize
488 *
489  work_size_min = max( bwl, bwu )*max( bwl, bwu )
490 *
491  work( 1 ) = work_size_min
492 *
493  IF( lwork.LT.work_size_min ) THEN
494  IF( lwork.NE.-1 ) THEN
495  info = -10
496  CALL pxerbla( ictxt, 'PSDBTRF: worksize error ', -info )
497  END IF
498  RETURN
499  END IF
500 *
501 * Pack params and positions into arrays for global consistency check
502 *
503  param_check( 9, 1 ) = desca( 5 )
504  param_check( 8, 1 ) = desca( 4 )
505  param_check( 7, 1 ) = desca( 3 )
506  param_check( 6, 1 ) = desca( 1 )
507  param_check( 5, 1 ) = ja
508  param_check( 4, 1 ) = bwu
509  param_check( 3, 1 ) = bwl
510  param_check( 2, 1 ) = n
511  param_check( 1, 1 ) = idum3
512 *
513  param_check( 9, 2 ) = 605
514  param_check( 8, 2 ) = 604
515  param_check( 7, 2 ) = 603
516  param_check( 6, 2 ) = 601
517  param_check( 5, 2 ) = 5
518  param_check( 4, 2 ) = 3
519  param_check( 3, 2 ) = 2
520  param_check( 2, 2 ) = 1
521  param_check( 1, 2 ) = 10
522 *
523 * Want to find errors with MIN( ), so if no error, set it to a big
524 * number. If there already is an error, multiply by the the
525 * descriptor multiplier.
526 *
527  IF( info.GE.0 ) THEN
528  info = bignum
529  ELSE IF( info.LT.-descmult ) THEN
530  info = -info
531  ELSE
532  info = -info*descmult
533  END IF
534 *
535 * Check consistency across processors
536 *
537  CALL globchk( ictxt, 9, param_check, 9, param_check( 1, 3 ),
538  $ info )
539 *
540 * Prepare output: set info = 0 if no error, and divide by DESCMULT
541 * if error is not in a descriptor entry.
542 *
543  IF( info.EQ.bignum ) THEN
544  info = 0
545  ELSE IF( mod( info, descmult ).EQ.0 ) THEN
546  info = -info / descmult
547  ELSE
548  info = -info
549  END IF
550 *
551  IF( info.LT.0 ) THEN
552  CALL pxerbla( ictxt, 'PSDBTRF', -info )
553  RETURN
554  END IF
555 *
556 * Quick return if possible
557 *
558  IF( n.EQ.0 )
559  $ RETURN
560 *
561 *
562 * Adjust addressing into matrix space to properly get into
563 * the beginning part of the relevant data
564 *
565  part_offset = nb*( ( ja-1 ) / ( npcol*nb ) )
566 *
567  IF( ( mycol-csrc ).LT.( ja-part_offset-1 ) / nb ) THEN
568  part_offset = part_offset + nb
569  END IF
570 *
571  IF( mycol.LT.csrc ) THEN
572  part_offset = part_offset - nb
573  END IF
574 *
575 * Form a new BLACS grid (the "standard form" grid) with only procs
576 * holding part of the matrix, of size 1xNP where NP is adjusted,
577 * starting at csrc=0, with JA modified to reflect dropped procs.
578 *
579 * First processor to hold part of the matrix:
580 *
581  first_proc = mod( ( ja-1 ) / nb+csrc, npcol )
582 *
583 * Calculate new JA one while dropping off unused processors.
584 *
585  ja_new = mod( ja-1, nb ) + 1
586 *
587 * Save and compute new value of NP
588 *
589  np_save = np
590  np = ( ja_new+n-2 ) / nb + 1
591 *
592 * Call utility routine that forms "standard-form" grid
593 *
594  CALL reshape( ictxt, int_one, ictxt_new, int_one, first_proc,
595  $ int_one, np )
596 *
597 * Use new context from standard grid as context.
598 *
599  ictxt_save = ictxt
600  ictxt = ictxt_new
601  desca_1xp( 2 ) = ictxt_new
602 *
603 * Get information about new grid.
604 *
605  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
606 *
607 * Drop out processors that do not have part of the matrix.
608 *
609  IF( myrow.LT.0 ) THEN
610  GO TO 140
611  END IF
612 *
613 * ********************************
614 * Values reused throughout routine
615 *
616 * User-input value of partition size
617 *
618  part_size = nb
619 *
620 * Number of columns in each processor
621 *
622  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
623 *
624 * Offset in columns to beginning of main partition in each proc
625 *
626  IF( mycol.EQ.0 ) THEN
627  part_offset = part_offset + mod( ja_new-1, part_size )
628  my_num_cols = my_num_cols - mod( ja_new-1, part_size )
629  END IF
630 *
631 * Offset in elements
632 *
633  ofst = part_offset*llda
634 *
635 * Size of main (or odd) partition in each processor
636 *
637  odd_size = my_num_cols
638  IF( mycol.LT.np-1 ) THEN
639  odd_size = odd_size - max_bw
640  END IF
641 *
642 * Offset to workspace for Upper triangular factor
643 *
644  work_u = bwu*odd_size + 3*mbw2
645 *
646 *
647 * Zero out space for fillin
648 *
649  DO 10 i = 1, laf_min
650  af( i ) = zero
651  10 CONTINUE
652 *
653 * Zero out space for work
654 *
655  DO 20 i = 1, work_size_min
656  work( i ) = zero
657  20 CONTINUE
658 *
659 * Begin main code
660 *
661 *
662 ********************************************************************
663 * PHASE 1: Local computation phase.
664 ********************************************************************
665 *
666 *
667 * Sizes of the extra triangles communicated bewtween processors
668 *
669  IF( mycol.GT.0 ) THEN
670  prev_tri_size_m = min( bwl, numroc( n, part_size, mycol, 0,
671  $ npcol ) )
672  prev_tri_size_n = min( bwl, numroc( n, part_size, mycol-1, 0,
673  $ npcol ) )
674  END IF
675 *
676  IF( mycol.GT.0 ) THEN
677  up_prev_tri_size_m = min( bwu,
678  $ numroc( n, part_size, mycol, 0, npcol ) )
679  up_prev_tri_size_n = min( bwu,
680  $ numroc( n, part_size, mycol-1, 0,
681  $ npcol ) )
682  END IF
683 *
684  IF( mycol.LT.npcol-1 ) THEN
685  next_tri_size_m = min( bwl, numroc( n, part_size, mycol+1, 0,
686  $ npcol ) )
687  next_tri_size_n = min( bwl, numroc( n, part_size, mycol, 0,
688  $ npcol ) )
689  END IF
690 *
691  IF( mycol.LT.np-1 ) THEN
692 * Transfer last triangle D_i of local matrix to next processor
693 * which needs it to calculate fillin due to factorization of
694 * its main (odd) block A_i.
695 * Overlap the send with the factorization of A_i.
696 *
697  CALL strsd2d( ictxt, 'U', 'N', next_tri_size_m,
698  $ next_tri_size_n, a( ofst+( my_num_cols-bwl )*
699  $ llda+( bwl+bwu+1 ) ), llda-1, 0, mycol+1 )
700 *
701  END IF
702 *
703 *
704 * Factor main partition A_i = L_i {U_i} in each processor
705 *
706  CALL sdbtrf( odd_size, odd_size, bwl, bwu, a( ofst+1 ), llda,
707  $ info )
708 *
709  IF( info.NE.0 ) THEN
710  info = mycol + 1
711  GO TO 30
712  END IF
713 *
714 *
715  IF( mycol.LT.np-1 ) THEN
716 *
717 * Apply factorization to lower connection block BL_i
718 * transpose the connection block in preparation.
719 * Apply factorization to upper connection block BU_i
720 * Move the connection block in preparation.
721 *
722  CALL slatcpy( 'U', bwl, bwl, a( ( ofst+( bwl+bwu+1 )+
723  $ ( odd_size-bwl )*llda ) ), llda-1,
724  $ af( odd_size*bwu+2*mbw2+1+max_bw-bwl ), max_bw )
725  CALL slamov( 'L', bwu, bwu, a( ( ofst+1+odd_size*llda ) ),
726  $ llda-1, af( work_u+odd_size*bwl+2*mbw2+1+max_bw-
727  $ bwu ), max_bw )
728 *
729 * Perform the triangular system solve {L_i}{{BU'}_i} = {B_i}
730 *
731  CALL stbtrs( 'L', 'N', 'U', bwu, bwl, bwu,
732  $ a( ofst+bwu+1+( odd_size-bwu )*llda ), llda,
733  $ af( work_u+odd_size*bwl+2*mbw2+1+max_bw-bwu ),
734  $ max_bw, info )
735 *
736 * Perform the triangular solve {U_i}^T{BL'}_i^T = {BL_i}^T
737 *
738  CALL stbtrs( 'U', 'T', 'N', bwl, bwu, bwl,
739  $ a( ofst+1+( odd_size-bwl )*llda ), llda,
740  $ af( odd_size*bwu+2*mbw2+1+max_bw-bwl ), max_bw,
741  $ info )
742 *
743 * transpose resulting block to its location
744 * in main storage.
745 *
746  CALL slatcpy( 'L', bwl, bwl, af( odd_size*bwu+2*mbw2+1+max_bw-
747  $ bwl ), max_bw, a( ( ofst+( bwl+bwu+1 )+
748  $ ( odd_size-bwl )*llda ) ), llda-1 )
749 *
750 * Move the resulting block back to its location in main storage.
751 *
752  CALL slamov( 'L', bwu, bwu, af( work_u+odd_size*bwl+2*mbw2+1+
753  $ max_bw-bwu ), max_bw, a( ( ofst+1+odd_size*
754  $ llda ) ), llda-1 )
755 *
756 *
757 * Compute contribution to diagonal block(s) of reduced system.
758 * {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i}
759 *
760 * The following method uses more flops than necessary but
761 * does not necessitate the writing of a new BLAS routine.
762 *
763 *
764  CALL sgemm( 'T', 'N', max_bw, max_bw, max_bw, -one,
765  $ af( odd_size*bwu+2*mbw2+1 ), max_bw,
766  $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw, one,
767  $ a( ofst+odd_size*llda+1+bwu ), llda-1 )
768 *
769  END IF
770 * End of "if ( MYCOL .lt. NP-1 )..." loop
771 *
772 *
773  30 CONTINUE
774 * If the processor could not locally factor, it jumps here.
775 *
776  IF( mycol.NE.0 ) THEN
777 * Discard temporary matrix stored beginning in
778 * AF( (odd_size+2*bwl, bwu)*bwl, bwu+1 ) and use for
779 * off_diagonal block of reduced system.
780 *
781 * Receive previously transmitted matrix section, which forms
782 * the right-hand-side for the triangular solve that calculates
783 * the "spike" fillin.
784 *
785 *
786  CALL strrv2d( ictxt, 'U', 'N', prev_tri_size_m,
787  $ prev_tri_size_n, af( work_u+1 ), bwl, 0,
788  $ mycol-1 )
789 *
790  IF( info.EQ.0 ) THEN
791 *
792 * Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ .
793 *
794 * Transpose transmitted triangular matrix $DL_i$
795 *
796  DO 50 i1 = 1, bwl
797  DO 40 i2 = i1 + 1, bwl
798  af( work_u+i2+( i1-1 )*bwl ) = af( work_u+i1+( i2-1 )*
799  $ bwl )
800  af( work_u+i1+( i2-1 )*bwl ) = zero
801  40 CONTINUE
802  50 CONTINUE
803 *
804  DO 60 i1 = 2, odd_size
805  i2 = min( i1-1, bwl )
806  CALL sgemv( 'N', bwl, i2, -one,
807  $ af( work_u+1+( i1-1-i2 )*bwl ), bwl,
808  $ a( ofst+bwu+1+i2+( i1-1-i2 )*llda ), llda-1,
809  $ one, af( work_u+1+( i1-1 )*bwl ), 1 )
810  60 CONTINUE
811 *
812 *
813 * Calculate the "spike" fillin, ${U_i}^T {{GL}_i}^T = {DU_i}^T$
814 *
815 *
816 * Copy D block into AF storage for solve.
817 *
818  CALL slamov( 'L', up_prev_tri_size_n, up_prev_tri_size_m,
819  $ a( ofst+1 ), llda-1, af( 1 ), bwu )
820 *
821  DO 80 i1 = 1, odd_size
822  i2 = min( bwu, i1-1 )
823  CALL sgemv( 'N', bwu, i2, -one, af( ( i1-1-i2 )*bwu+1 ),
824  $ bwu, a( ofst+bwu+1-i2+( i1-1 )*llda ), 1,
825  $ one, af( ( i1-1 )*bwu+1 ), 1 )
826 *
827  DO 70 i = 1, bwu
828  af( ( i1-1 )*bwu+i ) = af( ( i1-1 )*bwu+i ) /
829  $ a( ( i1-1 )*llda+bwu+1 )
830  70 CONTINUE
831  80 CONTINUE
832 *
833 * Calculate the update block for previous proc, E_i = GL_i{GU_i}
834 *
835 *
836 * Zero out space in case result is smaller than storage block
837 *
838  DO 90 i = 1, mbw2
839  af( odd_size*bwu+2*mbw2+i ) = zero
840  90 CONTINUE
841 *
842  CALL sgemm( 'N', 'T', bwu, bwl, odd_size, -one, af( 1 ),
843  $ bwu, af( work_u+1 ), bwl, zero,
844  $ af( 1+max( 0, bwl-bwu )+odd_size*bwu+( 2*max_bw+
845  $ max( 0, bwu-bwl ) )*max_bw ), max_bw )
846 *
847 *
848 * Initiate send of E_i to previous processor to overlap
849 * with next computation.
850 *
851  CALL sgesd2d( ictxt, max_bw, max_bw,
852  $ af( odd_size*bwu+2*mbw2+1 ), max_bw, 0,
853  $ mycol-1 )
854 *
855 *
856  IF( mycol.LT.np-1 ) THEN
857 *
858 * Calculate off-diagonal block(s) of reduced system.
859 * Note: for ease of use in solution of reduced system, store
860 * L's off-diagonal block in transpose form.
861 *
862 * Copy matrix HU_i (the last bwl rows of GU_i) to AFL storage
863 * as per requirements of BLAS routine DTRMM.
864 * Since we have GU_i stored,
865 * transpose HU_i to HU_i^T.
866 *
867  CALL slamov( 'N', bwl, bwl,
868  $ af( work_u+( odd_size-bwl )*bwl+1 ), bwl,
869  $ af( ( odd_size )*bwu+1+( max_bw-bwl ) ),
870  $ max_bw )
871 *
872  CALL strmm( 'R', 'U', 'T', 'N', bwl, bwl, -one,
873  $ a( ( ofst+( bwl+bwu+1 )+( odd_size-bwl )*
874  $ llda ) ), llda-1, af( ( odd_size )*bwu+1+
875  $ ( max_bw-bwl ) ), max_bw )
876 *
877 *
878 * Copy matrix HL_i (the last bwu rows of GL_i^T) to AFU store
879 * as per requirements of BLAS routine DTRMM.
880 * Since we have GL_i^T stored,
881 * transpose HL_i^T to HL_i.
882 *
883  CALL slamov( 'N', bwu, bwu, af( ( odd_size-bwu )*bwu+1 ),
884  $ bwu, af( work_u+( odd_size )*bwl+1+max_bw-
885  $ bwu ), max_bw )
886 *
887  CALL strmm( 'R', 'L', 'N', 'N', bwu, bwu, -one,
888  $ a( ( ofst+1+odd_size*llda ) ), llda-1,
889  $ af( work_u+( odd_size )*bwl+1+max_bw-bwu ),
890  $ max_bw )
891 *
892  END IF
893 *
894  END IF
895 * End of "if ( MYCOL .ne. 0 )..."
896 *
897  END IF
898 * End of "if (info.eq.0) then"
899 *
900 *
901 * Check to make sure no processors have found errors
902 *
903  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info, -1, 0,
904  $ 0 )
905 *
906  IF( mycol.EQ.0 ) THEN
907  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
908  ELSE
909  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
910  END IF
911 *
912  IF( info.NE.0 ) THEN
913  GO TO 130
914  END IF
915 * No errors found, continue
916 *
917 *
918 ********************************************************************
919 * PHASE 2: Formation and factorization of Reduced System.
920 ********************************************************************
921 *
922 * Gather up local sections of reduced system
923 *
924 *
925 * The last processor does not participate in the factorization of
926 * the reduced system, having sent its E_i already.
927  IF( mycol.EQ.npcol-1 ) THEN
928  GO TO 120
929  END IF
930 *
931 * Initiate send of off-diag block(s) to overlap with next part.
932 * Off-diagonal block needed on neighboring processor to start
933 * algorithm.
934 *
935  IF( ( mod( mycol+1, 2 ).EQ.0 ) .AND. ( mycol.GT.0 ) ) THEN
936 *
937  CALL sgesd2d( ictxt, max_bw, max_bw, af( odd_size*bwu+1 ),
938  $ max_bw, 0, mycol-1 )
939 *
940  CALL sgesd2d( ictxt, max_bw, max_bw,
941  $ af( work_u+odd_size*bwl+1 ), max_bw, 0, mycol-1 )
942 *
943  END IF
944 *
945 * Copy last diagonal block into AF storage for subsequent
946 * operations.
947 *
948  CALL slamov( 'N', max_bw, max_bw, a( ofst+odd_size*llda+bwu+1 ),
949  $ llda-1, af( odd_size*bwu+mbw2+1 ), max_bw )
950 *
951 * Receive cont. to diagonal block that is stored on this proc.
952 *
953  IF( mycol.LT.npcol-1 ) THEN
954 *
955  CALL sgerv2d( ictxt, max_bw, max_bw,
956  $ af( odd_size*bwu+2*mbw2+1 ), max_bw, 0, mycol+1 )
957 *
958 * Add contribution to diagonal block
959 *
960  CALL saxpy( mbw2, one, af( odd_size*bwu+2*mbw2+1 ), 1,
961  $ af( odd_size*bwu+mbw2+1 ), 1 )
962 *
963  END IF
964 *
965 *
966 * *************************************
967 * Modification Loop
968 *
969 * The distance for sending and receiving for each level starts
970 * at 1 for the first level.
971  level_dist = 1
972 *
973 * Do until this proc is needed to modify other procs' equations
974 *
975  100 CONTINUE
976  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
977  $ GO TO 110
978 *
979 * Receive and add contribution to diagonal block from the left
980 *
981  IF( mycol-level_dist.GE.0 ) THEN
982  CALL sgerv2d( ictxt, max_bw, max_bw, work( 1 ), max_bw, 0,
983  $ mycol-level_dist )
984 *
985  CALL saxpy( mbw2, one, work( 1 ), 1, af( odd_size*bwu+mbw2+1 ),
986  $ 1 )
987 *
988  END IF
989 *
990 * Receive and add contribution to diagonal block from the right
991 *
992  IF( mycol+level_dist.LT.npcol-1 ) THEN
993  CALL sgerv2d( ictxt, max_bw, max_bw, work( 1 ), max_bw, 0,
994  $ mycol+level_dist )
995 *
996  CALL saxpy( mbw2, one, work( 1 ), 1, af( odd_size*bwu+mbw2+1 ),
997  $ 1 )
998 *
999  END IF
1000 *
1001  level_dist = level_dist*2
1002 *
1003  GO TO 100
1004  110 CONTINUE
1005 * [End of GOTO Loop]
1006 *
1007 *
1008 * *********************************
1009 * Calculate and use this proc's blocks to modify other procs'...
1010 *
1011 * Factor diagonal block
1012 *
1013  CALL sdbtrf( max_bw, max_bw, min( max_bw-1, bwl ),
1014  $ min( max_bw-1, bwu ), af( odd_size*bwu+mbw2+1-
1015  $ ( min( max_bw-1, bwu ) ) ), max_bw+1, info )
1016 *
1017  IF( info.NE.0 ) THEN
1018  info = npcol + mycol
1019  END IF
1020 *
1021 * ****************************************************************
1022 * Receive offdiagonal block from processor to right.
1023 * If this is the first group of processors, the receive comes
1024 * from a different processor than otherwise.
1025 *
1026  IF( level_dist.EQ.1 ) THEN
1027  comm_proc = mycol + 1
1028 *
1029 * Move block into place that it will be expected to be for
1030 * calcs.
1031 *
1032  CALL slamov( 'N', max_bw, max_bw, af( odd_size*bwu+1 ), max_bw,
1033  $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw )
1034 *
1035  CALL slamov( 'N', max_bw, max_bw, af( work_u+odd_size*bwl+1 ),
1036  $ max_bw, af( odd_size*bwu+2*mbw2+1 ), max_bw )
1037 *
1038  ELSE
1039  comm_proc = mycol + level_dist / 2
1040  END IF
1041 *
1042  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1043 *
1044  CALL sgerv2d( ictxt, max_bw, max_bw, af( odd_size*bwu+1 ),
1045  $ max_bw, 0, comm_proc )
1046 *
1047  CALL sgerv2d( ictxt, max_bw, max_bw,
1048  $ af( work_u+odd_size*bwl+1 ), max_bw, 0,
1049  $ comm_proc )
1050 *
1051  IF( info.EQ.0 ) THEN
1052 *
1053 *
1054 * Modify upper off_diagonal block with diagonal block
1055 *
1056 *
1057  CALL stbtrs( 'L', 'N', 'U', bwu, min( bwl, bwu-1 ), bwu,
1058  $ af( odd_size*bwu+mbw2+1+( max_bw+1 )*( max_bw-
1059  $ bwu ) ), max_bw+1, af( work_u+odd_size*bwl+1+
1060  $ max_bw-bwu ), max_bw, info )
1061 *
1062 * Modify lower off_diagonal block with diagonal block
1063 *
1064 *
1065  CALL stbtrs( 'U', 'T', 'N', bwl, min( bwu, bwl-1 ), bwl,
1066  $ af( odd_size*bwu+mbw2+1-min( bwu,
1067  $ bwl-1 )+( max_bw+1 )*( max_bw-bwl ) ),
1068  $ max_bw+1, af( odd_size*bwu+1+max_bw-bwl ),
1069  $ max_bw, info )
1070 *
1071  END IF
1072 * End of "if ( info.eq.0 ) then"
1073 *
1074 * Calculate contribution from this block to next diagonal block
1075 *
1076  CALL sgemm( 'T', 'N', max_bw, max_bw, max_bw, -one,
1077  $ af( ( odd_size )*bwu+1 ), max_bw,
1078  $ af( work_u+( odd_size )*bwl+1 ), max_bw, zero,
1079  $ work( 1 ), max_bw )
1080 *
1081 * Send contribution to diagonal block's owning processor.
1082 *
1083  CALL sgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw, 0,
1084  $ mycol+level_dist )
1085 *
1086  END IF
1087 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1088 *
1089 *
1090 * ****************************************************************
1091 * Receive off_diagonal block from left and use to finish with this
1092 * processor.
1093 *
1094  IF( ( mycol / level_dist.GT.0 ) .AND.
1095  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) ) THEN
1096 *
1097  IF( level_dist.GT.1 ) THEN
1098 *
1099 * Receive offdiagonal block(s) from proc level_dist/2 to the
1100 * left
1101 *
1102  CALL sgerv2d( ictxt, max_bw, max_bw,
1103  $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw, 0,
1104  $ mycol-level_dist / 2 )
1105 *
1106 * Receive offdiagonal block(s) from proc level_dist/2 to the
1107 * left
1108 *
1109  CALL sgerv2d( ictxt, max_bw, max_bw,
1110  $ af( odd_size*bwu+2*mbw2+1 ), max_bw, 0,
1111  $ mycol-level_dist / 2 )
1112 *
1113  END IF
1114 *
1115 *
1116  IF( info.EQ.0 ) THEN
1117 *
1118 * Use diagonal block(s) to modify this offdiagonal block
1119 *
1120 *
1121 * Since STBTRS has no "left-right" option, we must transpose
1122 *
1123  CALL slatcpy( 'N', max_bw, max_bw,
1124  $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw,
1125  $ work( 1 ), max_bw )
1126 *
1127  CALL stbtrs( 'L', 'N', 'U', max_bw, min( bwl, max_bw-1 ),
1128  $ bwl, af( odd_size*bwu+mbw2+1 ), max_bw+1,
1129  $ work( 1+max_bw*( max_bw-bwl ) ), max_bw, info )
1130 *
1131 * Transpose back
1132 *
1133  CALL slatcpy( 'N', max_bw, max_bw, work( 1 ), max_bw,
1134  $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw )
1135 *
1136 *
1137 *
1138 * Since STBTRS has no "left-right" option, we must transpose
1139 *
1140  CALL slatcpy( 'N', max_bw, max_bw,
1141  $ af( odd_size*bwu+2*mbw2+1 ), max_bw,
1142  $ work( 1 ), max_bw )
1143 *
1144  CALL stbtrs( 'U', 'T', 'N', max_bw, min( bwu, max_bw-1 ),
1145  $ bwu, af( odd_size*bwu+mbw2+1-min( bwu,
1146  $ max_bw-1 ) ), max_bw+1,
1147  $ work( 1+max_bw*( max_bw-bwu ) ), max_bw, info )
1148 *
1149 * Transpose back
1150 *
1151  CALL slatcpy( 'N', max_bw, max_bw, work( 1 ), max_bw,
1152  $ af( odd_size*bwu+2*mbw2+1 ), max_bw )
1153 *
1154 *
1155  END IF
1156 * End of "if( info.eq.0 ) then"
1157 *
1158 * Use offdiag block(s) to calculate modification to diag block
1159 * of processor to the left
1160 *
1161  CALL sgemm( 'N', 'T', max_bw, max_bw, max_bw, -one,
1162  $ af( ( odd_size )*bwu+2*mbw2+1 ), max_bw,
1163  $ af( work_u+( odd_size )*bwl+2*mbw2+1 ), max_bw,
1164  $ zero, work( 1 ), max_bw )
1165 *
1166 * Send contribution to diagonal block's owning processor.
1167 *
1168  CALL sgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw, 0,
1169  $ mycol-level_dist )
1170 *
1171 * *******************************************************
1172 *
1173  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1174 *
1175 * Decide which processor offdiagonal block(s) goes to
1176 *
1177  IF( ( mod( mycol / ( 2*level_dist ), 2 ) ).EQ.0 ) THEN
1178  comm_proc = mycol + level_dist
1179  ELSE
1180  comm_proc = mycol - level_dist
1181  END IF
1182 *
1183 * Use offdiagonal blocks to calculate offdiag
1184 * block to send to neighboring processor. Depending
1185 * on circumstances, may need to transpose the matrix.
1186 *
1187  CALL sgemm( 'N', 'N', max_bw, max_bw, max_bw, -one,
1188  $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw,
1189  $ af( odd_size*bwu+1 ), max_bw, zero, work( 1 ),
1190  $ max_bw )
1191 *
1192 * Send contribution to offdiagonal block's owning processor.
1193 *
1194  CALL sgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw, 0,
1195  $ comm_proc )
1196 *
1197  CALL sgemm( 'N', 'N', max_bw, max_bw, max_bw, -one,
1198  $ af( odd_size*bwu+2*mbw2+1 ), max_bw,
1199  $ af( work_u+odd_size*bwl+1 ), max_bw, zero,
1200  $ work( 1 ), max_bw )
1201 *
1202 * Send contribution to offdiagonal block's owning processor.
1203 *
1204  CALL sgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw, 0,
1205  $ comm_proc )
1206 *
1207  END IF
1208 *
1209  END IF
1210 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1211 *
1212  120 CONTINUE
1213 *
1214 *
1215  130 CONTINUE
1216 *
1217 *
1218 * Free BLACS space used to hold standard-form grid.
1219 *
1220  IF( ictxt_save.NE.ictxt_new ) THEN
1221  CALL blacs_gridexit( ictxt_new )
1222  END IF
1223 *
1224  140 CONTINUE
1225 *
1226 * Restore saved input parameters
1227 *
1228  ictxt = ictxt_save
1229  np = np_save
1230 *
1231 * Output minimum worksize
1232 *
1233  work( 1 ) = work_size_min
1234 *
1235 * Make INFO consistent across processors
1236 *
1237  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info, -1, 0,
1238  $ 0 )
1239 *
1240  IF( mycol.EQ.0 ) THEN
1241  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
1242  ELSE
1243  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
1244  END IF
1245 *
1246 *
1247  RETURN
1248 *
1249 * End of PSDBTRF
1250 *
1251  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
max
#define max(A, B)
Definition: pcgemr.c:180
slatcpy
subroutine slatcpy(UPLO, M, N, A, LDA, B, LDB)
Definition: slatcpy.f:2
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
sdbtrf
subroutine sdbtrf(M, N, KL, KU, AB, LDAB, INFO)
Definition: sdbtrf.f:2
min
#define min(A, B)
Definition: pcgemr.c:181
psdbtrf
subroutine psdbtrf(N, BWL, BWU, A, JA, DESCA, AF, LAF, WORK, LWORK, INFO)
Definition: psdbtrf.f:3