ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pcdbtrf.f
Go to the documentation of this file.
1  SUBROUTINE pcdbtrf( N, BWL, BWU, A, JA, DESCA, AF, LAF, WORK,
2  $ LWORK, INFO )
3 *
4 * -- ScaLAPACK routine (version 2.0.2) --
5 * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6 * May 1 2012
7 *
8 * .. Scalar Arguments ..
9  INTEGER BWL, BWU, INFO, JA, LAF, LWORK, N
10 * ..
11 * .. Array Arguments ..
12  INTEGER DESCA( * )
13  COMPLEX A( * ), AF( * ), WORK( * )
14 * ..
15 *
16 *
17 * Purpose
18 * =======
19 *
20 * PCDBTRF computes a LU factorization
21 * of an N-by-N complex banded
22 * diagonally dominant-like distributed matrix
23 * with bandwidth BWL, BWU: A(1:N, JA:JA+N-1).
24 * Reordering is used to increase parallelism in the factorization.
25 * This reordering results in factors that are DIFFERENT from those
26 * produced by equivalent sequential codes. These factors cannot
27 * be used directly by users; however, they can be used in
28 * subsequent calls to PCDBTRS to solve linear systems.
29 *
30 * The factorization has the form
31 *
32 * P A(1:N, JA:JA+N-1) P^T = L U
33 *
34 * where U is a banded upper triangular matrix and L is banded
35 * lower triangular, and P is a permutation matrix.
36 *
37 * =====================================================================
38 *
39 * Arguments
40 * =========
41 *
42 *
43 * N (global input) INTEGER
44 * The number of rows and columns to be operated on, i.e. the
45 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
46 *
47 * BWL (global input) INTEGER
48 * Number of subdiagonals. 0 <= BWL <= N-1
49 *
50 * BWU (global input) INTEGER
51 * Number of superdiagonals. 0 <= BWU <= N-1
52 *
53 * A (local input/local output) COMPLEX pointer into
54 * local memory to an array with first dimension
55 * LLD_A >=(bwl+bwu+1) (stored in DESCA).
56 * On entry, this array contains the local pieces of the
57 * N-by-N unsymmetric banded distributed matrix
58 * A(1:N, JA:JA+N-1) to be factored.
59 * This local portion is stored in the packed banded format
60 * used in LAPACK. Please see the Notes below and the
61 * ScaLAPACK manual for more detail on the format of
62 * distributed matrices.
63 * On exit, this array contains information containing details
64 * of the factorization.
65 * Note that permutations are performed on the matrix, so that
66 * the factors returned are different from those returned
67 * by LAPACK.
68 *
69 * JA (global input) INTEGER
70 * The index in the global array A that points to the start of
71 * the matrix to be operated on (which may be either all of A
72 * or a submatrix of A).
73 *
74 * DESCA (global and local input) INTEGER array of dimension DLEN.
75 * if 1D type (DTYPE_A=501), DLEN >= 7;
76 * if 2D type (DTYPE_A=1), DLEN >= 9 .
77 * The array descriptor for the distributed matrix A.
78 * Contains information of mapping of A to memory. Please
79 * see NOTES below for full description and options.
80 *
81 * AF (local output) COMPLEX array, dimension LAF.
82 * Auxiliary Fillin Space.
83 * Fillin is created during the factorization routine
84 * PCDBTRF and this is stored in AF. If a linear system
85 * is to be solved using PCDBTRS after the factorization
86 * routine, AF *must not be altered* after the factorization.
87 *
88 * LAF (local input) INTEGER
89 * Size of user-input Auxiliary Fillin space AF. Must be >=
90 * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu)
91 * If LAF is not large enough, an error code will be returned
92 * and the minimum acceptable size will be returned in AF( 1 )
93 *
94 * WORK (local workspace/local output)
95 * COMPLEX temporary workspace. This space may
96 * be overwritten in between calls to routines. WORK must be
97 * the size given in LWORK.
98 * On exit, WORK( 1 ) contains the minimal LWORK.
99 *
100 * LWORK (local input or global input) INTEGER
101 * Size of user-input workspace WORK.
102 * If LWORK is too small, the minimal acceptable size will be
103 * returned in WORK(1) and an error code is returned. LWORK>=
104 * max(bwl,bwu)*max(bwl,bwu)
105 *
106 * INFO (global output) INTEGER
107 * = 0: successful exit
108 * < 0: If the i-th argument is an array and the j-entry had
109 * an illegal value, then INFO = -(i*100+j), if the i-th
110 * argument is a scalar and had an illegal value, then
111 * INFO = -i.
112 * > 0: If INFO = K<=NPROCS, the submatrix stored on processor
113 * INFO and factored locally was not
114 * diagonally dominant-like, and
115 * the factorization was not completed.
116 * If INFO = K>NPROCS, the submatrix stored on processor
117 * INFO-NPROCS representing interactions with other
118 * processors was not
119 * stably factorable wo/interchanges,
120 * and the factorization was not completed.
121 *
122 * =====================================================================
123 *
124 *
125 * Restrictions
126 * ============
127 *
128 * The following are restrictions on the input parameters. Some of these
129 * are temporary and will be removed in future releases, while others
130 * may reflect fundamental technical limitations.
131 *
132 * Non-cyclic restriction: VERY IMPORTANT!
133 * P*NB>= mod(JA-1,NB)+N.
134 * The mapping for matrices must be blocked, reflecting the nature
135 * of the divide and conquer algorithm as a task-parallel algorithm.
136 * This formula in words is: no processor may have more than one
137 * chunk of the matrix.
138 *
139 * Blocksize cannot be too small:
140 * If the matrix spans more than one processor, the following
141 * restriction on NB, the size of each block on each processor,
142 * must hold:
143 * NB >= 2*MAX(BWL,BWU)
144 * The bulk of parallel computation is done on the matrix of size
145 * O(NB) on each processor. If this is too small, divide and conquer
146 * is a poor choice of algorithm.
147 *
148 * Submatrix reference:
149 * JA = IB
150 * Alignment restriction that prevents unnecessary communication.
151 *
152 *
153 * =====================================================================
154 *
155 *
156 * Notes
157 * =====
158 *
159 * If the factorization routine and the solve routine are to be called
160 * separately (to solve various sets of righthand sides using the same
161 * coefficient matrix), the auxiliary space AF *must not be altered*
162 * between calls to the factorization routine and the solve routine.
163 *
164 * The best algorithm for solving banded and tridiagonal linear systems
165 * depends on a variety of parameters, especially the bandwidth.
166 * Currently, only algorithms designed for the case N/P >> bw are
167 * implemented. These go by many names, including Divide and Conquer,
168 * Partitioning, domain decomposition-type, etc.
169 *
170 * Algorithm description: Divide and Conquer
171 *
172 * The Divide and Conqer algorithm assumes the matrix is narrowly
173 * banded compared with the number of equations. In this situation,
174 * it is best to distribute the input matrix A one-dimensionally,
175 * with columns atomic and rows divided amongst the processes.
176 * The basic algorithm divides the banded matrix up into
177 * P pieces with one stored on each processor,
178 * and then proceeds in 2 phases for the factorization or 3 for the
179 * solution of a linear system.
180 * 1) Local Phase:
181 * The individual pieces are factored independently and in
182 * parallel. These factors are applied to the matrix creating
183 * fillin, which is stored in a non-inspectable way in auxiliary
184 * space AF. Mathematically, this is equivalent to reordering
185 * the matrix A as P A P^T and then factoring the principal
186 * leading submatrix of size equal to the sum of the sizes of
187 * the matrices factored on each processor. The factors of
188 * these submatrices overwrite the corresponding parts of A
189 * in memory.
190 * 2) Reduced System Phase:
191 * A small (max(bwl,bwu)* (P-1)) system is formed representing
192 * interaction of the larger blocks, and is stored (as are its
193 * factors) in the space AF. A parallel Block Cyclic Reduction
194 * algorithm is used. For a linear system, a parallel front solve
195 * followed by an analagous backsolve, both using the structure
196 * of the factored matrix, are performed.
197 * 3) Backsubsitution Phase:
198 * For a linear system, a local backsubstitution is performed on
199 * each processor in parallel.
200 *
201 *
202 * Descriptors
203 * ===========
204 *
205 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
206 *
207 * Note: banded codes can use either the old two dimensional
208 * or new one-dimensional descriptors, though the processor grid in
209 * both cases *must be one-dimensional*. We describe both types below.
210 *
211 * Each global data object is described by an associated description
212 * vector. This vector stores the information required to establish
213 * the mapping between an object element and its corresponding process
214 * and memory location.
215 *
216 * Let A be a generic term for any 2D block cyclicly distributed array.
217 * Such a global array has an associated description vector DESCA.
218 * In the following comments, the character _ should be read as
219 * "of the global array".
220 *
221 * NOTATION STORED IN EXPLANATION
222 * --------------- -------------- --------------------------------------
223 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
224 * DTYPE_A = 1.
225 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
226 * the BLACS process grid A is distribu-
227 * ted over. The context itself is glo-
228 * bal, but the handle (the integer
229 * value) may vary.
230 * M_A (global) DESCA( M_ ) The number of rows in the global
231 * array A.
232 * N_A (global) DESCA( N_ ) The number of columns in the global
233 * array A.
234 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
235 * the rows of the array.
236 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
237 * the columns of the array.
238 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
239 * row of the array A is distributed.
240 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
241 * first column of the array A is
242 * distributed.
243 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
244 * array. LLD_A >= MAX(1,LOCr(M_A)).
245 *
246 * Let K be the number of rows or columns of a distributed matrix,
247 * and assume that its process grid has dimension p x q.
248 * LOCr( K ) denotes the number of elements of K that a process
249 * would receive if K were distributed over the p processes of its
250 * process column.
251 * Similarly, LOCc( K ) denotes the number of elements of K that a
252 * process would receive if K were distributed over the q processes of
253 * its process row.
254 * The values of LOCr() and LOCc() may be determined via a call to the
255 * ScaLAPACK tool function, NUMROC:
256 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
257 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
258 * An upper bound for these quantities may be computed by:
259 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
260 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
261 *
262 *
263 * One-dimensional descriptors:
264 *
265 * One-dimensional descriptors are a new addition to ScaLAPACK since
266 * version 1.0. They simplify and shorten the descriptor for 1D
267 * arrays.
268 *
269 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
270 * object, we allow 1D arrays to be distributed either over the
271 * first dimension of the array (as if the grid were P-by-1) or the
272 * 2nd dimension (as if the grid were 1-by-P). This choice is
273 * indicated by the descriptor type (501 or 502)
274 * as described below.
275 *
276 * IMPORTANT NOTE: the actual BLACS grid represented by the
277 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
278 * irrespective of which one-dimensional descriptor type
279 * (501 or 502) is input.
280 * This routine will interpret the grid properly either way.
281 * ScaLAPACK routines *do not support intercontext operations* so that
282 * the grid passed to a single ScaLAPACK routine *must be the same*
283 * for all array descriptors passed to that routine.
284 *
285 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
286 * may also be used, since a one-dimensional array is a special case
287 * of a two-dimensional array with one dimension of size unity.
288 * The two-dimensional array used in this case *must* be of the
289 * proper orientation:
290 * If the appropriate one-dimensional descriptor is DTYPEA=501
291 * (1 by P type), then the two dimensional descriptor must
292 * have a CTXT value that refers to a 1 by P BLACS grid;
293 * If the appropriate one-dimensional descriptor is DTYPEA=502
294 * (P by 1 type), then the two dimensional descriptor must
295 * have a CTXT value that refers to a P by 1 BLACS grid.
296 *
297 *
298 * Summary of allowed descriptors, types, and BLACS grids:
299 * DTYPE 501 502 1 1
300 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
301 * -----------------------------------------------------
302 * A OK NO OK NO
303 * B NO OK NO OK
304 *
305 * Let A be a generic term for any 1D block cyclicly distributed array.
306 * Such a global array has an associated description vector DESCA.
307 * In the following comments, the character _ should be read as
308 * "of the global array".
309 *
310 * NOTATION STORED IN EXPLANATION
311 * --------------- ---------- ------------------------------------------
312 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
313 * TYPE_A = 501: 1-by-P grid.
314 * TYPE_A = 502: P-by-1 grid.
315 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
316 * the BLACS process grid A is distribu-
317 * ted over. The context itself is glo-
318 * bal, but the handle (the integer
319 * value) may vary.
320 * N_A (global) DESCA( 3 ) The size of the array dimension being
321 * distributed.
322 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
323 * the distributed dimension of the array.
324 * SRC_A (global) DESCA( 5 ) The process row or column over which the
325 * first row or column of the array
326 * is distributed.
327 * LLD_A (local) DESCA( 6 ) The leading dimension of the local array
328 * storing the local blocks of the distri-
329 * buted array A. Minimum value of LLD_A
330 * depends on TYPE_A.
331 * TYPE_A = 501: LLD_A >=
332 * size of undistributed dimension, 1.
333 * TYPE_A = 502: LLD_A >=NB_A, 1.
334 * Reserved DESCA( 7 ) Reserved for future use.
335 *
336 *
337 *
338 * =====================================================================
339 *
340 * Code Developer: Andrew J. Cleary, University of Tennessee.
341 * Current address: Lawrence Livermore National Labs.
342 * This version released: August, 2001.
343 *
344 * =====================================================================
345 *
346 * ..
347 * .. Parameters ..
348  REAL ONE, ZERO
349  parameter( one = 1.0e+0 )
350  parameter( zero = 0.0e+0 )
351  COMPLEX CONE, CZERO
352  parameter( cone = ( 1.0e+0, 0.0e+0 ) )
353  parameter( czero = ( 0.0e+0, 0.0e+0 ) )
354  INTEGER INT_ONE
355  parameter( int_one = 1 )
356  INTEGER DESCMULT, BIGNUM
357  parameter(descmult = 100, bignum = descmult * descmult)
358  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
359  $ lld_, mb_, m_, nb_, n_, rsrc_
360  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
361  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
362  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
363 * ..
364 * .. Local Scalars ..
365  INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT,
366  $ ictxt_new, ictxt_save, idum3, ja_new, laf_min,
367  $ level_dist, llda, max_bw, mbw2, mycol, myrow,
368  $ my_num_cols, nb, next_tri_size_m,
369  $ next_tri_size_n, np, npcol, nprow, np_save,
370  $ odd_size, ofst, part_offset, part_size,
371  $ prev_tri_size_m, prev_tri_size_n, return_code,
372  $ store_n_a, up_prev_tri_size_m,
373  $ up_prev_tri_size_n, work_size_min, work_u
374 * ..
375 * .. Local Arrays ..
376  INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 )
377 * ..
378 * .. External Subroutines ..
379  EXTERNAL blacs_get, blacs_gridexit, blacs_gridinfo,
380  $ caxpy, cgemm, cgerv2d, cgesd2d, clamov,
381  $ clatcpy, cpbtrf, cpotrf, csyrk, ctbtrs, ctrmm,
382  $ ctrrv2d, ctrsd2d, ctrsm, ctrtrs, desc_convert,
384 * ..
385 * .. External Functions ..
386  LOGICAL LSAME
387  INTEGER NUMROC
388  EXTERNAL lsame, numroc
389 * ..
390 * .. Intrinsic Functions ..
391  INTRINSIC ichar, min, mod
392 * ..
393 * .. Executable Statements ..
394 *
395 * Test the input parameters
396 *
397  info = 0
398 *
399 * Convert descriptor into standard form for easy access to
400 * parameters, check that grid is of right shape.
401 *
402  desca_1xp( 1 ) = 501
403 *
404  CALL desc_convert( desca, desca_1xp, return_code )
405 *
406  IF( return_code .NE. 0) THEN
407  info = -( 6*100 + 2 )
408  ENDIF
409 *
410 * Get values out of descriptor for use in code.
411 *
412  ictxt = desca_1xp( 2 )
413  csrc = desca_1xp( 5 )
414  nb = desca_1xp( 4 )
415  llda = desca_1xp( 6 )
416  store_n_a = desca_1xp( 3 )
417 *
418 * Get grid parameters
419 *
420 *
421 * Size of separator blocks is maximum of bandwidths
422 *
423  max_bw = max(bwl,bwu)
424  mbw2 = max_bw * max_bw
425 *
426  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
427  np = nprow * npcol
428 *
429 *
430 *
431  IF( lwork .LT. -1) THEN
432  info = -10
433  ELSE IF ( lwork .EQ. -1 ) THEN
434  idum3 = -1
435  ELSE
436  idum3 = 1
437  ENDIF
438 *
439  IF( n .LT. 0 ) THEN
440  info = -1
441  ENDIF
442 *
443  IF( n+ja-1 .GT. store_n_a ) THEN
444  info = -( 6*100 + 6 )
445  ENDIF
446 *
447  IF(( bwl .GT. n-1 ) .OR.
448  $ ( bwl .LT. 0 ) ) THEN
449  info = -2
450  ENDIF
451 *
452  IF(( bwu .GT. n-1 ) .OR.
453  $ ( bwu .LT. 0 ) ) THEN
454  info = -3
455  ENDIF
456 *
457  IF( llda .LT. (bwl+bwu+1) ) THEN
458  info = -( 6*100 + 6 )
459  ENDIF
460 *
461  IF( nb .LE. 0 ) THEN
462  info = -( 6*100 + 4 )
463  ENDIF
464 *
465 * Argument checking that is specific to Divide & Conquer routine
466 *
467  IF( nprow .NE. 1 ) THEN
468  info = -( 6*100+2 )
469  ENDIF
470 *
471  IF( n .GT. np*nb-mod( ja-1, nb )) THEN
472  info = -( 1 )
473  CALL pxerbla( ictxt,
474  $ 'PCDBTRF, D&C alg.: only 1 block per proc',
475  $ -info )
476  RETURN
477  ENDIF
478 *
479  IF((ja+n-1.GT.nb) .AND. ( nb.LT.2*max(bwl,bwu) )) THEN
480  info = -( 6*100+4 )
481  CALL pxerbla( ictxt,
482  $ 'PCDBTRF, D&C alg.: NB too small',
483  $ -info )
484  RETURN
485  ENDIF
486 *
487 *
488 * Check auxiliary storage size
489 *
490  laf_min = nb*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu)
491 *
492  IF( laf .LT. laf_min ) THEN
493  info = -8
494 * put minimum value of laf into AF( 1 )
495  af( 1 ) = laf_min
496  CALL pxerbla( ictxt,
497  $ 'PCDBTRF: auxiliary storage error ',
498  $ -info )
499  RETURN
500  ENDIF
501 *
502 * Check worksize
503 *
504  work_size_min = max(bwl,bwu)*max(bwl,bwu)
505 *
506  work( 1 ) = work_size_min
507 *
508  IF( lwork .LT. work_size_min ) THEN
509  IF( lwork .NE. -1 ) THEN
510  info = -10
511  CALL pxerbla( ictxt,
512  $ 'PCDBTRF: worksize error ',
513  $ -info )
514  ENDIF
515  RETURN
516  ENDIF
517 *
518 * Pack params and positions into arrays for global consistency check
519 *
520  param_check( 9, 1 ) = desca(5)
521  param_check( 8, 1 ) = desca(4)
522  param_check( 7, 1 ) = desca(3)
523  param_check( 6, 1 ) = desca(1)
524  param_check( 5, 1 ) = ja
525  param_check( 4, 1 ) = bwu
526  param_check( 3, 1 ) = bwl
527  param_check( 2, 1 ) = n
528  param_check( 1, 1 ) = idum3
529 *
530  param_check( 9, 2 ) = 605
531  param_check( 8, 2 ) = 604
532  param_check( 7, 2 ) = 603
533  param_check( 6, 2 ) = 601
534  param_check( 5, 2 ) = 5
535  param_check( 4, 2 ) = 3
536  param_check( 3, 2 ) = 2
537  param_check( 2, 2 ) = 1
538  param_check( 1, 2 ) = 10
539 *
540 * Want to find errors with MIN( ), so if no error, set it to a big
541 * number. If there already is an error, multiply by the the
542 * descriptor multiplier.
543 *
544  IF( info.GE.0 ) THEN
545  info = bignum
546  ELSE IF( info.LT.-descmult ) THEN
547  info = -info
548  ELSE
549  info = -info * descmult
550  END IF
551 *
552 * Check consistency across processors
553 *
554  CALL globchk( ictxt, 9, param_check, 9,
555  $ param_check( 1, 3 ), info )
556 *
557 * Prepare output: set info = 0 if no error, and divide by DESCMULT
558 * if error is not in a descriptor entry.
559 *
560  IF( info.EQ.bignum ) THEN
561  info = 0
562  ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
563  info = -info / descmult
564  ELSE
565  info = -info
566  END IF
567 *
568  IF( info.LT.0 ) THEN
569  CALL pxerbla( ictxt, 'PCDBTRF', -info )
570  RETURN
571  END IF
572 *
573 * Quick return if possible
574 *
575  IF( n.EQ.0 )
576  $ RETURN
577 *
578 *
579 * Adjust addressing into matrix space to properly get into
580 * the beginning part of the relevant data
581 *
582  part_offset = nb*( (ja-1)/(npcol*nb) )
583 *
584  IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
585  part_offset = part_offset + nb
586  ENDIF
587 *
588  IF ( mycol .LT. csrc ) THEN
589  part_offset = part_offset - nb
590  ENDIF
591 *
592 * Form a new BLACS grid (the "standard form" grid) with only procs
593 * holding part of the matrix, of size 1xNP where NP is adjusted,
594 * starting at csrc=0, with JA modified to reflect dropped procs.
595 *
596 * First processor to hold part of the matrix:
597 *
598  first_proc = mod( ( ja-1 )/nb+csrc, npcol )
599 *
600 * Calculate new JA one while dropping off unused processors.
601 *
602  ja_new = mod( ja-1, nb ) + 1
603 *
604 * Save and compute new value of NP
605 *
606  np_save = np
607  np = ( ja_new+n-2 )/nb + 1
608 *
609 * Call utility routine that forms "standard-form" grid
610 *
611  CALL reshape( ictxt, int_one, ictxt_new, int_one,
612  $ first_proc, int_one, np )
613 *
614 * Use new context from standard grid as context.
615 *
616  ictxt_save = ictxt
617  ictxt = ictxt_new
618  desca_1xp( 2 ) = ictxt_new
619 *
620 * Get information about new grid.
621 *
622  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
623 *
624 * Drop out processors that do not have part of the matrix.
625 *
626  IF( myrow .LT. 0 ) THEN
627  GOTO 1234
628  ENDIF
629 *
630 * ********************************
631 * Values reused throughout routine
632 *
633 * User-input value of partition size
634 *
635  part_size = nb
636 *
637 * Number of columns in each processor
638 *
639  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
640 *
641 * Offset in columns to beginning of main partition in each proc
642 *
643  IF ( mycol .EQ. 0 ) THEN
644  part_offset = part_offset+mod( ja_new-1, part_size )
645  my_num_cols = my_num_cols - mod(ja_new-1, part_size )
646  ENDIF
647 *
648 * Offset in elements
649 *
650  ofst = part_offset*llda
651 *
652 * Size of main (or odd) partition in each processor
653 *
654  odd_size = my_num_cols
655  IF ( mycol .LT. np-1 ) THEN
656  odd_size = odd_size - max_bw
657  ENDIF
658 *
659 * Offset to workspace for Upper triangular factor
660 *
661  work_u = bwu*odd_size + 3*mbw2
662 *
663 *
664 * Zero out space for fillin
665 *
666  DO 10 i=1, laf_min
667  af( i ) = czero
668  10 CONTINUE
669 *
670 * Zero out space for work
671 *
672  DO 20 i=1, work_size_min
673  work( i ) = czero
674  20 CONTINUE
675 *
676 * Begin main code
677 *
678 *
679 ********************************************************************
680 * PHASE 1: Local computation phase.
681 ********************************************************************
682 *
683 *
684 * Sizes of the extra triangles communicated bewtween processors
685 *
686  IF( mycol .GT. 0 ) THEN
687  prev_tri_size_m= min( bwl,
688  $ numroc( n, part_size, mycol, 0, npcol ) )
689  prev_tri_size_n=min( bwl,
690  $ numroc( n, part_size, mycol-1, 0, npcol ) )
691  ENDIF
692 *
693  IF( mycol .GT. 0 ) THEN
694  up_prev_tri_size_m= min( bwu,
695  $ numroc( n, part_size, mycol, 0, npcol ) )
696  up_prev_tri_size_n=min( bwu,
697  $ numroc( n, part_size, mycol-1, 0, npcol ) )
698  ENDIF
699 *
700  IF( mycol .LT. npcol-1 ) THEN
701  next_tri_size_m=min( bwl,
702  $ numroc( n, part_size, mycol+1, 0, npcol ) )
703  next_tri_size_n= min( bwl,
704  $ numroc( n, part_size, mycol, 0, npcol ) )
705  ENDIF
706 *
707  IF ( mycol .LT. np-1 ) THEN
708 * Transfer last triangle D_i of local matrix to next processor
709 * which needs it to calculate fillin due to factorization of
710 * its main (odd) block A_i.
711 * Overlap the send with the factorization of A_i.
712 *
713  CALL ctrsd2d( ictxt, 'U', 'N', next_tri_size_m,
714  $ next_tri_size_n,
715  $ a( ofst+(my_num_cols-bwl)*llda+(bwl+bwu+1) ),
716  $ llda-1, 0, mycol+1 )
717 *
718  ENDIF
719 *
720 *
721 * Factor main partition A_i = L_i {U_i} in each processor
722 *
723  CALL cdbtrf( odd_size, odd_size, bwl, bwu, a( ofst + 1),
724  $ llda, info )
725 *
726  IF( info.NE.0 ) THEN
727  info = mycol+1
728  GOTO 1500
729  ENDIF
730 *
731 *
732  IF ( mycol .LT. np-1 ) THEN
733 *
734 * Apply factorization to lower connection block BL_i
735 * conjugate transpose the connection block in preparation.
736 * Apply factorization to upper connection block BU_i
737 * Move the connection block in preparation.
738 *
739  CALL clatcpy( 'U', bwl, bwl,
740  $ a(( ofst+(bwl+bwu+1)+(odd_size-bwl)*llda )),
741  $ llda-1, af( odd_size*bwu+2*mbw2+1+max_bw-bwl ),
742  $ max_bw )
743  CALL clamov( 'L', bwu, bwu, a( ( ofst+1+odd_size*llda ) ),
744  $ llda-1,
745  $ af( work_u+odd_size*bwl+2*mbw2+1+max_bw-bwu ),
746  $ max_bw )
747 *
748 * Perform the triangular system solve {L_i}{{BU'}_i} = {B_i}
749 *
750  CALL ctbtrs( 'L', 'N', 'U', bwu, bwl, bwu,
751  $ a( ofst+bwu+1+(odd_size-bwu )*llda ), llda,
752  $ af( work_u+odd_size*bwl+2*mbw2+1+max_bw-bwu ),
753  $ max_bw, info )
754 *
755 * Perform the triangular solve {U_i}^C{BL'}_i^C = {BL_i}^C
756 *
757  CALL ctbtrs( 'U', 'C', 'N', bwl, bwu, bwl,
758  $ a( ofst+1+(odd_size-bwl)*llda ), llda,
759  $ af( odd_size*bwu+2*mbw2+1+max_bw-bwl ), max_bw,
760  $ info )
761 *
762 * conjugate transpose resulting block to its location
763 * in main storage.
764 *
765  CALL clatcpy( 'L', bwl, bwl,
766  $ af( odd_size*bwu+2*mbw2+1+max_bw-bwl ), max_bw,
767  $ a(( ofst+(bwl+bwu+1)+(odd_size-bwl)*llda )),
768  $ llda-1 )
769 *
770 * Move the resulting block back to its location in main storage.
771 *
772  CALL clamov( 'L', bwu, bwu,
773  $ af( work_u+odd_size*bwl+2*mbw2+1+max_bw-bwu ),
774  $ max_bw, a(( ofst+1+odd_size*llda )), llda-1 )
775 *
776 *
777 * Compute contribution to diagonal block(s) of reduced system.
778 * {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i}
779 *
780 * The following method uses more flops than necessary but
781 * does not necessitate the writing of a new BLAS routine.
782 *
783 *
784  CALL cgemm( 'C', 'N', max_bw, max_bw, max_bw, -cone ,
785  $ af( odd_size*bwu+2*mbw2+1), max_bw,
786  $ af( work_u+odd_size*bwl+2*mbw2+1), max_bw, cone,
787  $ a( ofst+odd_size*llda+1+bwu ), llda-1 )
788 *
789  ENDIF
790 * End of "if ( MYCOL .lt. NP-1 )..." loop
791 *
792 *
793  1500 CONTINUE
794 * If the processor could not locally factor, it jumps here.
795 *
796  IF ( mycol .NE. 0 ) THEN
797 * Discard temporary matrix stored beginning in
798 * AF( (odd_size+2*bwl, bwu)*bwl, bwu+1 ) and use for
799 * off_diagonal block of reduced system.
800 *
801 * Receive previously transmitted matrix section, which forms
802 * the right-hand-side for the triangular solve that calculates
803 * the "spike" fillin.
804 *
805 *
806  CALL ctrrv2d( ictxt, 'U', 'N', prev_tri_size_m,
807  $ prev_tri_size_n, af( work_u+1 ), odd_size, 0,
808  $ mycol-1 )
809 *
810  IF (info.EQ.0) THEN
811 *
812 * Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ .
813 *
814  CALL ctbtrs( 'L', 'N', 'U', odd_size, bwl, bwl,
815  $ a( ofst + bwu+1 ), llda, af( work_u+1 ),
816  $ odd_size, info )
817 *
818 *
819 * Calculate the "spike" fillin, ${U_i}^C {{GL}_i}^C = {DU_i}^C$
820 *
821 *
822 * Copy D block into AF storage for solve.
823 *
824  CALL clatcpy( 'L', up_prev_tri_size_n, up_prev_tri_size_m,
825  $ a( ofst+1 ), llda-1, af( 1 ), odd_size )
826 *
827  CALL ctbtrs( 'U', 'C', 'N', odd_size, bwu, bwu,
828  $ a( ofst + 1 ), llda,
829  $ af( 1 ), odd_size, info )
830 *
831 *
832 * Calculate the update block for previous proc, E_i = GL_i{GU_i}
833 *
834 *
835 * Zero out space in case result is smaller than storage block
836 *
837  DO 30 i=1, mbw2
838  af( odd_size*bwu+2*mbw2+i ) = czero
839  30 CONTINUE
840 *
841  CALL cgemm( 'C', 'N', bwu, bwl, odd_size,
842  $ -cone, af( 1 ), odd_size,
843  $ af( work_u+1 ), odd_size, czero,
844  $ af( 1+max(0,bwl-bwu)+odd_size*bwu+
845  $ (2*max_bw+max(0,bwu-bwl))*max_bw),
846  $ max_bw )
847 *
848 *
849 * Initiate send of E_i to previous processor to overlap
850 * with next computation.
851 *
852  CALL cgesd2d( ictxt, max_bw, max_bw,
853  $ af( odd_size*bwu+2*mbw2+1 ), max_bw, 0,
854  $ mycol-1 )
855 *
856 *
857  IF ( mycol .LT. np-1 ) THEN
858 *
859 * Calculate off-diagonal block(s) of reduced system.
860 * Note: for ease of use in solution of reduced system, store
861 * L's off-diagonal block in conjugate transpose form.
862 *
863 * Copy matrix HU_i (the last bwl rows of GU_i) to AFL storage
864 * as per requirements of BLAS routine CTRMM.
865 * Since we have GU_i stored,
866 * conjugate transpose HU_i to HU_i^C.
867 *
868  CALL clatcpy( 'N', bwl, bwl,
869  $ af( work_u+odd_size-bwl+1 ), odd_size,
870  $ af( (odd_size)*bwu+1+(max_bw-bwl) ),
871  $ max_bw )
872 *
873  CALL ctrmm( 'R', 'U', 'C', 'N', bwl, bwl, -cone,
874  $ a( ( ofst+(bwl+bwu+1)+(odd_size-bwl)*llda ) ),
875  $ llda-1, af( (odd_size)*bwu+1+(max_bw-bwl) ),
876  $ max_bw )
877 *
878 *
879 * Copy matrix HL_i (the last bwu rows of GL_i^C) to AFU store
880 * as per requirements of BLAS routine CTRMM.
881 * Since we have GL_i^C stored,
882 * conjugate transpose HL_i^C to HL_i.
883 *
884  CALL clatcpy( 'N', bwu, bwu,
885  $ af( odd_size-bwu+1 ), odd_size,
886  $ af( work_u+(odd_size)*bwl+1+max_bw-bwu ),
887  $ max_bw )
888 *
889  CALL ctrmm( 'R', 'L', 'N', 'N', bwu, bwu, -cone,
890  $ a( ( ofst+1+odd_size*llda ) ), llda-1,
891  $ af( work_u+(odd_size)*bwl+1+max_bw-bwu ),
892  $ max_bw )
893 *
894  ENDIF
895 *
896  ENDIF
897 * End of "if ( MYCOL .ne. 0 )..."
898 *
899  ENDIF
900 * End of "if (info.eq.0) then"
901 *
902 *
903 * Check to make sure no processors have found errors
904 *
905  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info,
906  $ -1, 0, 0 )
907 *
908  IF( mycol.EQ.0 ) THEN
909  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
910  ELSE
911  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
912  ENDIF
913 *
914  IF ( info.NE.0 ) THEN
915  GOTO 1000
916  ENDIF
917 * No errors found, continue
918 *
919 *
920 ********************************************************************
921 * PHASE 2: Formation and factorization of Reduced System.
922 ********************************************************************
923 *
924 * Gather up local sections of reduced system
925 *
926 *
927 * The last processor does not participate in the factorization of
928 * the reduced system, having sent its E_i already.
929  IF( mycol .EQ. npcol-1 ) THEN
930  GOTO 14
931  ENDIF
932 *
933 * Initiate send of off-diag block(s) to overlap with next part.
934 * Off-diagonal block needed on neighboring processor to start
935 * algorithm.
936 *
937  IF( (mod( mycol+1, 2 ) .EQ. 0) .AND. ( mycol .GT. 0 ) ) THEN
938 *
939  CALL cgesd2d( ictxt, max_bw, max_bw,
940  $ af( odd_size*bwu+1 ),
941  $ max_bw, 0, mycol-1 )
942 *
943  CALL cgesd2d( ictxt, max_bw, max_bw,
944  $ af( work_u+odd_size*bwl+1 ),
945  $ max_bw, 0, mycol-1 )
946 *
947  ENDIF
948 *
949 * Copy last diagonal block into AF storage for subsequent
950 * operations.
951 *
952  CALL clamov( 'N', max_bw, max_bw,
953  $ a( ofst+odd_size*llda+bwu+1 ),
954  $ llda-1, af( odd_size*bwu+mbw2+1 ),
955  $ max_bw )
956 *
957 * Receive cont. to diagonal block that is stored on this proc.
958 *
959  IF( mycol.LT. npcol-1 ) THEN
960 *
961  CALL cgerv2d( ictxt, max_bw, max_bw,
962  $ af( odd_size*bwu+2*mbw2+1 ),
963  $ max_bw, 0,
964  $ mycol+1 )
965 *
966 * Add contribution to diagonal block
967 *
968  CALL caxpy( mbw2, cone,
969  $ af( odd_size*bwu+2*mbw2+1 ),
970  $ 1, af( odd_size*bwu+mbw2+1 ), 1 )
971 *
972  ENDIF
973 *
974 *
975 * *************************************
976 * Modification Loop
977 *
978 * The distance for sending and receiving for each level starts
979 * at 1 for the first level.
980  level_dist = 1
981 *
982 * Do until this proc is needed to modify other procs' equations
983 *
984  12 CONTINUE
985  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 11
986 *
987 * Receive and add contribution to diagonal block from the left
988 *
989  IF( mycol-level_dist .GE. 0 ) THEN
990  CALL cgerv2d( ictxt, max_bw, max_bw, work( 1 ),
991  $ max_bw, 0, mycol-level_dist )
992 *
993  CALL caxpy( mbw2, cone, work( 1 ), 1,
994  $ af( odd_size*bwu+mbw2+1 ), 1 )
995 *
996  ENDIF
997 *
998 * Receive and add contribution to diagonal block from the right
999 *
1000  IF( mycol+level_dist .LT. npcol-1 ) THEN
1001  CALL cgerv2d( ictxt, max_bw, max_bw, work( 1 ),
1002  $ max_bw, 0, mycol+level_dist )
1003 *
1004  CALL caxpy( mbw2, cone, work( 1 ), 1,
1005  $ af( odd_size*bwu+mbw2+1 ), 1 )
1006 *
1007  ENDIF
1008 *
1009  level_dist = level_dist*2
1010 *
1011  GOTO 12
1012  11 CONTINUE
1013 * [End of GOTO Loop]
1014 *
1015 *
1016 * *********************************
1017 * Calculate and use this proc's blocks to modify other procs'...
1018 *
1019 * Factor diagonal block
1020 *
1021  CALL cdbtrf( max_bw, max_bw, min( max_bw-1, bwl ),
1022  $ min( max_bw-1, bwu ), af( odd_size*bwu+mbw2+1
1023  $ -( min( max_bw-1, bwu ))), max_bw+1, info )
1024 *
1025  IF( info.NE.0 ) THEN
1026  info = npcol + mycol
1027  ENDIF
1028 *
1029 * ****************************************************************
1030 * Receive offdiagonal block from processor to right.
1031 * If this is the first group of processors, the receive comes
1032 * from a different processor than otherwise.
1033 *
1034  IF( level_dist .EQ. 1 )THEN
1035  comm_proc = mycol + 1
1036 *
1037 * Move block into place that it will be expected to be for
1038 * calcs.
1039 *
1040  CALL clamov( 'N', max_bw, max_bw, af( odd_size*bwu+1 ),
1041  $ max_bw, af( work_u+odd_size*bwl+2*mbw2+1 ),
1042  $ max_bw )
1043 *
1044  CALL clamov( 'N', max_bw, max_bw, af( work_u+odd_size*bwl+1 ),
1045  $ max_bw, af( odd_size*bwu+2*mbw2+1 ), max_bw )
1046 *
1047  ELSE
1048  comm_proc = mycol + level_dist/2
1049  ENDIF
1050 *
1051  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
1052 *
1053  CALL cgerv2d( ictxt, max_bw, max_bw,
1054  $ af( odd_size*bwu+1 ),
1055  $ max_bw, 0, comm_proc )
1056 *
1057  CALL cgerv2d( ictxt, max_bw, max_bw,
1058  $ af( work_u+odd_size*bwl+1 ),
1059  $ max_bw, 0, comm_proc )
1060 *
1061  IF( info .EQ. 0 ) THEN
1062 *
1063 *
1064 * Modify upper off_diagonal block with diagonal block
1065 *
1066 *
1067  CALL ctbtrs(
1068  $ 'L', 'N', 'U', bwu, min( bwl, bwu-1 ), bwu,
1069  $ af( odd_size*bwu+
1070  $ mbw2+1+(max_bw+1)*(max_bw-bwu)), max_bw+1,
1071  $ af( work_u+odd_size*bwl+1+max_bw-bwu), max_bw, info )
1072 *
1073 * Modify lower off_diagonal block with diagonal block
1074 *
1075 *
1076  CALL ctbtrs(
1077  $ 'U', 'C', 'N', bwl, min( bwu, bwl-1 ), bwl,
1078  $ af( odd_size*bwu+
1079  $ mbw2+1-min( bwu, bwl-1 )+(max_bw+1)*(max_bw-bwl)), max_bw+1,
1080  $ af( odd_size*bwu+1+max_bw-bwl), max_bw, info )
1081 *
1082  ENDIF
1083 * End of "if ( info.eq.0 ) then"
1084 *
1085 * Calculate contribution from this block to next diagonal block
1086 *
1087  CALL cgemm( 'C', 'N', max_bw, max_bw, max_bw, -cone,
1088  $ af( (odd_size)*bwu+1 ), max_bw,
1089  $ af( work_u+(odd_size)*bwl+1 ), max_bw, czero,
1090  $ work( 1 ), max_bw )
1091 *
1092 * Send contribution to diagonal block's owning processor.
1093 *
1094  CALL cgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw,
1095  $ 0, mycol+level_dist )
1096 *
1097  ENDIF
1098 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1099 *
1100 *
1101 * ****************************************************************
1102 * Receive off_diagonal block from left and use to finish with this
1103 * processor.
1104 *
1105  IF( (mycol/level_dist .GT. 0 ).AND.
1106  $ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
1107 *
1108  IF( level_dist .GT. 1)THEN
1109 *
1110 * Receive offdiagonal block(s) from proc level_dist/2 to the
1111 * left
1112 *
1113  CALL cgerv2d( ictxt, max_bw, max_bw,
1114  $ af( work_u+odd_size*bwl+2*mbw2+1 ),
1115  $ max_bw, 0, mycol-level_dist/2 )
1116 *
1117 * Receive offdiagonal block(s) from proc level_dist/2 to the
1118 * left
1119 *
1120  CALL cgerv2d( ictxt, max_bw, max_bw,
1121  $ af( odd_size*bwu+2*mbw2+1 ),
1122  $ max_bw, 0, mycol-level_dist/2 )
1123 *
1124  ENDIF
1125 *
1126 *
1127  IF( info.EQ.0 ) THEN
1128 *
1129 * Use diagonal block(s) to modify this offdiagonal block
1130 *
1131 *
1132 * Since CTBTRS has no "left-right" option, we must transpose
1133 *
1134  CALL clatcpy( 'N', max_bw, max_bw, af( work_u+odd_size*bwl+
1135  $ 2*mbw2+1), max_bw, work( 1 ), max_bw )
1136 *
1137  CALL ctbtrs(
1138  $ 'L', 'N', 'U', max_bw, min( bwl, max_bw-1 ), bwl,
1139  $ af( odd_size*bwu+mbw2+1), max_bw+1,
1140  $ work( 1+max_bw*(max_bw-bwl) ), max_bw, info )
1141 *
1142 * Transpose back
1143 *
1144  CALL clatcpy(
1145  $ 'N', max_bw, max_bw, work( 1 ), max_bw,
1146  $ af( work_u+odd_size*bwl+
1147  $ 2*mbw2+1), max_bw )
1148 *
1149 *
1150 *
1151 * Since CTBTRS has no "left-right" option, we must transpose
1152 *
1153  CALL clatcpy( 'N', max_bw, max_bw, af( odd_size*bwu+
1154  $ 2*mbw2+1), max_bw, work( 1 ), max_bw )
1155 *
1156  CALL ctbtrs(
1157  $ 'U', 'C', 'N', max_bw, min( bwu, max_bw-1 ), bwu,
1158  $ af( odd_size*bwu+mbw2+1-min( bwu, max_bw-1 )), max_bw+1,
1159  $ work( 1+max_bw*(max_bw-bwu) ), max_bw, info )
1160 *
1161 * Transpose back
1162 *
1163  CALL clatcpy(
1164  $ 'N', max_bw, max_bw, work( 1 ), max_bw,
1165  $ af( odd_size*bwu+
1166  $ 2*mbw2+1), max_bw )
1167 *
1168 *
1169  ENDIF
1170 * End of "if( info.eq.0 ) then"
1171 *
1172 * Use offdiag block(s) to calculate modification to diag block
1173 * of processor to the left
1174 *
1175  CALL cgemm( 'N', 'C', max_bw, max_bw, max_bw, -cone,
1176  $ af( (odd_size)*bwu+2*mbw2+1 ), max_bw,
1177  $ af( work_u+(odd_size)*bwl+2*mbw2+1 ), max_bw,
1178  $ czero, work( 1 ), max_bw )
1179 *
1180 * Send contribution to diagonal block's owning processor.
1181 *
1182  CALL cgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw,
1183  $ 0, mycol-level_dist )
1184 *
1185 * *******************************************************
1186 *
1187  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 ) THEN
1188 *
1189 * Decide which processor offdiagonal block(s) goes to
1190 *
1191  IF( ( mod( mycol/( 2*level_dist ),2 )) .EQ.0 ) THEN
1192  comm_proc = mycol + level_dist
1193  ELSE
1194  comm_proc = mycol - level_dist
1195  ENDIF
1196 *
1197 * Use offdiagonal blocks to calculate offdiag
1198 * block to send to neighboring processor. Depending
1199 * on circumstances, may need to transpose the matrix.
1200 *
1201  CALL cgemm( 'N', 'N', max_bw, max_bw, max_bw, -cone,
1202  $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw,
1203  $ af( odd_size*bwu+1 ), max_bw, czero,
1204  $ work( 1 ), max_bw )
1205 *
1206 * Send contribution to offdiagonal block's owning processor.
1207 *
1208  CALL cgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw,
1209  $ 0, comm_proc )
1210 *
1211  CALL cgemm( 'N', 'N', max_bw, max_bw, max_bw, -cone,
1212  $ af( odd_size*bwu+2*mbw2+1 ), max_bw,
1213  $ af( work_u+odd_size*bwl+1 ), max_bw, czero,
1214  $ work( 1 ), max_bw )
1215 *
1216 * Send contribution to offdiagonal block's owning processor.
1217 *
1218  CALL cgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw,
1219  $ 0, comm_proc )
1220 *
1221  ENDIF
1222 *
1223  ENDIF
1224 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1225 *
1226  14 CONTINUE
1227 *
1228 *
1229  1000 CONTINUE
1230 *
1231 *
1232 * Free BLACS space used to hold standard-form grid.
1233 *
1234  IF( ictxt_save .NE. ictxt_new ) THEN
1235  CALL blacs_gridexit( ictxt_new )
1236  ENDIF
1237 *
1238  1234 CONTINUE
1239 *
1240 * Restore saved input parameters
1241 *
1242  ictxt = ictxt_save
1243  np = np_save
1244 *
1245 * Output minimum worksize
1246 *
1247  work( 1 ) = work_size_min
1248 *
1249 * Make INFO consistent across processors
1250 *
1251  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info,
1252  $ -1, 0, 0 )
1253 *
1254  IF( mycol.EQ.0 ) THEN
1255  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
1256  ELSE
1257  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
1258  ENDIF
1259 *
1260 *
1261  RETURN
1262 *
1263 * End of PCDBTRF
1264 *
1265  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
max
#define max(A, B)
Definition: pcgemr.c:180
pcdbtrf
subroutine pcdbtrf(N, BWL, BWU, A, JA, DESCA, AF, LAF, WORK, LWORK, INFO)
Definition: pcdbtrf.f:3
cdbtrf
subroutine cdbtrf(M, N, KL, KU, AB, LDAB, INFO)
Definition: cdbtrf.f:2
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
clatcpy
subroutine clatcpy(UPLO, M, N, A, LDA, B, LDB)
Definition: clatcpy.f:2
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181