ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pcpttrf.f
Go to the documentation of this file.
1  SUBROUTINE pcpttrf( N, D, E, JA, DESCA, AF, LAF, WORK, LWORK,
2  $ INFO )
3 *
4 *
5 *
6 * -- ScaLAPACK routine (version 1.7) --
7 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
8 * and University of California, Berkeley.
9 * May 25, 2001
10 *
11 * .. Scalar Arguments ..
12  INTEGER INFO, JA, LAF, LWORK, N
13 * ..
14 * .. Array Arguments ..
15  INTEGER DESCA( * )
16  COMPLEX AF( * ), E( * ), WORK( * )
17  REAL D( * )
18 * ..
19 *
20 *
21 * Purpose
22 * =======
23 *
24 * PCPTTRF computes a Cholesky factorization
25 * of an N-by-N complex tridiagonal
26 * symmetric positive definite distributed matrix
27 * A(1:N, JA:JA+N-1).
28 * Reordering is used to increase parallelism in the factorization.
29 * This reordering results in factors that are DIFFERENT from those
30 * produced by equivalent sequential codes. These factors cannot
31 * be used directly by users; however, they can be used in
32 * subsequent calls to PCPTTRS to solve linear systems.
33 *
34 * The factorization has the form
35 *
36 * P A(1:N, JA:JA+N-1) P^T = U' D U or
37 *
38 * P A(1:N, JA:JA+N-1) P^T = L D L',
39 *
40 * where U is a tridiagonal upper triangular matrix and L is tridiagonal
41 * lower triangular, and P is a permutation matrix.
42 *
43 * =====================================================================
44 *
45 * Arguments
46 * =========
47 *
48 *
49 * N (global input) INTEGER
50 * The number of rows and columns to be operated on, i.e. the
51 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
52 *
53 * D (local input/local output) COMPLEX pointer to local
54 * part of global vector storing the main diagonal of the
55 * matrix.
56 * On exit, this array contains information containing the
57 * factors of the matrix.
58 * Must be of size >= DESCA( NB_ ).
59 *
60 * E (local input/local output) COMPLEX pointer to local
61 * part of global vector storing the upper diagonal of the
62 * matrix. Globally, DU(n) is not referenced, and DU must be
63 * aligned with D.
64 * On exit, this array contains information containing the
65 * factors of the matrix.
66 * Must be of size >= DESCA( NB_ ).
67 *
68 * JA (global input) INTEGER
69 * The index in the global array A that points to the start of
70 * the matrix to be operated on (which may be either all of A
71 * or a submatrix of A).
72 *
73 * DESCA (global and local input) INTEGER array of dimension DLEN.
74 * if 1D type (DTYPE_A=501 or 502), DLEN >= 7;
75 * if 2D type (DTYPE_A=1), DLEN >= 9.
76 * The array descriptor for the distributed matrix A.
77 * Contains information of mapping of A to memory. Please
78 * see NOTES below for full description and options.
79 *
80 * AF (local output) COMPLEX array, dimension LAF.
81 * Auxiliary Fillin Space.
82 * Fillin is created during the factorization routine
83 * PCPTTRF and this is stored in AF. If a linear system
84 * is to be solved using PCPTTRS after the factorization
85 * routine, AF *must not be altered* after the factorization.
86 *
87 * LAF (local input) INTEGER
88 * Size of user-input Auxiliary Fillin space AF. Must be >=
89 * (NB+2)
90 * If LAF is not large enough, an error code will be returned
91 * and the minimum acceptable size will be returned in AF( 1 )
92 *
93 * WORK (local workspace/local output)
94 * COMPLEX temporary workspace. This space may
95 * be overwritten in between calls to routines. WORK must be
96 * the size given in LWORK.
97 * On exit, WORK( 1 ) contains the minimal LWORK.
98 *
99 * LWORK (local input or global input) INTEGER
100 * Size of user-input workspace WORK.
101 * If LWORK is too small, the minimal acceptable size will be
102 * returned in WORK(1) and an error code is returned. LWORK>=
103 * 8*NPCOL
104 *
105 * INFO (local output) INTEGER
106 * = 0: successful exit
107 * < 0: If the i-th argument is an array and the j-entry had
108 * an illegal value, then INFO = -(i*100+j), if the i-th
109 * argument is a scalar and had an illegal value, then
110 * INFO = -i.
111 * > 0: If INFO = K<=NPROCS, the submatrix stored on processor
112 * INFO and factored locally was not
113 * positive definite, and
114 * the factorization was not completed.
115 * If INFO = K>NPROCS, the submatrix stored on processor
116 * INFO-NPROCS representing interactions with other
117 * processors was not
118 * positive definite,
119 * and the factorization was not completed.
120 *
121 * =====================================================================
122 *
123 *
124 * Restrictions
125 * ============
126 *
127 * The following are restrictions on the input parameters. Some of these
128 * are temporary and will be removed in future releases, while others
129 * may reflect fundamental technical limitations.
130 *
131 * Non-cyclic restriction: VERY IMPORTANT!
132 * P*NB>= mod(JA-1,NB)+N.
133 * The mapping for matrices must be blocked, reflecting the nature
134 * of the divide and conquer algorithm as a task-parallel algorithm.
135 * This formula in words is: no processor may have more than one
136 * chunk of the matrix.
137 *
138 * Blocksize cannot be too small:
139 * If the matrix spans more than one processor, the following
140 * restriction on NB, the size of each block on each processor,
141 * must hold:
142 * NB >= 2
143 * The bulk of parallel computation is done on the matrix of size
144 * O(NB) on each processor. If this is too small, divide and conquer
145 * is a poor choice of algorithm.
146 *
147 * Submatrix reference:
148 * JA = IB
149 * Alignment restriction that prevents unnecessary communication.
150 *
151 *
152 * =====================================================================
153 *
154 *
155 * Notes
156 * =====
157 *
158 * If the factorization routine and the solve routine are to be called
159 * separately (to solve various sets of righthand sides using the same
160 * coefficient matrix), the auxiliary space AF *must not be altered*
161 * between calls to the factorization routine and the solve routine.
162 *
163 * The best algorithm for solving banded and tridiagonal linear systems
164 * depends on a variety of parameters, especially the bandwidth.
165 * Currently, only algorithms designed for the case N/P >> bw are
166 * implemented. These go by many names, including Divide and Conquer,
167 * Partitioning, domain decomposition-type, etc.
168 * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C
169 * algorithms are the appropriate choice.
170 *
171 * Algorithm description: Divide and Conquer
172 *
173 * The Divide and Conqer algorithm assumes the matrix is narrowly
174 * banded compared with the number of equations. In this situation,
175 * it is best to distribute the input matrix A one-dimensionally,
176 * with columns atomic and rows divided amongst the processes.
177 * The basic algorithm divides the tridiagonal matrix up into
178 * P pieces with one stored on each processor,
179 * and then proceeds in 2 phases for the factorization or 3 for the
180 * solution of a linear system.
181 * 1) Local Phase:
182 * The individual pieces are factored independently and in
183 * parallel. These factors are applied to the matrix creating
184 * fillin, which is stored in a non-inspectable way in auxiliary
185 * space AF. Mathematically, this is equivalent to reordering
186 * the matrix A as P A P^T and then factoring the principal
187 * leading submatrix of size equal to the sum of the sizes of
188 * the matrices factored on each processor. The factors of
189 * these submatrices overwrite the corresponding parts of A
190 * in memory.
191 * 2) Reduced System Phase:
192 * A small ((P-1)) system is formed representing
193 * interaction of the larger blocks, and is stored (as are its
194 * factors) in the space AF. A parallel Block Cyclic Reduction
195 * algorithm is used. For a linear system, a parallel front solve
196 * followed by an analagous backsolve, both using the structure
197 * of the factored matrix, are performed.
198 * 3) Backsubsitution Phase:
199 * For a linear system, a local backsubstitution is performed on
200 * each processor in parallel.
201 *
202 *
203 * Descriptors
204 * ===========
205 *
206 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
207 *
208 * Note: tridiagonal codes can use either the old two dimensional
209 * or new one-dimensional descriptors, though the processor grid in
210 * both cases *must be one-dimensional*. We describe both types below.
211 *
212 * Each global data object is described by an associated description
213 * vector. This vector stores the information required to establish
214 * the mapping between an object element and its corresponding process
215 * and memory location.
216 *
217 * Let A be a generic term for any 2D block cyclicly distributed array.
218 * Such a global array has an associated description vector DESCA.
219 * In the following comments, the character _ should be read as
220 * "of the global array".
221 *
222 * NOTATION STORED IN EXPLANATION
223 * --------------- -------------- --------------------------------------
224 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
225 * DTYPE_A = 1.
226 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
227 * the BLACS process grid A is distribu-
228 * ted over. The context itself is glo-
229 * bal, but the handle (the integer
230 * value) may vary.
231 * M_A (global) DESCA( M_ ) The number of rows in the global
232 * array A.
233 * N_A (global) DESCA( N_ ) The number of columns in the global
234 * array A.
235 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
236 * the rows of the array.
237 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
238 * the columns of the array.
239 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
240 * row of the array A is distributed.
241 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
242 * first column of the array A is
243 * distributed.
244 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
245 * array. LLD_A >= MAX(1,LOCr(M_A)).
246 *
247 * Let K be the number of rows or columns of a distributed matrix,
248 * and assume that its process grid has dimension p x q.
249 * LOCr( K ) denotes the number of elements of K that a process
250 * would receive if K were distributed over the p processes of its
251 * process column.
252 * Similarly, LOCc( K ) denotes the number of elements of K that a
253 * process would receive if K were distributed over the q processes of
254 * its process row.
255 * The values of LOCr() and LOCc() may be determined via a call to the
256 * ScaLAPACK tool function, NUMROC:
257 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
258 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
259 * An upper bound for these quantities may be computed by:
260 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
261 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
262 *
263 *
264 * One-dimensional descriptors:
265 *
266 * One-dimensional descriptors are a new addition to ScaLAPACK since
267 * version 1.0. They simplify and shorten the descriptor for 1D
268 * arrays.
269 *
270 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
271 * object, we allow 1D arrays to be distributed either over the
272 * first dimension of the array (as if the grid were P-by-1) or the
273 * 2nd dimension (as if the grid were 1-by-P). This choice is
274 * indicated by the descriptor type (501 or 502)
275 * as described below.
276 * However, for tridiagonal matrices, since the objects being
277 * distributed are the individual vectors storing the diagonals, we
278 * have adopted the convention that both the P-by-1 descriptor and
279 * the 1-by-P descriptor are allowed and are equivalent for
280 * tridiagonal matrices. Thus, for tridiagonal matrices,
281 * DTYPE_A = 501 or 502 can be used interchangeably
282 * without any other change.
283 * We require that the distributed vectors storing the diagonals of a
284 * tridiagonal matrix be aligned with each other. Because of this, a
285 * single descriptor, DESCA, serves to describe the distribution of
286 * of all diagonals simultaneously.
287 *
288 * IMPORTANT NOTE: the actual BLACS grid represented by the
289 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
290 * irrespective of which one-dimensional descriptor type
291 * (501 or 502) is input.
292 * This routine will interpret the grid properly either way.
293 * ScaLAPACK routines *do not support intercontext operations* so that
294 * the grid passed to a single ScaLAPACK routine *must be the same*
295 * for all array descriptors passed to that routine.
296 *
297 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
298 * may also be used, since a one-dimensional array is a special case
299 * of a two-dimensional array with one dimension of size unity.
300 * The two-dimensional array used in this case *must* be of the
301 * proper orientation:
302 * If the appropriate one-dimensional descriptor is DTYPEA=501
303 * (1 by P type), then the two dimensional descriptor must
304 * have a CTXT value that refers to a 1 by P BLACS grid;
305 * If the appropriate one-dimensional descriptor is DTYPEA=502
306 * (P by 1 type), then the two dimensional descriptor must
307 * have a CTXT value that refers to a P by 1 BLACS grid.
308 *
309 *
310 * Summary of allowed descriptors, types, and BLACS grids:
311 * DTYPE 501 502 1 1
312 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
313 * -----------------------------------------------------
314 * A OK OK OK NO
315 * B NO OK NO OK
316 *
317 * Let A be a generic term for any 1D block cyclicly distributed array.
318 * Such a global array has an associated description vector DESCA.
319 * In the following comments, the character _ should be read as
320 * "of the global array".
321 *
322 * NOTATION STORED IN EXPLANATION
323 * --------------- ---------- ------------------------------------------
324 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
325 * TYPE_A = 501: 1-by-P grid.
326 * TYPE_A = 502: P-by-1 grid.
327 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
328 * the BLACS process grid A is distribu-
329 * ted over. The context itself is glo-
330 * bal, but the handle (the integer
331 * value) may vary.
332 * N_A (global) DESCA( 3 ) The size of the array dimension being
333 * distributed.
334 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
335 * the distributed dimension of the array.
336 * SRC_A (global) DESCA( 5 ) The process row or column over which the
337 * first row or column of the array
338 * is distributed.
339 * Ignored DESCA( 6 ) Ignored for tridiagonal matrices.
340 * Reserved DESCA( 7 ) Reserved for future use.
341 *
342 *
343 *
344 * =====================================================================
345 *
346 * Code Developer: Andrew J. Cleary, University of Tennessee.
347 * Current address: Lawrence Livermore National Labs.
348 * This version released: August, 2001.
349 *
350 * =====================================================================
351 *
352 * ..
353 * .. Parameters ..
354  REAL ONE, ZERO
355  parameter( one = 1.0e+0 )
356  parameter( zero = 0.0e+0 )
357  COMPLEX CONE, CZERO
358  parameter( cone = ( 1.0e+0, 0.0e+0 ) )
359  parameter( czero = ( 0.0e+0, 0.0e+0 ) )
360  INTEGER INT_ONE
361  parameter( int_one = 1 )
362  INTEGER DESCMULT, BIGNUM
363  parameter(descmult = 100, bignum = descmult * descmult)
364  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
365  $ lld_, mb_, m_, nb_, n_, rsrc_
366  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
367  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
368  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
369 * ..
370 * .. Local Scalars ..
371  INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT,
372  $ ictxt_new, ictxt_save, idum3, int_temp, ja_new,
373  $ laf_min, level_dist, llda, mycol, myrow,
374  $ my_num_cols, nb, np, npcol, nprow, np_save,
375  $ odd_size, part_offset, part_size, return_code,
376  $ store_n_a, temp, work_size_min
377 * ..
378 * .. Local Arrays ..
379  INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 7, 3 )
380 * ..
381 * .. External Subroutines ..
382  EXTERNAL blacs_get, blacs_gridexit, blacs_gridinfo,
383  $ caxpy, cgemm, cgerv2d, cgesd2d, clamov,
384  $ clatcpy, cpbtrf, cpotrf, csyrk, ctbtrs, ctrmm,
385  $ ctrrv2d, ctrsd2d, ctrsm, ctrtrs, desc_convert,
387 * ..
388 * .. External Functions ..
389  LOGICAL LSAME
390  INTEGER NUMROC
391  EXTERNAL lsame, numroc
392 * ..
393 * .. Intrinsic Functions ..
394  INTRINSIC ichar, min, mod
395 * ..
396 * .. Executable Statements ..
397 *
398 * Test the input parameters
399 *
400  info = 0
401 *
402 * Convert descriptor into standard form for easy access to
403 * parameters, check that grid is of right shape.
404 *
405  desca_1xp( 1 ) = 501
406 *
407  temp = desca( dtype_ )
408  IF( temp .EQ. 502 ) THEN
409 * Temporarily set the descriptor type to 1xP type
410  desca( dtype_ ) = 501
411  ENDIF
412 *
413  CALL desc_convert( desca, desca_1xp, return_code )
414 *
415  desca( dtype_ ) = temp
416 *
417  IF( return_code .NE. 0) THEN
418  info = -( 5*100 + 2 )
419  ENDIF
420 *
421 * Get values out of descriptor for use in code.
422 *
423  ictxt = desca_1xp( 2 )
424  csrc = desca_1xp( 5 )
425  nb = desca_1xp( 4 )
426  llda = desca_1xp( 6 )
427  store_n_a = desca_1xp( 3 )
428 *
429 * Get grid parameters
430 *
431 *
432  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
433  np = nprow * npcol
434 *
435 *
436 *
437  IF( lwork .LT. -1) THEN
438  info = -9
439  ELSE IF ( lwork .EQ. -1 ) THEN
440  idum3 = -1
441  ELSE
442  idum3 = 1
443  ENDIF
444 *
445  IF( n .LT. 0 ) THEN
446  info = -1
447  ENDIF
448 *
449  IF( n+ja-1 .GT. store_n_a ) THEN
450  info = -( 5*100 + 6 )
451  ENDIF
452 *
453 * Argument checking that is specific to Divide & Conquer routine
454 *
455  IF( nprow .NE. 1 ) THEN
456  info = -( 5*100+2 )
457  ENDIF
458 *
459  IF( n .GT. np*nb-mod( ja-1, nb )) THEN
460  info = -( 1 )
461  CALL pxerbla( ictxt,
462  $ 'PCPTTRF, D&C alg.: only 1 block per proc',
463  $ -info )
464  RETURN
465  ENDIF
466 *
467  IF((ja+n-1.GT.nb) .AND. ( nb.LT.2*int_one )) THEN
468  info = -( 5*100+4 )
469  CALL pxerbla( ictxt,
470  $ 'PCPTTRF, D&C alg.: NB too small',
471  $ -info )
472  RETURN
473  ENDIF
474 *
475 *
476 * Check auxiliary storage size
477 *
478  laf_min = (12*npcol + 3*nb)
479 *
480  IF( laf .LT. laf_min ) THEN
481  info = -7
482 * put minimum value of laf into AF( 1 )
483  af( 1 ) = laf_min
484  CALL pxerbla( ictxt,
485  $ 'PCPTTRF: auxiliary storage error ',
486  $ -info )
487  RETURN
488  ENDIF
489 *
490 * Check worksize
491 *
492  work_size_min = 8*npcol
493 *
494  work( 1 ) = work_size_min
495 *
496  IF( lwork .LT. work_size_min ) THEN
497  IF( lwork .NE. -1 ) THEN
498  info = -9
499  CALL pxerbla( ictxt,
500  $ 'PCPTTRF: worksize error ',
501  $ -info )
502  ENDIF
503  RETURN
504  ENDIF
505 *
506 * Pack params and positions into arrays for global consistency check
507 *
508  param_check( 7, 1 ) = desca(5)
509  param_check( 6, 1 ) = desca(4)
510  param_check( 5, 1 ) = desca(3)
511  param_check( 4, 1 ) = desca(1)
512  param_check( 3, 1 ) = ja
513  param_check( 2, 1 ) = n
514  param_check( 1, 1 ) = idum3
515 *
516  param_check( 7, 2 ) = 505
517  param_check( 6, 2 ) = 504
518  param_check( 5, 2 ) = 503
519  param_check( 4, 2 ) = 501
520  param_check( 3, 2 ) = 4
521  param_check( 2, 2 ) = 1
522  param_check( 1, 2 ) = 9
523 *
524 * Want to find errors with MIN( ), so if no error, set it to a big
525 * number. If there already is an error, multiply by the the
526 * descriptor multiplier.
527 *
528  IF( info.GE.0 ) THEN
529  info = bignum
530  ELSE IF( info.LT.-descmult ) THEN
531  info = -info
532  ELSE
533  info = -info * descmult
534  END IF
535 *
536 * Check consistency across processors
537 *
538  CALL globchk( ictxt, 7, param_check, 7,
539  $ param_check( 1, 3 ), info )
540 *
541 * Prepare output: set info = 0 if no error, and divide by DESCMULT
542 * if error is not in a descriptor entry.
543 *
544  IF( info.EQ.bignum ) THEN
545  info = 0
546  ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
547  info = -info / descmult
548  ELSE
549  info = -info
550  END IF
551 *
552  IF( info.LT.0 ) THEN
553  CALL pxerbla( ictxt, 'PCPTTRF', -info )
554  RETURN
555  END IF
556 *
557 * Quick return if possible
558 *
559  IF( n.EQ.0 )
560  $ RETURN
561 *
562 *
563 * Adjust addressing into matrix space to properly get into
564 * the beginning part of the relevant data
565 *
566  part_offset = nb*( (ja-1)/(npcol*nb) )
567 *
568  IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
569  part_offset = part_offset + nb
570  ENDIF
571 *
572  IF ( mycol .LT. csrc ) THEN
573  part_offset = part_offset - nb
574  ENDIF
575 *
576 * Form a new BLACS grid (the "standard form" grid) with only procs
577 * holding part of the matrix, of size 1xNP where NP is adjusted,
578 * starting at csrc=0, with JA modified to reflect dropped procs.
579 *
580 * First processor to hold part of the matrix:
581 *
582  first_proc = mod( ( ja-1 )/nb+csrc, npcol )
583 *
584 * Calculate new JA one while dropping off unused processors.
585 *
586  ja_new = mod( ja-1, nb ) + 1
587 *
588 * Save and compute new value of NP
589 *
590  np_save = np
591  np = ( ja_new+n-2 )/nb + 1
592 *
593 * Call utility routine that forms "standard-form" grid
594 *
595  CALL reshape( ictxt, int_one, ictxt_new, int_one,
596  $ first_proc, int_one, np )
597 *
598 * Use new context from standard grid as context.
599 *
600  ictxt_save = ictxt
601  ictxt = ictxt_new
602  desca_1xp( 2 ) = ictxt_new
603 *
604 * Get information about new grid.
605 *
606  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
607 *
608 * Drop out processors that do not have part of the matrix.
609 *
610  IF( myrow .LT. 0 ) THEN
611  GOTO 1234
612  ENDIF
613 *
614 * ********************************
615 * Values reused throughout routine
616 *
617 * User-input value of partition size
618 *
619  part_size = nb
620 *
621 * Number of columns in each processor
622 *
623  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
624 *
625 * Offset in columns to beginning of main partition in each proc
626 *
627  IF ( mycol .EQ. 0 ) THEN
628  part_offset = part_offset+mod( ja_new-1, part_size )
629  my_num_cols = my_num_cols - mod(ja_new-1, part_size )
630  ENDIF
631 *
632 * Size of main (or odd) partition in each processor
633 *
634  odd_size = my_num_cols
635  IF ( mycol .LT. np-1 ) THEN
636  odd_size = odd_size - int_one
637  ENDIF
638 *
639 *
640 * Zero out space for fillin
641 *
642  DO 10 i=1, laf_min
643  af( i ) = czero
644  10 CONTINUE
645 *
646 * Begin main code
647 *
648 *
649 ********************************************************************
650 * PHASE 1: Local computation phase.
651 ********************************************************************
652 *
653 *
654  IF ( mycol .LT. np-1 ) THEN
655 * Transfer last triangle D_i of local matrix to next processor
656 * which needs it to calculate fillin due to factorization of
657 * its main (odd) block A_i.
658 * Overlap the send with the factorization of A_i.
659 *
660  CALL ctrsd2d( ictxt, 'U', 'N', 1, 1,
661  $ e( part_offset+odd_size+1 ), llda-1, 0,
662  $ mycol+1 )
663 *
664  ENDIF
665 *
666 *
667 * Factor main partition A_i = L_i {L_i}^C in each processor
668 * Or A_i = {U_i}^C {U_i} if E is the upper superdiagonal
669 *
670  CALL cpttrf( odd_size, d( part_offset+1 ), e( part_offset+1 ),
671  $ info )
672 *
673  IF( info.NE.0 ) THEN
674  info = mycol+1
675  GOTO 1500
676  ENDIF
677 *
678 *
679  IF ( mycol .LT. np-1 ) THEN
680 * Apply factorization to odd-even connection block B_i
681 *
682 *
683 * Perform the triangular system solve {L_i}{{B'}_i}^C = {B_i}^C
684 * by dividing B_i by diagonal element
685 *
686  e( part_offset+odd_size ) = e( part_offset+odd_size )/
687  $ d( part_offset+odd_size )
688 *
689 *
690 *
691 * Compute contribution to diagonal block(s) of reduced system.
692 * {C'}_i = {C_i}-{{B'}_i}{{B'}_i}^C
693 *
694  d( part_offset+odd_size+1 ) = d( part_offset+odd_size+1 )-
695  $ d( part_offset+odd_size )*real( e( part_offset+odd_size )*
696  $ conjg( e( part_offset+odd_size ) ) )
697 *
698  ENDIF
699 * End of "if ( MYCOL .lt. NP-1 )..." loop
700 *
701 *
702  1500 CONTINUE
703 * If the processor could not locally factor, it jumps here.
704 *
705  IF ( mycol .NE. 0 ) THEN
706 *
707 * Receive previously transmitted matrix section, which forms
708 * the right-hand-side for the triangular solve that calculates
709 * the "spike" fillin.
710 *
711 *
712  CALL ctrrv2d( ictxt, 'U', 'N', 1, 1, af( 1 ), odd_size, 0,
713  $ mycol-1 )
714 *
715  IF (info.EQ.0) THEN
716 *
717 * Calculate the "spike" fillin, ${L_i} {{G}_i}^C = {D_i}$ .
718 *
719  CALL cpttrsv( 'L', 'N', odd_size, int_one, d( part_offset+1 ),
720  $ e( part_offset+1 ), af( 1 ), odd_size, info )
721 *
722 * Divide by D
723 *
724  DO 20 i=1, odd_size
725  af( i ) = af( i )/d( part_offset+i )
726  20 CONTINUE
727 *
728 *
729 * Calculate the update block for previous proc, E_i = G_i{G_i}^C
730 *
731 *
732 * Since there is no element-by-element vector multiplication in
733 * the BLAS, this loop must be hardwired in without a BLAS call
734 *
735  int_temp = odd_size*int_one+2+1
736  af( int_temp ) = 0
737 *
738  DO 30 i=1, odd_size
739  af( int_temp ) = af( int_temp )-d( part_offset+i )*
740  $ ( af( i )*conjg( af( i ) ) )
741  30 CONTINUE
742 *
743 *
744 * Initiate send of E_i to previous processor to overlap
745 * with next computation.
746 *
747  CALL cgesd2d( ictxt, int_one, int_one, af( odd_size+3 ),
748  $ int_one, 0, mycol-1 )
749 *
750 *
751  IF ( mycol .LT. np-1 ) THEN
752 *
753 * Calculate off-diagonal block(s) of reduced system.
754 * Note: for ease of use in solution of reduced system, store
755 * L's off-diagonal block in conjugate transpose form.
756 * {F_i}^C = {H_i}{{B'}_i}^C
757 *
758  af( odd_size+1 ) =
759  $ - d( part_offset+odd_size )
760  $ * conjg( e( part_offset+odd_size )
761  $ * af( odd_size ) )
762 *
763 *
764  ENDIF
765 *
766  ENDIF
767 * End of "if ( MYCOL .ne. 0 )..."
768 *
769  ENDIF
770 * End of "if (info.eq.0) then"
771 *
772 *
773 * Check to make sure no processors have found errors
774 *
775  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info,
776  $ -1, 0, 0 )
777 *
778  IF( mycol.EQ.0 ) THEN
779  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
780  ELSE
781  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
782  ENDIF
783 *
784  IF ( info.NE.0 ) THEN
785  GOTO 1000
786  ENDIF
787 * No errors found, continue
788 *
789 *
790 ********************************************************************
791 * PHASE 2: Formation and factorization of Reduced System.
792 ********************************************************************
793 *
794 * Gather up local sections of reduced system
795 *
796 *
797 * The last processor does not participate in the factorization of
798 * the reduced system, having sent its E_i already.
799  IF( mycol .EQ. npcol-1 ) THEN
800  GOTO 14
801  ENDIF
802 *
803 * Initiate send of off-diag block(s) to overlap with next part.
804 * Off-diagonal block needed on neighboring processor to start
805 * algorithm.
806 *
807  IF( (mod( mycol+1, 2 ) .EQ. 0) .AND. ( mycol .GT. 0 ) ) THEN
808 *
809  CALL cgesd2d( ictxt, int_one, int_one,
810  $ af( odd_size+1 ),
811  $ int_one, 0, mycol-1 )
812 *
813  ENDIF
814 *
815 * Copy last diagonal block into AF storage for subsequent
816 * operations.
817 *
818  af( odd_size+2 ) =
819  $ cmplx( d( part_offset+odd_size+1 ) )
820 *
821 * Receive cont. to diagonal block that is stored on this proc.
822 *
823  IF( mycol.LT. npcol-1 ) THEN
824 *
825  CALL cgerv2d( ictxt, int_one, int_one,
826  $ af( odd_size+2+1 ),
827  $ int_one, 0,
828  $ mycol+1 )
829 *
830 * Add contribution to diagonal block
831 *
832  af( odd_size+2 ) = af( odd_size+2 )+af( odd_size+3 )
833 *
834  ENDIF
835 *
836 *
837 * *************************************
838 * Modification Loop
839 *
840 * The distance for sending and receiving for each level starts
841 * at 1 for the first level.
842  level_dist = 1
843 *
844 * Do until this proc is needed to modify other procs' equations
845 *
846  12 CONTINUE
847  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 11
848 *
849 * Receive and add contribution to diagonal block from the left
850 *
851  IF( mycol-level_dist .GE. 0 ) THEN
852  CALL cgerv2d( ictxt, int_one, int_one, work( 1 ),
853  $ int_one, 0, mycol-level_dist )
854 *
855  af( odd_size+2 ) = af( odd_size+2 )+work( 1 )
856 *
857  ENDIF
858 *
859 * Receive and add contribution to diagonal block from the right
860 *
861  IF( mycol+level_dist .LT. npcol-1 ) THEN
862  CALL cgerv2d( ictxt, int_one, int_one, work( 1 ),
863  $ int_one, 0, mycol+level_dist )
864 *
865  af( odd_size+2 ) = af( odd_size+2 )+work( 1 )
866 *
867  ENDIF
868 *
869  level_dist = level_dist*2
870 *
871  GOTO 12
872  11 CONTINUE
873 * [End of GOTO Loop]
874 *
875 *
876 * *********************************
877 * Calculate and use this proc's blocks to modify other procs'...
878  IF( af( odd_size+2 ) .EQ. czero ) THEN
879  info = npcol + mycol
880  ENDIF
881 *
882 * ****************************************************************
883 * Receive offdiagonal block from processor to right.
884 * If this is the first group of processors, the receive comes
885 * from a different processor than otherwise.
886 *
887  IF( level_dist .EQ. 1 )THEN
888  comm_proc = mycol + 1
889 *
890 * Move block into place that it will be expected to be for
891 * calcs.
892 *
893  af( odd_size+3 ) = af( odd_size+1 )
894 *
895  ELSE
896  comm_proc = mycol + level_dist/2
897  ENDIF
898 *
899  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
900 *
901  CALL cgerv2d( ictxt, int_one, int_one,
902  $ af( odd_size+1 ),
903  $ int_one, 0, comm_proc )
904 *
905  IF( info .EQ. 0 ) THEN
906 *
907 *
908 * Modify upper off_diagonal block with diagonal block
909 *
910 *
911  af( odd_size+1 ) = af( odd_size+1 )/af( odd_size+2 )
912 *
913  ENDIF
914 * End of "if ( info.eq.0 ) then"
915 *
916 * Calculate contribution from this block to next diagonal block
917 *
918  work( 1 ) = -one*af( odd_size+1 )*af( odd_size+2 )
919  $ *conjg( af( odd_size+1 ) )
920 *
921 * Send contribution to diagonal block's owning processor.
922 *
923  CALL cgesd2d( ictxt, int_one, int_one, work( 1 ), int_one,
924  $ 0, mycol+level_dist )
925 *
926  ENDIF
927 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
928 *
929 *
930 * ****************************************************************
931 * Receive off_diagonal block from left and use to finish with this
932 * processor.
933 *
934  IF( (mycol/level_dist .GT. 0 ).AND.
935  $ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
936 *
937  IF( level_dist .GT. 1)THEN
938 *
939 * Receive offdiagonal block(s) from proc level_dist/2 to the
940 * left
941 *
942  CALL cgerv2d( ictxt, int_one, int_one,
943  $ af( odd_size+2+1 ),
944  $ int_one, 0, mycol-level_dist/2 )
945 *
946  ENDIF
947 *
948 *
949  IF( info.EQ.0 ) THEN
950 *
951 * Use diagonal block(s) to modify this offdiagonal block
952 *
953  af( odd_size+3 ) = ( af( odd_size+3 ) )
954  $ /af( odd_size+2 )
955 *
956  ENDIF
957 * End of "if( info.eq.0 ) then"
958 *
959 * Use offdiag block(s) to calculate modification to diag block
960 * of processor to the left
961 *
962  work( 1 ) = -one*af( odd_size+3 )*af( odd_size+2 )
963  $ *conjg( af( odd_size+3 ) )
964 *
965 * Send contribution to diagonal block's owning processor.
966 *
967  CALL cgesd2d( ictxt, int_one, int_one, work( 1 ), int_one,
968  $ 0, mycol-level_dist )
969 *
970 * *******************************************************
971 *
972  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 ) THEN
973 *
974 * Decide which processor offdiagonal block(s) goes to
975 *
976  IF( ( mod( mycol/( 2*level_dist ),2 )) .EQ.0 ) THEN
977  comm_proc = mycol + level_dist
978  ELSE
979  comm_proc = mycol - level_dist
980  ENDIF
981 *
982 * Use offdiagonal blocks to calculate offdiag
983 * block to send to neighboring processor. Depending
984 * on circumstances, may need to transpose the matrix.
985 *
986  work( 1 ) = -one*af( odd_size+3 )
987  $ * af( odd_size+2 )
988  $ * af( odd_size+1 )
989 *
990 * Send contribution to offdiagonal block's owning processor.
991 *
992  CALL cgesd2d( ictxt, int_one, int_one, work( 1 ), int_one,
993  $ 0, comm_proc )
994 *
995  ENDIF
996 *
997  ENDIF
998 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
999 *
1000  14 CONTINUE
1001 *
1002 *
1003  1000 CONTINUE
1004 *
1005 *
1006 * Free BLACS space used to hold standard-form grid.
1007 *
1008  IF( ictxt_save .NE. ictxt_new ) THEN
1009  CALL blacs_gridexit( ictxt_new )
1010  ENDIF
1011 *
1012  1234 CONTINUE
1013 *
1014 * Restore saved input parameters
1015 *
1016  ictxt = ictxt_save
1017  np = np_save
1018 *
1019 * Output minimum worksize
1020 *
1021  work( 1 ) = work_size_min
1022 *
1023 * Make INFO consistent across processors
1024 *
1025  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info,
1026  $ -1, 0, 0 )
1027 *
1028  IF( mycol.EQ.0 ) THEN
1029  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
1030  ELSE
1031  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
1032  ENDIF
1033 *
1034 *
1035  RETURN
1036 *
1037 * End of PCPTTRF
1038 *
1039  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
cmplx
float cmplx[2]
Definition: pblas.h:132
cpttrsv
subroutine cpttrsv(UPLO, TRANS, N, NRHS, D, E, B, LDB, INFO)
Definition: cpttrsv.f:3
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
clatcpy
subroutine clatcpy(UPLO, M, N, A, LDA, B, LDB)
Definition: clatcpy.f:2
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
pcpttrf
subroutine pcpttrf(N, D, E, JA, DESCA, AF, LAF, WORK, LWORK, INFO)
Definition: pcpttrf.f:3
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181