ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pddttrf.f
Go to the documentation of this file.
1  SUBROUTINE pddttrf( N, DL, D, DU, JA, DESCA, AF, LAF, WORK, LWORK,
2  $ INFO )
3 *
4 * -- ScaLAPACK routine (version 1.7) --
5 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
6 * and University of California, Berkeley.
7 * April 3, 2000
8 *
9 * .. Scalar Arguments ..
10  INTEGER INFO, JA, LAF, LWORK, N
11 * ..
12 * .. Array Arguments ..
13  INTEGER DESCA( * )
14  DOUBLE PRECISION AF( * ), D( * ), DL( * ), DU( * ), WORK( * )
15 * ..
16 *
17 *
18 * Purpose
19 * =======
20 *
21 * PDDTTRF computes a LU factorization
22 * of an N-by-N real tridiagonal
23 * diagonally dominant-like distributed matrix
24 * A(1:N, JA:JA+N-1).
25 * Reordering is used to increase parallelism in the factorization.
26 * This reordering results in factors that are DIFFERENT from those
27 * produced by equivalent sequential codes. These factors cannot
28 * be used directly by users; however, they can be used in
29 * subsequent calls to PDDTTRS to solve linear systems.
30 *
31 * The factorization has the form
32 *
33 * P A(1:N, JA:JA+N-1) P^T = L U
34 *
35 * where U is a tridiagonal upper triangular matrix and L is tridiagonal
36 * lower triangular, and P is a permutation matrix.
37 *
38 * =====================================================================
39 *
40 * Arguments
41 * =========
42 *
43 *
44 * N (global input) INTEGER
45 * The number of rows and columns to be operated on, i.e. the
46 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
47 *
48 * DL (local input/local output) DOUBLE PRECISION pointer to local
49 * part of global vector storing the lower diagonal of the
50 * matrix. Globally, DL(1) is not referenced, and DL must be
51 * aligned with D.
52 * Must be of size >= DESCA( NB_ ).
53 * On exit, this array contains information containing the
54 * factors of the matrix.
55 *
56 * D (local input/local output) DOUBLE PRECISION pointer to local
57 * part of global vector storing the main diagonal of the
58 * matrix.
59 * On exit, this array contains information containing the
60 * factors of the matrix.
61 * Must be of size >= DESCA( NB_ ).
62 *
63 * DU (local input/local output) DOUBLE PRECISION pointer to local
64 * part of global vector storing the upper diagonal of the
65 * matrix. Globally, DU(n) is not referenced, and DU must be
66 * aligned with D.
67 * On exit, this array contains information containing the
68 * factors of the matrix.
69 * Must be of size >= DESCA( NB_ ).
70 *
71 * JA (global input) INTEGER
72 * The index in the global array A that points to the start of
73 * the matrix to be operated on (which may be either all of A
74 * or a submatrix of A).
75 *
76 * DESCA (global and local input) INTEGER array of dimension DLEN.
77 * if 1D type (DTYPE_A=501 or 502), DLEN >= 7;
78 * if 2D type (DTYPE_A=1), DLEN >= 9.
79 * The array descriptor for the distributed matrix A.
80 * Contains information of mapping of A to memory. Please
81 * see NOTES below for full description and options.
82 *
83 * AF (local output) DOUBLE PRECISION array, dimension LAF.
84 * Auxiliary Fillin Space.
85 * Fillin is created during the factorization routine
86 * PDDTTRF and this is stored in AF. If a linear system
87 * is to be solved using PDDTTRS after the factorization
88 * routine, AF *must not be altered* after the factorization.
89 *
90 * LAF (local input) INTEGER
91 * Size of user-input Auxiliary Fillin space AF. Must be >=
92 * 2*(NB+2)
93 * If LAF is not large enough, an error code will be returned
94 * and the minimum acceptable size will be returned in AF( 1 )
95 *
96 * WORK (local workspace/local output)
97 * DOUBLE PRECISION temporary workspace. This space may
98 * be overwritten in between calls to routines. WORK must be
99 * the size given in LWORK.
100 * On exit, WORK( 1 ) contains the minimal LWORK.
101 *
102 * LWORK (local input or global input) INTEGER
103 * Size of user-input workspace WORK.
104 * If LWORK is too small, the minimal acceptable size will be
105 * returned in WORK(1) and an error code is returned. LWORK>=
106 * 8*NPCOL
107 *
108 * INFO (local output) INTEGER
109 * = 0: successful exit
110 * < 0: If the i-th argument is an array and the j-entry had
111 * an illegal value, then INFO = -(i*100+j), if the i-th
112 * argument is a scalar and had an illegal value, then
113 * INFO = -i.
114 * > 0: If INFO = K<=NPROCS, the submatrix stored on processor
115 * INFO and factored locally was not
116 * diagonally dominant-like, and
117 * the factorization was not completed.
118 * If INFO = K>NPROCS, the submatrix stored on processor
119 * INFO-NPROCS representing interactions with other
120 * processors was not
121 * stably factorable wo/interchanges,
122 * and the factorization was not completed.
123 *
124 * =====================================================================
125 *
126 *
127 * Restrictions
128 * ============
129 *
130 * The following are restrictions on the input parameters. Some of these
131 * are temporary and will be removed in future releases, while others
132 * may reflect fundamental technical limitations.
133 *
134 * Non-cyclic restriction: VERY IMPORTANT!
135 * P*NB>= mod(JA-1,NB)+N.
136 * The mapping for matrices must be blocked, reflecting the nature
137 * of the divide and conquer algorithm as a task-parallel algorithm.
138 * This formula in words is: no processor may have more than one
139 * chunk of the matrix.
140 *
141 * Blocksize cannot be too small:
142 * If the matrix spans more than one processor, the following
143 * restriction on NB, the size of each block on each processor,
144 * must hold:
145 * NB >= 2
146 * The bulk of parallel computation is done on the matrix of size
147 * O(NB) on each processor. If this is too small, divide and conquer
148 * is a poor choice of algorithm.
149 *
150 * Submatrix reference:
151 * JA = IB
152 * Alignment restriction that prevents unnecessary communication.
153 *
154 *
155 * =====================================================================
156 *
157 *
158 * Notes
159 * =====
160 *
161 * If the factorization routine and the solve routine are to be called
162 * separately (to solve various sets of righthand sides using the same
163 * coefficient matrix), the auxiliary space AF *must not be altered*
164 * between calls to the factorization routine and the solve routine.
165 *
166 * The best algorithm for solving banded and tridiagonal linear systems
167 * depends on a variety of parameters, especially the bandwidth.
168 * Currently, only algorithms designed for the case N/P >> bw are
169 * implemented. These go by many names, including Divide and Conquer,
170 * Partitioning, domain decomposition-type, etc.
171 * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C
172 * algorithms are the appropriate choice.
173 *
174 * Algorithm description: Divide and Conquer
175 *
176 * The Divide and Conqer algorithm assumes the matrix is narrowly
177 * banded compared with the number of equations. In this situation,
178 * it is best to distribute the input matrix A one-dimensionally,
179 * with columns atomic and rows divided amongst the processes.
180 * The basic algorithm divides the tridiagonal matrix up into
181 * P pieces with one stored on each processor,
182 * and then proceeds in 2 phases for the factorization or 3 for the
183 * solution of a linear system.
184 * 1) Local Phase:
185 * The individual pieces are factored independently and in
186 * parallel. These factors are applied to the matrix creating
187 * fillin, which is stored in a non-inspectable way in auxiliary
188 * space AF. Mathematically, this is equivalent to reordering
189 * the matrix A as P A P^T and then factoring the principal
190 * leading submatrix of size equal to the sum of the sizes of
191 * the matrices factored on each processor. The factors of
192 * these submatrices overwrite the corresponding parts of A
193 * in memory.
194 * 2) Reduced System Phase:
195 * A small ((P-1)) system is formed representing
196 * interaction of the larger blocks, and is stored (as are its
197 * factors) in the space AF. A parallel Block Cyclic Reduction
198 * algorithm is used. For a linear system, a parallel front solve
199 * followed by an analagous backsolve, both using the structure
200 * of the factored matrix, are performed.
201 * 3) Backsubsitution Phase:
202 * For a linear system, a local backsubstitution is performed on
203 * each processor in parallel.
204 *
205 *
206 * Descriptors
207 * ===========
208 *
209 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
210 *
211 * Note: tridiagonal codes can use either the old two dimensional
212 * or new one-dimensional descriptors, though the processor grid in
213 * both cases *must be one-dimensional*. We describe both types below.
214 *
215 * Each global data object is described by an associated description
216 * vector. This vector stores the information required to establish
217 * the mapping between an object element and its corresponding process
218 * and memory location.
219 *
220 * Let A be a generic term for any 2D block cyclicly distributed array.
221 * Such a global array has an associated description vector DESCA.
222 * In the following comments, the character _ should be read as
223 * "of the global array".
224 *
225 * NOTATION STORED IN EXPLANATION
226 * --------------- -------------- --------------------------------------
227 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
228 * DTYPE_A = 1.
229 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
230 * the BLACS process grid A is distribu-
231 * ted over. The context itself is glo-
232 * bal, but the handle (the integer
233 * value) may vary.
234 * M_A (global) DESCA( M_ ) The number of rows in the global
235 * array A.
236 * N_A (global) DESCA( N_ ) The number of columns in the global
237 * array A.
238 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
239 * the rows of the array.
240 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
241 * the columns of the array.
242 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
243 * row of the array A is distributed.
244 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
245 * first column of the array A is
246 * distributed.
247 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
248 * array. LLD_A >= MAX(1,LOCr(M_A)).
249 *
250 * Let K be the number of rows or columns of a distributed matrix,
251 * and assume that its process grid has dimension p x q.
252 * LOCr( K ) denotes the number of elements of K that a process
253 * would receive if K were distributed over the p processes of its
254 * process column.
255 * Similarly, LOCc( K ) denotes the number of elements of K that a
256 * process would receive if K were distributed over the q processes of
257 * its process row.
258 * The values of LOCr() and LOCc() may be determined via a call to the
259 * ScaLAPACK tool function, NUMROC:
260 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
261 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
262 * An upper bound for these quantities may be computed by:
263 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
264 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
265 *
266 *
267 * One-dimensional descriptors:
268 *
269 * One-dimensional descriptors are a new addition to ScaLAPACK since
270 * version 1.0. They simplify and shorten the descriptor for 1D
271 * arrays.
272 *
273 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
274 * object, we allow 1D arrays to be distributed either over the
275 * first dimension of the array (as if the grid were P-by-1) or the
276 * 2nd dimension (as if the grid were 1-by-P). This choice is
277 * indicated by the descriptor type (501 or 502)
278 * as described below.
279 * However, for tridiagonal matrices, since the objects being
280 * distributed are the individual vectors storing the diagonals, we
281 * have adopted the convention that both the P-by-1 descriptor and
282 * the 1-by-P descriptor are allowed and are equivalent for
283 * tridiagonal matrices. Thus, for tridiagonal matrices,
284 * DTYPE_A = 501 or 502 can be used interchangeably
285 * without any other change.
286 * We require that the distributed vectors storing the diagonals of a
287 * tridiagonal matrix be aligned with each other. Because of this, a
288 * single descriptor, DESCA, serves to describe the distribution of
289 * of all diagonals simultaneously.
290 *
291 * IMPORTANT NOTE: the actual BLACS grid represented by the
292 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
293 * irrespective of which one-dimensional descriptor type
294 * (501 or 502) is input.
295 * This routine will interpret the grid properly either way.
296 * ScaLAPACK routines *do not support intercontext operations* so that
297 * the grid passed to a single ScaLAPACK routine *must be the same*
298 * for all array descriptors passed to that routine.
299 *
300 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
301 * may also be used, since a one-dimensional array is a special case
302 * of a two-dimensional array with one dimension of size unity.
303 * The two-dimensional array used in this case *must* be of the
304 * proper orientation:
305 * If the appropriate one-dimensional descriptor is DTYPEA=501
306 * (1 by P type), then the two dimensional descriptor must
307 * have a CTXT value that refers to a 1 by P BLACS grid;
308 * If the appropriate one-dimensional descriptor is DTYPEA=502
309 * (P by 1 type), then the two dimensional descriptor must
310 * have a CTXT value that refers to a P by 1 BLACS grid.
311 *
312 *
313 * Summary of allowed descriptors, types, and BLACS grids:
314 * DTYPE 501 502 1 1
315 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
316 * -----------------------------------------------------
317 * A OK OK OK NO
318 * B NO OK NO OK
319 *
320 * Let A be a generic term for any 1D block cyclicly distributed array.
321 * Such a global array has an associated description vector DESCA.
322 * In the following comments, the character _ should be read as
323 * "of the global array".
324 *
325 * NOTATION STORED IN EXPLANATION
326 * --------------- ---------- ------------------------------------------
327 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
328 * TYPE_A = 501: 1-by-P grid.
329 * TYPE_A = 502: P-by-1 grid.
330 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
331 * the BLACS process grid A is distribu-
332 * ted over. The context itself is glo-
333 * bal, but the handle (the integer
334 * value) may vary.
335 * N_A (global) DESCA( 3 ) The size of the array dimension being
336 * distributed.
337 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
338 * the distributed dimension of the array.
339 * SRC_A (global) DESCA( 5 ) The process row or column over which the
340 * first row or column of the array
341 * is distributed.
342 * Ignored DESCA( 6 ) Ignored for tridiagonal matrices.
343 * Reserved DESCA( 7 ) Reserved for future use.
344 *
345 *
346 *
347 * =====================================================================
348 *
349 * Code Developer: Andrew J. Cleary, University of Tennessee.
350 * Current address: Lawrence Livermore National Labs.
351 *
352 * =====================================================================
353 *
354 * .. Parameters ..
355  DOUBLE PRECISION ONE
356  parameter( one = 1.0d+0 )
357  DOUBLE PRECISION ZERO
358  parameter( zero = 0.0d+0 )
359  INTEGER INT_ONE
360  parameter( int_one = 1 )
361  INTEGER DESCMULT, BIGNUM
362  parameter( descmult = 100, bignum = descmult*descmult )
363  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
364  $ lld_, mb_, m_, nb_, n_, rsrc_
365  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
366  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
367  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
368 * ..
369 * .. Local Scalars ..
370  INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT,
371  $ ictxt_new, ictxt_save, idum3, ja_new, laf_min,
372  $ level_dist, llda, mycol, myrow, my_num_cols,
373  $ nb, np, npcol, nprow, np_save, odd_size,
374  $ part_offset, part_size, return_code, store_n_a,
375  $ temp, work_size_min, work_u
376 * ..
377 * .. Local Arrays ..
378  INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 7, 3 )
379 * ..
380 * .. External Subroutines ..
381  EXTERNAL blacs_gridexit, blacs_gridinfo, ddttrf,
382  $ ddttrsv, desc_convert, dgerv2d, dgesd2d,
383  $ dtrrv2d, dtrsd2d, globchk, igamx2d, igebr2d,
384  $ igebs2d, pxerbla, reshape
385 * ..
386 * .. External Functions ..
387  INTEGER NUMROC
388  DOUBLE PRECISION DDOT
389  EXTERNAL numroc, ddot
390 * ..
391 * .. Intrinsic Functions ..
392  INTRINSIC dble, mod
393 * ..
394 * .. Executable Statements ..
395 *
396 * Test the input parameters
397 *
398  info = 0
399 *
400 * Convert descriptor into standard form for easy access to
401 * parameters, check that grid is of right shape.
402 *
403  desca_1xp( 1 ) = 501
404 *
405  temp = desca( dtype_ )
406  IF( temp.EQ.502 ) THEN
407 * Temporarily set the descriptor type to 1xP type
408  desca( dtype_ ) = 501
409  END IF
410 *
411  CALL desc_convert( desca, desca_1xp, return_code )
412 *
413  desca( dtype_ ) = temp
414 *
415  IF( return_code.NE.0 ) THEN
416  info = -( 6*100+2 )
417  END IF
418 *
419 * Get values out of descriptor for use in code.
420 *
421  ictxt = desca_1xp( 2 )
422  csrc = desca_1xp( 5 )
423  nb = desca_1xp( 4 )
424  llda = desca_1xp( 6 )
425  store_n_a = desca_1xp( 3 )
426 *
427 * Get grid parameters
428 *
429 *
430  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
431  np = nprow*npcol
432 *
433 *
434 *
435  IF( lwork.LT.-1 ) THEN
436  info = -10
437  ELSE IF( lwork.EQ.-1 ) THEN
438  idum3 = -1
439  ELSE
440  idum3 = 1
441  END IF
442 *
443  IF( n.LT.0 ) THEN
444  info = -1
445  END IF
446 *
447  IF( n+ja-1.GT.store_n_a ) THEN
448  info = -( 6*100+6 )
449  END IF
450 *
451 * Argument checking that is specific to Divide & Conquer routine
452 *
453  IF( nprow.NE.1 ) THEN
454  info = -( 6*100+2 )
455  END IF
456 *
457  IF( n.GT.np*nb-mod( ja-1, nb ) ) THEN
458  info = -( 1 )
459  CALL pxerbla( ictxt, 'PDDTTRF, D&C alg.: only 1 block per proc'
460  $ , -info )
461  RETURN
462  END IF
463 *
464  IF( ( ja+n-1.GT.nb ) .AND. ( nb.LT.2*int_one ) ) THEN
465  info = -( 6*100+4 )
466  CALL pxerbla( ictxt, 'PDDTTRF, D&C alg.: NB too small', -info )
467  RETURN
468  END IF
469 *
470 *
471 * Check auxiliary storage size
472 *
473  laf_min = ( 12*npcol+3*nb )
474 *
475  IF( laf.LT.laf_min ) THEN
476  info = -8
477 * put minimum value of laf into AF( 1 )
478  af( 1 ) = laf_min
479  CALL pxerbla( ictxt, 'PDDTTRF: auxiliary storage error ',
480  $ -info )
481  RETURN
482  END IF
483 *
484 * Check worksize
485 *
486  work_size_min = 8*npcol
487 *
488  work( 1 ) = work_size_min
489 *
490  IF( lwork.LT.work_size_min ) THEN
491  IF( lwork.NE.-1 ) THEN
492  info = -10
493  CALL pxerbla( ictxt, 'PDDTTRF: worksize error ', -info )
494  END IF
495  RETURN
496  END IF
497 *
498 * Pack params and positions into arrays for global consistency check
499 *
500  param_check( 7, 1 ) = desca( 5 )
501  param_check( 6, 1 ) = desca( 4 )
502  param_check( 5, 1 ) = desca( 3 )
503  param_check( 4, 1 ) = desca( 1 )
504  param_check( 3, 1 ) = ja
505  param_check( 2, 1 ) = n
506  param_check( 1, 1 ) = idum3
507 *
508  param_check( 7, 2 ) = 605
509  param_check( 6, 2 ) = 604
510  param_check( 5, 2 ) = 603
511  param_check( 4, 2 ) = 601
512  param_check( 3, 2 ) = 5
513  param_check( 2, 2 ) = 1
514  param_check( 1, 2 ) = 10
515 *
516 * Want to find errors with MIN( ), so if no error, set it to a big
517 * number. If there already is an error, multiply by the the
518 * descriptor multiplier.
519 *
520  IF( info.GE.0 ) THEN
521  info = bignum
522  ELSE IF( info.LT.-descmult ) THEN
523  info = -info
524  ELSE
525  info = -info*descmult
526  END IF
527 *
528 * Check consistency across processors
529 *
530  CALL globchk( ictxt, 7, param_check, 7, param_check( 1, 3 ),
531  $ info )
532 *
533 * Prepare output: set info = 0 if no error, and divide by DESCMULT
534 * if error is not in a descriptor entry.
535 *
536  IF( info.EQ.bignum ) THEN
537  info = 0
538  ELSE IF( mod( info, descmult ).EQ.0 ) THEN
539  info = -info / descmult
540  ELSE
541  info = -info
542  END IF
543 *
544  IF( info.LT.0 ) THEN
545  CALL pxerbla( ictxt, 'PDDTTRF', -info )
546  RETURN
547  END IF
548 *
549 * Quick return if possible
550 *
551  IF( n.EQ.0 )
552  $ RETURN
553 *
554 *
555 * Adjust addressing into matrix space to properly get into
556 * the beginning part of the relevant data
557 *
558  part_offset = nb*( ( ja-1 ) / ( npcol*nb ) )
559 *
560  IF( ( mycol-csrc ).LT.( ja-part_offset-1 ) / nb ) THEN
561  part_offset = part_offset + nb
562  END IF
563 *
564  IF( mycol.LT.csrc ) THEN
565  part_offset = part_offset - nb
566  END IF
567 *
568 * Form a new BLACS grid (the "standard form" grid) with only procs
569 * holding part of the matrix, of size 1xNP where NP is adjusted,
570 * starting at csrc=0, with JA modified to reflect dropped procs.
571 *
572 * First processor to hold part of the matrix:
573 *
574  first_proc = mod( ( ja-1 ) / nb+csrc, npcol )
575 *
576 * Calculate new JA one while dropping off unused processors.
577 *
578  ja_new = mod( ja-1, nb ) + 1
579 *
580 * Save and compute new value of NP
581 *
582  np_save = np
583  np = ( ja_new+n-2 ) / nb + 1
584 *
585 * Call utility routine that forms "standard-form" grid
586 *
587  CALL reshape( ictxt, int_one, ictxt_new, int_one, first_proc,
588  $ int_one, np )
589 *
590 * Use new context from standard grid as context.
591 *
592  ictxt_save = ictxt
593  ictxt = ictxt_new
594  desca_1xp( 2 ) = ictxt_new
595 *
596 * Get information about new grid.
597 *
598  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
599 *
600 * Drop out processors that do not have part of the matrix.
601 *
602  IF( myrow.LT.0 ) THEN
603  GO TO 70
604  END IF
605 *
606 * ********************************
607 * Values reused throughout routine
608 *
609 * User-input value of partition size
610 *
611  part_size = nb
612 *
613 * Number of columns in each processor
614 *
615  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
616 *
617 * Offset in columns to beginning of main partition in each proc
618 *
619  IF( mycol.EQ.0 ) THEN
620  part_offset = part_offset + mod( ja_new-1, part_size )
621  my_num_cols = my_num_cols - mod( ja_new-1, part_size )
622  END IF
623 *
624 * Size of main (or odd) partition in each processor
625 *
626  odd_size = my_num_cols
627  IF( mycol.LT.np-1 ) THEN
628  odd_size = odd_size - int_one
629  END IF
630 *
631 * Offset to workspace for Upper triangular factor
632 *
633  work_u = int_one*odd_size + 3
634 *
635 *
636 * Zero out space for fillin
637 *
638  DO 10 i = 1, laf_min
639  af( i ) = zero
640  10 CONTINUE
641 *
642 * Begin main code
643 *
644 *
645 ********************************************************************
646 * PHASE 1: Local computation phase.
647 ********************************************************************
648 *
649 *
650  IF( mycol.LT.np-1 ) THEN
651 * Transfer last triangle D_i of local matrix to next processor
652 * which needs it to calculate fillin due to factorization of
653 * its main (odd) block A_i.
654 * Overlap the send with the factorization of A_i.
655 *
656  CALL dtrsd2d( ictxt, 'U', 'N', 1, 1,
657  $ du( part_offset+odd_size+1 ), llda-1, 0,
658  $ mycol+1 )
659 *
660  END IF
661 *
662 *
663 * Factor main partition A_i = L_i {U_i} in each processor
664 *
665  CALL ddttrf( odd_size, dl( part_offset+2 ), d( part_offset+1 ),
666  $ du( part_offset+1 ), info )
667 *
668  IF( info.NE.0 ) THEN
669  info = mycol + 1
670  GO TO 20
671  END IF
672 *
673 *
674  IF( mycol.LT.np-1 ) THEN
675 *
676 * Apply factorization to lower connection block BL_i
677 * Apply factorization to upper connection block BU_i
678 *
679 *
680 * Perform the triangular solve {U_i}^T{BL'}_i^T = {BL_i}^T
681 *
682 *
683  dl( part_offset+odd_size+1 ) = ( dl( part_offset+odd_size+1 ) )
684  $ / ( d( part_offset+odd_size ) )
685 *
686 *
687 * Compute contribution to diagonal block(s) of reduced system.
688 * {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i}
689 *
690 *
691  d( part_offset+odd_size+1 ) = d( part_offset+odd_size+1 ) -
692  $ dl( part_offset+odd_size+1 )*
693  $ du( part_offset+odd_size )
694 *
695  END IF
696 * End of "if ( MYCOL .lt. NP-1 )..." loop
697 *
698 *
699  20 CONTINUE
700 * If the processor could not locally factor, it jumps here.
701 *
702  IF( mycol.NE.0 ) THEN
703 *
704 * Move entry that causes spike to auxiliary storage
705 *
706  af( work_u+1 ) = ( dl( part_offset+1 ) )
707 *
708  IF( info.EQ.0 ) THEN
709 *
710 * Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ .
711 *
712  CALL ddttrsv( 'L', 'N', odd_size, int_one,
713  $ dl( part_offset+2 ), d( part_offset+1 ),
714  $ du( part_offset+1 ), af( work_u+1 ), odd_size,
715  $ info )
716 *
717 *
718 * Calculate the "spike" fillin, ${U_i}^T {{GL}_i}^T = {DU_i}^T$
719 *
720  CALL dtrrv2d( ictxt, 'U', 'N', 1, 1, af( 1 ), odd_size, 0,
721  $ mycol-1 )
722 *
723  CALL ddttrsv( 'U', 'T', odd_size, int_one,
724  $ dl( part_offset+2 ), d( part_offset+1 ),
725  $ du( part_offset+1 ), af( 1 ), odd_size, info )
726 *
727 *
728 * Calculate the update block for previous proc, E_i = GL_i{GU_i}
729 *
730  af( odd_size+3 ) = -one*ddot( odd_size, af( 1 ), 1,
731  $ af( work_u+1 ), 1 )
732 *
733 *
734 * Initiate send of E_i to previous processor to overlap
735 * with next computation.
736 *
737  CALL dgesd2d( ictxt, int_one, int_one, af( odd_size+3 ),
738  $ int_one, 0, mycol-1 )
739 *
740 *
741  IF( mycol.LT.np-1 ) THEN
742 *
743 * Calculate off-diagonal block(s) of reduced system.
744 * Note: for ease of use in solution of reduced system, store
745 * L's off-diagonal block in transpose form.
746 *
747  af( odd_size+1 ) = -one*( dl( part_offset+odd_size+1 )*
748  $ af( work_u+odd_size ) )
749 *
750 *
751  af( work_u+( odd_size )+1 ) = -one*
752  $ du( part_offset+odd_size )*( af( odd_size ) )
753 *
754  END IF
755 *
756  END IF
757 * End of "if ( MYCOL .ne. 0 )..."
758 *
759  END IF
760 * End of "if (info.eq.0) then"
761 *
762 *
763 * Check to make sure no processors have found errors
764 *
765  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info, -1, 0,
766  $ 0 )
767 *
768  IF( mycol.EQ.0 ) THEN
769  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
770  ELSE
771  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
772  END IF
773 *
774  IF( info.NE.0 ) THEN
775  GO TO 60
776  END IF
777 * No errors found, continue
778 *
779 *
780 ********************************************************************
781 * PHASE 2: Formation and factorization of Reduced System.
782 ********************************************************************
783 *
784 * Gather up local sections of reduced system
785 *
786 *
787 * The last processor does not participate in the factorization of
788 * the reduced system, having sent its E_i already.
789  IF( mycol.EQ.npcol-1 ) THEN
790  GO TO 50
791  END IF
792 *
793 * Initiate send of off-diag block(s) to overlap with next part.
794 * Off-diagonal block needed on neighboring processor to start
795 * algorithm.
796 *
797  IF( ( mod( mycol+1, 2 ).EQ.0 ) .AND. ( mycol.GT.0 ) ) THEN
798 *
799  CALL dgesd2d( ictxt, int_one, int_one, af( odd_size+1 ),
800  $ int_one, 0, mycol-1 )
801 *
802  CALL dgesd2d( ictxt, int_one, int_one, af( work_u+odd_size+1 ),
803  $ int_one, 0, mycol-1 )
804 *
805  END IF
806 *
807 * Copy last diagonal block into AF storage for subsequent
808 * operations.
809 *
810  af( odd_size+2 ) = dble( d( part_offset+odd_size+1 ) )
811 *
812 * Receive cont. to diagonal block that is stored on this proc.
813 *
814  IF( mycol.LT.npcol-1 ) THEN
815 *
816  CALL dgerv2d( ictxt, int_one, int_one, af( odd_size+2+1 ),
817  $ int_one, 0, mycol+1 )
818 *
819 * Add contribution to diagonal block
820 *
821  af( odd_size+2 ) = af( odd_size+2 ) + af( odd_size+3 )
822 *
823  END IF
824 *
825 *
826 * *************************************
827 * Modification Loop
828 *
829 * The distance for sending and receiving for each level starts
830 * at 1 for the first level.
831  level_dist = 1
832 *
833 * Do until this proc is needed to modify other procs' equations
834 *
835  30 CONTINUE
836  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
837  $ GO TO 40
838 *
839 * Receive and add contribution to diagonal block from the left
840 *
841  IF( mycol-level_dist.GE.0 ) THEN
842  CALL dgerv2d( ictxt, int_one, int_one, work( 1 ), int_one, 0,
843  $ mycol-level_dist )
844 *
845  af( odd_size+2 ) = af( odd_size+2 ) + work( 1 )
846 *
847  END IF
848 *
849 * Receive and add contribution to diagonal block from the right
850 *
851  IF( mycol+level_dist.LT.npcol-1 ) THEN
852  CALL dgerv2d( ictxt, int_one, int_one, work( 1 ), int_one, 0,
853  $ mycol+level_dist )
854 *
855  af( odd_size+2 ) = af( odd_size+2 ) + work( 1 )
856 *
857  END IF
858 *
859  level_dist = level_dist*2
860 *
861  GO TO 30
862  40 CONTINUE
863 * [End of GOTO Loop]
864 *
865 *
866 * *********************************
867 * Calculate and use this proc's blocks to modify other procs'...
868  IF( af( odd_size+2 ).EQ.zero ) THEN
869  info = npcol + mycol
870  END IF
871 *
872 * ****************************************************************
873 * Receive offdiagonal block from processor to right.
874 * If this is the first group of processors, the receive comes
875 * from a different processor than otherwise.
876 *
877  IF( level_dist.EQ.1 ) THEN
878  comm_proc = mycol + 1
879 *
880 * Move block into place that it will be expected to be for
881 * calcs.
882 *
883  af( work_u+odd_size+3 ) = af( odd_size+1 )
884 *
885  af( odd_size+3 ) = af( work_u+odd_size+1 )
886 *
887  ELSE
888  comm_proc = mycol + level_dist / 2
889  END IF
890 *
891  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
892 *
893  CALL dgerv2d( ictxt, int_one, int_one, af( odd_size+1 ),
894  $ int_one, 0, comm_proc )
895 *
896  CALL dgerv2d( ictxt, int_one, int_one, af( work_u+odd_size+1 ),
897  $ int_one, 0, comm_proc )
898 *
899  IF( info.EQ.0 ) THEN
900 *
901 *
902 * Modify lower off_diagonal block with diagonal block
903 *
904 *
905  af( odd_size+1 ) = af( odd_size+1 ) / ( af( odd_size+2 ) )
906 *
907  END IF
908 * End of "if ( info.eq.0 ) then"
909 *
910 * Calculate contribution from this block to next diagonal block
911 *
912  work( 1 ) = -one*( af( odd_size+1 ) )*
913  $ af( work_u+( odd_size )+1 )
914 *
915 * Send contribution to diagonal block's owning processor.
916 *
917  CALL dgesd2d( ictxt, int_one, int_one, work( 1 ), int_one, 0,
918  $ mycol+level_dist )
919 *
920  END IF
921 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
922 *
923 *
924 * ****************************************************************
925 * Receive off_diagonal block from left and use to finish with this
926 * processor.
927 *
928  IF( ( mycol / level_dist.GT.0 ) .AND.
929  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) ) THEN
930 *
931  IF( level_dist.GT.1 ) THEN
932 *
933 * Receive offdiagonal block(s) from proc level_dist/2 to the
934 * left
935 *
936  CALL dgerv2d( ictxt, int_one, int_one,
937  $ af( work_u+odd_size+2+1 ), int_one, 0,
938  $ mycol-level_dist / 2 )
939 *
940 * Receive offdiagonal block(s) from proc level_dist/2 to the
941 * left
942 *
943  CALL dgerv2d( ictxt, int_one, int_one, af( odd_size+2+1 ),
944  $ int_one, 0, mycol-level_dist / 2 )
945 *
946  END IF
947 *
948 *
949  IF( info.EQ.0 ) THEN
950 *
951 * Use diagonal block(s) to modify this offdiagonal block
952 *
953  af( odd_size+3 ) = af( odd_size+3 ) / ( af( odd_size+2 ) )
954 *
955 *
956  END IF
957 * End of "if( info.eq.0 ) then"
958 *
959 * Use offdiag block(s) to calculate modification to diag block
960 * of processor to the left
961 *
962  work( 1 ) = -one*af( odd_size+3 )*( af( work_u+odd_size+3 ) )
963 *
964 * Send contribution to diagonal block's owning processor.
965 *
966  CALL dgesd2d( ictxt, int_one, int_one, work( 1 ), int_one, 0,
967  $ mycol-level_dist )
968 *
969 * *******************************************************
970 *
971  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
972 *
973 * Decide which processor offdiagonal block(s) goes to
974 *
975  IF( ( mod( mycol / ( 2*level_dist ), 2 ) ).EQ.0 ) THEN
976  comm_proc = mycol + level_dist
977  ELSE
978  comm_proc = mycol - level_dist
979  END IF
980 *
981 * Use offdiagonal blocks to calculate offdiag
982 * block to send to neighboring processor. Depending
983 * on circumstances, may need to transpose the matrix.
984 *
985  work( 1 ) = -one*af( work_u+odd_size+3 )*af( odd_size+1 )
986 *
987 * Send contribution to offdiagonal block's owning processor.
988 *
989  CALL dgesd2d( ictxt, int_one, int_one, work( 1 ), int_one,
990  $ 0, comm_proc )
991 *
992  work( 1 ) = -one*af( odd_size+3 )*af( work_u+odd_size+1 )
993 *
994 * Send contribution to offdiagonal block's owning processor.
995 *
996  CALL dgesd2d( ictxt, int_one, int_one, work( 1 ), int_one,
997  $ 0, comm_proc )
998 *
999  END IF
1000 *
1001  END IF
1002 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1003 *
1004  50 CONTINUE
1005 *
1006 *
1007  60 CONTINUE
1008 *
1009 *
1010 * Free BLACS space used to hold standard-form grid.
1011 *
1012  IF( ictxt_save.NE.ictxt_new ) THEN
1013  CALL blacs_gridexit( ictxt_new )
1014  END IF
1015 *
1016  70 CONTINUE
1017 *
1018 * Restore saved input parameters
1019 *
1020  ictxt = ictxt_save
1021  np = np_save
1022 *
1023 * Output minimum worksize
1024 *
1025  work( 1 ) = work_size_min
1026 *
1027 * Make INFO consistent across processors
1028 *
1029  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info, -1, 0,
1030  $ 0 )
1031 *
1032  IF( mycol.EQ.0 ) THEN
1033  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
1034  ELSE
1035  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
1036  END IF
1037 *
1038 *
1039  RETURN
1040 *
1041 * End of PDDTTRF
1042 *
1043  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
pddttrf
subroutine pddttrf(N, DL, D, DU, JA, DESCA, AF, LAF, WORK, LWORK, INFO)
Definition: pddttrf.f:3
ddttrf
subroutine ddttrf(N, DL, D, DU, INFO)
Definition: ddttrf.f:2
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
ddttrsv
subroutine ddttrsv(UPLO, TRANS, N, NRHS, DL, D, DU, B, LDB, INFO)
Definition: ddttrsv.f:3