ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pcpttrsv.f
Go to the documentation of this file.
1  SUBROUTINE pcpttrsv( UPLO, TRANS, N, NRHS, D, E, JA, DESCA, B, IB,
2  $ DESCB, AF, LAF, WORK, LWORK, INFO )
3 *
4 *
5 *
6 * -- ScaLAPACK routine (version 1.7) --
7 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
8 * and University of California, Berkeley.
9 * November 15, 1997
10 *
11 * .. Scalar Arguments ..
12  CHARACTER TRANS, UPLO
13  INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS
14 * ..
15 * .. Array Arguments ..
16  INTEGER DESCA( * ), DESCB( * )
17  COMPLEX AF( * ), B( * ), E( * ), WORK( * )
18  REAL D( * )
19 * ..
20 *
21 *
22 * Purpose
23 * =======
24 *
25 * PCPTTRSV solves a tridiagonal triangular system of linear equations
26 *
27 * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
28 * or
29 * A(1:N, JA:JA+N-1)^H * X = B(IB:IB+N-1, 1:NRHS)
30 *
31 * where A(1:N, JA:JA+N-1) is a tridiagonal
32 * triangular matrix factor produced by the
33 * Cholesky factorization code PCPTTRF
34 * and is stored in A(1:N,JA:JA+N-1) and AF.
35 * The matrix stored in A(1:N, JA:JA+N-1) is either
36 * upper or lower triangular according to UPLO,
37 * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^H
38 * is dictated by the user by the parameter TRANS.
39 *
40 * Routine PCPTTRF MUST be called first.
41 *
42 * =====================================================================
43 *
44 * Arguments
45 * =========
46 *
47 * UPLO (global input) CHARACTER
48 * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
49 * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
50 *
51 * TRANS (global input) CHARACTER
52 * = 'N': Solve with A(1:N, JA:JA+N-1);
53 * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) );
54 *
55 * N (global input) INTEGER
56 * The number of rows and columns to be operated on, i.e. the
57 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
58 *
59 * NRHS (global input) INTEGER
60 * The number of right hand sides, i.e., the number of columns
61 * of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
62 * NRHS >= 0.
63 *
64 * D (local input/local output) COMPLEX pointer to local
65 * part of global vector storing the main diagonal of the
66 * matrix.
67 * On exit, this array contains information containing the
68 * factors of the matrix.
69 * Must be of size >= DESCA( NB_ ).
70 *
71 * E (local input/local output) COMPLEX pointer to local
72 * part of global vector storing the upper diagonal of the
73 * matrix. Globally, DU(n) is not referenced, and DU must be
74 * aligned with D.
75 * On exit, this array contains information containing the
76 * factors of the matrix.
77 * Must be of size >= DESCA( NB_ ).
78 *
79 * JA (global input) INTEGER
80 * The index in the global array A that points to the start of
81 * the matrix to be operated on (which may be either all of A
82 * or a submatrix of A).
83 *
84 * DESCA (global and local input) INTEGER array of dimension DLEN.
85 * if 1D type (DTYPE_A=501 or 502), DLEN >= 7;
86 * if 2D type (DTYPE_A=1), DLEN >= 9.
87 * The array descriptor for the distributed matrix A.
88 * Contains information of mapping of A to memory. Please
89 * see NOTES below for full description and options.
90 *
91 * B (local input/local output) COMPLEX pointer into
92 * local memory to an array of local lead dimension lld_b>=NB.
93 * On entry, this array contains the
94 * the local pieces of the right hand sides
95 * B(IB:IB+N-1, 1:NRHS).
96 * On exit, this contains the local piece of the solutions
97 * distributed matrix X.
98 *
99 * IB (global input) INTEGER
100 * The row index in the global array B that points to the first
101 * row of the matrix to be operated on (which may be either
102 * all of B or a submatrix of B).
103 *
104 * DESCB (global and local input) INTEGER array of dimension DLEN.
105 * if 1D type (DTYPE_B=502), DLEN >=7;
106 * if 2D type (DTYPE_B=1), DLEN >= 9.
107 * The array descriptor for the distributed matrix B.
108 * Contains information of mapping of B to memory. Please
109 * see NOTES below for full description and options.
110 *
111 * AF (local output) COMPLEX array, dimension LAF.
112 * Auxiliary Fillin Space.
113 * Fillin is created during the factorization routine
114 * PCPTTRF and this is stored in AF. If a linear system
115 * is to be solved using PCPTTRS after the factorization
116 * routine, AF *must not be altered* after the factorization.
117 *
118 * LAF (local input) INTEGER
119 * Size of user-input Auxiliary Fillin space AF. Must be >=
120 * (NB+2)
121 * If LAF is not large enough, an error code will be returned
122 * and the minimum acceptable size will be returned in AF( 1 )
123 *
124 * WORK (local workspace/local output)
125 * COMPLEX temporary workspace. This space may
126 * be overwritten in between calls to routines. WORK must be
127 * the size given in LWORK.
128 * On exit, WORK( 1 ) contains the minimal LWORK.
129 *
130 * LWORK (local input or global input) INTEGER
131 * Size of user-input workspace WORK.
132 * If LWORK is too small, the minimal acceptable size will be
133 * returned in WORK(1) and an error code is returned. LWORK>=
134 * (10+2*min(100,NRHS))*NPCOL+4*NRHS
135 *
136 * INFO (local output) INTEGER
137 * = 0: successful exit
138 * < 0: If the i-th argument is an array and the j-entry had
139 * an illegal value, then INFO = -(i*100+j), if the i-th
140 * argument is a scalar and had an illegal value, then
141 * INFO = -i.
142 *
143 * =====================================================================
144 *
145 *
146 * Restrictions
147 * ============
148 *
149 * The following are restrictions on the input parameters. Some of these
150 * are temporary and will be removed in future releases, while others
151 * may reflect fundamental technical limitations.
152 *
153 * Non-cyclic restriction: VERY IMPORTANT!
154 * P*NB>= mod(JA-1,NB)+N.
155 * The mapping for matrices must be blocked, reflecting the nature
156 * of the divide and conquer algorithm as a task-parallel algorithm.
157 * This formula in words is: no processor may have more than one
158 * chunk of the matrix.
159 *
160 * Blocksize cannot be too small:
161 * If the matrix spans more than one processor, the following
162 * restriction on NB, the size of each block on each processor,
163 * must hold:
164 * NB >= 2
165 * The bulk of parallel computation is done on the matrix of size
166 * O(NB) on each processor. If this is too small, divide and conquer
167 * is a poor choice of algorithm.
168 *
169 * Submatrix reference:
170 * JA = IB
171 * Alignment restriction that prevents unnecessary communication.
172 *
173 *
174 * =====================================================================
175 *
176 *
177 * Notes
178 * =====
179 *
180 * If the factorization routine and the solve routine are to be called
181 * separately (to solve various sets of righthand sides using the same
182 * coefficient matrix), the auxiliary space AF *must not be altered*
183 * between calls to the factorization routine and the solve routine.
184 *
185 * The best algorithm for solving banded and tridiagonal linear systems
186 * depends on a variety of parameters, especially the bandwidth.
187 * Currently, only algorithms designed for the case N/P >> bw are
188 * implemented. These go by many names, including Divide and Conquer,
189 * Partitioning, domain decomposition-type, etc.
190 * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C
191 * algorithms are the appropriate choice.
192 *
193 * Algorithm description: Divide and Conquer
194 *
195 * The Divide and Conqer algorithm assumes the matrix is narrowly
196 * banded compared with the number of equations. In this situation,
197 * it is best to distribute the input matrix A one-dimensionally,
198 * with columns atomic and rows divided amongst the processes.
199 * The basic algorithm divides the tridiagonal matrix up into
200 * P pieces with one stored on each processor,
201 * and then proceeds in 2 phases for the factorization or 3 for the
202 * solution of a linear system.
203 * 1) Local Phase:
204 * The individual pieces are factored independently and in
205 * parallel. These factors are applied to the matrix creating
206 * fillin, which is stored in a non-inspectable way in auxiliary
207 * space AF. Mathematically, this is equivalent to reordering
208 * the matrix A as P A P^T and then factoring the principal
209 * leading submatrix of size equal to the sum of the sizes of
210 * the matrices factored on each processor. The factors of
211 * these submatrices overwrite the corresponding parts of A
212 * in memory.
213 * 2) Reduced System Phase:
214 * A small ((P-1)) system is formed representing
215 * interaction of the larger blocks, and is stored (as are its
216 * factors) in the space AF. A parallel Block Cyclic Reduction
217 * algorithm is used. For a linear system, a parallel front solve
218 * followed by an analagous backsolve, both using the structure
219 * of the factored matrix, are performed.
220 * 3) Backsubsitution Phase:
221 * For a linear system, a local backsubstitution is performed on
222 * each processor in parallel.
223 *
224 *
225 * Descriptors
226 * ===========
227 *
228 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
229 *
230 * Note: tridiagonal codes can use either the old two dimensional
231 * or new one-dimensional descriptors, though the processor grid in
232 * both cases *must be one-dimensional*. We describe both types below.
233 *
234 * Each global data object is described by an associated description
235 * vector. This vector stores the information required to establish
236 * the mapping between an object element and its corresponding process
237 * and memory location.
238 *
239 * Let A be a generic term for any 2D block cyclicly distributed array.
240 * Such a global array has an associated description vector DESCA.
241 * In the following comments, the character _ should be read as
242 * "of the global array".
243 *
244 * NOTATION STORED IN EXPLANATION
245 * --------------- -------------- --------------------------------------
246 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
247 * DTYPE_A = 1.
248 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
249 * the BLACS process grid A is distribu-
250 * ted over. The context itself is glo-
251 * bal, but the handle (the integer
252 * value) may vary.
253 * M_A (global) DESCA( M_ ) The number of rows in the global
254 * array A.
255 * N_A (global) DESCA( N_ ) The number of columns in the global
256 * array A.
257 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
258 * the rows of the array.
259 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
260 * the columns of the array.
261 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
262 * row of the array A is distributed.
263 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
264 * first column of the array A is
265 * distributed.
266 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
267 * array. LLD_A >= MAX(1,LOCr(M_A)).
268 *
269 * Let K be the number of rows or columns of a distributed matrix,
270 * and assume that its process grid has dimension p x q.
271 * LOCr( K ) denotes the number of elements of K that a process
272 * would receive if K were distributed over the p processes of its
273 * process column.
274 * Similarly, LOCc( K ) denotes the number of elements of K that a
275 * process would receive if K were distributed over the q processes of
276 * its process row.
277 * The values of LOCr() and LOCc() may be determined via a call to the
278 * ScaLAPACK tool function, NUMROC:
279 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
280 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
281 * An upper bound for these quantities may be computed by:
282 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
283 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
284 *
285 *
286 * One-dimensional descriptors:
287 *
288 * One-dimensional descriptors are a new addition to ScaLAPACK since
289 * version 1.0. They simplify and shorten the descriptor for 1D
290 * arrays.
291 *
292 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
293 * object, we allow 1D arrays to be distributed either over the
294 * first dimension of the array (as if the grid were P-by-1) or the
295 * 2nd dimension (as if the grid were 1-by-P). This choice is
296 * indicated by the descriptor type (501 or 502)
297 * as described below.
298 * However, for tridiagonal matrices, since the objects being
299 * distributed are the individual vectors storing the diagonals, we
300 * have adopted the convention that both the P-by-1 descriptor and
301 * the 1-by-P descriptor are allowed and are equivalent for
302 * tridiagonal matrices. Thus, for tridiagonal matrices,
303 * DTYPE_A = 501 or 502 can be used interchangeably
304 * without any other change.
305 * We require that the distributed vectors storing the diagonals of a
306 * tridiagonal matrix be aligned with each other. Because of this, a
307 * single descriptor, DESCA, serves to describe the distribution of
308 * of all diagonals simultaneously.
309 *
310 * IMPORTANT NOTE: the actual BLACS grid represented by the
311 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
312 * irrespective of which one-dimensional descriptor type
313 * (501 or 502) is input.
314 * This routine will interpret the grid properly either way.
315 * ScaLAPACK routines *do not support intercontext operations* so that
316 * the grid passed to a single ScaLAPACK routine *must be the same*
317 * for all array descriptors passed to that routine.
318 *
319 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
320 * may also be used, since a one-dimensional array is a special case
321 * of a two-dimensional array with one dimension of size unity.
322 * The two-dimensional array used in this case *must* be of the
323 * proper orientation:
324 * If the appropriate one-dimensional descriptor is DTYPEA=501
325 * (1 by P type), then the two dimensional descriptor must
326 * have a CTXT value that refers to a 1 by P BLACS grid;
327 * If the appropriate one-dimensional descriptor is DTYPEA=502
328 * (P by 1 type), then the two dimensional descriptor must
329 * have a CTXT value that refers to a P by 1 BLACS grid.
330 *
331 *
332 * Summary of allowed descriptors, types, and BLACS grids:
333 * DTYPE 501 502 1 1
334 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
335 * -----------------------------------------------------
336 * A OK OK OK NO
337 * B NO OK NO OK
338 *
339 * Note that a consequence of this chart is that it is not possible
340 * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
341 * to opposite requirements for the orientation of the BLACS grid,
342 * and as noted before, the *same* BLACS context must be used in
343 * all descriptors in a single ScaLAPACK subroutine call.
344 *
345 * Let A be a generic term for any 1D block cyclicly distributed array.
346 * Such a global array has an associated description vector DESCA.
347 * In the following comments, the character _ should be read as
348 * "of the global array".
349 *
350 * NOTATION STORED IN EXPLANATION
351 * --------------- ---------- ------------------------------------------
352 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
353 * TYPE_A = 501: 1-by-P grid.
354 * TYPE_A = 502: P-by-1 grid.
355 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
356 * the BLACS process grid A is distribu-
357 * ted over. The context itself is glo-
358 * bal, but the handle (the integer
359 * value) may vary.
360 * N_A (global) DESCA( 3 ) The size of the array dimension being
361 * distributed.
362 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
363 * the distributed dimension of the array.
364 * SRC_A (global) DESCA( 5 ) The process row or column over which the
365 * first row or column of the array
366 * is distributed.
367 * Ignored DESCA( 6 ) Ignored for tridiagonal matrices.
368 * Reserved DESCA( 7 ) Reserved for future use.
369 *
370 *
371 *
372 * =====================================================================
373 *
374 * Code Developer: Andrew J. Cleary, University of Tennessee.
375 * Current address: Lawrence Livermore National Labs.
376 * This version released: August, 2001.
377 *
378 * =====================================================================
379 *
380 * ..
381 * .. Parameters ..
382  REAL ONE, ZERO
383  parameter( one = 1.0e+0 )
384  parameter( zero = 0.0e+0 )
385  COMPLEX CONE, CZERO
386  parameter( cone = ( 1.0e+0, 0.0e+0 ) )
387  parameter( czero = ( 0.0e+0, 0.0e+0 ) )
388  INTEGER INT_ONE
389  parameter( int_one = 1 )
390  INTEGER DESCMULT, BIGNUM
391  parameter(descmult = 100, bignum = descmult * descmult)
392  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
393  $ lld_, mb_, m_, nb_, n_, rsrc_
394  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
395  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
396  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
397 * ..
398 * .. Local Scalars ..
399  INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
400  $ idum1, idum2, idum3, ja_new, level_dist, llda,
401  $ lldb, mycol, myrow, my_num_cols, nb, np, npcol,
402  $ nprow, np_save, odd_size, part_offset,
403  $ part_size, return_code, store_m_b, store_n_a,
404  $ temp, work_size_min
405 * ..
406 * .. Local Arrays ..
407  INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
408  $ param_check( 16, 3 )
409 * ..
410 * .. External Subroutines ..
411  EXTERNAL blacs_get, blacs_gridexit, blacs_gridinfo,
412  $ cgemm, cgerv2d, cgesd2d, clamov, cmatadd,
413  $ ctbtrs, ctrmm, ctrtrs, desc_convert, globchk,
414  $ pxerbla, reshape
415 * ..
416 * .. External Functions ..
417  LOGICAL LSAME
418  INTEGER NUMROC
419  EXTERNAL lsame, numroc
420 * ..
421 * .. Intrinsic Functions ..
422  INTRINSIC ichar, min, mod
423 * ..
424 * .. Executable Statements ..
425 *
426 * Test the input parameters
427 *
428  info = 0
429 *
430 * Convert descriptor into standard form for easy access to
431 * parameters, check that grid is of right shape.
432 *
433  desca_1xp( 1 ) = 501
434  descb_px1( 1 ) = 502
435 *
436  temp = desca( dtype_ )
437  IF( temp .EQ. 502 ) THEN
438 * Temporarily set the descriptor type to 1xP type
439  desca( dtype_ ) = 501
440  ENDIF
441 *
442  CALL desc_convert( desca, desca_1xp, return_code )
443 *
444  desca( dtype_ ) = temp
445 *
446  IF( return_code .NE. 0) THEN
447  info = -( 8*100 + 2 )
448  ENDIF
449 *
450  CALL desc_convert( descb, descb_px1, return_code )
451 *
452  IF( return_code .NE. 0) THEN
453  info = -( 11*100 + 2 )
454  ENDIF
455 *
456 * Consistency checks for DESCA and DESCB.
457 *
458 * Context must be the same
459  IF( desca_1xp( 2 ) .NE. descb_px1( 2 ) ) THEN
460  info = -( 11*100 + 2 )
461  ENDIF
462 *
463 * These are alignment restrictions that may or may not be removed
464 * in future releases. -Andy Cleary, April 14, 1996.
465 *
466 * Block sizes must be the same
467  IF( desca_1xp( 4 ) .NE. descb_px1( 4 ) ) THEN
468  info = -( 11*100 + 4 )
469  ENDIF
470 *
471 * Source processor must be the same
472 *
473  IF( desca_1xp( 5 ) .NE. descb_px1( 5 ) ) THEN
474  info = -( 11*100 + 5 )
475  ENDIF
476 *
477 * Get values out of descriptor for use in code.
478 *
479  ictxt = desca_1xp( 2 )
480  csrc = desca_1xp( 5 )
481  nb = desca_1xp( 4 )
482  llda = desca_1xp( 6 )
483  store_n_a = desca_1xp( 3 )
484  lldb = descb_px1( 6 )
485  store_m_b = descb_px1( 3 )
486 *
487 * Get grid parameters
488 *
489 *
490  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
491  np = nprow * npcol
492 *
493 *
494 *
495  IF( lsame( uplo, 'U' ) ) THEN
496  idum1 = ichar( 'U' )
497  ELSE IF ( lsame( uplo, 'L' ) ) THEN
498  idum1 = ichar( 'L' )
499  ELSE
500  info = -1
501  END IF
502 *
503  IF( lsame( trans, 'N' ) ) THEN
504  idum2 = ichar( 'N' )
505  ELSE IF ( lsame( trans, 'C' ) ) THEN
506  idum2 = ichar( 'C' )
507  ELSE
508  info = -2
509  END IF
510 *
511  IF( lwork .LT. -1) THEN
512  info = -15
513  ELSE IF ( lwork .EQ. -1 ) THEN
514  idum3 = -1
515  ELSE
516  idum3 = 1
517  ENDIF
518 *
519  IF( n .LT. 0 ) THEN
520  info = -3
521  ENDIF
522 *
523  IF( n+ja-1 .GT. store_n_a ) THEN
524  info = -( 8*100 + 6 )
525  ENDIF
526 *
527  IF( n+ib-1 .GT. store_m_b ) THEN
528  info = -( 11*100 + 3 )
529  ENDIF
530 *
531  IF( lldb .LT. nb ) THEN
532  info = -( 11*100 + 6 )
533  ENDIF
534 *
535  IF( nrhs .LT. 0 ) THEN
536  info = -4
537  ENDIF
538 *
539 * Current alignment restriction
540 *
541  IF( ja .NE. ib) THEN
542  info = -7
543  ENDIF
544 *
545 * Argument checking that is specific to Divide & Conquer routine
546 *
547  IF( nprow .NE. 1 ) THEN
548  info = -( 8*100+2 )
549  ENDIF
550 *
551  IF( n .GT. np*nb-mod( ja-1, nb )) THEN
552  info = -( 3 )
553  CALL pxerbla( ictxt,
554  $ 'PCPTTRSV, D&C alg.: only 1 block per proc',
555  $ -info )
556  RETURN
557  ENDIF
558 *
559  IF((ja+n-1.GT.nb) .AND. ( nb.LT.2*int_one )) THEN
560  info = -( 8*100+4 )
561  CALL pxerbla( ictxt,
562  $ 'PCPTTRSV, D&C alg.: NB too small',
563  $ -info )
564  RETURN
565  ENDIF
566 *
567 *
568  work_size_min =
569  $ int_one*nrhs
570 *
571  work( 1 ) = work_size_min
572 *
573  IF( lwork .LT. work_size_min ) THEN
574  IF( lwork .NE. -1 ) THEN
575  info = -15
576  CALL pxerbla( ictxt,
577  $ 'PCPTTRSV: worksize error',
578  $ -info )
579  ENDIF
580  RETURN
581  ENDIF
582 *
583 * Pack params and positions into arrays for global consistency check
584 *
585  param_check( 16, 1 ) = descb(5)
586  param_check( 15, 1 ) = descb(4)
587  param_check( 14, 1 ) = descb(3)
588  param_check( 13, 1 ) = descb(2)
589  param_check( 12, 1 ) = descb(1)
590  param_check( 11, 1 ) = ib
591  param_check( 10, 1 ) = desca(5)
592  param_check( 9, 1 ) = desca(4)
593  param_check( 8, 1 ) = desca(3)
594  param_check( 7, 1 ) = desca(1)
595  param_check( 6, 1 ) = ja
596  param_check( 5, 1 ) = nrhs
597  param_check( 4, 1 ) = n
598  param_check( 3, 1 ) = idum3
599  param_check( 2, 1 ) = idum2
600  param_check( 1, 1 ) = idum1
601 *
602  param_check( 16, 2 ) = 1105
603  param_check( 15, 2 ) = 1104
604  param_check( 14, 2 ) = 1103
605  param_check( 13, 2 ) = 1102
606  param_check( 12, 2 ) = 1101
607  param_check( 11, 2 ) = 10
608  param_check( 10, 2 ) = 805
609  param_check( 9, 2 ) = 804
610  param_check( 8, 2 ) = 803
611  param_check( 7, 2 ) = 801
612  param_check( 6, 2 ) = 7
613  param_check( 5, 2 ) = 4
614  param_check( 4, 2 ) = 3
615  param_check( 3, 2 ) = 14
616  param_check( 2, 2 ) = 2
617  param_check( 1, 2 ) = 1
618 *
619 * Want to find errors with MIN( ), so if no error, set it to a big
620 * number. If there already is an error, multiply by the the
621 * descriptor multiplier.
622 *
623  IF( info.GE.0 ) THEN
624  info = bignum
625  ELSE IF( info.LT.-descmult ) THEN
626  info = -info
627  ELSE
628  info = -info * descmult
629  END IF
630 *
631 * Check consistency across processors
632 *
633  CALL globchk( ictxt, 16, param_check, 16,
634  $ param_check( 1, 3 ), info )
635 *
636 * Prepare output: set info = 0 if no error, and divide by DESCMULT
637 * if error is not in a descriptor entry.
638 *
639  IF( info.EQ.bignum ) THEN
640  info = 0
641  ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
642  info = -info / descmult
643  ELSE
644  info = -info
645  END IF
646 *
647  IF( info.LT.0 ) THEN
648  CALL pxerbla( ictxt, 'PCPTTRSV', -info )
649  RETURN
650  END IF
651 *
652 * Quick return if possible
653 *
654  IF( n.EQ.0 )
655  $ RETURN
656 *
657  IF( nrhs.EQ.0 )
658  $ RETURN
659 *
660 *
661 * Adjust addressing into matrix space to properly get into
662 * the beginning part of the relevant data
663 *
664  part_offset = nb*( (ja-1)/(npcol*nb) )
665 *
666  IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
667  part_offset = part_offset + nb
668  ENDIF
669 *
670  IF ( mycol .LT. csrc ) THEN
671  part_offset = part_offset - nb
672  ENDIF
673 *
674 * Form a new BLACS grid (the "standard form" grid) with only procs
675 * holding part of the matrix, of size 1xNP where NP is adjusted,
676 * starting at csrc=0, with JA modified to reflect dropped procs.
677 *
678 * First processor to hold part of the matrix:
679 *
680  first_proc = mod( ( ja-1 )/nb+csrc, npcol )
681 *
682 * Calculate new JA one while dropping off unused processors.
683 *
684  ja_new = mod( ja-1, nb ) + 1
685 *
686 * Save and compute new value of NP
687 *
688  np_save = np
689  np = ( ja_new+n-2 )/nb + 1
690 *
691 * Call utility routine that forms "standard-form" grid
692 *
693  CALL reshape( ictxt, int_one, ictxt_new, int_one,
694  $ first_proc, int_one, np )
695 *
696 * Use new context from standard grid as context.
697 *
698  ictxt_save = ictxt
699  ictxt = ictxt_new
700  desca_1xp( 2 ) = ictxt_new
701  descb_px1( 2 ) = ictxt_new
702 *
703 * Get information about new grid.
704 *
705  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
706 *
707 * Drop out processors that do not have part of the matrix.
708 *
709  IF( myrow .LT. 0 ) THEN
710  GOTO 1234
711  ENDIF
712 *
713 * ********************************
714 * Values reused throughout routine
715 *
716 * User-input value of partition size
717 *
718  part_size = nb
719 *
720 * Number of columns in each processor
721 *
722  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
723 *
724 * Offset in columns to beginning of main partition in each proc
725 *
726  IF ( mycol .EQ. 0 ) THEN
727  part_offset = part_offset+mod( ja_new-1, part_size )
728  my_num_cols = my_num_cols - mod(ja_new-1, part_size )
729  ENDIF
730 *
731 * Size of main (or odd) partition in each processor
732 *
733  odd_size = my_num_cols
734  IF ( mycol .LT. np-1 ) THEN
735  odd_size = odd_size - int_one
736  ENDIF
737 *
738 *
739 *
740 * Begin main code
741 *
742  IF ( lsame( uplo, 'L' ) ) THEN
743 *
744  IF ( lsame( trans, 'N' ) ) THEN
745 *
746 * Frontsolve
747 *
748 *
749 ******************************************
750 * Local computation phase
751 ******************************************
752 *
753 * Use main partition in each processor to solve locally
754 *
755  CALL cpttrsv( uplo, 'N', odd_size, nrhs, d( part_offset+1 ),
756  $ e( part_offset+1 ), b( part_offset+1 ), lldb,
757  $ info )
758 *
759 *
760  IF ( mycol .LT. np-1 ) THEN
761 * Use factorization of odd-even connection block to modify
762 * locally stored portion of right hand side(s)
763 *
764  CALL caxpy( nrhs, -e( part_offset+odd_size ),
765  $ b( part_offset+odd_size ), lldb,
766  $ b( part_offset+odd_size+1 ), lldb )
767 *
768  ENDIF
769 *
770 *
771  IF ( mycol .NE. 0 ) THEN
772 * Use the "spike" fillin to calculate contribution to previous
773 * processor's righthand-side.
774 *
775  CALL cgemm( 'C', 'N', 1, nrhs, odd_size, -cone, af( 1 ),
776  $ odd_size, b( part_offset+1 ), lldb, czero,
777  $ work( 1+int_one-1 ), int_one )
778  ENDIF
779 *
780 *
781 ************************************************
782 * Formation and solution of reduced system
783 ************************************************
784 *
785 *
786 * Send modifications to prior processor's right hand sides
787 *
788  IF( mycol .GT. 0) THEN
789 *
790  CALL cgesd2d( ictxt, int_one, nrhs,
791  $ work( 1 ), int_one,
792  $ 0, mycol - 1 )
793 *
794  ENDIF
795 *
796 * Receive modifications to processor's right hand sides
797 *
798  IF( mycol .LT. npcol-1) THEN
799 *
800  CALL cgerv2d( ictxt, int_one, nrhs,
801  $ work( 1 ), int_one,
802  $ 0, mycol + 1 )
803 *
804 * Combine contribution to locally stored right hand sides
805 *
806  CALL cmatadd( int_one, nrhs, cone,
807  $ work( 1 ), int_one, cone,
808  $ b( part_offset+odd_size + 1 ), lldb )
809 *
810  ENDIF
811 *
812 *
813 * The last processor does not participate in the solution of the
814 * reduced system, having sent its contribution already.
815  IF( mycol .EQ. npcol-1 ) THEN
816  GOTO 14
817  ENDIF
818 *
819 *
820 * *************************************
821 * Modification Loop
822 *
823 * The distance for sending and receiving for each level starts
824 * at 1 for the first level.
825  level_dist = 1
826 *
827 * Do until this proc is needed to modify other procs' equations
828 *
829  12 CONTINUE
830  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 11
831 *
832 * Receive and add contribution to righthand sides from left
833 *
834  IF( mycol-level_dist .GE. 0 ) THEN
835 *
836  CALL cgerv2d( ictxt, int_one, nrhs,
837  $ work( 1 ),
838  $ int_one, 0, mycol-level_dist )
839 *
840  CALL cmatadd( int_one, nrhs, cone,
841  $ work( 1 ), int_one, cone,
842  $ b( part_offset+odd_size + 1 ), lldb )
843 *
844  ENDIF
845 *
846 * Receive and add contribution to righthand sides from right
847 *
848  IF( mycol+level_dist .LT. npcol-1 ) THEN
849 *
850  CALL cgerv2d( ictxt, int_one, nrhs,
851  $ work( 1 ),
852  $ int_one, 0, mycol+level_dist )
853 *
854  CALL cmatadd( int_one, nrhs, cone,
855  $ work( 1 ), int_one, cone,
856  $ b( part_offset+odd_size + 1 ), lldb )
857 *
858  ENDIF
859 *
860  level_dist = level_dist*2
861 *
862  GOTO 12
863  11 CONTINUE
864 * [End of GOTO Loop]
865 *
866 *
867 *
868 * *********************************
869 * Calculate and use this proc's blocks to modify other procs
870 *
871 * Solve with diagonal block
872 *
873  CALL ctrtrs( 'L', 'N', 'U', int_one, nrhs, af( odd_size+2 ),
874  $ int_one, b( part_offset+odd_size+1 ), lldb, info )
875 *
876  IF( info.NE.0 ) THEN
877  GO TO 1000
878  ENDIF
879 *
880 *
881 *
882 * *********
883  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
884 *
885 * Calculate contribution from this block to next diagonal block
886 *
887  CALL cgemm( 'C', 'N', int_one, nrhs, int_one, -cone,
888  $ af( (odd_size)*1+1 ),
889  $ int_one,
890  $ b( part_offset+odd_size+1 ),
891  $ lldb, czero,
892  $ work( 1 ),
893  $ int_one )
894 *
895 * Send contribution to diagonal block's owning processor.
896 *
897  CALL cgesd2d( ictxt, int_one, nrhs,
898  $ work( 1 ),
899  $ int_one, 0, mycol+level_dist )
900 *
901  ENDIF
902 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
903 *
904 * ************
905  IF( (mycol/level_dist .GT. 0 ).AND.
906  $ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
907 *
908 *
909 * Use offdiagonal block to calculate modification to diag block
910 * of processor to the left
911 *
912  CALL cgemm( 'N', 'N', int_one, nrhs, int_one, -cone,
913  $ af( odd_size*1+2+1 ),
914  $ int_one,
915  $ b( part_offset+odd_size+1 ),
916  $ lldb, czero,
917  $ work( 1 ),
918  $ int_one )
919 *
920 * Send contribution to diagonal block's owning processor.
921 *
922  CALL cgesd2d( ictxt, int_one, nrhs,
923  $ work( 1 ),
924  $ int_one, 0, mycol-level_dist )
925 *
926  ENDIF
927 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
928 *
929  14 CONTINUE
930 *
931  ELSE
932 *
933 ******************** BACKSOLVE *************************************
934 *
935 ********************************************************************
936 * .. Begin reduced system phase of algorithm ..
937 ********************************************************************
938 *
939 *
940 *
941 * The last processor does not participate in the solution of the
942 * reduced system and just waits to receive its solution.
943  IF( mycol .EQ. npcol-1 ) THEN
944  GOTO 24
945  ENDIF
946 *
947 * Determine number of steps in tree loop
948 *
949  level_dist = 1
950  27 CONTINUE
951  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 26
952 *
953  level_dist = level_dist*2
954 *
955  GOTO 27
956  26 CONTINUE
957 *
958 *
959  IF( (mycol/level_dist .GT. 0 ).AND.
960  $ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
961 *
962 * Receive solution from processor to left
963 *
964  CALL cgerv2d( ictxt, int_one, nrhs,
965  $ work( 1 ),
966  $ int_one, 0, mycol-level_dist )
967 *
968 *
969 * Use offdiagonal block to calculate modification to RHS stored
970 * on this processor
971 *
972  CALL cgemm( 'C', 'N', int_one, nrhs, int_one, -cone,
973  $ af( odd_size*1+2+1 ),
974  $ int_one,
975  $ work( 1 ),
976  $ int_one, cone,
977  $ b( part_offset+odd_size+1 ),
978  $ lldb )
979  ENDIF
980 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
981 *
982 *
983  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
984 *
985 * Receive solution from processor to right
986 *
987  CALL cgerv2d( ictxt, int_one, nrhs,
988  $ work( 1 ),
989  $ int_one, 0, mycol+level_dist )
990 *
991 * Calculate contribution from this block to next diagonal block
992 *
993  CALL cgemm( 'N', 'N', int_one, nrhs, int_one, -cone,
994  $ af( (odd_size)*1+1 ),
995  $ int_one,
996  $ work( 1 ),
997  $ int_one, cone,
998  $ b( part_offset+odd_size+1 ),
999  $ lldb )
1000 *
1001  ENDIF
1002 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1003 *
1004 *
1005 * Solve with diagonal block
1006 *
1007  CALL ctrtrs( 'L', 'C', 'U', int_one, nrhs, af( odd_size+2 ),
1008  $ int_one, b( part_offset+odd_size+1 ), lldb, info )
1009 *
1010  IF( info.NE.0 ) THEN
1011  GO TO 1000
1012  ENDIF
1013 *
1014 *
1015 *
1016 ***Modification Loop *******
1017 *
1018  22 CONTINUE
1019  IF( level_dist .EQ. 1 ) GOTO 21
1020 *
1021  level_dist = level_dist/2
1022 *
1023 * Send solution to the right
1024 *
1025  IF( mycol+level_dist .LT. npcol-1 ) THEN
1026 *
1027  CALL cgesd2d( ictxt, int_one, nrhs,
1028  $ b( part_offset+odd_size+1 ),
1029  $ lldb, 0, mycol+level_dist )
1030 *
1031  ENDIF
1032 *
1033 * Send solution to left
1034 *
1035  IF( mycol-level_dist .GE. 0 ) THEN
1036 *
1037  CALL cgesd2d( ictxt, int_one, nrhs,
1038  $ b( part_offset+odd_size+1 ),
1039  $ lldb, 0, mycol-level_dist )
1040 *
1041  ENDIF
1042 *
1043  GOTO 22
1044  21 CONTINUE
1045 * [End of GOTO Loop]
1046 *
1047  24 CONTINUE
1048 * [Processor npcol - 1 jumped to here to await next stage]
1049 *
1050 *******************************
1051 * Reduced system has been solved, communicate solutions to nearest
1052 * neighbors in preparation for local computation phase.
1053 *
1054 *
1055 * Send elements of solution to next proc
1056 *
1057  IF( mycol .LT. npcol-1) THEN
1058 *
1059  CALL cgesd2d( ictxt, int_one, nrhs,
1060  $ b( part_offset+odd_size+1 ), lldb,
1061  $ 0, mycol +1 )
1062 *
1063  ENDIF
1064 *
1065 * Receive modifications to processor's right hand sides
1066 *
1067  IF( mycol .GT. 0) THEN
1068 *
1069  CALL cgerv2d( ictxt, int_one, nrhs,
1070  $ work( 1 ), int_one,
1071  $ 0, mycol - 1 )
1072 *
1073  ENDIF
1074 *
1075 *
1076 *
1077 **********************************************
1078 * Local computation phase
1079 **********************************************
1080 *
1081  IF ( mycol .NE. 0 ) THEN
1082 * Use the "spike" fillin to calculate contribution from previous
1083 * processor's solution.
1084 *
1085  CALL cgemm( 'N', 'N', odd_size, nrhs, 1, -cone, af( 1 ),
1086  $ odd_size, work( 1+int_one-1 ), int_one, cone,
1087  $ b( part_offset+1 ), lldb )
1088 *
1089  ENDIF
1090 *
1091 *
1092  IF ( mycol .LT. np-1 ) THEN
1093 * Use factorization of odd-even connection block to modify
1094 * locally stored portion of right hand side(s)
1095 *
1096  CALL caxpy( nrhs, -conjg( e( part_offset+odd_size ) ),
1097  $ b( part_offset+odd_size+1 ), lldb,
1098  $ b( part_offset+odd_size ), lldb )
1099 *
1100  ENDIF
1101 *
1102 * Use main partition in each processor to solve locally
1103 *
1104  CALL cpttrsv( uplo, 'C', odd_size, nrhs, d( part_offset+1 ),
1105  $ e( part_offset+1 ), b( part_offset+1 ), lldb,
1106  $ info )
1107 *
1108  ENDIF
1109 * End of "IF( LSAME( TRANS, 'N' ) )"...
1110 *
1111 *
1112  ELSE
1113 ***************************************************************
1114 * CASE UPLO = 'U' *
1115 ***************************************************************
1116  IF ( lsame( trans, 'C' ) ) THEN
1117 *
1118 * Frontsolve
1119 *
1120 *
1121 ******************************************
1122 * Local computation phase
1123 ******************************************
1124 *
1125 * Use main partition in each processor to solve locally
1126 *
1127  CALL cpttrsv( uplo, 'C', odd_size, nrhs, d( part_offset+1 ),
1128  $ e( part_offset+1 ), b( part_offset+1 ), lldb,
1129  $ info )
1130 *
1131 *
1132  IF ( mycol .LT. np-1 ) THEN
1133 * Use factorization of odd-even connection block to modify
1134 * locally stored portion of right hand side(s)
1135 *
1136  CALL caxpy( nrhs, -conjg( e( part_offset+odd_size ) ),
1137  $ b( part_offset+odd_size ), lldb,
1138  $ b( part_offset+odd_size+1 ), lldb )
1139 *
1140  ENDIF
1141 *
1142 *
1143  IF ( mycol .NE. 0 ) THEN
1144 * Use the "spike" fillin to calculate contribution to previous
1145 * processor's righthand-side.
1146 *
1147  CALL cgemm( 'T', 'N', 1, nrhs, odd_size, -cone, af( 1 ),
1148  $ odd_size, b( part_offset+1 ), lldb, czero,
1149  $ work( 1+int_one-1 ), int_one )
1150  ENDIF
1151 *
1152 *
1153 ************************************************
1154 * Formation and solution of reduced system
1155 ************************************************
1156 *
1157 *
1158 * Send modifications to prior processor's right hand sides
1159 *
1160  IF( mycol .GT. 0) THEN
1161 *
1162  CALL cgesd2d( ictxt, int_one, nrhs,
1163  $ work( 1 ), int_one,
1164  $ 0, mycol - 1 )
1165 *
1166  ENDIF
1167 *
1168 * Receive modifications to processor's right hand sides
1169 *
1170  IF( mycol .LT. npcol-1) THEN
1171 *
1172  CALL cgerv2d( ictxt, int_one, nrhs,
1173  $ work( 1 ), int_one,
1174  $ 0, mycol + 1 )
1175 *
1176 * Combine contribution to locally stored right hand sides
1177 *
1178  CALL cmatadd( int_one, nrhs, cone,
1179  $ work( 1 ), int_one, cone,
1180  $ b( part_offset+odd_size + 1 ), lldb )
1181 *
1182  ENDIF
1183 *
1184 *
1185 * The last processor does not participate in the solution of the
1186 * reduced system, having sent its contribution already.
1187  IF( mycol .EQ. npcol-1 ) THEN
1188  GOTO 44
1189  ENDIF
1190 *
1191 *
1192 * *************************************
1193 * Modification Loop
1194 *
1195 * The distance for sending and receiving for each level starts
1196 * at 1 for the first level.
1197  level_dist = 1
1198 *
1199 * Do until this proc is needed to modify other procs' equations
1200 *
1201  42 CONTINUE
1202  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 41
1203 *
1204 * Receive and add contribution to righthand sides from left
1205 *
1206  IF( mycol-level_dist .GE. 0 ) THEN
1207 *
1208  CALL cgerv2d( ictxt, int_one, nrhs,
1209  $ work( 1 ),
1210  $ int_one, 0, mycol-level_dist )
1211 *
1212  CALL cmatadd( int_one, nrhs, cone,
1213  $ work( 1 ), int_one, cone,
1214  $ b( part_offset+odd_size + 1 ), lldb )
1215 *
1216  ENDIF
1217 *
1218 * Receive and add contribution to righthand sides from right
1219 *
1220  IF( mycol+level_dist .LT. npcol-1 ) THEN
1221 *
1222  CALL cgerv2d( ictxt, int_one, nrhs,
1223  $ work( 1 ),
1224  $ int_one, 0, mycol+level_dist )
1225 *
1226  CALL cmatadd( int_one, nrhs, cone,
1227  $ work( 1 ), int_one, cone,
1228  $ b( part_offset+odd_size + 1 ), lldb )
1229 *
1230  ENDIF
1231 *
1232  level_dist = level_dist*2
1233 *
1234  GOTO 42
1235  41 CONTINUE
1236 * [End of GOTO Loop]
1237 *
1238 *
1239 *
1240 * *********************************
1241 * Calculate and use this proc's blocks to modify other procs
1242 *
1243 * Solve with diagonal block
1244 *
1245  CALL ctrtrs( 'L', 'N', 'U', int_one, nrhs, af( odd_size+2 ),
1246  $ int_one, b( part_offset+odd_size+1 ), lldb, info )
1247 *
1248  IF( info.NE.0 ) THEN
1249  GO TO 1000
1250  ENDIF
1251 *
1252 *
1253 *
1254 * *********
1255  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
1256 *
1257 * Calculate contribution from this block to next diagonal block
1258 *
1259  CALL cgemm( 'T', 'N', int_one, nrhs, int_one, -cone,
1260  $ af( (odd_size)*1+1 ),
1261  $ int_one,
1262  $ b( part_offset+odd_size+1 ),
1263  $ lldb, czero,
1264  $ work( 1 ),
1265  $ int_one )
1266 *
1267 * Send contribution to diagonal block's owning processor.
1268 *
1269  CALL cgesd2d( ictxt, int_one, nrhs,
1270  $ work( 1 ),
1271  $ int_one, 0, mycol+level_dist )
1272 *
1273  ENDIF
1274 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1275 *
1276 * ************
1277  IF( (mycol/level_dist .GT. 0 ).AND.
1278  $ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
1279 *
1280 *
1281 * Use offdiagonal block to calculate modification to diag block
1282 * of processor to the left
1283 *
1284  CALL cgemm( 'C', 'N', int_one, nrhs, int_one, -cone,
1285  $ af( odd_size*1+2+1 ),
1286  $ int_one,
1287  $ b( part_offset+odd_size+1 ),
1288  $ lldb, czero,
1289  $ work( 1 ),
1290  $ int_one )
1291 *
1292 * Send contribution to diagonal block's owning processor.
1293 *
1294  CALL cgesd2d( ictxt, int_one, nrhs,
1295  $ work( 1 ),
1296  $ int_one, 0, mycol-level_dist )
1297 *
1298  ENDIF
1299 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1300 *
1301  44 CONTINUE
1302 *
1303  ELSE
1304 *
1305 ******************** BACKSOLVE *************************************
1306 *
1307 ********************************************************************
1308 * .. Begin reduced system phase of algorithm ..
1309 ********************************************************************
1310 *
1311 *
1312 *
1313 * The last processor does not participate in the solution of the
1314 * reduced system and just waits to receive its solution.
1315  IF( mycol .EQ. npcol-1 ) THEN
1316  GOTO 54
1317  ENDIF
1318 *
1319 * Determine number of steps in tree loop
1320 *
1321  level_dist = 1
1322  57 CONTINUE
1323  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 56
1324 *
1325  level_dist = level_dist*2
1326 *
1327  GOTO 57
1328  56 CONTINUE
1329 *
1330 *
1331  IF( (mycol/level_dist .GT. 0 ).AND.
1332  $ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
1333 *
1334 * Receive solution from processor to left
1335 *
1336  CALL cgerv2d( ictxt, int_one, nrhs,
1337  $ work( 1 ),
1338  $ int_one, 0, mycol-level_dist )
1339 *
1340 *
1341 * Use offdiagonal block to calculate modification to RHS stored
1342 * on this processor
1343 *
1344  CALL cgemm( 'T', 'N', int_one, nrhs, int_one, -cone,
1345  $ af( odd_size*1+2+1 ),
1346  $ int_one,
1347  $ work( 1 ),
1348  $ int_one, cone,
1349  $ b( part_offset+odd_size+1 ),
1350  $ lldb )
1351  ENDIF
1352 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1353 *
1354 *
1355  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
1356 *
1357 * Receive solution from processor to right
1358 *
1359  CALL cgerv2d( ictxt, int_one, nrhs,
1360  $ work( 1 ),
1361  $ int_one, 0, mycol+level_dist )
1362 *
1363 * Calculate contribution from this block to next diagonal block
1364 *
1365  CALL cgemm( 'C', 'N', int_one, nrhs, int_one, -cone,
1366  $ af( (odd_size)*1+1 ),
1367  $ int_one,
1368  $ work( 1 ),
1369  $ int_one, cone,
1370  $ b( part_offset+odd_size+1 ),
1371  $ lldb )
1372 *
1373  ENDIF
1374 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1375 *
1376 *
1377 * Solve with diagonal block
1378 *
1379  CALL ctrtrs( 'L', 'C', 'U', int_one, nrhs, af( odd_size+2 ),
1380  $ int_one, b( part_offset+odd_size+1 ), lldb, info )
1381 *
1382  IF( info.NE.0 ) THEN
1383  GO TO 1000
1384  ENDIF
1385 *
1386 *
1387 *
1388 ***Modification Loop *******
1389 *
1390  52 CONTINUE
1391  IF( level_dist .EQ. 1 ) GOTO 51
1392 *
1393  level_dist = level_dist/2
1394 *
1395 * Send solution to the right
1396 *
1397  IF( mycol+level_dist .LT. npcol-1 ) THEN
1398 *
1399  CALL cgesd2d( ictxt, int_one, nrhs,
1400  $ b( part_offset+odd_size+1 ),
1401  $ lldb, 0, mycol+level_dist )
1402 *
1403  ENDIF
1404 *
1405 * Send solution to left
1406 *
1407  IF( mycol-level_dist .GE. 0 ) THEN
1408 *
1409  CALL cgesd2d( ictxt, int_one, nrhs,
1410  $ b( part_offset+odd_size+1 ),
1411  $ lldb, 0, mycol-level_dist )
1412 *
1413  ENDIF
1414 *
1415  GOTO 52
1416  51 CONTINUE
1417 * [End of GOTO Loop]
1418 *
1419  54 CONTINUE
1420 * [Processor npcol - 1 jumped to here to await next stage]
1421 *
1422 *******************************
1423 * Reduced system has been solved, communicate solutions to nearest
1424 * neighbors in preparation for local computation phase.
1425 *
1426 *
1427 * Send elements of solution to next proc
1428 *
1429  IF( mycol .LT. npcol-1) THEN
1430 *
1431  CALL cgesd2d( ictxt, int_one, nrhs,
1432  $ b( part_offset+odd_size+1 ), lldb,
1433  $ 0, mycol +1 )
1434 *
1435  ENDIF
1436 *
1437 * Receive modifications to processor's right hand sides
1438 *
1439  IF( mycol .GT. 0) THEN
1440 *
1441  CALL cgerv2d( ictxt, int_one, nrhs,
1442  $ work( 1 ), int_one,
1443  $ 0, mycol - 1 )
1444 *
1445  ENDIF
1446 *
1447 *
1448 *
1449 **********************************************
1450 * Local computation phase
1451 **********************************************
1452 *
1453  IF ( mycol .NE. 0 ) THEN
1454 * Use the "spike" fillin to calculate contribution from previous
1455 * processor's solution.
1456 *
1457  CALL cgemm( 'C', 'N', odd_size, nrhs, 1, -cone, af( 1 ),
1458  $ int_one, work( 1 ), int_one, cone,
1459  $ b( part_offset+1 ), lldb )
1460 *
1461  ENDIF
1462 *
1463 *
1464  IF ( mycol .LT. np-1 ) THEN
1465 * Use factorization of odd-even connection block to modify
1466 * locally stored portion of right hand side(s)
1467 *
1468  CALL caxpy( nrhs, -e( part_offset+odd_size ),
1469  $ b( part_offset+odd_size+1 ), lldb,
1470  $ b( part_offset+odd_size ), lldb )
1471 *
1472  ENDIF
1473 *
1474 * Use main partition in each processor to solve locally
1475 *
1476  CALL cpttrsv( uplo, 'N', odd_size, nrhs, d( part_offset+1 ),
1477  $ e( part_offset+1 ), b( part_offset+1 ), lldb,
1478  $ info )
1479 *
1480  ENDIF
1481 * End of "IF( LSAME( TRANS, 'N' ) )"...
1482 *
1483 *
1484  ENDIF
1485 * End of "IF( LSAME( UPLO, 'L' ) )"...
1486  1000 CONTINUE
1487 *
1488 *
1489 * Free BLACS space used to hold standard-form grid.
1490 *
1491  IF( ictxt_save .NE. ictxt_new ) THEN
1492  CALL blacs_gridexit( ictxt_new )
1493  ENDIF
1494 *
1495  1234 CONTINUE
1496 *
1497 * Restore saved input parameters
1498 *
1499  ictxt = ictxt_save
1500  np = np_save
1501 *
1502 * Output minimum worksize
1503 *
1504  work( 1 ) = work_size_min
1505 *
1506 *
1507  RETURN
1508 *
1509 * End of PCPTTRSV
1510 *
1511  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
cmatadd
subroutine cmatadd(M, N, ALPHA, A, LDA, BETA, C, LDC)
Definition: cmatadd.f:2
cpttrsv
subroutine cpttrsv(UPLO, TRANS, N, NRHS, D, E, B, LDB, INFO)
Definition: cpttrsv.f:3
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
pcpttrsv
subroutine pcpttrsv(UPLO, TRANS, N, NRHS, D, E, JA, DESCA, B, IB, DESCB, AF, LAF, WORK, LWORK, INFO)
Definition: pcpttrsv.f:3
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181