ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pspttrsv.f
Go to the documentation of this file.
1  SUBROUTINE pspttrsv( UPLO, N, NRHS, D, E, JA, DESCA, B, IB, DESCB,
2  $ AF, LAF, WORK, LWORK, INFO )
3 *
4 * -- ScaLAPACK routine (version 1.7) --
5 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
6 * and University of California, Berkeley.
7 * April 3, 2000
8 *
9 * .. Scalar Arguments ..
10  CHARACTER UPLO
11  INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS
12 * ..
13 * .. Array Arguments ..
14  INTEGER DESCA( * ), DESCB( * )
15  REAL AF( * ), B( * ), D( * ), E( * ), WORK( * )
16 * ..
17 *
18 *
19 * Purpose
20 * =======
21 *
22 * PSPTTRSV solves a tridiagonal triangular system of linear equations
23 *
24 * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
25 * or
26 * A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS)
27 *
28 * where A(1:N, JA:JA+N-1) is a tridiagonal
29 * triangular matrix factor produced by the
30 * Cholesky factorization code PSPTTRF
31 * and is stored in A(1:N,JA:JA+N-1) and AF.
32 * The matrix stored in A(1:N, JA:JA+N-1) is either
33 * upper or lower triangular according to UPLO,
34 * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T
35 * is dictated by the user by the parameter TRANS.
36 *
37 * Routine PSPTTRF MUST be called first.
38 *
39 * =====================================================================
40 *
41 * Arguments
42 * =========
43 *
44 * UPLO (global input) CHARACTER
45 * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
46 * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
47 *
48 * N (global input) INTEGER
49 * The number of rows and columns to be operated on, i.e. the
50 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
51 *
52 * NRHS (global input) INTEGER
53 * The number of right hand sides, i.e., the number of columns
54 * of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
55 * NRHS >= 0.
56 *
57 * D (local input/local output) REAL pointer to local
58 * part of global vector storing the main diagonal of the
59 * matrix.
60 * On exit, this array contains information containing the
61 * factors of the matrix.
62 * Must be of size >= DESCA( NB_ ).
63 *
64 * E (local input/local output) REAL pointer to local
65 * part of global vector storing the upper diagonal of the
66 * matrix. Globally, DU(n) is not referenced, and DU must be
67 * aligned with D.
68 * On exit, this array contains information containing the
69 * factors of the matrix.
70 * Must be of size >= DESCA( NB_ ).
71 *
72 * JA (global input) INTEGER
73 * The index in the global array A that points to the start of
74 * the matrix to be operated on (which may be either all of A
75 * or a submatrix of A).
76 *
77 * DESCA (global and local input) INTEGER array of dimension DLEN.
78 * if 1D type (DTYPE_A=501 or 502), DLEN >= 7;
79 * if 2D type (DTYPE_A=1), DLEN >= 9.
80 * The array descriptor for the distributed matrix A.
81 * Contains information of mapping of A to memory. Please
82 * see NOTES below for full description and options.
83 *
84 * B (local input/local output) REAL pointer into
85 * local memory to an array of local lead dimension lld_b>=NB.
86 * On entry, this array contains the
87 * the local pieces of the right hand sides
88 * B(IB:IB+N-1, 1:NRHS).
89 * On exit, this contains the local piece of the solutions
90 * distributed matrix X.
91 *
92 * IB (global input) INTEGER
93 * The row index in the global array B that points to the first
94 * row of the matrix to be operated on (which may be either
95 * all of B or a submatrix of B).
96 *
97 * DESCB (global and local input) INTEGER array of dimension DLEN.
98 * if 1D type (DTYPE_B=502), DLEN >=7;
99 * if 2D type (DTYPE_B=1), DLEN >= 9.
100 * The array descriptor for the distributed matrix B.
101 * Contains information of mapping of B to memory. Please
102 * see NOTES below for full description and options.
103 *
104 * AF (local output) REAL array, dimension LAF.
105 * Auxiliary Fillin Space.
106 * Fillin is created during the factorization routine
107 * PSPTTRF and this is stored in AF. If a linear system
108 * is to be solved using PSPTTRS after the factorization
109 * routine, AF *must not be altered* after the factorization.
110 *
111 * LAF (local input) INTEGER
112 * Size of user-input Auxiliary Fillin space AF. Must be >=
113 * (NB+2)
114 * If LAF is not large enough, an error code will be returned
115 * and the minimum acceptable size will be returned in AF( 1 )
116 *
117 * WORK (local workspace/local output)
118 * REAL temporary workspace. This space may
119 * be overwritten in between calls to routines. WORK must be
120 * the size given in LWORK.
121 * On exit, WORK( 1 ) contains the minimal LWORK.
122 *
123 * LWORK (local input or global input) INTEGER
124 * Size of user-input workspace WORK.
125 * If LWORK is too small, the minimal acceptable size will be
126 * returned in WORK(1) and an error code is returned. LWORK>=
127 * (10+2*min(100,NRHS))*NPCOL+4*NRHS
128 *
129 * INFO (local output) INTEGER
130 * = 0: successful exit
131 * < 0: If the i-th argument is an array and the j-entry had
132 * an illegal value, then INFO = -(i*100+j), if the i-th
133 * argument is a scalar and had an illegal value, then
134 * INFO = -i.
135 *
136 * =====================================================================
137 *
138 *
139 * Restrictions
140 * ============
141 *
142 * The following are restrictions on the input parameters. Some of these
143 * are temporary and will be removed in future releases, while others
144 * may reflect fundamental technical limitations.
145 *
146 * Non-cyclic restriction: VERY IMPORTANT!
147 * P*NB>= mod(JA-1,NB)+N.
148 * The mapping for matrices must be blocked, reflecting the nature
149 * of the divide and conquer algorithm as a task-parallel algorithm.
150 * This formula in words is: no processor may have more than one
151 * chunk of the matrix.
152 *
153 * Blocksize cannot be too small:
154 * If the matrix spans more than one processor, the following
155 * restriction on NB, the size of each block on each processor,
156 * must hold:
157 * NB >= 2
158 * The bulk of parallel computation is done on the matrix of size
159 * O(NB) on each processor. If this is too small, divide and conquer
160 * is a poor choice of algorithm.
161 *
162 * Submatrix reference:
163 * JA = IB
164 * Alignment restriction that prevents unnecessary communication.
165 *
166 *
167 * =====================================================================
168 *
169 *
170 * Notes
171 * =====
172 *
173 * If the factorization routine and the solve routine are to be called
174 * separately (to solve various sets of righthand sides using the same
175 * coefficient matrix), the auxiliary space AF *must not be altered*
176 * between calls to the factorization routine and the solve routine.
177 *
178 * The best algorithm for solving banded and tridiagonal linear systems
179 * depends on a variety of parameters, especially the bandwidth.
180 * Currently, only algorithms designed for the case N/P >> bw are
181 * implemented. These go by many names, including Divide and Conquer,
182 * Partitioning, domain decomposition-type, etc.
183 * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C
184 * algorithms are the appropriate choice.
185 *
186 * Algorithm description: Divide and Conquer
187 *
188 * The Divide and Conqer algorithm assumes the matrix is narrowly
189 * banded compared with the number of equations. In this situation,
190 * it is best to distribute the input matrix A one-dimensionally,
191 * with columns atomic and rows divided amongst the processes.
192 * The basic algorithm divides the tridiagonal matrix up into
193 * P pieces with one stored on each processor,
194 * and then proceeds in 2 phases for the factorization or 3 for the
195 * solution of a linear system.
196 * 1) Local Phase:
197 * The individual pieces are factored independently and in
198 * parallel. These factors are applied to the matrix creating
199 * fillin, which is stored in a non-inspectable way in auxiliary
200 * space AF. Mathematically, this is equivalent to reordering
201 * the matrix A as P A P^T and then factoring the principal
202 * leading submatrix of size equal to the sum of the sizes of
203 * the matrices factored on each processor. The factors of
204 * these submatrices overwrite the corresponding parts of A
205 * in memory.
206 * 2) Reduced System Phase:
207 * A small ((P-1)) system is formed representing
208 * interaction of the larger blocks, and is stored (as are its
209 * factors) in the space AF. A parallel Block Cyclic Reduction
210 * algorithm is used. For a linear system, a parallel front solve
211 * followed by an analagous backsolve, both using the structure
212 * of the factored matrix, are performed.
213 * 3) Backsubsitution Phase:
214 * For a linear system, a local backsubstitution is performed on
215 * each processor in parallel.
216 *
217 *
218 * Descriptors
219 * ===========
220 *
221 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
222 *
223 * Note: tridiagonal codes can use either the old two dimensional
224 * or new one-dimensional descriptors, though the processor grid in
225 * both cases *must be one-dimensional*. We describe both types below.
226 *
227 * Each global data object is described by an associated description
228 * vector. This vector stores the information required to establish
229 * the mapping between an object element and its corresponding process
230 * and memory location.
231 *
232 * Let A be a generic term for any 2D block cyclicly distributed array.
233 * Such a global array has an associated description vector DESCA.
234 * In the following comments, the character _ should be read as
235 * "of the global array".
236 *
237 * NOTATION STORED IN EXPLANATION
238 * --------------- -------------- --------------------------------------
239 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
240 * DTYPE_A = 1.
241 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
242 * the BLACS process grid A is distribu-
243 * ted over. The context itself is glo-
244 * bal, but the handle (the integer
245 * value) may vary.
246 * M_A (global) DESCA( M_ ) The number of rows in the global
247 * array A.
248 * N_A (global) DESCA( N_ ) The number of columns in the global
249 * array A.
250 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
251 * the rows of the array.
252 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
253 * the columns of the array.
254 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
255 * row of the array A is distributed.
256 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
257 * first column of the array A is
258 * distributed.
259 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
260 * array. LLD_A >= MAX(1,LOCr(M_A)).
261 *
262 * Let K be the number of rows or columns of a distributed matrix,
263 * and assume that its process grid has dimension p x q.
264 * LOCr( K ) denotes the number of elements of K that a process
265 * would receive if K were distributed over the p processes of its
266 * process column.
267 * Similarly, LOCc( K ) denotes the number of elements of K that a
268 * process would receive if K were distributed over the q processes of
269 * its process row.
270 * The values of LOCr() and LOCc() may be determined via a call to the
271 * ScaLAPACK tool function, NUMROC:
272 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
273 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
274 * An upper bound for these quantities may be computed by:
275 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
276 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
277 *
278 *
279 * One-dimensional descriptors:
280 *
281 * One-dimensional descriptors are a new addition to ScaLAPACK since
282 * version 1.0. They simplify and shorten the descriptor for 1D
283 * arrays.
284 *
285 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
286 * object, we allow 1D arrays to be distributed either over the
287 * first dimension of the array (as if the grid were P-by-1) or the
288 * 2nd dimension (as if the grid were 1-by-P). This choice is
289 * indicated by the descriptor type (501 or 502)
290 * as described below.
291 * However, for tridiagonal matrices, since the objects being
292 * distributed are the individual vectors storing the diagonals, we
293 * have adopted the convention that both the P-by-1 descriptor and
294 * the 1-by-P descriptor are allowed and are equivalent for
295 * tridiagonal matrices. Thus, for tridiagonal matrices,
296 * DTYPE_A = 501 or 502 can be used interchangeably
297 * without any other change.
298 * We require that the distributed vectors storing the diagonals of a
299 * tridiagonal matrix be aligned with each other. Because of this, a
300 * single descriptor, DESCA, serves to describe the distribution of
301 * of all diagonals simultaneously.
302 *
303 * IMPORTANT NOTE: the actual BLACS grid represented by the
304 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
305 * irrespective of which one-dimensional descriptor type
306 * (501 or 502) is input.
307 * This routine will interpret the grid properly either way.
308 * ScaLAPACK routines *do not support intercontext operations* so that
309 * the grid passed to a single ScaLAPACK routine *must be the same*
310 * for all array descriptors passed to that routine.
311 *
312 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
313 * may also be used, since a one-dimensional array is a special case
314 * of a two-dimensional array with one dimension of size unity.
315 * The two-dimensional array used in this case *must* be of the
316 * proper orientation:
317 * If the appropriate one-dimensional descriptor is DTYPEA=501
318 * (1 by P type), then the two dimensional descriptor must
319 * have a CTXT value that refers to a 1 by P BLACS grid;
320 * If the appropriate one-dimensional descriptor is DTYPEA=502
321 * (P by 1 type), then the two dimensional descriptor must
322 * have a CTXT value that refers to a P by 1 BLACS grid.
323 *
324 *
325 * Summary of allowed descriptors, types, and BLACS grids:
326 * DTYPE 501 502 1 1
327 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
328 * -----------------------------------------------------
329 * A OK OK OK NO
330 * B NO OK NO OK
331 *
332 * Note that a consequence of this chart is that it is not possible
333 * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
334 * to opposite requirements for the orientation of the BLACS grid,
335 * and as noted before, the *same* BLACS context must be used in
336 * all descriptors in a single ScaLAPACK subroutine call.
337 *
338 * Let A be a generic term for any 1D block cyclicly distributed array.
339 * Such a global array has an associated description vector DESCA.
340 * In the following comments, the character _ should be read as
341 * "of the global array".
342 *
343 * NOTATION STORED IN EXPLANATION
344 * --------------- ---------- ------------------------------------------
345 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
346 * TYPE_A = 501: 1-by-P grid.
347 * TYPE_A = 502: P-by-1 grid.
348 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
349 * the BLACS process grid A is distribu-
350 * ted over. The context itself is glo-
351 * bal, but the handle (the integer
352 * value) may vary.
353 * N_A (global) DESCA( 3 ) The size of the array dimension being
354 * distributed.
355 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
356 * the distributed dimension of the array.
357 * SRC_A (global) DESCA( 5 ) The process row or column over which the
358 * first row or column of the array
359 * is distributed.
360 * Ignored DESCA( 6 ) Ignored for tridiagonal matrices.
361 * Reserved DESCA( 7 ) Reserved for future use.
362 *
363 *
364 *
365 * =====================================================================
366 *
367 * Code Developer: Andrew J. Cleary, University of Tennessee.
368 * Current address: Lawrence Livermore National Labs.
369 *
370 * =====================================================================
371 *
372 * .. Parameters ..
373  REAL ONE
374  parameter( one = 1.0e+0 )
375  REAL ZERO
376  parameter( zero = 0.0e+0 )
377  INTEGER INT_ONE
378  parameter( int_one = 1 )
379  INTEGER DESCMULT, BIGNUM
380  parameter( descmult = 100, bignum = descmult*descmult )
381  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
382  $ lld_, mb_, m_, nb_, n_, rsrc_
383  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
384  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
385  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
386 * ..
387 * .. Local Scalars ..
388  INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
389  $ idum1, idum3, ja_new, level_dist, llda, lldb,
390  $ mycol, myrow, my_num_cols, nb, np, npcol,
391  $ nprow, np_save, odd_size, part_offset,
392  $ part_size, return_code, store_m_b, store_n_a,
393  $ temp, work_size_min
394 * ..
395 * .. Local Arrays ..
396  INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
397  $ param_check( 15, 3 )
398 * ..
399 * .. External Subroutines ..
400  EXTERNAL blacs_gridexit, blacs_gridinfo, desc_convert,
401  $ globchk, pxerbla, reshape, saxpy, sgemm,
402  $ sgerv2d, sgesd2d, smatadd, spttrsv, strtrs
403 * ..
404 * .. External Functions ..
405  LOGICAL LSAME
406  INTEGER NUMROC
407  EXTERNAL lsame, numroc
408 * ..
409 * .. Intrinsic Functions ..
410  INTRINSIC ichar, mod
411 * ..
412 * .. Executable Statements ..
413 *
414 * Test the input parameters
415 *
416  info = 0
417 *
418 * Convert descriptor into standard form for easy access to
419 * parameters, check that grid is of right shape.
420 *
421  desca_1xp( 1 ) = 501
422  descb_px1( 1 ) = 502
423 *
424  temp = desca( dtype_ )
425  IF( temp.EQ.502 ) THEN
426 * Temporarily set the descriptor type to 1xP type
427  desca( dtype_ ) = 501
428  END IF
429 *
430  CALL desc_convert( desca, desca_1xp, return_code )
431 *
432  desca( dtype_ ) = temp
433 *
434  IF( return_code.NE.0 ) THEN
435  info = -( 7*100+2 )
436  END IF
437 *
438  CALL desc_convert( descb, descb_px1, return_code )
439 *
440  IF( return_code.NE.0 ) THEN
441  info = -( 10*100+2 )
442  END IF
443 *
444 * Consistency checks for DESCA and DESCB.
445 *
446 * Context must be the same
447  IF( desca_1xp( 2 ).NE.descb_px1( 2 ) ) THEN
448  info = -( 10*100+2 )
449  END IF
450 *
451 * These are alignment restrictions that may or may not be removed
452 * in future releases. -Andy Cleary, April 14, 1996.
453 *
454 * Block sizes must be the same
455  IF( desca_1xp( 4 ).NE.descb_px1( 4 ) ) THEN
456  info = -( 10*100+4 )
457  END IF
458 *
459 * Source processor must be the same
460 *
461  IF( desca_1xp( 5 ).NE.descb_px1( 5 ) ) THEN
462  info = -( 10*100+5 )
463  END IF
464 *
465 * Get values out of descriptor for use in code.
466 *
467  ictxt = desca_1xp( 2 )
468  csrc = desca_1xp( 5 )
469  nb = desca_1xp( 4 )
470  llda = desca_1xp( 6 )
471  store_n_a = desca_1xp( 3 )
472  lldb = descb_px1( 6 )
473  store_m_b = descb_px1( 3 )
474 *
475 * Get grid parameters
476 *
477 *
478  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
479  np = nprow*npcol
480 *
481 *
482 *
483  IF( lsame( uplo, 'U' ) ) THEN
484  idum1 = ichar( 'U' )
485  ELSE IF( lsame( uplo, 'L' ) ) THEN
486  idum1 = ichar( 'L' )
487  ELSE
488  info = -1
489  END IF
490 *
491  IF( lwork.LT.-1 ) THEN
492  info = -14
493  ELSE IF( lwork.EQ.-1 ) THEN
494  idum3 = -1
495  ELSE
496  idum3 = 1
497  END IF
498 *
499  IF( n.LT.0 ) THEN
500  info = -2
501  END IF
502 *
503  IF( n+ja-1.GT.store_n_a ) THEN
504  info = -( 7*100+6 )
505  END IF
506 *
507  IF( n+ib-1.GT.store_m_b ) THEN
508  info = -( 10*100+3 )
509  END IF
510 *
511  IF( lldb.LT.nb ) THEN
512  info = -( 10*100+6 )
513  END IF
514 *
515  IF( nrhs.LT.0 ) THEN
516  info = -3
517  END IF
518 *
519 * Current alignment restriction
520 *
521  IF( ja.NE.ib ) THEN
522  info = -6
523  END IF
524 *
525 * Argument checking that is specific to Divide & Conquer routine
526 *
527  IF( nprow.NE.1 ) THEN
528  info = -( 7*100+2 )
529  END IF
530 *
531  IF( n.GT.np*nb-mod( ja-1, nb ) ) THEN
532  info = -( 2 )
533  CALL pxerbla( ictxt,
534  $ 'PSPTTRSV, D&C alg.: only 1 block per proc',
535  $ -info )
536  RETURN
537  END IF
538 *
539  IF( ( ja+n-1.GT.nb ) .AND. ( nb.LT.2*int_one ) ) THEN
540  info = -( 7*100+4 )
541  CALL pxerbla( ictxt, 'PSPTTRSV, D&C alg.: NB too small',
542  $ -info )
543  RETURN
544  END IF
545 *
546 *
547  work_size_min = int_one*nrhs
548 *
549  work( 1 ) = work_size_min
550 *
551  IF( lwork.LT.work_size_min ) THEN
552  IF( lwork.NE.-1 ) THEN
553  info = -14
554  CALL pxerbla( ictxt, 'PSPTTRSV: worksize error', -info )
555  END IF
556  RETURN
557  END IF
558 *
559 * Pack params and positions into arrays for global consistency check
560 *
561  param_check( 15, 1 ) = descb( 5 )
562  param_check( 14, 1 ) = descb( 4 )
563  param_check( 13, 1 ) = descb( 3 )
564  param_check( 12, 1 ) = descb( 2 )
565  param_check( 11, 1 ) = descb( 1 )
566  param_check( 10, 1 ) = ib
567  param_check( 9, 1 ) = desca( 5 )
568  param_check( 8, 1 ) = desca( 4 )
569  param_check( 7, 1 ) = desca( 3 )
570  param_check( 6, 1 ) = desca( 1 )
571  param_check( 5, 1 ) = ja
572  param_check( 4, 1 ) = nrhs
573  param_check( 3, 1 ) = n
574  param_check( 2, 1 ) = idum3
575  param_check( 1, 1 ) = idum1
576 *
577  param_check( 15, 2 ) = 1005
578  param_check( 14, 2 ) = 1004
579  param_check( 13, 2 ) = 1003
580  param_check( 12, 2 ) = 1002
581  param_check( 11, 2 ) = 1001
582  param_check( 10, 2 ) = 9
583  param_check( 9, 2 ) = 705
584  param_check( 8, 2 ) = 704
585  param_check( 7, 2 ) = 703
586  param_check( 6, 2 ) = 701
587  param_check( 5, 2 ) = 6
588  param_check( 4, 2 ) = 3
589  param_check( 3, 2 ) = 2
590  param_check( 2, 2 ) = 14
591  param_check( 1, 2 ) = 1
592 *
593 * Want to find errors with MIN( ), so if no error, set it to a big
594 * number. If there already is an error, multiply by the the
595 * descriptor multiplier.
596 *
597  IF( info.GE.0 ) THEN
598  info = bignum
599  ELSE IF( info.LT.-descmult ) THEN
600  info = -info
601  ELSE
602  info = -info*descmult
603  END IF
604 *
605 * Check consistency across processors
606 *
607  CALL globchk( ictxt, 15, param_check, 15, param_check( 1, 3 ),
608  $ info )
609 *
610 * Prepare output: set info = 0 if no error, and divide by DESCMULT
611 * if error is not in a descriptor entry.
612 *
613  IF( info.EQ.bignum ) THEN
614  info = 0
615  ELSE IF( mod( info, descmult ).EQ.0 ) THEN
616  info = -info / descmult
617  ELSE
618  info = -info
619  END IF
620 *
621  IF( info.LT.0 ) THEN
622  CALL pxerbla( ictxt, 'PSPTTRSV', -info )
623  RETURN
624  END IF
625 *
626 * Quick return if possible
627 *
628  IF( n.EQ.0 )
629  $ RETURN
630 *
631  IF( nrhs.EQ.0 )
632  $ RETURN
633 *
634 *
635 * Adjust addressing into matrix space to properly get into
636 * the beginning part of the relevant data
637 *
638  part_offset = nb*( ( ja-1 ) / ( npcol*nb ) )
639 *
640  IF( ( mycol-csrc ).LT.( ja-part_offset-1 ) / nb ) THEN
641  part_offset = part_offset + nb
642  END IF
643 *
644  IF( mycol.LT.csrc ) THEN
645  part_offset = part_offset - nb
646  END IF
647 *
648 * Form a new BLACS grid (the "standard form" grid) with only procs
649 * holding part of the matrix, of size 1xNP where NP is adjusted,
650 * starting at csrc=0, with JA modified to reflect dropped procs.
651 *
652 * First processor to hold part of the matrix:
653 *
654  first_proc = mod( ( ja-1 ) / nb+csrc, npcol )
655 *
656 * Calculate new JA one while dropping off unused processors.
657 *
658  ja_new = mod( ja-1, nb ) + 1
659 *
660 * Save and compute new value of NP
661 *
662  np_save = np
663  np = ( ja_new+n-2 ) / nb + 1
664 *
665 * Call utility routine that forms "standard-form" grid
666 *
667  CALL reshape( ictxt, int_one, ictxt_new, int_one, first_proc,
668  $ int_one, np )
669 *
670 * Use new context from standard grid as context.
671 *
672  ictxt_save = ictxt
673  ictxt = ictxt_new
674  desca_1xp( 2 ) = ictxt_new
675  descb_px1( 2 ) = ictxt_new
676 *
677 * Get information about new grid.
678 *
679  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
680 *
681 * Drop out processors that do not have part of the matrix.
682 *
683  IF( myrow.LT.0 ) THEN
684  GO TO 100
685  END IF
686 *
687 * ********************************
688 * Values reused throughout routine
689 *
690 * User-input value of partition size
691 *
692  part_size = nb
693 *
694 * Number of columns in each processor
695 *
696  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
697 *
698 * Offset in columns to beginning of main partition in each proc
699 *
700  IF( mycol.EQ.0 ) THEN
701  part_offset = part_offset + mod( ja_new-1, part_size )
702  my_num_cols = my_num_cols - mod( ja_new-1, part_size )
703  END IF
704 *
705 * Size of main (or odd) partition in each processor
706 *
707  odd_size = my_num_cols
708  IF( mycol.LT.np-1 ) THEN
709  odd_size = odd_size - int_one
710  END IF
711 *
712 *
713 *
714 * Begin main code
715 *
716  IF( lsame( uplo, 'L' ) ) THEN
717 *
718 *
719 * Frontsolve
720 *
721 *
722 ******************************************
723 * Local computation phase
724 ******************************************
725 *
726 * Use main partition in each processor to solve locally
727 *
728  CALL spttrsv( 'N', odd_size, nrhs, d( part_offset+1 ),
729  $ e( part_offset+1 ), b( part_offset+1 ), lldb,
730  $ info )
731 *
732 *
733  IF( mycol.LT.np-1 ) THEN
734 * Use factorization of odd-even connection block to modify
735 * locally stored portion of right hand side(s)
736 *
737  CALL saxpy( nrhs, -e( part_offset+odd_size ),
738  $ b( part_offset+odd_size ), lldb,
739  $ b( part_offset+odd_size+1 ), lldb )
740 *
741  END IF
742 *
743 *
744  IF( mycol.NE.0 ) THEN
745 * Use the "spike" fillin to calculate contribution to previous
746 * processor's righthand-side.
747 *
748  CALL sgemm( 'T', 'N', 1, nrhs, odd_size, -one, af( 1 ),
749  $ odd_size, b( part_offset+1 ), lldb, zero,
750  $ work( 1+int_one-1 ), int_one )
751  END IF
752 *
753 *
754 ************************************************
755 * Formation and solution of reduced system
756 ************************************************
757 *
758 *
759 * Send modifications to prior processor's right hand sides
760 *
761  IF( mycol.GT.0 ) THEN
762 *
763  CALL sgesd2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
764  $ mycol-1 )
765 *
766  END IF
767 *
768 * Receive modifications to processor's right hand sides
769 *
770  IF( mycol.LT.npcol-1 ) THEN
771 *
772  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
773  $ mycol+1 )
774 *
775 * Combine contribution to locally stored right hand sides
776 *
777  CALL smatadd( int_one, nrhs, one, work( 1 ), int_one, one,
778  $ b( part_offset+odd_size+1 ), lldb )
779 *
780  END IF
781 *
782 *
783 * The last processor does not participate in the solution of the
784 * reduced system, having sent its contribution already.
785  IF( mycol.EQ.npcol-1 ) THEN
786  GO TO 30
787  END IF
788 *
789 *
790 * *************************************
791 * Modification Loop
792 *
793 * The distance for sending and receiving for each level starts
794 * at 1 for the first level.
795  level_dist = 1
796 *
797 * Do until this proc is needed to modify other procs' equations
798 *
799  10 CONTINUE
800  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
801  $ GO TO 20
802 *
803 * Receive and add contribution to righthand sides from left
804 *
805  IF( mycol-level_dist.GE.0 ) THEN
806 *
807  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
808  $ mycol-level_dist )
809 *
810  CALL smatadd( int_one, nrhs, one, work( 1 ), int_one, one,
811  $ b( part_offset+odd_size+1 ), lldb )
812 *
813  END IF
814 *
815 * Receive and add contribution to righthand sides from right
816 *
817  IF( mycol+level_dist.LT.npcol-1 ) THEN
818 *
819  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
820  $ mycol+level_dist )
821 *
822  CALL smatadd( int_one, nrhs, one, work( 1 ), int_one, one,
823  $ b( part_offset+odd_size+1 ), lldb )
824 *
825  END IF
826 *
827  level_dist = level_dist*2
828 *
829  GO TO 10
830  20 CONTINUE
831 * [End of GOTO Loop]
832 *
833 *
834 *
835 * *********************************
836 * Calculate and use this proc's blocks to modify other procs
837 *
838 * Solve with diagonal block
839 *
840  CALL strtrs( 'L', 'N', 'U', int_one, nrhs, af( odd_size+2 ),
841  $ int_one, b( part_offset+odd_size+1 ), lldb, info )
842 *
843  IF( info.NE.0 ) THEN
844  GO TO 90
845  END IF
846 *
847 *
848 *
849 * *********
850  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
851 *
852 * Calculate contribution from this block to next diagonal block
853 *
854  CALL sgemm( 'T', 'N', int_one, nrhs, int_one, -one,
855  $ af( ( odd_size )*1+1 ), int_one,
856  $ b( part_offset+odd_size+1 ), lldb, zero,
857  $ work( 1 ), int_one )
858 *
859 * Send contribution to diagonal block's owning processor.
860 *
861  CALL sgesd2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
862  $ mycol+level_dist )
863 *
864  END IF
865 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
866 *
867 * ************
868  IF( ( mycol / level_dist.GT.0 ) .AND.
869  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) ) THEN
870 *
871 *
872 * Use offdiagonal block to calculate modification to diag block
873 * of processor to the left
874 *
875  CALL sgemm( 'N', 'N', int_one, nrhs, int_one, -one,
876  $ af( odd_size*1+2+1 ), int_one,
877  $ b( part_offset+odd_size+1 ), lldb, zero,
878  $ work( 1 ), int_one )
879 *
880 * Send contribution to diagonal block's owning processor.
881 *
882  CALL sgesd2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
883  $ mycol-level_dist )
884 *
885  END IF
886 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
887 *
888  30 CONTINUE
889 *
890  ELSE
891 *
892 ******************** BACKSOLVE *************************************
893 *
894 ********************************************************************
895 * .. Begin reduced system phase of algorithm ..
896 ********************************************************************
897 *
898 *
899 *
900 * The last processor does not participate in the solution of the
901 * reduced system and just waits to receive its solution.
902  IF( mycol.EQ.npcol-1 ) THEN
903  GO TO 80
904  END IF
905 *
906 * Determine number of steps in tree loop
907 *
908  level_dist = 1
909  40 CONTINUE
910  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
911  $ GO TO 50
912 *
913  level_dist = level_dist*2
914 *
915  GO TO 40
916  50 CONTINUE
917 *
918 *
919  IF( ( mycol / level_dist.GT.0 ) .AND.
920  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) ) THEN
921 *
922 * Receive solution from processor to left
923 *
924  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
925  $ mycol-level_dist )
926 *
927 *
928 * Use offdiagonal block to calculate modification to RHS stored
929 * on this processor
930 *
931  CALL sgemm( 'T', 'N', int_one, nrhs, int_one, -one,
932  $ af( odd_size*1+2+1 ), int_one, work( 1 ),
933  $ int_one, one, b( part_offset+odd_size+1 ),
934  $ lldb )
935  END IF
936 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
937 *
938 *
939  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
940 *
941 * Receive solution from processor to right
942 *
943  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
944  $ mycol+level_dist )
945 *
946 * Calculate contribution from this block to next diagonal block
947 *
948  CALL sgemm( 'N', 'N', int_one, nrhs, int_one, -one,
949  $ af( ( odd_size )*1+1 ), int_one, work( 1 ),
950  $ int_one, one, b( part_offset+odd_size+1 ),
951  $ lldb )
952 *
953  END IF
954 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
955 *
956 *
957 * Solve with diagonal block
958 *
959  CALL strtrs( 'L', 'T', 'U', int_one, nrhs, af( odd_size+2 ),
960  $ int_one, b( part_offset+odd_size+1 ), lldb, info )
961 *
962  IF( info.NE.0 ) THEN
963  GO TO 90
964  END IF
965 *
966 *
967 *
968 ***Modification Loop *******
969 *
970  60 CONTINUE
971  IF( level_dist.EQ.1 )
972  $ GO TO 70
973 *
974  level_dist = level_dist / 2
975 *
976 * Send solution to the right
977 *
978  IF( mycol+level_dist.LT.npcol-1 ) THEN
979 *
980  CALL sgesd2d( ictxt, int_one, nrhs,
981  $ b( part_offset+odd_size+1 ), lldb, 0,
982  $ mycol+level_dist )
983 *
984  END IF
985 *
986 * Send solution to left
987 *
988  IF( mycol-level_dist.GE.0 ) THEN
989 *
990  CALL sgesd2d( ictxt, int_one, nrhs,
991  $ b( part_offset+odd_size+1 ), lldb, 0,
992  $ mycol-level_dist )
993 *
994  END IF
995 *
996  GO TO 60
997  70 CONTINUE
998 * [End of GOTO Loop]
999 *
1000  80 CONTINUE
1001 * [Processor npcol - 1 jumped to here to await next stage]
1002 *
1003 *******************************
1004 * Reduced system has been solved, communicate solutions to nearest
1005 * neighbors in preparation for local computation phase.
1006 *
1007 *
1008 * Send elements of solution to next proc
1009 *
1010  IF( mycol.LT.npcol-1 ) THEN
1011 *
1012  CALL sgesd2d( ictxt, int_one, nrhs,
1013  $ b( part_offset+odd_size+1 ), lldb, 0,
1014  $ mycol+1 )
1015 *
1016  END IF
1017 *
1018 * Receive modifications to processor's right hand sides
1019 *
1020  IF( mycol.GT.0 ) THEN
1021 *
1022  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one, 0,
1023  $ mycol-1 )
1024 *
1025  END IF
1026 *
1027 *
1028 *
1029 **********************************************
1030 * Local computation phase
1031 **********************************************
1032 *
1033  IF( mycol.NE.0 ) THEN
1034 * Use the "spike" fillin to calculate contribution from previous
1035 * processor's solution.
1036 *
1037  CALL sgemm( 'N', 'N', odd_size, nrhs, 1, -one, af( 1 ),
1038  $ odd_size, work( 1+int_one-1 ), int_one, one,
1039  $ b( part_offset+1 ), lldb )
1040 *
1041  END IF
1042 *
1043 *
1044  IF( mycol.LT.np-1 ) THEN
1045 * Use factorization of odd-even connection block to modify
1046 * locally stored portion of right hand side(s)
1047 *
1048  CALL saxpy( nrhs, -( e( part_offset+odd_size ) ),
1049  $ b( part_offset+odd_size+1 ), lldb,
1050  $ b( part_offset+odd_size ), lldb )
1051 *
1052  END IF
1053 *
1054 * Use main partition in each processor to solve locally
1055 *
1056  CALL spttrsv( 'T', odd_size, nrhs, d( part_offset+1 ),
1057  $ e( part_offset+1 ), b( part_offset+1 ), lldb,
1058  $ info )
1059 *
1060 *
1061  END IF
1062 * End of "IF( LSAME( UPLO, 'L' ) )"...
1063  90 CONTINUE
1064 *
1065 *
1066 * Free BLACS space used to hold standard-form grid.
1067 *
1068  IF( ictxt_save.NE.ictxt_new ) THEN
1069  CALL blacs_gridexit( ictxt_new )
1070  END IF
1071 *
1072  100 CONTINUE
1073 *
1074 * Restore saved input parameters
1075 *
1076  ictxt = ictxt_save
1077  np = np_save
1078 *
1079 * Output minimum worksize
1080 *
1081  work( 1 ) = work_size_min
1082 *
1083 *
1084  RETURN
1085 *
1086 * End of PSPTTRSV
1087 *
1088  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
smatadd
subroutine smatadd(M, N, ALPHA, A, LDA, BETA, C, LDC)
Definition: smatadd.f:2
pspttrsv
subroutine pspttrsv(UPLO, N, NRHS, D, E, JA, DESCA, B, IB, DESCB, AF, LAF, WORK, LWORK, INFO)
Definition: pspttrsv.f:3
spttrsv
subroutine spttrsv(TRANS, N, NRHS, D, E, B, LDB, INFO)
Definition: spttrsv.f:3
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2