ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pddbmv1.f
Go to the documentation of this file.
1  SUBROUTINE pdgbdcmv( LDBW, BWL, BWU, TRANS, N, A, JA, DESCA, NRHS,
2  $ B, IB, DESCB, X, WORK, LWORK, INFO )
3 *
4 *
5 *
6 * -- ScaLAPACK routine (version 1.7) --
7 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
8 * and University of California, Berkeley.
9 * November 15, 1997
10 *
11 * .. Scalar Arguments ..
12  CHARACTER TRANS
13  INTEGER BWL, BWU, IB, INFO, JA, LDBW, LWORK, N, NRHS
14 * ..
15 * .. Array Arguments ..
16  INTEGER DESCA( * ), DESCB( * )
17  DOUBLE PRECISION A( * ), B( * ), WORK( * ), X( * )
18 * ..
19 *
20 *
21 * Purpose
22 * =======
23 *
24 *
25 * =====================================================================
26 *
27 * Arguments
28 * =========
29 *
30 *
31 * TRANS (global input) CHARACTER
32 * = 'N': Solve with A(1:N, JA:JA+N-1);
33 * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T;
34 *
35 * N (global input) INTEGER
36 * The number of rows and columns to be operated on, i.e. the
37 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
38 *
39 * BWL (global input) INTEGER
40 * Number of subdiagonals. 0 <= BWL <= N-1
41 *
42 * BWU (global input) INTEGER
43 * Number of superdiagonals. 0 <= BWU <= N-1
44 *
45 * A (local input/local output) DOUBLE PRECISION pointer into
46 * local memory to an array with first dimension
47 * LLD_A >=(bwl+bwu+1) (stored in DESCA).
48 * On entry, this array contains the local pieces of the
49 * This local portion is stored in the packed banded format
50 * used in LAPACK. Please see the Notes below and the
51 * ScaLAPACK manual for more detail on the format of
52 * distributed matrices.
53 *
54 * JA (global input) INTEGER
55 * The index in the global array A that points to the start of
56 * the matrix to be operated on (which may be either all of A
57 * or a submatrix of A).
58 *
59 * DESCA (global and local input) INTEGER array of dimension DLEN.
60 * if 1D type (DTYPE_A=501), DLEN >= 7;
61 * if 2D type (DTYPE_A=1), DLEN >= 9 .
62 * The array descriptor for the distributed matrix A.
63 * Contains information of mapping of A to memory. Please
64 * see NOTES below for full description and options.
65 *
66 * AF (local output) DOUBLE PRECISION array, dimension LAF.
67 * Auxiliary Fillin Space.
68 * Fillin is created during the factorization routine
69 * PDDBTRF and this is stored in AF. If a linear system
70 * is to be solved using PDDBTRS after the factorization
71 * routine, AF *must not be altered* after the factorization.
72 *
73 * LAF (local input) INTEGER
74 * Size of user-input Auxiliary Fillin space AF. Must be >=
75 * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu)
76 * If LAF is not large enough, an error code will be returned
77 * and the minimum acceptable size will be returned in AF( 1 )
78 *
79 * WORK (local workspace/local output)
80 * DOUBLE PRECISION temporary workspace. This space may
81 * be overwritten in between calls to routines. WORK must be
82 * the size given in LWORK.
83 * On exit, WORK( 1 ) contains the minimal LWORK.
84 *
85 * LWORK (local input or global input) INTEGER
86 * Size of user-input workspace WORK.
87 * If LWORK is too small, the minimal acceptable size will be
88 * returned in WORK(1) and an error code is returned. LWORK>=
89 *
90 * INFO (global output) INTEGER
91 * = 0: successful exit
92 * < 0: If the i-th argument is an array and the j-entry had
93 * an illegal value, then INFO = -(i*100+j), if the i-th
94 * argument is a scalar and had an illegal value, then
95 * INFO = -i.
96 *
97 * =====================================================================
98 *
99 *
100 * Restrictions
101 * ============
102 *
103 * The following are restrictions on the input parameters. Some of these
104 * are temporary and will be removed in future releases, while others
105 * may reflect fundamental technical limitations.
106 *
107 * Non-cyclic restriction: VERY IMPORTANT!
108 * P*NB>= mod(JA-1,NB)+N.
109 * The mapping for matrices must be blocked, reflecting the nature
110 * of the divide and conquer algorithm as a task-parallel algorithm.
111 * This formula in words is: no processor may have more than one
112 * chunk of the matrix.
113 *
114 * Blocksize cannot be too small:
115 * If the matrix spans more than one processor, the following
116 * restriction on NB, the size of each block on each processor,
117 * must hold:
118 * NB >= 2*MAX(BWL,BWU)
119 * The bulk of parallel computation is done on the matrix of size
120 * O(NB) on each processor. If this is too small, divide and conquer
121 * is a poor choice of algorithm.
122 *
123 * Submatrix reference:
124 * JA = IB
125 * Alignment restriction that prevents unnecessary communication.
126 *
127 *
128 * =====================================================================
129 *
130 *
131 * Notes
132 * =====
133 *
134 * If the factorization routine and the solve routine are to be called
135 * separately (to solve various sets of righthand sides using the same
136 * coefficient matrix), the auxiliary space AF *must not be altered*
137 * between calls to the factorization routine and the solve routine.
138 *
139 * The best algorithm for solving banded and tridiagonal linear systems
140 * depends on a variety of parameters, especially the bandwidth.
141 * Currently, only algorithms designed for the case N/P >> bw are
142 * implemented. These go by many names, including Divide and Conquer,
143 * Partitioning, domain decomposition-type, etc.
144 *
145 * Algorithm description: Divide and Conquer
146 *
147 * The Divide and Conqer algorithm assumes the matrix is narrowly
148 * banded compared with the number of equations. In this situation,
149 * it is best to distribute the input matrix A one-dimensionally,
150 * with columns atomic and rows divided amongst the processes.
151 * The basic algorithm divides the banded matrix up into
152 * P pieces with one stored on each processor,
153 * and then proceeds in 2 phases for the factorization or 3 for the
154 * solution of a linear system.
155 * 1) Local Phase:
156 * The individual pieces are factored independently and in
157 * parallel. These factors are applied to the matrix creating
158 * fillin, which is stored in a non-inspectable way in auxiliary
159 * space AF. Mathematically, this is equivalent to reordering
160 * the matrix A as P A P^T and then factoring the principal
161 * leading submatrix of size equal to the sum of the sizes of
162 * the matrices factored on each processor. The factors of
163 * these submatrices overwrite the corresponding parts of A
164 * in memory.
165 * 2) Reduced System Phase:
166 * A small (max(bwl,bwu)* (P-1)) system is formed representing
167 * interaction of the larger blocks, and is stored (as are its
168 * factors) in the space AF. A parallel Block Cyclic Reduction
169 * algorithm is used. For a linear system, a parallel front solve
170 * followed by an analagous backsolve, both using the structure
171 * of the factored matrix, are performed.
172 * 3) Backsubsitution Phase:
173 * For a linear system, a local backsubstitution is performed on
174 * each processor in parallel.
175 *
176 *
177 * Descriptors
178 * ===========
179 *
180 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
181 *
182 * Note: banded codes can use either the old two dimensional
183 * or new one-dimensional descriptors, though the processor grid in
184 * both cases *must be one-dimensional*. We describe both types below.
185 *
186 * Each global data object is described by an associated description
187 * vector. This vector stores the information required to establish
188 * the mapping between an object element and its corresponding process
189 * and memory location.
190 *
191 * Let A be a generic term for any 2D block cyclicly distributed array.
192 * Such a global array has an associated description vector DESCA.
193 * In the following comments, the character _ should be read as
194 * "of the global array".
195 *
196 * NOTATION STORED IN EXPLANATION
197 * --------------- -------------- --------------------------------------
198 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
199 * DTYPE_A = 1.
200 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
201 * the BLACS process grid A is distribu-
202 * ted over. The context itself is glo-
203 * bal, but the handle (the integer
204 * value) may vary.
205 * M_A (global) DESCA( M_ ) The number of rows in the global
206 * array A.
207 * N_A (global) DESCA( N_ ) The number of columns in the global
208 * array A.
209 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
210 * the rows of the array.
211 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
212 * the columns of the array.
213 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
214 * row of the array A is distributed.
215 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
216 * first column of the array A is
217 * distributed.
218 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
219 * array. LLD_A >= MAX(1,LOCr(M_A)).
220 *
221 * Let K be the number of rows or columns of a distributed matrix,
222 * and assume that its process grid has dimension p x q.
223 * LOCr( K ) denotes the number of elements of K that a process
224 * would receive if K were distributed over the p processes of its
225 * process column.
226 * Similarly, LOCc( K ) denotes the number of elements of K that a
227 * process would receive if K were distributed over the q processes of
228 * its process row.
229 * The values of LOCr() and LOCc() may be determined via a call to the
230 * ScaLAPACK tool function, NUMROC:
231 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
232 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
233 * An upper bound for these quantities may be computed by:
234 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
235 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
236 *
237 *
238 * One-dimensional descriptors:
239 *
240 * One-dimensional descriptors are a new addition to ScaLAPACK since
241 * version 1.0. They simplify and shorten the descriptor for 1D
242 * arrays.
243 *
244 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
245 * object, we allow 1D arrays to be distributed either over the
246 * first dimension of the array (as if the grid were P-by-1) or the
247 * 2nd dimension (as if the grid were 1-by-P). This choice is
248 * indicated by the descriptor type (501 or 502)
249 * as described below.
250 *
251 * IMPORTANT NOTE: the actual BLACS grid represented by the
252 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
253 * irrespective of which one-dimensional descriptor type
254 * (501 or 502) is input.
255 * This routine will interpret the grid properly either way.
256 * ScaLAPACK routines *do not support intercontext operations* so that
257 * the grid passed to a single ScaLAPACK routine *must be the same*
258 * for all array descriptors passed to that routine.
259 *
260 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
261 * may also be used, since a one-dimensional array is a special case
262 * of a two-dimensional array with one dimension of size unity.
263 * The two-dimensional array used in this case *must* be of the
264 * proper orientation:
265 * If the appropriate one-dimensional descriptor is DTYPEA=501
266 * (1 by P type), then the two dimensional descriptor must
267 * have a CTXT value that refers to a 1 by P BLACS grid;
268 * If the appropriate one-dimensional descriptor is DTYPEA=502
269 * (P by 1 type), then the two dimensional descriptor must
270 * have a CTXT value that refers to a P by 1 BLACS grid.
271 *
272 *
273 * Summary of allowed descriptors, types, and BLACS grids:
274 * DTYPE 501 502 1 1
275 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
276 * -----------------------------------------------------
277 * A OK NO OK NO
278 * B NO OK NO OK
279 *
280 * Let A be a generic term for any 1D block cyclicly distributed array.
281 * Such a global array has an associated description vector DESCA.
282 * In the following comments, the character _ should be read as
283 * "of the global array".
284 *
285 * NOTATION STORED IN EXPLANATION
286 * --------------- ---------- ------------------------------------------
287 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
288 * TYPE_A = 501: 1-by-P grid.
289 * TYPE_A = 502: P-by-1 grid.
290 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
291 * the BLACS process grid A is distribu-
292 * ted over. The context itself is glo-
293 * bal, but the handle (the integer
294 * value) may vary.
295 * N_A (global) DESCA( 3 ) The size of the array dimension being
296 * distributed.
297 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
298 * the distributed dimension of the array.
299 * SRC_A (global) DESCA( 5 ) The process row or column over which the
300 * first row or column of the array
301 * is distributed.
302 * LLD_A (local) DESCA( 6 ) The leading dimension of the local array
303 * storing the local blocks of the distri-
304 * buted array A. Minimum value of LLD_A
305 * depends on TYPE_A.
306 * TYPE_A = 501: LLD_A >=
307 * size of undistributed dimension, 1.
308 * TYPE_A = 502: LLD_A >=NB_A, 1.
309 * Reserved DESCA( 7 ) Reserved for future use.
310 *
311 *
312 *
313 * =====================================================================
314 *
315 * Code Developer: Andrew J. Cleary, University of Tennessee.
316 * Current address: Lawrence Livermore National Labs.
317 * This version released: August, 2001.
318 *
319 * =====================================================================
320 *
321 * ..
322 * .. Parameters ..
323  DOUBLE PRECISION ONE, ZERO
324  parameter( one = 1.0d+0 )
325  parameter( zero = 0.0d+0 )
326  INTEGER INT_ONE
327  parameter( int_one = 1 )
328  INTEGER DESCMULT, BIGNUM
329  parameter(descmult = 100, bignum = descmult * descmult)
330  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
331  $ lld_, mb_, m_, nb_, n_, rsrc_
332  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
333  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
334  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
335 * ..
336 * .. Local Scalars ..
337  INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N, DU_N_M,
338  $ du_n_n, du_p_m, du_p_n, first_proc, i, ictxt,
339  $ ictxt_new, ictxt_save, idum2, idum3, j, ja_new,
340  $ llda, lldb, max_bw, mycol, myrow, my_num_cols,
341  $ nb, np, npcol, nprow, np_save, odd_size, ofst,
342  $ part_offset, part_size, store_m_b, store_n_a
343  INTEGER NUMROC_SIZE
344 * ..
345 * .. Local Arrays ..
346  INTEGER PARAM_CHECK( 17, 3 )
347 * ..
348 * .. External Subroutines ..
349  EXTERNAL blacs_gridinfo, pxerbla, reshape
350 * ..
351 * .. External Functions ..
352  LOGICAL LSAME
353  INTEGER NUMROC
354  EXTERNAL lsame, numroc
355 * ..
356 * .. Intrinsic Functions ..
357  INTRINSIC ichar, min, mod
358 * ..
359 * .. Executable Statements ..
360 *
361 * Test the input parameters
362 *
363  info = 0
364 *
365  ictxt = desca( ctxt_ )
366  csrc = desca( csrc_ )
367  nb = desca( nb_ )
368  llda = desca( lld_ )
369  store_n_a = desca( n_ )
370  lldb = descb( lld_ )
371  store_m_b = descb( m_ )
372 *
373 *
374 * Size of separator blocks is maximum of bandwidths
375 *
376  max_bw = max(bwl,bwu)
377 *
378  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
379  np = nprow * npcol
380 *
381 *
382 *
383  IF( lsame( trans, 'N' ) ) THEN
384  idum2 = ichar( 'N' )
385  ELSE IF ( lsame( trans, 'T' ) ) THEN
386  idum2 = ichar( 'T' )
387  ELSE IF ( lsame( trans, 'C' ) ) THEN
388  idum2 = ichar( 'T' )
389  ELSE
390  info = -1
391  END IF
392 *
393  IF( lwork .LT. -1) THEN
394  info = -15
395  ELSE IF ( lwork .EQ. -1 ) THEN
396  idum3 = -1
397  ELSE
398  idum3 = 1
399  ENDIF
400 *
401  IF( n .LT. 0 ) THEN
402  info = -2
403  ENDIF
404 *
405  IF( n+ja-1 .GT. store_n_a ) THEN
406  info = -( 8*100 + 6 )
407  ENDIF
408 *
409  IF(( bwl .GT. n-1 ) .OR.
410  $ ( bwl .LT. 0 ) ) THEN
411  info = -3
412  ENDIF
413 *
414  IF(( bwu .GT. n-1 ) .OR.
415  $ ( bwu .LT. 0 ) ) THEN
416  info = -4
417  ENDIF
418 *
419  IF( llda .LT. (bwl+bwu+1) ) THEN
420  info = -( 8*100 + 6 )
421  ENDIF
422 *
423  IF( nb .LE. 0 ) THEN
424  info = -( 8*100 + 4 )
425  ENDIF
426 *
427 * Argument checking that is specific to Divide & Conquer routine
428 *
429  IF( nprow .NE. 1 ) THEN
430  info = -( 8*100+2 )
431  ENDIF
432 *
433  IF( n .GT. np*nb-mod( ja-1, nb )) THEN
434  info = -( 2 )
435  CALL pxerbla( ictxt,
436  $ 'PDDBDCMV, D&C alg.: only 1 block per proc',
437  $ -info )
438  RETURN
439  ENDIF
440 *
441  IF((ja+n-1.GT.nb) .AND. ( nb.LT.2*max(bwl,bwu) )) THEN
442  info = -( 8*100+4 )
443  CALL pxerbla( ictxt,
444  $ 'PDDBDCMV, D&C alg.: NB too small',
445  $ -info )
446  RETURN
447  ENDIF
448 *
449 *
450 * Pack params and positions into arrays for global consistency check
451 *
452  param_check( 17, 1 ) = descb(5)
453  param_check( 16, 1 ) = descb(4)
454  param_check( 15, 1 ) = descb(3)
455  param_check( 14, 1 ) = descb(2)
456  param_check( 13, 1 ) = descb(1)
457  param_check( 12, 1 ) = ib
458  param_check( 11, 1 ) = desca(5)
459  param_check( 10, 1 ) = desca(4)
460  param_check( 9, 1 ) = desca(3)
461  param_check( 8, 1 ) = desca(1)
462  param_check( 7, 1 ) = ja
463  param_check( 6, 1 ) = nrhs
464  param_check( 5, 1 ) = bwu
465  param_check( 4, 1 ) = bwl
466  param_check( 3, 1 ) = n
467  param_check( 2, 1 ) = idum3
468  param_check( 1, 1 ) = idum2
469 *
470  param_check( 17, 2 ) = 1105
471  param_check( 16, 2 ) = 1104
472  param_check( 15, 2 ) = 1103
473  param_check( 14, 2 ) = 1102
474  param_check( 13, 2 ) = 1101
475  param_check( 12, 2 ) = 10
476  param_check( 11, 2 ) = 805
477  param_check( 10, 2 ) = 804
478  param_check( 9, 2 ) = 803
479  param_check( 8, 2 ) = 801
480  param_check( 7, 2 ) = 7
481  param_check( 6, 2 ) = 5
482  param_check( 5, 2 ) = 4
483  param_check( 4, 2 ) = 3
484  param_check( 3, 2 ) = 2
485  param_check( 2, 2 ) = 15
486  param_check( 1, 2 ) = 1
487 *
488 * Want to find errors with MIN( ), so if no error, set it to a big
489 * number. If there already is an error, multiply by the the
490 * descriptor multiplier.
491 *
492  IF( info.GE.0 ) THEN
493  info = bignum
494  ELSE IF( info.LT.-descmult ) THEN
495  info = -info
496  ELSE
497  info = -info * descmult
498  END IF
499 *
500 * Check consistency across processors
501 *
502  CALL globchk( ictxt, 17, param_check, 17,
503  $ param_check( 1, 3 ), info )
504 *
505 * Prepare output: set info = 0 if no error, and divide by DESCMULT
506 * if error is not in a descriptor entry.
507 *
508  IF( info.EQ.bignum ) THEN
509  info = 0
510  ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
511  info = -info / descmult
512  ELSE
513  info = -info
514  END IF
515 *
516  IF( info.LT.0 ) THEN
517  CALL pxerbla( ictxt, 'PDDBDCMV', -info )
518  RETURN
519  END IF
520 *
521 * Quick return if possible
522 *
523  IF( n.EQ.0 )
524  $ RETURN
525 *
526 *
527 * Adjust addressing into matrix space to properly get into
528 * the beginning part of the relevant data
529 *
530  part_offset = nb*( (ja-1)/(npcol*nb) )
531 *
532  IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
533  part_offset = part_offset + nb
534  ENDIF
535 *
536  IF ( mycol .LT. csrc ) THEN
537  part_offset = part_offset - nb
538  ENDIF
539 *
540 * Form a new BLACS grid (the "standard form" grid) with only procs
541 * holding part of the matrix, of size 1xNP where NP is adjusted,
542 * starting at csrc=0, with JA modified to reflect dropped procs.
543 *
544 * First processor to hold part of the matrix:
545 *
546  first_proc = mod( ( ja-1 )/nb+csrc, npcol )
547 *
548 * Calculate new JA one while dropping off unused processors.
549 *
550  ja_new = mod( ja-1, nb ) + 1
551 *
552 * Save and compute new value of NP
553 *
554  np_save = np
555  np = ( ja_new+n-2 )/nb + 1
556 *
557 * Call utility routine that forms "standard-form" grid
558 *
559  CALL reshape( ictxt, int_one, ictxt_new, int_one,
560  $ first_proc, int_one, np )
561 *
562 * Use new context from standard grid as context.
563 *
564  ictxt_save = ictxt
565  ictxt = ictxt_new
566 *
567 * Get information about new grid.
568 *
569  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
570 *
571 * Drop out processors that do not have part of the matrix.
572 *
573  IF( myrow .LT. 0 ) THEN
574  GOTO 1234
575  ENDIF
576 *
577 * ********************************
578 * Values reused throughout routine
579 *
580 * User-input value of partition size
581 *
582  part_size = nb
583 *
584 * Number of columns in each processor
585 *
586  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
587 *
588 * Offset in columns to beginning of main partition in each proc
589 *
590  IF ( mycol .EQ. 0 ) THEN
591  part_offset = part_offset+mod( ja_new-1, part_size )
592  my_num_cols = my_num_cols - mod(ja_new-1, part_size )
593  ENDIF
594 *
595 * Offset in elements
596 *
597  ofst = part_offset*llda
598 *
599 * Size of main (or odd) partition in each processor
600 *
601  odd_size = my_num_cols
602  IF ( mycol .LT. np-1 ) THEN
603  odd_size = odd_size - max_bw
604  ENDIF
605 *
606 *
607 *
608 * Zero out solution to use to accumulate answer
609 *
610  numroc_size =
611  $ numroc( n, part_size, mycol, 0, npcol)
612 *
613  DO 2279 j=1,nrhs
614  DO 4502 i=1,numroc_size
615  x( (j-1)*lldb + i ) = zero
616  4502 CONTINUE
617  2279 CONTINUE
618 *
619  DO 5642 i=1, (max_bw+2)*max_bw
620  work( i ) = zero
621  5642 CONTINUE
622 *
623 * Begin main code
624 *
625 *
626 **************************************
627 *
628  IF ( lsame( trans, 'N' ) ) THEN
629 *
630 * Sizes of the extra triangles communicated bewtween processors
631 *
632  IF( mycol .GT. 0 ) THEN
633 *
634  dl_p_m= min( bwl,
635  $ numroc( n, part_size, mycol, 0, npcol ) )
636  dl_p_n= min( bwl,
637  $ numroc( n, part_size, mycol-1, 0, npcol ) )
638 *
639  du_p_m= min( bwu,
640  $ numroc( n, part_size, mycol-1, 0, npcol ) )
641  du_p_n= min( bwu,
642  $ numroc( n, part_size, mycol, 0, npcol ) )
643  ENDIF
644 *
645  IF( mycol .LT. npcol-1 ) THEN
646 *
647  dl_n_m= min( bwl,
648  $ numroc( n, part_size, mycol+1, 0, npcol ) )
649  dl_n_n= min( bwl,
650  $ numroc( n, part_size, mycol, 0, npcol ) )
651 *
652  du_n_m= min( bwu,
653  $ numroc( n, part_size, mycol, 0, npcol ) )
654  du_n_n= min( bwu,
655  $ numroc( n, part_size, mycol+1, 0, npcol ) )
656  ENDIF
657 *
658 *
659 * Use main partition in each processor to multiply locally
660 *
661  CALL dgbmv( trans, numroc_size, numroc_size, bwl, bwu, one,
662  $ a( ofst+1 ), llda, b(part_offset+1), 1, zero,
663  $ x( part_offset+1 ), 1 )
664 *
665 *
666 *
667  IF ( mycol .LT. npcol-1 ) THEN
668 *
669 * Do the multiplication of the triangle in the lower half
670 *
671  CALL dcopy( dl_n_n,
672  $ b( numroc_size-dl_n_n+1 ),
673  $ 1, work( max_bw*max_bw+1+bwl-dl_n_n ), 1 )
674 *
675  CALL dtrmv( 'U', 'N', 'N', bwl,
676  $ a( llda*( numroc_size-bwl )+1+bwu+bwl ), llda-1,
677  $ work( max_bw*max_bw+1 ), 1)
678 *
679 * Zero out extraneous elements caused by TRMV if any
680 *
681  IF( dl_n_m .GT. dl_n_n ) THEN
682  DO 10 i = dl_n_m-dl_n_n, dl_n_m
683  work( max_bw*max_bw+i ) = 0
684  10 CONTINUE
685  ENDIF
686 *
687 * Send the result to the neighbor
688 *
689  CALL dgesd2d( ictxt, bwl, 1,
690  $ work( max_bw*max_bw+1 ), bwl, myrow, mycol+1 )
691 *
692  ENDIF
693 *
694  IF ( mycol .GT. 0 ) THEN
695 *
696  DO 20 i=1, max_bw*( max_bw+2 )
697  work( i ) = zero
698  20 CONTINUE
699 *
700 * Do the multiplication of the triangle in the upper half
701 *
702 * Copy vector to workspace
703 *
704  CALL dcopy( du_p_n, b( 1 ), 1,
705  $ work( max_bw*max_bw+1 ), 1)
706 *
707  CALL dtrmv(
708  $ 'L',
709  $ 'N',
710  $ 'N', bwu,
711  $ a( 1 ), llda-1,
712  $ work( max_bw*max_bw+1 ), 1 )
713 *
714 * Zero out extraneous results from TRMV if any
715 *
716  IF( du_p_n .GT. du_p_m ) THEN
717  DO 30 i=1, du_p_n-du_p_m
718  work( max_bw*max_bw+i ) = 0
719  30 CONTINUE
720  ENDIF
721 *
722 * Send result back
723 *
724  CALL dgesd2d( ictxt, bwu, 1, work(max_bw*max_bw+1 ),
725  $ bwu, myrow, mycol-1 )
726 *
727 * Receive vector result from neighboring processor
728 *
729  CALL dgerv2d( ictxt, bwl, 1, work( max_bw*max_bw+1 ),
730  $ bwl, myrow, mycol-1 )
731 *
732 * Do addition of received vector
733 *
734  CALL daxpy( bwl, one,
735  $ work( max_bw*max_bw+1 ), 1,
736  $ x( 1 ), 1 )
737 *
738  ENDIF
739 *
740 *
741 *
742  IF( mycol .LT. npcol-1 ) THEN
743 *
744 * Receive returned result
745 *
746  CALL dgerv2d( ictxt, bwu, 1, work( max_bw*max_bw+1 ),
747  $ bwu, myrow, mycol+1 )
748 *
749 * Do addition of received vector
750 *
751  CALL daxpy( bwu, one,
752  $ work( max_bw*max_bw+1 ), 1,
753  $ x( numroc_size-bwu+1 ), 1)
754 *
755  ENDIF
756 *
757 *
758  ENDIF
759 *
760 * End of LSAME if
761 *
762 **************************************
763 *
764  IF ( lsame( trans, 'T' ) ) THEN
765 *
766 * Sizes of the extra triangles communicated bewtween processors
767 *
768  IF( mycol .GT. 0 ) THEN
769 *
770  dl_p_m= min( bwu,
771  $ numroc( n, part_size, mycol, 0, npcol ) )
772  dl_p_n= min( bwu,
773  $ numroc( n, part_size, mycol-1, 0, npcol ) )
774 *
775  du_p_m= min( bwl,
776  $ numroc( n, part_size, mycol-1, 0, npcol ) )
777  du_p_n= min( bwl,
778  $ numroc( n, part_size, mycol, 0, npcol ) )
779  ENDIF
780 *
781  IF( mycol .LT. npcol-1 ) THEN
782 *
783  dl_n_m= min( bwu,
784  $ numroc( n, part_size, mycol+1, 0, npcol ) )
785  dl_n_n= min( bwu,
786  $ numroc( n, part_size, mycol, 0, npcol ) )
787 *
788  du_n_m= min( bwl,
789  $ numroc( n, part_size, mycol, 0, npcol ) )
790  du_n_n= min( bwl,
791  $ numroc( n, part_size, mycol+1, 0, npcol ) )
792  ENDIF
793 *
794 *
795  IF( mycol .GT. 0 ) THEN
796 * ...must send triangle in lower half of matrix to left
797 *
798 * Transpose triangle in preparation for sending
799 *
800  CALL dlatcpy( 'L', bwu, bwu, a( ofst+1 ),
801  $ llda-1, work( 1 ), max_bw )
802 *
803 * Send the triangle to neighboring processor to left
804 *
805  CALL dtrsd2d(ictxt, 'U', 'N',
806  $ bwu, bwu,
807  $ work( 1 ),
808  $ max_bw, myrow, mycol-1 )
809 *
810  ENDIF
811 *
812  IF( mycol .LT. npcol-1 ) THEN
813 * ...must send triangle in upper half of matrix to right
814 *
815 * Transpose triangle in preparation for sending
816 *
817  CALL dlatcpy( 'U', bwl, bwl,
818  $ a( llda*( numroc_size-bwl )+1+bwu+bwl ),
819  $ llda-1, work( 1 ), max_bw )
820 *
821 * Send the triangle to neighboring processor to right
822 *
823  CALL dtrsd2d(ictxt, 'L', 'N',
824  $ bwl, bwl,
825  $ work( 1 ),
826  $ max_bw, myrow, mycol+1 )
827 *
828  ENDIF
829 *
830 * Use main partition in each processor to multiply locally
831 *
832  CALL dgbmv( trans, numroc_size, numroc_size, bwl, bwu, one,
833  $ a( ofst+1 ), llda, b(part_offset+1), 1, zero,
834  $ x( part_offset+1 ), 1 )
835 *
836 *
837 *
838  IF ( mycol .LT. npcol-1 ) THEN
839 *
840 * Do the multiplication of the triangle in the lower half
841 *
842  CALL dcopy( dl_n_n,
843  $ b( numroc_size-dl_n_n+1 ),
844  $ 1, work( max_bw*max_bw+1+bwu-dl_n_n ), 1 )
845 *
846 * Receive the triangle prior to multiplying by it.
847 *
848  CALL dtrrv2d(ictxt, 'U', 'N',
849  $ bwu, bwu,
850  $ work( 1 ), max_bw, myrow, mycol+1 )
851 *
852  CALL dtrmv( 'U', 'N', 'N', bwu,
853  $ work( 1 ), max_bw,
854  $ work( max_bw*max_bw+1 ), 1)
855 *
856 * Zero out extraneous elements caused by TRMV if any
857 *
858  IF( dl_n_m .GT. dl_n_n ) THEN
859  DO 40 i = dl_n_m-dl_n_n, dl_n_m
860  work( max_bw*max_bw+i ) = 0
861  40 CONTINUE
862  ENDIF
863 *
864 * Send the result to the neighbor
865 *
866  CALL dgesd2d( ictxt, bwu, 1,
867  $ work( max_bw*max_bw+1 ), bwu, myrow, mycol+1 )
868 *
869  ENDIF
870 *
871  IF ( mycol .GT. 0 ) THEN
872 *
873  DO 50 i=1, max_bw*( max_bw+2 )
874  work( i ) = zero
875  50 CONTINUE
876 *
877 * Do the multiplication of the triangle in the upper half
878 *
879 * Copy vector to workspace
880 *
881  CALL dcopy( du_p_n, b( 1 ), 1,
882  $ work( max_bw*max_bw+1 ), 1)
883 *
884 * Receive the triangle prior to multiplying by it.
885 *
886  CALL dtrrv2d(ictxt, 'L', 'N',
887  $ bwl, bwl,
888  $ work( 1 ), max_bw, myrow, mycol-1 )
889 *
890  CALL dtrmv(
891  $ 'L',
892  $ 'N',
893  $ 'N', bwl,
894  $ work( 1 ), max_bw,
895  $ work( max_bw*max_bw+1 ), 1 )
896 *
897 * Zero out extraneous results from TRMV if any
898 *
899  IF( du_p_n .GT. du_p_m ) THEN
900  DO 60 i=1, du_p_n-du_p_m
901  work( max_bw*max_bw+i ) = 0
902  60 CONTINUE
903  ENDIF
904 *
905 * Send result back
906 *
907  CALL dgesd2d( ictxt, bwl, 1, work(max_bw*max_bw+1 ),
908  $ bwl, myrow, mycol-1 )
909 *
910 * Receive vector result from neighboring processor
911 *
912  CALL dgerv2d( ictxt, bwu, 1, work( max_bw*max_bw+1 ),
913  $ bwu, myrow, mycol-1 )
914 *
915 * Do addition of received vector
916 *
917  CALL daxpy( bwu, one,
918  $ work( max_bw*max_bw+1 ), 1,
919  $ x( 1 ), 1 )
920 *
921  ENDIF
922 *
923 *
924 *
925  IF( mycol .LT. npcol-1 ) THEN
926 *
927 * Receive returned result
928 *
929  CALL dgerv2d( ictxt, bwl, 1, work( max_bw*max_bw+1 ),
930  $ bwl, myrow, mycol+1 )
931 *
932 * Do addition of received vector
933 *
934  CALL daxpy( bwl, one,
935  $ work( max_bw*max_bw+1 ), 1,
936  $ x( numroc_size-bwl+1 ), 1)
937 *
938  ENDIF
939 *
940 *
941  ENDIF
942 *
943 * End of LSAME if
944 *
945 *
946 * Free BLACS space used to hold standard-form grid.
947 *
948  IF( ictxt_save .NE. ictxt_new ) THEN
949  CALL blacs_gridexit( ictxt_new )
950  ENDIF
951 *
952  1234 CONTINUE
953 *
954 * Restore saved input parameters
955 *
956  ictxt = ictxt_save
957  np = np_save
958 *
959 *
960  RETURN
961 *
962 * End of PDBsBMV1
963 *
964  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
max
#define max(A, B)
Definition: pcgemr.c:180
dlatcpy
subroutine dlatcpy(UPLO, M, N, A, LDA, B, LDB)
Definition: dlatcpy.f:2
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
pdgbdcmv
subroutine pdgbdcmv(LDBW, BWL, BWU, TRANS, N, A, JA, DESCA, NRHS, B, IB, DESCB, X, WORK, LWORK, INFO)
Definition: pddbmv1.f:3
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181