ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pcgbmv1.f
Go to the documentation of this file.
1  SUBROUTINE pcgbdcmv( LDBW, BWL, BWU, TRANS, N, A, JA, DESCA, NRHS,
2  $ B, IB, DESCB, X, WORK, LWORK, INFO )
3 *
4 *
5 *
6 * -- ScaLAPACK routine (version 1.7) --
7 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
8 * and University of California, Berkeley.
9 * November 15, 1997
10 *
11 * .. Scalar Arguments ..
12  CHARACTER TRANS
13  INTEGER BWL, BWU, IB, INFO, JA, LDBW, LWORK, N, NRHS
14 * ..
15 * .. Array Arguments ..
16  INTEGER DESCA( * ), DESCB( * )
17  COMPLEX A( * ), B( * ), WORK( * ), X( * )
18 * ..
19 *
20 *
21 * Purpose
22 * =======
23 *
24 *
25 * =====================================================================
26 *
27 * Arguments
28 * =========
29 *
30 *
31 * TRANS (global input) CHARACTER
32 * = 'N': Solve with A(1:N, JA:JA+N-1);
33 * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) );
34 *
35 * N (global input) INTEGER
36 * The number of rows and columns to be operated on, i.e. the
37 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
38 *
39 * BWL (global input) INTEGER
40 * Number of subdiagonals. 0 <= BWL <= N-1
41 *
42 * BWU (global input) INTEGER
43 * Number of superdiagonals. 0 <= BWU <= N-1
44 *
45 * A (local input/local output) COMPLEX pointer into
46 * local memory to an array with first dimension
47 * LLD_A >=(bwl+bwu+1) (stored in DESCA).
48 * On entry, this array contains the local pieces of the
49 * This local portion is stored in the packed banded format
50 * used in LAPACK. Please see the Notes below and the
51 * ScaLAPACK manual for more detail on the format of
52 * distributed matrices.
53 *
54 * JA (global input) INTEGER
55 * The index in the global array A that points to the start of
56 * the matrix to be operated on (which may be either all of A
57 * or a submatrix of A).
58 *
59 * DESCA (global and local input) INTEGER array of dimension DLEN.
60 * if 1D type (DTYPE_A=501), DLEN >= 7;
61 * if 2D type (DTYPE_A=1), DLEN >= 9 .
62 * The array descriptor for the distributed matrix A.
63 * Contains information of mapping of A to memory. Please
64 * see NOTES below for full description and options.
65 *
66 * AF (local output) COMPLEX array, dimension LAF.
67 * Auxiliary Fillin Space.
68 * Fillin is created during the factorization routine
69 * PCDBTRF and this is stored in AF. If a linear system
70 * is to be solved using PCDBTRS after the factorization
71 * routine, AF *must not be altered* after the factorization.
72 *
73 * LAF (local input) INTEGER
74 * Size of user-input Auxiliary Fillin space AF. Must be >=
75 * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu)
76 * If LAF is not large enough, an error code will be returned
77 * and the minimum acceptable size will be returned in AF( 1 )
78 *
79 * WORK (local workspace/local output)
80 * COMPLEX temporary workspace. This space may
81 * be overwritten in between calls to routines. WORK must be
82 * the size given in LWORK.
83 * On exit, WORK( 1 ) contains the minimal LWORK.
84 *
85 * LWORK (local input or global input) INTEGER
86 * Size of user-input workspace WORK.
87 * If LWORK is too small, the minimal acceptable size will be
88 * returned in WORK(1) and an error code is returned. LWORK>=
89 *
90 * INFO (global output) INTEGER
91 * = 0: successful exit
92 * < 0: If the i-th argument is an array and the j-entry had
93 * an illegal value, then INFO = -(i*100+j), if the i-th
94 * argument is a scalar and had an illegal value, then
95 * INFO = -i.
96 *
97 * =====================================================================
98 *
99 *
100 * Restrictions
101 * ============
102 *
103 * The following are restrictions on the input parameters. Some of these
104 * are temporary and will be removed in future releases, while others
105 * may reflect fundamental technical limitations.
106 *
107 * Non-cyclic restriction: VERY IMPORTANT!
108 * P*NB>= mod(JA-1,NB)+N.
109 * The mapping for matrices must be blocked, reflecting the nature
110 * of the divide and conquer algorithm as a task-parallel algorithm.
111 * This formula in words is: no processor may have more than one
112 * chunk of the matrix.
113 *
114 * Blocksize cannot be too small:
115 * If the matrix spans more than one processor, the following
116 * restriction on NB, the size of each block on each processor,
117 * must hold:
118 * NB >= 2*MAX(BWL,BWU)
119 * The bulk of parallel computation is done on the matrix of size
120 * O(NB) on each processor. If this is too small, divide and conquer
121 * is a poor choice of algorithm.
122 *
123 * Submatrix reference:
124 * JA = IB
125 * Alignment restriction that prevents unnecessary communication.
126 *
127 *
128 * =====================================================================
129 *
130 *
131 * Notes
132 * =====
133 *
134 * If the factorization routine and the solve routine are to be called
135 * separately (to solve various sets of righthand sides using the same
136 * coefficient matrix), the auxiliary space AF *must not be altered*
137 * between calls to the factorization routine and the solve routine.
138 *
139 * The best algorithm for solving banded and tridiagonal linear systems
140 * depends on a variety of parameters, especially the bandwidth.
141 * Currently, only algorithms designed for the case N/P >> bw are
142 * implemented. These go by many names, including Divide and Conquer,
143 * Partitioning, domain decomposition-type, etc.
144 *
145 * Algorithm description: Divide and Conquer
146 *
147 * The Divide and Conqer algorithm assumes the matrix is narrowly
148 * banded compared with the number of equations. In this situation,
149 * it is best to distribute the input matrix A one-dimensionally,
150 * with columns atomic and rows divided amongst the processes.
151 * The basic algorithm divides the banded matrix up into
152 * P pieces with one stored on each processor,
153 * and then proceeds in 2 phases for the factorization or 3 for the
154 * solution of a linear system.
155 * 1) Local Phase:
156 * The individual pieces are factored independently and in
157 * parallel. These factors are applied to the matrix creating
158 * fillin, which is stored in a non-inspectable way in auxiliary
159 * space AF. Mathematically, this is equivalent to reordering
160 * the matrix A as P A P^T and then factoring the principal
161 * leading submatrix of size equal to the sum of the sizes of
162 * the matrices factored on each processor. The factors of
163 * these submatrices overwrite the corresponding parts of A
164 * in memory.
165 * 2) Reduced System Phase:
166 * A small (max(bwl,bwu)* (P-1)) system is formed representing
167 * interaction of the larger blocks, and is stored (as are its
168 * factors) in the space AF. A parallel Block Cyclic Reduction
169 * algorithm is used. For a linear system, a parallel front solve
170 * followed by an analagous backsolve, both using the structure
171 * of the factored matrix, are performed.
172 * 3) Backsubsitution Phase:
173 * For a linear system, a local backsubstitution is performed on
174 * each processor in parallel.
175 *
176 *
177 * Descriptors
178 * ===========
179 *
180 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
181 *
182 * Note: banded codes can use either the old two dimensional
183 * or new one-dimensional descriptors, though the processor grid in
184 * both cases *must be one-dimensional*. We describe both types below.
185 *
186 * Each global data object is described by an associated description
187 * vector. This vector stores the information required to establish
188 * the mapping between an object element and its corresponding process
189 * and memory location.
190 *
191 * Let A be a generic term for any 2D block cyclicly distributed array.
192 * Such a global array has an associated description vector DESCA.
193 * In the following comments, the character _ should be read as
194 * "of the global array".
195 *
196 * NOTATION STORED IN EXPLANATION
197 * --------------- -------------- --------------------------------------
198 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
199 * DTYPE_A = 1.
200 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
201 * the BLACS process grid A is distribu-
202 * ted over. The context itself is glo-
203 * bal, but the handle (the integer
204 * value) may vary.
205 * M_A (global) DESCA( M_ ) The number of rows in the global
206 * array A.
207 * N_A (global) DESCA( N_ ) The number of columns in the global
208 * array A.
209 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
210 * the rows of the array.
211 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
212 * the columns of the array.
213 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
214 * row of the array A is distributed.
215 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
216 * first column of the array A is
217 * distributed.
218 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
219 * array. LLD_A >= MAX(1,LOCr(M_A)).
220 *
221 * Let K be the number of rows or columns of a distributed matrix,
222 * and assume that its process grid has dimension p x q.
223 * LOCr( K ) denotes the number of elements of K that a process
224 * would receive if K were distributed over the p processes of its
225 * process column.
226 * Similarly, LOCc( K ) denotes the number of elements of K that a
227 * process would receive if K were distributed over the q processes of
228 * its process row.
229 * The values of LOCr() and LOCc() may be determined via a call to the
230 * ScaLAPACK tool function, NUMROC:
231 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
232 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
233 * An upper bound for these quantities may be computed by:
234 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
235 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
236 *
237 *
238 * One-dimensional descriptors:
239 *
240 * One-dimensional descriptors are a new addition to ScaLAPACK since
241 * version 1.0. They simplify and shorten the descriptor for 1D
242 * arrays.
243 *
244 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
245 * object, we allow 1D arrays to be distributed either over the
246 * first dimension of the array (as if the grid were P-by-1) or the
247 * 2nd dimension (as if the grid were 1-by-P). This choice is
248 * indicated by the descriptor type (501 or 502)
249 * as described below.
250 *
251 * IMPORTANT NOTE: the actual BLACS grid represented by the
252 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
253 * irrespective of which one-dimensional descriptor type
254 * (501 or 502) is input.
255 * This routine will interpret the grid properly either way.
256 * ScaLAPACK routines *do not support intercontext operations* so that
257 * the grid passed to a single ScaLAPACK routine *must be the same*
258 * for all array descriptors passed to that routine.
259 *
260 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
261 * may also be used, since a one-dimensional array is a special case
262 * of a two-dimensional array with one dimension of size unity.
263 * The two-dimensional array used in this case *must* be of the
264 * proper orientation:
265 * If the appropriate one-dimensional descriptor is DTYPEA=501
266 * (1 by P type), then the two dimensional descriptor must
267 * have a CTXT value that refers to a 1 by P BLACS grid;
268 * If the appropriate one-dimensional descriptor is DTYPEA=502
269 * (P by 1 type), then the two dimensional descriptor must
270 * have a CTXT value that refers to a P by 1 BLACS grid.
271 *
272 *
273 * Summary of allowed descriptors, types, and BLACS grids:
274 * DTYPE 501 502 1 1
275 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
276 * -----------------------------------------------------
277 * A OK NO OK NO
278 * B NO OK NO OK
279 *
280 * Let A be a generic term for any 1D block cyclicly distributed array.
281 * Such a global array has an associated description vector DESCA.
282 * In the following comments, the character _ should be read as
283 * "of the global array".
284 *
285 * NOTATION STORED IN EXPLANATION
286 * --------------- ---------- ------------------------------------------
287 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
288 * TYPE_A = 501: 1-by-P grid.
289 * TYPE_A = 502: P-by-1 grid.
290 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
291 * the BLACS process grid A is distribu-
292 * ted over. The context itself is glo-
293 * bal, but the handle (the integer
294 * value) may vary.
295 * N_A (global) DESCA( 3 ) The size of the array dimension being
296 * distributed.
297 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
298 * the distributed dimension of the array.
299 * SRC_A (global) DESCA( 5 ) The process row or column over which the
300 * first row or column of the array
301 * is distributed.
302 * LLD_A (local) DESCA( 6 ) The leading dimension of the local array
303 * storing the local blocks of the distri-
304 * buted array A. Minimum value of LLD_A
305 * depends on TYPE_A.
306 * TYPE_A = 501: LLD_A >=
307 * size of undistributed dimension, 1.
308 * TYPE_A = 502: LLD_A >=NB_A, 1.
309 * Reserved DESCA( 7 ) Reserved for future use.
310 *
311 *
312 *
313 * =====================================================================
314 *
315 * Code Developer: Andrew J. Cleary, University of Tennessee.
316 * Current address: Lawrence Livermore National Labs.
317 * This version released: August, 2001.
318 *
319 * =====================================================================
320 *
321 * ..
322 * .. Parameters ..
323  REAL ONE, ZERO
324  parameter( one = 1.0e+0 )
325  parameter( zero = 0.0e+0 )
326  COMPLEX CONE, CZERO
327  parameter( cone = ( 1.0e+0, 0.0e+0 ) )
328  parameter( czero = ( 0.0e+0, 0.0e+0 ) )
329  INTEGER INT_ONE
330  parameter( int_one = 1 )
331  INTEGER DESCMULT, BIGNUM
332  parameter(descmult = 100, bignum = descmult * descmult)
333  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
334  $ lld_, mb_, m_, nb_, n_, rsrc_
335  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
336  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
337  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
338 * ..
339 * .. Local Scalars ..
340  INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N, DU_N_M,
341  $ du_n_n, du_p_m, du_p_n, first_proc, i, ictxt,
342  $ ictxt_new, ictxt_save, idum2, idum3, j, ja_new,
343  $ llda, lldb, max_bw, mycol, myrow, my_num_cols,
344  $ nb, np, npcol, nprow, np_save, odd_size, ofst,
345  $ part_offset, part_size, store_m_b, store_n_a
346  INTEGER NUMROC_SIZE
347 * ..
348 * .. Local Arrays ..
349  INTEGER PARAM_CHECK( 17, 3 )
350 * ..
351 * .. External Subroutines ..
352  EXTERNAL blacs_gridinfo, pxerbla, reshape
353 * ..
354 * .. External Functions ..
355  LOGICAL LSAME
356  INTEGER NUMROC
357  EXTERNAL lsame, numroc
358 * ..
359 * .. Intrinsic Functions ..
360  INTRINSIC ichar, min, mod
361 * ..
362 * .. Executable Statements ..
363 *
364 * Test the input parameters
365 *
366  info = 0
367 *
368  ictxt = desca( ctxt_ )
369  csrc = desca( csrc_ )
370  nb = desca( nb_ )
371  llda = desca( lld_ )
372  store_n_a = desca( n_ )
373  lldb = descb( lld_ )
374  store_m_b = descb( m_ )
375 *
376 *
377 * Size of separator blocks is maximum of bandwidths
378 *
379  max_bw = max(bwl,bwu)
380 *
381  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
382  np = nprow * npcol
383 *
384 *
385 *
386  IF( lsame( trans, 'N' ) ) THEN
387  idum2 = ichar( 'N' )
388  ELSE IF ( lsame( trans, 'C' ) ) THEN
389  idum2 = ichar( 'C' )
390  ELSE
391  info = -1
392  END IF
393 *
394  IF( lwork .LT. -1) THEN
395  info = -15
396  ELSE IF ( lwork .EQ. -1 ) THEN
397  idum3 = -1
398  ELSE
399  idum3 = 1
400  ENDIF
401 *
402  IF( n .LT. 0 ) THEN
403  info = -2
404  ENDIF
405 *
406  IF( n+ja-1 .GT. store_n_a ) THEN
407  info = -( 8*100 + 6 )
408  ENDIF
409 *
410  IF(( bwl .GT. n-1 ) .OR.
411  $ ( bwl .LT. 0 ) ) THEN
412  info = -3
413  ENDIF
414 *
415  IF(( bwu .GT. n-1 ) .OR.
416  $ ( bwu .LT. 0 ) ) THEN
417  info = -4
418  ENDIF
419 *
420  IF( llda .LT. (bwl+bwu+1) ) THEN
421  info = -( 8*100 + 6 )
422  ENDIF
423 *
424  IF( nb .LE. 0 ) THEN
425  info = -( 8*100 + 4 )
426  ENDIF
427 *
428 * Argument checking that is specific to Divide & Conquer routine
429 *
430  IF( nprow .NE. 1 ) THEN
431  info = -( 8*100+2 )
432  ENDIF
433 *
434  IF( n .GT. np*nb-mod( ja-1, nb )) THEN
435  info = -( 2 )
436  CALL pxerbla( ictxt,
437  $ 'PCDBDCMV, D&C alg.: only 1 block per proc',
438  $ -info )
439  RETURN
440  ENDIF
441 *
442  IF((ja+n-1.GT.nb) .AND. ( nb.LT.2*max(bwl,bwu) )) THEN
443  info = -( 8*100+4 )
444  CALL pxerbla( ictxt,
445  $ 'PCDBDCMV, D&C alg.: NB too small',
446  $ -info )
447  RETURN
448  ENDIF
449 *
450 *
451 * Pack params and positions into arrays for global consistency check
452 *
453  param_check( 17, 1 ) = descb(5)
454  param_check( 16, 1 ) = descb(4)
455  param_check( 15, 1 ) = descb(3)
456  param_check( 14, 1 ) = descb(2)
457  param_check( 13, 1 ) = descb(1)
458  param_check( 12, 1 ) = ib
459  param_check( 11, 1 ) = desca(5)
460  param_check( 10, 1 ) = desca(4)
461  param_check( 9, 1 ) = desca(3)
462  param_check( 8, 1 ) = desca(1)
463  param_check( 7, 1 ) = ja
464  param_check( 6, 1 ) = nrhs
465  param_check( 5, 1 ) = bwu
466  param_check( 4, 1 ) = bwl
467  param_check( 3, 1 ) = n
468  param_check( 2, 1 ) = idum3
469  param_check( 1, 1 ) = idum2
470 *
471  param_check( 17, 2 ) = 1105
472  param_check( 16, 2 ) = 1104
473  param_check( 15, 2 ) = 1103
474  param_check( 14, 2 ) = 1102
475  param_check( 13, 2 ) = 1101
476  param_check( 12, 2 ) = 10
477  param_check( 11, 2 ) = 805
478  param_check( 10, 2 ) = 804
479  param_check( 9, 2 ) = 803
480  param_check( 8, 2 ) = 801
481  param_check( 7, 2 ) = 7
482  param_check( 6, 2 ) = 5
483  param_check( 5, 2 ) = 4
484  param_check( 4, 2 ) = 3
485  param_check( 3, 2 ) = 2
486  param_check( 2, 2 ) = 15
487  param_check( 1, 2 ) = 1
488 *
489 * Want to find errors with MIN( ), so if no error, set it to a big
490 * number. If there already is an error, multiply by the the
491 * descriptor multiplier.
492 *
493  IF( info.GE.0 ) THEN
494  info = bignum
495  ELSE IF( info.LT.-descmult ) THEN
496  info = -info
497  ELSE
498  info = -info * descmult
499  END IF
500 *
501 * Check consistency across processors
502 *
503  CALL globchk( ictxt, 17, param_check, 17,
504  $ param_check( 1, 3 ), info )
505 *
506 * Prepare output: set info = 0 if no error, and divide by DESCMULT
507 * if error is not in a descriptor entry.
508 *
509  IF( info.EQ.bignum ) THEN
510  info = 0
511  ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
512  info = -info / descmult
513  ELSE
514  info = -info
515  END IF
516 *
517  IF( info.LT.0 ) THEN
518  CALL pxerbla( ictxt, 'PCDBDCMV', -info )
519  RETURN
520  END IF
521 *
522 * Quick return if possible
523 *
524  IF( n.EQ.0 )
525  $ RETURN
526 *
527 *
528 * Adjust addressing into matrix space to properly get into
529 * the beginning part of the relevant data
530 *
531  part_offset = nb*( (ja-1)/(npcol*nb) )
532 *
533  IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
534  part_offset = part_offset + nb
535  ENDIF
536 *
537  IF ( mycol .LT. csrc ) THEN
538  part_offset = part_offset - nb
539  ENDIF
540 *
541 * Form a new BLACS grid (the "standard form" grid) with only procs
542 * holding part of the matrix, of size 1xNP where NP is adjusted,
543 * starting at csrc=0, with JA modified to reflect dropped procs.
544 *
545 * First processor to hold part of the matrix:
546 *
547  first_proc = mod( ( ja-1 )/nb+csrc, npcol )
548 *
549 * Calculate new JA one while dropping off unused processors.
550 *
551  ja_new = mod( ja-1, nb ) + 1
552 *
553 * Save and compute new value of NP
554 *
555  np_save = np
556  np = ( ja_new+n-2 )/nb + 1
557 *
558 * Call utility routine that forms "standard-form" grid
559 *
560  CALL reshape( ictxt, int_one, ictxt_new, int_one,
561  $ first_proc, int_one, np )
562 *
563 * Use new context from standard grid as context.
564 *
565  ictxt_save = ictxt
566  ictxt = ictxt_new
567 *
568 * Get information about new grid.
569 *
570  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
571 *
572 * Drop out processors that do not have part of the matrix.
573 *
574  IF( myrow .LT. 0 ) THEN
575  GOTO 1234
576  ENDIF
577 *
578 * ********************************
579 * Values reused throughout routine
580 *
581 * User-input value of partition size
582 *
583  part_size = nb
584 *
585 * Number of columns in each processor
586 *
587  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
588 *
589 * Offset in columns to beginning of main partition in each proc
590 *
591  IF ( mycol .EQ. 0 ) THEN
592  part_offset = part_offset+mod( ja_new-1, part_size )
593  my_num_cols = my_num_cols - mod(ja_new-1, part_size )
594  ENDIF
595 *
596 * Offset in elements
597 *
598  ofst = part_offset*llda
599 *
600 * Size of main (or odd) partition in each processor
601 *
602  odd_size = my_num_cols
603  IF ( mycol .LT. np-1 ) THEN
604  odd_size = odd_size - max_bw
605  ENDIF
606 *
607 *
608 *
609 * Zero out solution to use to accumulate answer
610 *
611  numroc_size =
612  $ numroc( n, part_size, mycol, 0, npcol)
613 *
614  DO 2279 j=1,nrhs
615  DO 4502 i=1,numroc_size
616  x( (j-1)*lldb + i ) = czero
617  4502 CONTINUE
618  2279 CONTINUE
619 *
620  DO 5642 i=1, (max_bw+2)*max_bw
621  work( i ) = czero
622  5642 CONTINUE
623 *
624 * Begin main code
625 *
626 *
627 **************************************
628 *
629  IF ( lsame( trans, 'N' ) ) THEN
630 *
631 * Sizes of the extra triangles communicated bewtween processors
632 *
633  IF( mycol .GT. 0 ) THEN
634 *
635  dl_p_m= min( bwl,
636  $ numroc( n, part_size, mycol, 0, npcol ) )
637  dl_p_n= min( bwl,
638  $ numroc( n, part_size, mycol-1, 0, npcol ) )
639 *
640  du_p_m= min( bwu,
641  $ numroc( n, part_size, mycol-1, 0, npcol ) )
642  du_p_n= min( bwu,
643  $ numroc( n, part_size, mycol, 0, npcol ) )
644  ENDIF
645 *
646  IF( mycol .LT. npcol-1 ) THEN
647 *
648  dl_n_m= min( bwl,
649  $ numroc( n, part_size, mycol+1, 0, npcol ) )
650  dl_n_n= min( bwl,
651  $ numroc( n, part_size, mycol, 0, npcol ) )
652 *
653  du_n_m= min( bwu,
654  $ numroc( n, part_size, mycol, 0, npcol ) )
655  du_n_n= min( bwu,
656  $ numroc( n, part_size, mycol+1, 0, npcol ) )
657  ENDIF
658 *
659 *
660 * Use main partition in each processor to multiply locally
661 *
662  CALL cgbmv( trans, numroc_size, numroc_size, bwl, bwu, cone,
663  $ a( ofst+1 ), llda, b(part_offset+1), 1, czero,
664  $ x( part_offset+1 ), 1 )
665 *
666 *
667 *
668  IF ( mycol .LT. npcol-1 ) THEN
669 *
670 * Do the multiplication of the triangle in the lower half
671 *
672  CALL ccopy( dl_n_n,
673  $ b( numroc_size-dl_n_n+1 ),
674  $ 1, work( max_bw*max_bw+1+bwl-dl_n_n ), 1 )
675 *
676  CALL ctrmv( 'U', 'N', 'N', bwl,
677  $ a( llda*( numroc_size-bwl )+1+bwu+bwl ), llda-1,
678  $ work( max_bw*max_bw+1 ), 1)
679 *
680 * Zero out extraneous elements caused by TRMV if any
681 *
682  IF( dl_n_m .GT. dl_n_n ) THEN
683  DO 10 i = dl_n_m-dl_n_n, dl_n_m
684  work( max_bw*max_bw+i ) = 0
685  10 CONTINUE
686  ENDIF
687 *
688 * Send the result to the neighbor
689 *
690  CALL cgesd2d( ictxt, bwl, 1,
691  $ work( max_bw*max_bw+1 ), bwl, myrow, mycol+1 )
692 *
693  ENDIF
694 *
695  IF ( mycol .GT. 0 ) THEN
696 *
697  DO 20 i=1, max_bw*( max_bw+2 )
698  work( i ) = czero
699  20 CONTINUE
700 *
701 * Do the multiplication of the triangle in the upper half
702 *
703 * Copy vector to workspace
704 *
705  CALL ccopy( du_p_n, b( 1 ), 1,
706  $ work( max_bw*max_bw+1 ), 1)
707 *
708  CALL ctrmv(
709  $ 'L',
710  $ 'N',
711  $ 'N', bwu,
712  $ a( 1 ), llda-1,
713  $ work( max_bw*max_bw+1 ), 1 )
714 *
715 * Zero out extraneous results from TRMV if any
716 *
717  IF( du_p_n .GT. du_p_m ) THEN
718  DO 30 i=1, du_p_n-du_p_m
719  work( max_bw*max_bw+i ) = 0
720  30 CONTINUE
721  ENDIF
722 *
723 * Send result back
724 *
725  CALL cgesd2d( ictxt, bwu, 1, work(max_bw*max_bw+1 ),
726  $ bwu, myrow, mycol-1 )
727 *
728 * Receive vector result from neighboring processor
729 *
730  CALL cgerv2d( ictxt, bwl, 1, work( max_bw*max_bw+1 ),
731  $ bwl, myrow, mycol-1 )
732 *
733 * Do addition of received vector
734 *
735  CALL caxpy( bwl, cone,
736  $ work( max_bw*max_bw+1 ), 1,
737  $ x( 1 ), 1 )
738 *
739  ENDIF
740 *
741 *
742 *
743  IF( mycol .LT. npcol-1 ) THEN
744 *
745 * Receive returned result
746 *
747  CALL cgerv2d( ictxt, bwu, 1, work( max_bw*max_bw+1 ),
748  $ bwu, myrow, mycol+1 )
749 *
750 * Do addition of received vector
751 *
752  CALL caxpy( bwu, cone,
753  $ work( max_bw*max_bw+1 ), 1,
754  $ x( numroc_size-bwu+1 ), 1)
755 *
756  ENDIF
757 *
758 *
759  ENDIF
760 *
761 * End of LSAME if
762 *
763 **************************************
764 *
765  IF ( lsame( trans, 'C' ) ) THEN
766 *
767 * Sizes of the extra triangles communicated bewtween processors
768 *
769  IF( mycol .GT. 0 ) THEN
770 *
771  dl_p_m= min( bwu,
772  $ numroc( n, part_size, mycol, 0, npcol ) )
773  dl_p_n= min( bwu,
774  $ numroc( n, part_size, mycol-1, 0, npcol ) )
775 *
776  du_p_m= min( bwl,
777  $ numroc( n, part_size, mycol-1, 0, npcol ) )
778  du_p_n= min( bwl,
779  $ numroc( n, part_size, mycol, 0, npcol ) )
780  ENDIF
781 *
782  IF( mycol .LT. npcol-1 ) THEN
783 *
784  dl_n_m= min( bwu,
785  $ numroc( n, part_size, mycol+1, 0, npcol ) )
786  dl_n_n= min( bwu,
787  $ numroc( n, part_size, mycol, 0, npcol ) )
788 *
789  du_n_m= min( bwl,
790  $ numroc( n, part_size, mycol, 0, npcol ) )
791  du_n_n= min( bwl,
792  $ numroc( n, part_size, mycol+1, 0, npcol ) )
793  ENDIF
794 *
795 *
796  IF( mycol .GT. 0 ) THEN
797 * ...must send triangle in lower half of matrix to left
798 *
799 * Transpose triangle in preparation for sending
800 *
801  CALL clatcpy( 'L', bwu, bwu, a( ofst+1 ),
802  $ llda-1, work( 1 ), max_bw )
803 *
804 * Send the triangle to neighboring processor to left
805 *
806  CALL ctrsd2d(ictxt, 'U', 'N',
807  $ bwu, bwu,
808  $ work( 1 ),
809  $ max_bw, myrow, mycol-1 )
810 *
811  ENDIF
812 *
813  IF( mycol .LT. npcol-1 ) THEN
814 * ...must send triangle in upper half of matrix to right
815 *
816 * Transpose triangle in preparation for sending
817 *
818  CALL clatcpy( 'U', bwl, bwl,
819  $ a( llda*( numroc_size-bwl )+1+bwu+bwl ),
820  $ llda-1, work( 1 ), max_bw )
821 *
822 * Send the triangle to neighboring processor to right
823 *
824  CALL ctrsd2d(ictxt, 'L', 'N',
825  $ bwl, bwl,
826  $ work( 1 ),
827  $ max_bw, myrow, mycol+1 )
828 *
829  ENDIF
830 *
831 * Use main partition in each processor to multiply locally
832 *
833  CALL cgbmv( trans, numroc_size, numroc_size, bwl, bwu, cone,
834  $ a( ofst+1 ), llda, b(part_offset+1), 1, czero,
835  $ x( part_offset+1 ), 1 )
836 *
837 *
838 *
839  IF ( mycol .LT. npcol-1 ) THEN
840 *
841 * Do the multiplication of the triangle in the lower half
842 *
843  CALL ccopy( dl_n_n,
844  $ b( numroc_size-dl_n_n+1 ),
845  $ 1, work( max_bw*max_bw+1+bwu-dl_n_n ), 1 )
846 *
847 * Receive the triangle prior to multiplying by it.
848 *
849  CALL ctrrv2d(ictxt, 'U', 'N',
850  $ bwu, bwu,
851  $ work( 1 ), max_bw, myrow, mycol+1 )
852 *
853  CALL ctrmv( 'U', 'N', 'N', bwu,
854  $ work( 1 ), max_bw,
855  $ work( max_bw*max_bw+1 ), 1)
856 *
857 * Zero out extraneous elements caused by TRMV if any
858 *
859  IF( dl_n_m .GT. dl_n_n ) THEN
860  DO 40 i = dl_n_m-dl_n_n, dl_n_m
861  work( max_bw*max_bw+i ) = 0
862  40 CONTINUE
863  ENDIF
864 *
865 * Send the result to the neighbor
866 *
867  CALL cgesd2d( ictxt, bwu, 1,
868  $ work( max_bw*max_bw+1 ), bwu, myrow, mycol+1 )
869 *
870  ENDIF
871 *
872  IF ( mycol .GT. 0 ) THEN
873 *
874  DO 50 i=1, max_bw*( max_bw+2 )
875  work( i ) = czero
876  50 CONTINUE
877 *
878 * Do the multiplication of the triangle in the upper half
879 *
880 * Copy vector to workspace
881 *
882  CALL ccopy( du_p_n, b( 1 ), 1,
883  $ work( max_bw*max_bw+1 ), 1)
884 *
885 * Receive the triangle prior to multiplying by it.
886 *
887  CALL ctrrv2d(ictxt, 'L', 'N',
888  $ bwl, bwl,
889  $ work( 1 ), max_bw, myrow, mycol-1 )
890 *
891  CALL ctrmv(
892  $ 'L',
893  $ 'N',
894  $ 'N', bwl,
895  $ work( 1 ), max_bw,
896  $ work( max_bw*max_bw+1 ), 1 )
897 *
898 * Zero out extraneous results from TRMV if any
899 *
900  IF( du_p_n .GT. du_p_m ) THEN
901  DO 60 i=1, du_p_n-du_p_m
902  work( max_bw*max_bw+i ) = 0
903  60 CONTINUE
904  ENDIF
905 *
906 * Send result back
907 *
908  CALL cgesd2d( ictxt, bwl, 1, work(max_bw*max_bw+1 ),
909  $ bwl, myrow, mycol-1 )
910 *
911 * Receive vector result from neighboring processor
912 *
913  CALL cgerv2d( ictxt, bwu, 1, work( max_bw*max_bw+1 ),
914  $ bwu, myrow, mycol-1 )
915 *
916 * Do addition of received vector
917 *
918  CALL caxpy( bwu, cone,
919  $ work( max_bw*max_bw+1 ), 1,
920  $ x( 1 ), 1 )
921 *
922  ENDIF
923 *
924 *
925 *
926  IF( mycol .LT. npcol-1 ) THEN
927 *
928 * Receive returned result
929 *
930  CALL cgerv2d( ictxt, bwl, 1, work( max_bw*max_bw+1 ),
931  $ bwl, myrow, mycol+1 )
932 *
933 * Do addition of received vector
934 *
935  CALL caxpy( bwl, cone,
936  $ work( max_bw*max_bw+1 ), 1,
937  $ x( numroc_size-bwl+1 ), 1)
938 *
939  ENDIF
940 *
941 *
942  ENDIF
943 *
944 * End of LSAME if
945 *
946 *
947 * Free BLACS space used to hold standard-form grid.
948 *
949  IF( ictxt_save .NE. ictxt_new ) THEN
950  CALL blacs_gridexit( ictxt_new )
951  ENDIF
952 *
953  1234 CONTINUE
954 *
955 * Restore saved input parameters
956 *
957  ictxt = ictxt_save
958  np = np_save
959 *
960 *
961  RETURN
962 *
963 * End of PCBhBMV1
964 *
965  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
max
#define max(A, B)
Definition: pcgemr.c:180
pcgbdcmv
subroutine pcgbdcmv(LDBW, BWL, BWU, TRANS, N, A, JA, DESCA, NRHS, B, IB, DESCB, X, WORK, LWORK, INFO)
Definition: pcdbmv1.f:3
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
clatcpy
subroutine clatcpy(UPLO, M, N, A, LDA, B, LDB)
Definition: clatcpy.f:2
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181