SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
pzdbtrf.f
Go to the documentation of this file.
1 SUBROUTINE pzdbtrf( N, BWL, BWU, A, JA, DESCA, AF, LAF, WORK,
2 $ LWORK, INFO )
3*
4* -- ScaLAPACK routine (version 2.0.2) --
5* Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6* May 1 2012
7*
8* .. Scalar Arguments ..
9 INTEGER BWL, BWU, INFO, JA, LAF, LWORK, N
10* ..
11* .. Array Arguments ..
12 INTEGER DESCA( * )
13 COMPLEX*16 A( * ), AF( * ), WORK( * )
14* ..
15*
16*
17* Purpose
18* =======
19*
20* PZDBTRF computes a LU factorization
21* of an N-by-N complex banded
22* diagonally dominant-like distributed matrix
23* with bandwidth BWL, BWU: A(1:N, JA:JA+N-1).
24* Reordering is used to increase parallelism in the factorization.
25* This reordering results in factors that are DIFFERENT from those
26* produced by equivalent sequential codes. These factors cannot
27* be used directly by users; however, they can be used in
28* subsequent calls to PZDBTRS to solve linear systems.
29*
30* The factorization has the form
31*
32* P A(1:N, JA:JA+N-1) P^T = L U
33*
34* where U is a banded upper triangular matrix and L is banded
35* lower triangular, and P is a permutation matrix.
36*
37* =====================================================================
38*
39* Arguments
40* =========
41*
42*
43* N (global input) INTEGER
44* The number of rows and columns to be operated on, i.e. the
45* order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
46*
47* BWL (global input) INTEGER
48* Number of subdiagonals. 0 <= BWL <= N-1
49*
50* BWU (global input) INTEGER
51* Number of superdiagonals. 0 <= BWU <= N-1
52*
53* A (local input/local output) COMPLEX*16 pointer into
54* local memory to an array with first dimension
55* LLD_A >=(bwl+bwu+1) (stored in DESCA).
56* On entry, this array contains the local pieces of the
57* N-by-N unsymmetric banded distributed matrix
58* A(1:N, JA:JA+N-1) to be factored.
59* This local portion is stored in the packed banded format
60* used in LAPACK. Please see the Notes below and the
61* ScaLAPACK manual for more detail on the format of
62* distributed matrices.
63* On exit, this array contains information containing details
64* of the factorization.
65* Note that permutations are performed on the matrix, so that
66* the factors returned are different from those returned
67* by LAPACK.
68*
69* JA (global input) INTEGER
70* The index in the global array A that points to the start of
71* the matrix to be operated on (which may be either all of A
72* or a submatrix of A).
73*
74* DESCA (global and local input) INTEGER array of dimension DLEN.
75* if 1D type (DTYPE_A=501), DLEN >= 7;
76* if 2D type (DTYPE_A=1), DLEN >= 9 .
77* The array descriptor for the distributed matrix A.
78* Contains information of mapping of A to memory. Please
79* see NOTES below for full description and options.
80*
81* AF (local output) COMPLEX*16 array, dimension LAF.
82* Auxiliary Fillin Space.
83* Fillin is created during the factorization routine
84* PZDBTRF and this is stored in AF. If a linear system
85* is to be solved using PZDBTRS after the factorization
86* routine, AF *must not be altered* after the factorization.
87*
88* LAF (local input) INTEGER
89* Size of user-input Auxiliary Fillin space AF. Must be >=
90* NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu)
91* If LAF is not large enough, an error code will be returned
92* and the minimum acceptable size will be returned in AF( 1 )
93*
94* WORK (local workspace/local output)
95* COMPLEX*16 temporary workspace. This space may
96* be overwritten in between calls to routines. WORK must be
97* the size given in LWORK.
98* On exit, WORK( 1 ) contains the minimal LWORK.
99*
100* LWORK (local input or global input) INTEGER
101* Size of user-input workspace WORK.
102* If LWORK is too small, the minimal acceptable size will be
103* returned in WORK(1) and an error code is returned. LWORK>=
104* max(bwl,bwu)*max(bwl,bwu)
105*
106* INFO (global output) INTEGER
107* = 0: successful exit
108* < 0: If the i-th argument is an array and the j-entry had
109* an illegal value, then INFO = -(i*100+j), if the i-th
110* argument is a scalar and had an illegal value, then
111* INFO = -i.
112* > 0: If INFO = K<=NPROCS, the submatrix stored on processor
113* INFO and factored locally was not
114* diagonally dominant-like, and
115* the factorization was not completed.
116* If INFO = K>NPROCS, the submatrix stored on processor
117* INFO-NPROCS representing interactions with other
118* processors was not
119* stably factorable wo/interchanges,
120* and the factorization was not completed.
121*
122* =====================================================================
123*
124*
125* Restrictions
126* ============
127*
128* The following are restrictions on the input parameters. Some of these
129* are temporary and will be removed in future releases, while others
130* may reflect fundamental technical limitations.
131*
132* Non-cyclic restriction: VERY IMPORTANT!
133* P*NB>= mod(JA-1,NB)+N.
134* The mapping for matrices must be blocked, reflecting the nature
135* of the divide and conquer algorithm as a task-parallel algorithm.
136* This formula in words is: no processor may have more than one
137* chunk of the matrix.
138*
139* Blocksize cannot be too small:
140* If the matrix spans more than one processor, the following
141* restriction on NB, the size of each block on each processor,
142* must hold:
143* NB >= 2*MAX(BWL,BWU)
144* The bulk of parallel computation is done on the matrix of size
145* O(NB) on each processor. If this is too small, divide and conquer
146* is a poor choice of algorithm.
147*
148* Submatrix reference:
149* JA = IB
150* Alignment restriction that prevents unnecessary communication.
151*
152*
153* =====================================================================
154*
155*
156* Notes
157* =====
158*
159* If the factorization routine and the solve routine are to be called
160* separately (to solve various sets of righthand sides using the same
161* coefficient matrix), the auxiliary space AF *must not be altered*
162* between calls to the factorization routine and the solve routine.
163*
164* The best algorithm for solving banded and tridiagonal linear systems
165* depends on a variety of parameters, especially the bandwidth.
166* Currently, only algorithms designed for the case N/P >> bw are
167* implemented. These go by many names, including Divide and Conquer,
168* Partitioning, domain decomposition-type, etc.
169*
170* Algorithm description: Divide and Conquer
171*
172* The Divide and Conqer algorithm assumes the matrix is narrowly
173* banded compared with the number of equations. In this situation,
174* it is best to distribute the input matrix A one-dimensionally,
175* with columns atomic and rows divided amongst the processes.
176* The basic algorithm divides the banded matrix up into
177* P pieces with one stored on each processor,
178* and then proceeds in 2 phases for the factorization or 3 for the
179* solution of a linear system.
180* 1) Local Phase:
181* The individual pieces are factored independently and in
182* parallel. These factors are applied to the matrix creating
183* fillin, which is stored in a non-inspectable way in auxiliary
184* space AF. Mathematically, this is equivalent to reordering
185* the matrix A as P A P^T and then factoring the principal
186* leading submatrix of size equal to the sum of the sizes of
187* the matrices factored on each processor. The factors of
188* these submatrices overwrite the corresponding parts of A
189* in memory.
190* 2) Reduced System Phase:
191* A small (max(bwl,bwu)* (P-1)) system is formed representing
192* interaction of the larger blocks, and is stored (as are its
193* factors) in the space AF. A parallel Block Cyclic Reduction
194* algorithm is used. For a linear system, a parallel front solve
195* followed by an analagous backsolve, both using the structure
196* of the factored matrix, are performed.
197* 3) Backsubsitution Phase:
198* For a linear system, a local backsubstitution is performed on
199* each processor in parallel.
200*
201*
202* Descriptors
203* ===========
204*
205* Descriptors now have *types* and differ from ScaLAPACK 1.0.
206*
207* Note: banded codes can use either the old two dimensional
208* or new one-dimensional descriptors, though the processor grid in
209* both cases *must be one-dimensional*. We describe both types below.
210*
211* Each global data object is described by an associated description
212* vector. This vector stores the information required to establish
213* the mapping between an object element and its corresponding process
214* and memory location.
215*
216* Let A be a generic term for any 2D block cyclicly distributed array.
217* Such a global array has an associated description vector DESCA.
218* In the following comments, the character _ should be read as
219* "of the global array".
220*
221* NOTATION STORED IN EXPLANATION
222* --------------- -------------- --------------------------------------
223* DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
224* DTYPE_A = 1.
225* CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
226* the BLACS process grid A is distribu-
227* ted over. The context itself is glo-
228* bal, but the handle (the integer
229* value) may vary.
230* M_A (global) DESCA( M_ ) The number of rows in the global
231* array A.
232* N_A (global) DESCA( N_ ) The number of columns in the global
233* array A.
234* MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
235* the rows of the array.
236* NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
237* the columns of the array.
238* RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
239* row of the array A is distributed.
240* CSRC_A (global) DESCA( CSRC_ ) The process column over which the
241* first column of the array A is
242* distributed.
243* LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
244* array. LLD_A >= MAX(1,LOCr(M_A)).
245*
246* Let K be the number of rows or columns of a distributed matrix,
247* and assume that its process grid has dimension p x q.
248* LOCr( K ) denotes the number of elements of K that a process
249* would receive if K were distributed over the p processes of its
250* process column.
251* Similarly, LOCc( K ) denotes the number of elements of K that a
252* process would receive if K were distributed over the q processes of
253* its process row.
254* The values of LOCr() and LOCc() may be determined via a call to the
255* ScaLAPACK tool function, NUMROC:
256* LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
257* LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
258* An upper bound for these quantities may be computed by:
259* LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
260* LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
261*
262*
263* One-dimensional descriptors:
264*
265* One-dimensional descriptors are a new addition to ScaLAPACK since
266* version 1.0. They simplify and shorten the descriptor for 1D
267* arrays.
268*
269* Since ScaLAPACK supports two-dimensional arrays as the fundamental
270* object, we allow 1D arrays to be distributed either over the
271* first dimension of the array (as if the grid were P-by-1) or the
272* 2nd dimension (as if the grid were 1-by-P). This choice is
273* indicated by the descriptor type (501 or 502)
274* as described below.
275*
276* IMPORTANT NOTE: the actual BLACS grid represented by the
277* CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
278* irrespective of which one-dimensional descriptor type
279* (501 or 502) is input.
280* This routine will interpret the grid properly either way.
281* ScaLAPACK routines *do not support intercontext operations* so that
282* the grid passed to a single ScaLAPACK routine *must be the same*
283* for all array descriptors passed to that routine.
284*
285* NOTE: In all cases where 1D descriptors are used, 2D descriptors
286* may also be used, since a one-dimensional array is a special case
287* of a two-dimensional array with one dimension of size unity.
288* The two-dimensional array used in this case *must* be of the
289* proper orientation:
290* If the appropriate one-dimensional descriptor is DTYPEA=501
291* (1 by P type), then the two dimensional descriptor must
292* have a CTXT value that refers to a 1 by P BLACS grid;
293* If the appropriate one-dimensional descriptor is DTYPEA=502
294* (P by 1 type), then the two dimensional descriptor must
295* have a CTXT value that refers to a P by 1 BLACS grid.
296*
297*
298* Summary of allowed descriptors, types, and BLACS grids:
299* DTYPE 501 502 1 1
300* BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
301* -----------------------------------------------------
302* A OK NO OK NO
303* B NO OK NO OK
304*
305* Let A be a generic term for any 1D block cyclicly distributed array.
306* Such a global array has an associated description vector DESCA.
307* In the following comments, the character _ should be read as
308* "of the global array".
309*
310* NOTATION STORED IN EXPLANATION
311* --------------- ---------- ------------------------------------------
312* DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
313* TYPE_A = 501: 1-by-P grid.
314* TYPE_A = 502: P-by-1 grid.
315* CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
316* the BLACS process grid A is distribu-
317* ted over. The context itself is glo-
318* bal, but the handle (the integer
319* value) may vary.
320* N_A (global) DESCA( 3 ) The size of the array dimension being
321* distributed.
322* NB_A (global) DESCA( 4 ) The blocking factor used to distribute
323* the distributed dimension of the array.
324* SRC_A (global) DESCA( 5 ) The process row or column over which the
325* first row or column of the array
326* is distributed.
327* LLD_A (local) DESCA( 6 ) The leading dimension of the local array
328* storing the local blocks of the distri-
329* buted array A. Minimum value of LLD_A
330* depends on TYPE_A.
331* TYPE_A = 501: LLD_A >=
332* size of undistributed dimension, 1.
333* TYPE_A = 502: LLD_A >=NB_A, 1.
334* Reserved DESCA( 7 ) Reserved for future use.
335*
336*
337*
338* =====================================================================
339*
340* Code Developer: Andrew J. Cleary, University of Tennessee.
341* Current address: Lawrence Livermore National Labs.
342* This version released: August, 2001.
343*
344* =====================================================================
345*
346* ..
347* .. Parameters ..
348 DOUBLE PRECISION ONE, ZERO
349 parameter( one = 1.0d+0 )
350 parameter( zero = 0.0d+0 )
351 COMPLEX*16 CONE, CZERO
352 parameter( cone = ( 1.0d+0, 0.0d+0 ) )
353 parameter( czero = ( 0.0d+0, 0.0d+0 ) )
354 INTEGER INT_ONE
355 parameter( int_one = 1 )
356 INTEGER DESCMULT, BIGNUM
357 parameter(descmult = 100, bignum = descmult * descmult)
358 INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
359 $ lld_, mb_, m_, nb_, n_, rsrc_
360 parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
361 $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
362 $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
363* ..
364* .. Local Scalars ..
365 INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT,
366 $ ictxt_new, ictxt_save, idum3, ja_new, laf_min,
367 $ level_dist, llda, max_bw, mbw2, mycol, myrow,
368 $ my_num_cols, nb, next_tri_size_m,
369 $ next_tri_size_n, np, npcol, nprow, np_save,
370 $ odd_size, ofst, part_offset, part_size,
371 $ prev_tri_size_m, prev_tri_size_n, return_code,
372 $ store_n_a, up_prev_tri_size_m,
373 $ up_prev_tri_size_n, work_size_min, work_u
374* ..
375* .. Local Arrays ..
376 INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 )
377* ..
378* .. External Subroutines ..
379 EXTERNAL blacs_get, blacs_gridexit, blacs_gridinfo,
381 $ zgemm, zgerv2d, zgesd2d, zlamov, zlatcpy,
382 $ zpbtrf, zpotrf, zsyrk, ztbtrs, ztrmm, ztrrv2d,
383 $ ztrsd2d, ztrsm, ztrtrs
384* ..
385* .. External Functions ..
386 LOGICAL LSAME
387 INTEGER NUMROC
388 EXTERNAL lsame, numroc
389* ..
390* .. Intrinsic Functions ..
391 INTRINSIC ichar, min, mod
392* ..
393* .. Executable Statements ..
394*
395* Test the input parameters
396*
397 info = 0
398*
399* Convert descriptor into standard form for easy access to
400* parameters, check that grid is of right shape.
401*
402 desca_1xp( 1 ) = 501
403*
404 CALL desc_convert( desca, desca_1xp, return_code )
405*
406 IF( return_code .NE. 0) THEN
407 info = -( 6*100 + 2 )
408 ENDIF
409*
410* Get values out of descriptor for use in code.
411*
412 ictxt = desca_1xp( 2 )
413 csrc = desca_1xp( 5 )
414 nb = desca_1xp( 4 )
415 llda = desca_1xp( 6 )
416 store_n_a = desca_1xp( 3 )
417*
418* Get grid parameters
419*
420*
421* Size of separator blocks is maximum of bandwidths
422*
423 max_bw = max(bwl,bwu)
424 mbw2 = max_bw * max_bw
425*
426 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
427 np = nprow * npcol
428*
429*
430*
431 IF( lwork .LT. -1) THEN
432 info = -10
433 ELSE IF ( lwork .EQ. -1 ) THEN
434 idum3 = -1
435 ELSE
436 idum3 = 1
437 ENDIF
438*
439 IF( n .LT. 0 ) THEN
440 info = -1
441 ENDIF
442*
443 IF( n+ja-1 .GT. store_n_a ) THEN
444 info = -( 6*100 + 6 )
445 ENDIF
446*
447 IF(( bwl .GT. n-1 ) .OR.
448 $ ( bwl .LT. 0 ) ) THEN
449 info = -2
450 ENDIF
451*
452 IF(( bwu .GT. n-1 ) .OR.
453 $ ( bwu .LT. 0 ) ) THEN
454 info = -3
455 ENDIF
456*
457 IF( llda .LT. (bwl+bwu+1) ) THEN
458 info = -( 6*100 + 6 )
459 ENDIF
460*
461 IF( nb .LE. 0 ) THEN
462 info = -( 6*100 + 4 )
463 ENDIF
464*
465* Argument checking that is specific to Divide & Conquer routine
466*
467 IF( nprow .NE. 1 ) THEN
468 info = -( 6*100+2 )
469 ENDIF
470*
471 IF( n .GT. np*nb-mod( ja-1, nb )) THEN
472 info = -( 1 )
473 CALL pxerbla( ictxt,
474 $ 'PZDBTRF, D&C alg.: only 1 block per proc',
475 $ -info )
476 RETURN
477 ENDIF
478*
479 IF((ja+n-1.GT.nb) .AND. ( nb.LT.2*max(bwl,bwu) )) THEN
480 info = -( 6*100+4 )
481 CALL pxerbla( ictxt,
482 $ 'PZDBTRF, D&C alg.: NB too small',
483 $ -info )
484 RETURN
485 ENDIF
486*
487*
488* Check auxiliary storage size
489*
490 laf_min = nb*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu)
491*
492 IF( laf .LT. laf_min ) THEN
493 info = -8
494* put minimum value of laf into AF( 1 )
495 af( 1 ) = laf_min
496 CALL pxerbla( ictxt,
497 $ 'PZDBTRF: auxiliary storage error ',
498 $ -info )
499 RETURN
500 ENDIF
501*
502* Check worksize
503*
504 work_size_min = max(bwl,bwu)*max(bwl,bwu)
505*
506 work( 1 ) = work_size_min
507*
508 IF( lwork .LT. work_size_min ) THEN
509 IF( lwork .NE. -1 ) THEN
510 info = -10
511 CALL pxerbla( ictxt,
512 $ 'PZDBTRF: worksize error ',
513 $ -info )
514 ENDIF
515 RETURN
516 ENDIF
517*
518* Pack params and positions into arrays for global consistency check
519*
520 param_check( 9, 1 ) = desca(5)
521 param_check( 8, 1 ) = desca(4)
522 param_check( 7, 1 ) = desca(3)
523 param_check( 6, 1 ) = desca(1)
524 param_check( 5, 1 ) = ja
525 param_check( 4, 1 ) = bwu
526 param_check( 3, 1 ) = bwl
527 param_check( 2, 1 ) = n
528 param_check( 1, 1 ) = idum3
529*
530 param_check( 9, 2 ) = 605
531 param_check( 8, 2 ) = 604
532 param_check( 7, 2 ) = 603
533 param_check( 6, 2 ) = 601
534 param_check( 5, 2 ) = 5
535 param_check( 4, 2 ) = 3
536 param_check( 3, 2 ) = 2
537 param_check( 2, 2 ) = 1
538 param_check( 1, 2 ) = 10
539*
540* Want to find errors with MIN( ), so if no error, set it to a big
541* number. If there already is an error, multiply by the the
542* descriptor multiplier.
543*
544 IF( info.GE.0 ) THEN
545 info = bignum
546 ELSE IF( info.LT.-descmult ) THEN
547 info = -info
548 ELSE
549 info = -info * descmult
550 END IF
551*
552* Check consistency across processors
553*
554 CALL globchk( ictxt, 9, param_check, 9,
555 $ param_check( 1, 3 ), info )
556*
557* Prepare output: set info = 0 if no error, and divide by DESCMULT
558* if error is not in a descriptor entry.
559*
560 IF( info.EQ.bignum ) THEN
561 info = 0
562 ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
563 info = -info / descmult
564 ELSE
565 info = -info
566 END IF
567*
568 IF( info.LT.0 ) THEN
569 CALL pxerbla( ictxt, 'PZDBTRF', -info )
570 RETURN
571 END IF
572*
573* Quick return if possible
574*
575 IF( n.EQ.0 )
576 $ RETURN
577*
578*
579* Adjust addressing into matrix space to properly get into
580* the beginning part of the relevant data
581*
582 part_offset = nb*( (ja-1)/(npcol*nb) )
583*
584 IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
585 part_offset = part_offset + nb
586 ENDIF
587*
588 IF ( mycol .LT. csrc ) THEN
589 part_offset = part_offset - nb
590 ENDIF
591*
592* Form a new BLACS grid (the "standard form" grid) with only procs
593* holding part of the matrix, of size 1xNP where NP is adjusted,
594* starting at csrc=0, with JA modified to reflect dropped procs.
595*
596* First processor to hold part of the matrix:
597*
598 first_proc = mod( ( ja-1 )/nb+csrc, npcol )
599*
600* Calculate new JA one while dropping off unused processors.
601*
602 ja_new = mod( ja-1, nb ) + 1
603*
604* Save and compute new value of NP
605*
606 np_save = np
607 np = ( ja_new+n-2 )/nb + 1
608*
609* Call utility routine that forms "standard-form" grid
610*
611 CALL reshape( ictxt, int_one, ictxt_new, int_one,
612 $ first_proc, int_one, np )
613*
614* Use new context from standard grid as context.
615*
616 ictxt_save = ictxt
617 ictxt = ictxt_new
618 desca_1xp( 2 ) = ictxt_new
619*
620* Get information about new grid.
621*
622 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
623*
624* Drop out processors that do not have part of the matrix.
625*
626 IF( myrow .LT. 0 ) THEN
627 GOTO 1234
628 ENDIF
629*
630* ********************************
631* Values reused throughout routine
632*
633* User-input value of partition size
634*
635 part_size = nb
636*
637* Number of columns in each processor
638*
639 my_num_cols = numroc( n, part_size, mycol, 0, npcol )
640*
641* Offset in columns to beginning of main partition in each proc
642*
643 IF ( mycol .EQ. 0 ) THEN
644 part_offset = part_offset+mod( ja_new-1, part_size )
645 my_num_cols = my_num_cols - mod(ja_new-1, part_size )
646 ENDIF
647*
648* Offset in elements
649*
650 ofst = part_offset*llda
651*
652* Size of main (or odd) partition in each processor
653*
654 odd_size = my_num_cols
655 IF ( mycol .LT. np-1 ) THEN
656 odd_size = odd_size - max_bw
657 ENDIF
658*
659* Offset to workspace for Upper triangular factor
660*
661 work_u = bwu*odd_size + 3*mbw2
662*
663*
664* Zero out space for fillin
665*
666 DO 10 i=1, laf_min
667 af( i ) = czero
668 10 CONTINUE
669*
670* Zero out space for work
671*
672 DO 20 i=1, work_size_min
673 work( i ) = czero
674 20 CONTINUE
675*
676* Begin main code
677*
678*
679********************************************************************
680* PHASE 1: Local computation phase.
681********************************************************************
682*
683*
684* Sizes of the extra triangles communicated bewtween processors
685*
686 IF( mycol .GT. 0 ) THEN
687 prev_tri_size_m= min( bwl,
688 $ numroc( n, part_size, mycol, 0, npcol ) )
689 prev_tri_size_n=min( bwl,
690 $ numroc( n, part_size, mycol-1, 0, npcol ) )
691 ENDIF
692*
693 IF( mycol .GT. 0 ) THEN
694 up_prev_tri_size_m= min( bwu,
695 $ numroc( n, part_size, mycol, 0, npcol ) )
696 up_prev_tri_size_n=min( bwu,
697 $ numroc( n, part_size, mycol-1, 0, npcol ) )
698 ENDIF
699*
700 IF( mycol .LT. npcol-1 ) THEN
701 next_tri_size_m=min( bwl,
702 $ numroc( n, part_size, mycol+1, 0, npcol ) )
703 next_tri_size_n= min( bwl,
704 $ numroc( n, part_size, mycol, 0, npcol ) )
705 ENDIF
706*
707 IF ( mycol .LT. np-1 ) THEN
708* Transfer last triangle D_i of local matrix to next processor
709* which needs it to calculate fillin due to factorization of
710* its main (odd) block A_i.
711* Overlap the send with the factorization of A_i.
712*
713 CALL ztrsd2d( ictxt, 'U', 'N', next_tri_size_m,
714 $ next_tri_size_n,
715 $ a( ofst+(my_num_cols-bwl)*llda+(bwl+bwu+1) ),
716 $ llda-1, 0, mycol+1 )
717*
718 ENDIF
719*
720*
721* Factor main partition A_i = L_i {U_i} in each processor
722*
723 CALL zdbtrf( odd_size, odd_size, bwl, bwu, a( ofst + 1),
724 $ llda, info )
725*
726 IF( info.NE.0 ) THEN
727 info = mycol+1
728 GOTO 1500
729 ENDIF
730*
731*
732 IF ( mycol .LT. np-1 ) THEN
733*
734* Apply factorization to lower connection block BL_i
735* conjugate transpose the connection block in preparation.
736* Apply factorization to upper connection block BU_i
737* Move the connection block in preparation.
738*
739 CALL zlatcpy( 'U', bwl, bwl,
740 $ a(( ofst+(bwl+bwu+1)+(odd_size-bwl)*llda )),
741 $ llda-1, af( odd_size*bwu+2*mbw2+1+max_bw-bwl ),
742 $ max_bw )
743 CALL zlamov( 'L', bwu, bwu, a( ( ofst+1+odd_size*llda ) ),
744 $ llda-1,
745 $ af( work_u+odd_size*bwl+2*mbw2+1+max_bw-bwu ),
746 $ max_bw )
747*
748* Perform the triangular system solve {L_i}{{BU'}_i} = {B_i}
749*
750 CALL ztbtrs( 'L', 'N', 'U', bwu, bwl, bwu,
751 $ a( ofst+bwu+1+(odd_size-bwu )*llda ), llda,
752 $ af( work_u+odd_size*bwl+2*mbw2+1+max_bw-bwu ),
753 $ max_bw, info )
754*
755* Perform the triangular solve {U_i}^C{BL'}_i^C = {BL_i}^C
756*
757 CALL ztbtrs( 'U', 'C', 'N', bwl, bwu, bwl,
758 $ a( ofst+1+(odd_size-bwl)*llda ), llda,
759 $ af( odd_size*bwu+2*mbw2+1+max_bw-bwl ), max_bw,
760 $ info )
761*
762* conjugate transpose resulting block to its location
763* in main storage.
764*
765 CALL zlatcpy( 'L', bwl, bwl,
766 $ af( odd_size*bwu+2*mbw2+1+max_bw-bwl ), max_bw,
767 $ a(( ofst+(bwl+bwu+1)+(odd_size-bwl)*llda )),
768 $ llda-1 )
769*
770* Move the resulting block back to its location in main storage.
771*
772 CALL zlamov( 'L', bwu, bwu,
773 $ af( work_u+odd_size*bwl+2*mbw2+1+max_bw-bwu ),
774 $ max_bw, a(( ofst+1+odd_size*llda )), llda-1 )
775*
776*
777* Compute contribution to diagonal block(s) of reduced system.
778* {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i}
779*
780* The following method uses more flops than necessary but
781* does not necessitate the writing of a new BLAS routine.
782*
783*
784 CALL zgemm( 'C', 'N', max_bw, max_bw, max_bw, -cone ,
785 $ af( odd_size*bwu+2*mbw2+1), max_bw,
786 $ af( work_u+odd_size*bwl+2*mbw2+1), max_bw, cone,
787 $ a( ofst+odd_size*llda+1+bwu ), llda-1 )
788*
789 ENDIF
790* End of "if ( MYCOL .lt. NP-1 )..." loop
791*
792*
793 1500 CONTINUE
794* If the processor could not locally factor, it jumps here.
795*
796 IF ( mycol .NE. 0 ) THEN
797* Discard temporary matrix stored beginning in
798* AF( (odd_size+2*bwl, bwu)*bwl, bwu+1 ) and use for
799* off_diagonal block of reduced system.
800*
801* Receive previously transmitted matrix section, which forms
802* the right-hand-side for the triangular solve that calculates
803* the "spike" fillin.
804*
805*
806 CALL ztrrv2d( ictxt, 'U', 'N', prev_tri_size_m,
807 $ prev_tri_size_n, af( work_u+1 ), odd_size, 0,
808 $ mycol-1 )
809*
810 IF (info.EQ.0) THEN
811*
812* Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ .
813*
814 CALL ztbtrs( 'L', 'N', 'U', odd_size, bwl, bwl,
815 $ a( ofst + bwu+1 ), llda, af( work_u+1 ),
816 $ odd_size, info )
817*
818*
819* Calculate the "spike" fillin, ${U_i}^C {{GL}_i}^C = {DU_i}^C$
820*
821*
822* Copy D block into AF storage for solve.
823*
824 CALL zlatcpy( 'L', up_prev_tri_size_n, up_prev_tri_size_m,
825 $ a( ofst+1 ), llda-1, af( 1 ), odd_size )
826*
827 CALL ztbtrs( 'U', 'C', 'N', odd_size, bwu, bwu,
828 $ a( ofst + 1 ), llda,
829 $ af( 1 ), odd_size, info )
830*
831*
832* Calculate the update block for previous proc, E_i = GL_i{GU_i}
833*
834*
835* Zero out space in case result is smaller than storage block
836*
837 DO 30 i=1, mbw2
838 af( odd_size*bwu+2*mbw2+i ) = czero
839 30 CONTINUE
840*
841 CALL zgemm( 'C', 'N', bwu, bwl, odd_size,
842 $ -cone, af( 1 ), odd_size,
843 $ af( work_u+1 ), odd_size, czero,
844 $ af( 1+max(0,bwl-bwu)+odd_size*bwu+
845 $ (2*max_bw+max(0,bwu-bwl))*max_bw),
846 $ max_bw )
847*
848*
849* Initiate send of E_i to previous processor to overlap
850* with next computation.
851*
852 CALL zgesd2d( ictxt, max_bw, max_bw,
853 $ af( odd_size*bwu+2*mbw2+1 ), max_bw, 0,
854 $ mycol-1 )
855*
856*
857 IF ( mycol .LT. np-1 ) THEN
858*
859* Calculate off-diagonal block(s) of reduced system.
860* Note: for ease of use in solution of reduced system, store
861* L's off-diagonal block in conjugate transpose form.
862*
863* Copy matrix HU_i (the last bwl rows of GU_i) to AFL storage
864* as per requirements of BLAS routine ZTRMM.
865* Since we have GU_i stored,
866* conjugate transpose HU_i to HU_i^C.
867*
868 CALL zlatcpy( 'N', bwl, bwl,
869 $ af( work_u+odd_size-bwl+1 ), odd_size,
870 $ af( (odd_size)*bwu+1+(max_bw-bwl) ),
871 $ max_bw )
872*
873 CALL ztrmm( 'R', 'U', 'C', 'N', bwl, bwl, -cone,
874 $ a( ( ofst+(bwl+bwu+1)+(odd_size-bwl)*llda ) ),
875 $ llda-1, af( (odd_size)*bwu+1+(max_bw-bwl) ),
876 $ max_bw )
877*
878*
879* Copy matrix HL_i (the last bwu rows of GL_i^C) to AFU store
880* as per requirements of BLAS routine ZTRMM.
881* Since we have GL_i^C stored,
882* conjugate transpose HL_i^C to HL_i.
883*
884 CALL zlatcpy( 'N', bwu, bwu,
885 $ af( odd_size-bwu+1 ), odd_size,
886 $ af( work_u+(odd_size)*bwl+1+max_bw-bwu ),
887 $ max_bw )
888*
889 CALL ztrmm( 'R', 'L', 'N', 'N', bwu, bwu, -cone,
890 $ a( ( ofst+1+odd_size*llda ) ), llda-1,
891 $ af( work_u+(odd_size)*bwl+1+max_bw-bwu ),
892 $ max_bw )
893*
894 ENDIF
895*
896 ENDIF
897* End of "if ( MYCOL .ne. 0 )..."
898*
899 ENDIF
900* End of "if (info.eq.0) then"
901*
902*
903* Check to make sure no processors have found errors
904*
905 CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info,
906 $ -1, 0, 0 )
907*
908 IF( mycol.EQ.0 ) THEN
909 CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
910 ELSE
911 CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
912 ENDIF
913*
914 IF ( info.NE.0 ) THEN
915 GOTO 1000
916 ENDIF
917* No errors found, continue
918*
919*
920********************************************************************
921* PHASE 2: Formation and factorization of Reduced System.
922********************************************************************
923*
924* Gather up local sections of reduced system
925*
926*
927* The last processor does not participate in the factorization of
928* the reduced system, having sent its E_i already.
929 IF( mycol .EQ. npcol-1 ) THEN
930 GOTO 14
931 ENDIF
932*
933* Initiate send of off-diag block(s) to overlap with next part.
934* Off-diagonal block needed on neighboring processor to start
935* algorithm.
936*
937 IF( (mod( mycol+1, 2 ) .EQ. 0) .AND. ( mycol .GT. 0 ) ) THEN
938*
939 CALL zgesd2d( ictxt, max_bw, max_bw,
940 $ af( odd_size*bwu+1 ),
941 $ max_bw, 0, mycol-1 )
942*
943 CALL zgesd2d( ictxt, max_bw, max_bw,
944 $ af( work_u+odd_size*bwl+1 ),
945 $ max_bw, 0, mycol-1 )
946*
947 ENDIF
948*
949* Copy last diagonal block into AF storage for subsequent
950* operations.
951*
952 CALL zlamov( 'N', max_bw, max_bw,
953 $ a( ofst+odd_size*llda+bwu+1 ),
954 $ llda-1, af( odd_size*bwu+mbw2+1 ),
955 $ max_bw )
956*
957* Receive cont. to diagonal block that is stored on this proc.
958*
959 IF( mycol.LT. npcol-1 ) THEN
960*
961 CALL zgerv2d( ictxt, max_bw, max_bw,
962 $ af( odd_size*bwu+2*mbw2+1 ),
963 $ max_bw, 0,
964 $ mycol+1 )
965*
966* Add contribution to diagonal block
967*
968 CALL zaxpy( mbw2, cone,
969 $ af( odd_size*bwu+2*mbw2+1 ),
970 $ 1, af( odd_size*bwu+mbw2+1 ), 1 )
971*
972 ENDIF
973*
974*
975* *************************************
976* Modification Loop
977*
978* The distance for sending and receiving for each level starts
979* at 1 for the first level.
980 level_dist = 1
981*
982* Do until this proc is needed to modify other procs' equations
983*
984 12 CONTINUE
985 IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 11
986*
987* Receive and add contribution to diagonal block from the left
988*
989 IF( mycol-level_dist .GE. 0 ) THEN
990 CALL zgerv2d( ictxt, max_bw, max_bw, work( 1 ),
991 $ max_bw, 0, mycol-level_dist )
992*
993 CALL zaxpy( mbw2, cone, work( 1 ), 1,
994 $ af( odd_size*bwu+mbw2+1 ), 1 )
995*
996 ENDIF
997*
998* Receive and add contribution to diagonal block from the right
999*
1000 IF( mycol+level_dist .LT. npcol-1 ) THEN
1001 CALL zgerv2d( ictxt, max_bw, max_bw, work( 1 ),
1002 $ max_bw, 0, mycol+level_dist )
1003*
1004 CALL zaxpy( mbw2, cone, work( 1 ), 1,
1005 $ af( odd_size*bwu+mbw2+1 ), 1 )
1006*
1007 ENDIF
1008*
1009 level_dist = level_dist*2
1010*
1011 GOTO 12
1012 11 CONTINUE
1013* [End of GOTO Loop]
1014*
1015*
1016* *********************************
1017* Calculate and use this proc's blocks to modify other procs'...
1018*
1019* Factor diagonal block
1020*
1021 CALL zdbtrf( max_bw, max_bw, min( max_bw-1, bwl ),
1022 $ min( max_bw-1, bwu ), af( odd_size*bwu+mbw2+1
1023 $ -( min( max_bw-1, bwu ))), max_bw+1, info )
1024*
1025 IF( info.NE.0 ) THEN
1026 info = npcol + mycol
1027 ENDIF
1028*
1029* ****************************************************************
1030* Receive offdiagonal block from processor to right.
1031* If this is the first group of processors, the receive comes
1032* from a different processor than otherwise.
1033*
1034 IF( level_dist .EQ. 1 )THEN
1035 comm_proc = mycol + 1
1036*
1037* Move block into place that it will be expected to be for
1038* calcs.
1039*
1040 CALL zlamov( 'N', max_bw, max_bw, af( odd_size*bwu+1 ),
1041 $ max_bw, af( work_u+odd_size*bwl+2*mbw2+1 ),
1042 $ max_bw )
1043*
1044 CALL zlamov( 'N', max_bw, max_bw, af( work_u+odd_size*bwl+1 ),
1045 $ max_bw, af( odd_size*bwu+2*mbw2+1 ), max_bw )
1046*
1047 ELSE
1048 comm_proc = mycol + level_dist/2
1049 ENDIF
1050*
1051 IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
1052*
1053 CALL zgerv2d( ictxt, max_bw, max_bw,
1054 $ af( odd_size*bwu+1 ),
1055 $ max_bw, 0, comm_proc )
1056*
1057 CALL zgerv2d( ictxt, max_bw, max_bw,
1058 $ af( work_u+odd_size*bwl+1 ),
1059 $ max_bw, 0, comm_proc )
1060*
1061 IF( info .EQ. 0 ) THEN
1062*
1063*
1064* Modify upper off_diagonal block with diagonal block
1065*
1066*
1067 CALL ztbtrs(
1068 $ 'L', 'N', 'U', bwu, min( bwl, bwu-1 ), bwu,
1069 $ af( odd_size*bwu+
1070 $ mbw2+1+(max_bw+1)*(max_bw-bwu)), max_bw+1,
1071 $ af( work_u+odd_size*bwl+1+max_bw-bwu), max_bw, info )
1072*
1073* Modify lower off_diagonal block with diagonal block
1074*
1075*
1076 CALL ztbtrs(
1077 $ 'U', 'C', 'N', bwl, min( bwu, bwl-1 ), bwl,
1078 $ af( odd_size*bwu+
1079 $ mbw2+1-min( bwu, bwl-1 )+(max_bw+1)*(max_bw-bwl)), max_bw+1,
1080 $ af( odd_size*bwu+1+max_bw-bwl), max_bw, info )
1081*
1082 ENDIF
1083* End of "if ( info.eq.0 ) then"
1084*
1085* Calculate contribution from this block to next diagonal block
1086*
1087 CALL zgemm( 'C', 'N', max_bw, max_bw, max_bw, -cone,
1088 $ af( (odd_size)*bwu+1 ), max_bw,
1089 $ af( work_u+(odd_size)*bwl+1 ), max_bw, czero,
1090 $ work( 1 ), max_bw )
1091*
1092* Send contribution to diagonal block's owning processor.
1093*
1094 CALL zgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw,
1095 $ 0, mycol+level_dist )
1096*
1097 ENDIF
1098* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1099*
1100*
1101* ****************************************************************
1102* Receive off_diagonal block from left and use to finish with this
1103* processor.
1104*
1105 IF( (mycol/level_dist .GT. 0 ).AND.
1106 $ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
1107*
1108 IF( level_dist .GT. 1)THEN
1109*
1110* Receive offdiagonal block(s) from proc level_dist/2 to the
1111* left
1112*
1113 CALL zgerv2d( ictxt, max_bw, max_bw,
1114 $ af( work_u+odd_size*bwl+2*mbw2+1 ),
1115 $ max_bw, 0, mycol-level_dist/2 )
1116*
1117* Receive offdiagonal block(s) from proc level_dist/2 to the
1118* left
1119*
1120 CALL zgerv2d( ictxt, max_bw, max_bw,
1121 $ af( odd_size*bwu+2*mbw2+1 ),
1122 $ max_bw, 0, mycol-level_dist/2 )
1123*
1124 ENDIF
1125*
1126*
1127 IF( info.EQ.0 ) THEN
1128*
1129* Use diagonal block(s) to modify this offdiagonal block
1130*
1131*
1132* Since ZTBTRS has no "left-right" option, we must transpose
1133*
1134 CALL zlatcpy( 'N', max_bw, max_bw, af( work_u+odd_size*bwl+
1135 $ 2*mbw2+1), max_bw, work( 1 ), max_bw )
1136*
1137 CALL ztbtrs(
1138 $ 'L', 'N', 'U', max_bw, min( bwl, max_bw-1 ), bwl,
1139 $ af( odd_size*bwu+mbw2+1), max_bw+1,
1140 $ work( 1+max_bw*(max_bw-bwl) ), max_bw, info )
1141*
1142* Transpose back
1143*
1144 CALL zlatcpy(
1145 $ 'N', max_bw, max_bw, work( 1 ), max_bw,
1146 $ af( work_u+odd_size*bwl+
1147 $ 2*mbw2+1), max_bw )
1148*
1149*
1150*
1151* Since ZTBTRS has no "left-right" option, we must transpose
1152*
1153 CALL zlatcpy( 'N', max_bw, max_bw, af( odd_size*bwu+
1154 $ 2*mbw2+1), max_bw, work( 1 ), max_bw )
1155*
1156 CALL ztbtrs(
1157 $ 'U', 'C', 'N', max_bw, min( bwu, max_bw-1 ), bwu,
1158 $ af( odd_size*bwu+mbw2+1-min( bwu, max_bw-1 )), max_bw+1,
1159 $ work( 1+max_bw*(max_bw-bwu) ), max_bw, info )
1160*
1161* Transpose back
1162*
1163 CALL zlatcpy(
1164 $ 'N', max_bw, max_bw, work( 1 ), max_bw,
1165 $ af( odd_size*bwu+
1166 $ 2*mbw2+1), max_bw )
1167*
1168*
1169 ENDIF
1170* End of "if( info.eq.0 ) then"
1171*
1172* Use offdiag block(s) to calculate modification to diag block
1173* of processor to the left
1174*
1175 CALL zgemm( 'N', 'C', max_bw, max_bw, max_bw, -cone,
1176 $ af( (odd_size)*bwu+2*mbw2+1 ), max_bw,
1177 $ af( work_u+(odd_size)*bwl+2*mbw2+1 ), max_bw,
1178 $ czero, work( 1 ), max_bw )
1179*
1180* Send contribution to diagonal block's owning processor.
1181*
1182 CALL zgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw,
1183 $ 0, mycol-level_dist )
1184*
1185* *******************************************************
1186*
1187 IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 ) THEN
1188*
1189* Decide which processor offdiagonal block(s) goes to
1190*
1191 IF( ( mod( mycol/( 2*level_dist ),2 )) .EQ.0 ) THEN
1192 comm_proc = mycol + level_dist
1193 ELSE
1194 comm_proc = mycol - level_dist
1195 ENDIF
1196*
1197* Use offdiagonal blocks to calculate offdiag
1198* block to send to neighboring processor. Depending
1199* on circumstances, may need to transpose the matrix.
1200*
1201 CALL zgemm( 'N', 'N', max_bw, max_bw, max_bw, -cone,
1202 $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw,
1203 $ af( odd_size*bwu+1 ), max_bw, czero,
1204 $ work( 1 ), max_bw )
1205*
1206* Send contribution to offdiagonal block's owning processor.
1207*
1208 CALL zgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw,
1209 $ 0, comm_proc )
1210*
1211 CALL zgemm( 'N', 'N', max_bw, max_bw, max_bw, -cone,
1212 $ af( odd_size*bwu+2*mbw2+1 ), max_bw,
1213 $ af( work_u+odd_size*bwl+1 ), max_bw, czero,
1214 $ work( 1 ), max_bw )
1215*
1216* Send contribution to offdiagonal block's owning processor.
1217*
1218 CALL zgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw,
1219 $ 0, comm_proc )
1220*
1221 ENDIF
1222*
1223 ENDIF
1224* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1225*
1226 14 CONTINUE
1227*
1228*
1229 1000 CONTINUE
1230*
1231*
1232* Free BLACS space used to hold standard-form grid.
1233*
1234 IF( ictxt_save .NE. ictxt_new ) THEN
1235 CALL blacs_gridexit( ictxt_new )
1236 ENDIF
1237*
1238 1234 CONTINUE
1239*
1240* Restore saved input parameters
1241*
1242 ictxt = ictxt_save
1243 np = np_save
1244*
1245* Output minimum worksize
1246*
1247 work( 1 ) = work_size_min
1248*
1249* Make INFO consistent across processors
1250*
1251 CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info,
1252 $ -1, 0, 0 )
1253*
1254 IF( mycol.EQ.0 ) THEN
1255 CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
1256 ELSE
1257 CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
1258 ENDIF
1259*
1260*
1261 RETURN
1262*
1263* End of PZDBTRF
1264*
1265 END
subroutine desc_convert(desc_in, desc_out, info)
Definition desc_convert.f:2
#define max(A, B)
Definition pcgemr.c:180
#define min(A, B)
Definition pcgemr.c:181
subroutine globchk(ictxt, n, x, ldx, iwork, info)
Definition pchkxmat.f:403
subroutine pxerbla(ictxt, srname, info)
Definition pxerbla.f:2
subroutine pzdbtrf(n, bwl, bwu, a, ja, desca, af, laf, work, lwork, info)
Definition pzdbtrf.f:3
void reshape(Int *context_in, Int *major_in, Int *context_out, Int *major_out, Int *first_proc, Int *nprow_new, Int *npcol_new)
Definition reshape.c:80
subroutine zdbtrf(m, n, kl, ku, ab, ldab, info)
Definition zdbtrf.f:2
subroutine zlatcpy(uplo, m, n, a, lda, b, ldb)
Definition zlatcpy.f:2