SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
pddbtrf.f
Go to the documentation of this file.
1 SUBROUTINE pddbtrf( N, BWL, BWU, A, JA, DESCA, AF, LAF, WORK,
2 $ LWORK, INFO )
3*
4* -- ScaLAPACK routine (version 2.0.2) --
5* Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6* May 1 2012
7*
8* .. Scalar Arguments ..
9 INTEGER BWL, BWU, INFO, JA, LAF, LWORK, N
10* ..
11* .. Array Arguments ..
12 INTEGER DESCA( * )
13 DOUBLE PRECISION A( * ), AF( * ), WORK( * )
14* ..
15*
16* Purpose
17* =======
18*
19* PDDBTRF computes a LU factorization
20* of an N-by-N real banded
21* diagonally dominant-like distributed matrix
22* with bandwidth BWL, BWU: A(1:N, JA:JA+N-1).
23* Reordering is used to increase parallelism in the factorization.
24* This reordering results in factors that are DIFFERENT from those
25* produced by equivalent sequential codes. These factors cannot
26* be used directly by users; however, they can be used in
27* subsequent calls to PDDBTRS to solve linear systems.
28*
29* The factorization has the form
30*
31* P A(1:N, JA:JA+N-1) P^T = L U
32*
33* where U is a banded upper triangular matrix and L is banded
34* lower triangular, and P is a permutation matrix.
35*
36* =====================================================================
37*
38* Arguments
39* =========
40*
41*
42* N (global input) INTEGER
43* The number of rows and columns to be operated on, i.e. the
44* order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
45*
46* BWL (global input) INTEGER
47* Number of subdiagonals. 0 <= BWL <= N-1
48*
49* BWU (global input) INTEGER
50* Number of superdiagonals. 0 <= BWU <= N-1
51*
52* A (local input/local output) DOUBLE PRECISION pointer into
53* local memory to an array with first dimension
54* LLD_A >=(bwl+bwu+1) (stored in DESCA).
55* On entry, this array contains the local pieces of the
56* N-by-N unsymmetric banded distributed matrix
57* A(1:N, JA:JA+N-1) to be factored.
58* This local portion is stored in the packed banded format
59* used in LAPACK. Please see the Notes below and the
60* ScaLAPACK manual for more detail on the format of
61* distributed matrices.
62* On exit, this array contains information containing details
63* of the factorization.
64* Note that permutations are performed on the matrix, so that
65* the factors returned are different from those returned
66* by LAPACK.
67*
68* JA (global input) INTEGER
69* The index in the global array A that points to the start of
70* the matrix to be operated on (which may be either all of A
71* or a submatrix of A).
72*
73* DESCA (global and local input) INTEGER array of dimension DLEN.
74* if 1D type (DTYPE_A=501), DLEN >= 7;
75* if 2D type (DTYPE_A=1), DLEN >= 9 .
76* The array descriptor for the distributed matrix A.
77* Contains information of mapping of A to memory. Please
78* see NOTES below for full description and options.
79*
80* AF (local output) DOUBLE PRECISION array, dimension LAF.
81* Auxiliary Fillin Space.
82* Fillin is created during the factorization routine
83* PDDBTRF and this is stored in AF. If a linear system
84* is to be solved using PDDBTRS after the factorization
85* routine, AF *must not be altered* after the factorization.
86*
87* LAF (local input) INTEGER
88* Size of user-input Auxiliary Fillin space AF. Must be >=
89* NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu)
90* If LAF is not large enough, an error code will be returned
91* and the minimum acceptable size will be returned in AF( 1 )
92*
93* WORK (local workspace/local output)
94* DOUBLE PRECISION temporary workspace. This space may
95* be overwritten in between calls to routines. WORK must be
96* the size given in LWORK.
97* On exit, WORK( 1 ) contains the minimal LWORK.
98*
99* LWORK (local input or global input) INTEGER
100* Size of user-input workspace WORK.
101* If LWORK is too small, the minimal acceptable size will be
102* returned in WORK(1) and an error code is returned. LWORK>=
103* max(bwl,bwu)*max(bwl,bwu)
104*
105* INFO (global output) INTEGER
106* = 0: successful exit
107* < 0: If the i-th argument is an array and the j-entry had
108* an illegal value, then INFO = -(i*100+j), if the i-th
109* argument is a scalar and had an illegal value, then
110* INFO = -i.
111* > 0: If INFO = K<=NPROCS, the submatrix stored on processor
112* INFO and factored locally was not
113* diagonally dominant-like, and
114* the factorization was not completed.
115* If INFO = K>NPROCS, the submatrix stored on processor
116* INFO-NPROCS representing interactions with other
117* processors was not
118* stably factorable wo/interchanges,
119* and the factorization was not completed.
120*
121* =====================================================================
122*
123* Restrictions
124* ============
125*
126* The following are restrictions on the input parameters. Some of these
127* are temporary and will be removed in future releases, while others
128* may reflect fundamental technical limitations.
129*
130* Non-cyclic restriction: VERY IMPORTANT!
131* P*NB>= mod(JA-1,NB)+N.
132* The mapping for matrices must be blocked, reflecting the nature
133* of the divide and conquer algorithm as a task-parallel algorithm.
134* This formula in words is: no processor may have more than one
135* chunk of the matrix.
136*
137* Blocksize cannot be too small:
138* If the matrix spans more than one processor, the following
139* restriction on NB, the size of each block on each processor,
140* must hold:
141* NB >= 2*MAX(BWL,BWU)
142* The bulk of parallel computation is done on the matrix of size
143* O(NB) on each processor. If this is too small, divide and conquer
144* is a poor choice of algorithm.
145*
146* Submatrix reference:
147* JA = IB
148* Alignment restriction that prevents unnecessary communication.
149*
150*
151* =====================================================================
152*
153* Notes
154* =====
155*
156* If the factorization routine and the solve routine are to be called
157* separately (to solve various sets of righthand sides using the same
158* coefficient matrix), the auxiliary space AF *must not be altered*
159* between calls to the factorization routine and the solve routine.
160*
161* The best algorithm for solving banded and tridiagonal linear systems
162* depends on a variety of parameters, especially the bandwidth.
163* Currently, only algorithms designed for the case N/P >> bw are
164* implemented. These go by many names, including Divide and Conquer,
165* Partitioning, domain decomposition-type, etc.
166*
167* Algorithm description: Divide and Conquer
168*
169* The Divide and Conqer algorithm assumes the matrix is narrowly
170* banded compared with the number of equations. In this situation,
171* it is best to distribute the input matrix A one-dimensionally,
172* with columns atomic and rows divided amongst the processes.
173* The basic algorithm divides the banded matrix up into
174* P pieces with one stored on each processor,
175* and then proceeds in 2 phases for the factorization or 3 for the
176* solution of a linear system.
177* 1) Local Phase:
178* The individual pieces are factored independently and in
179* parallel. These factors are applied to the matrix creating
180* fillin, which is stored in a non-inspectable way in auxiliary
181* space AF. Mathematically, this is equivalent to reordering
182* the matrix A as P A P^T and then factoring the principal
183* leading submatrix of size equal to the sum of the sizes of
184* the matrices factored on each processor. The factors of
185* these submatrices overwrite the corresponding parts of A
186* in memory.
187* 2) Reduced System Phase:
188* A small (max(bwl,bwu)* (P-1)) system is formed representing
189* interaction of the larger blocks, and is stored (as are its
190* factors) in the space AF. A parallel Block Cyclic Reduction
191* algorithm is used. For a linear system, a parallel front solve
192* followed by an analagous backsolve, both using the structure
193* of the factored matrix, are performed.
194* 3) Backsubsitution Phase:
195* For a linear system, a local backsubstitution is performed on
196* each processor in parallel.
197*
198* Descriptors
199* ===========
200*
201* Descriptors now have *types* and differ from ScaLAPACK 1.0.
202*
203* Note: banded codes can use either the old two dimensional
204* or new one-dimensional descriptors, though the processor grid in
205* both cases *must be one-dimensional*. We describe both types below.
206*
207* Each global data object is described by an associated description
208* vector. This vector stores the information required to establish
209* the mapping between an object element and its corresponding process
210* and memory location.
211*
212* Let A be a generic term for any 2D block cyclicly distributed array.
213* Such a global array has an associated description vector DESCA.
214* In the following comments, the character _ should be read as
215* "of the global array".
216*
217* NOTATION STORED IN EXPLANATION
218* --------------- -------------- --------------------------------------
219* DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
220* DTYPE_A = 1.
221* CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
222* the BLACS process grid A is distribu-
223* ted over. The context itself is glo-
224* bal, but the handle (the integer
225* value) may vary.
226* M_A (global) DESCA( M_ ) The number of rows in the global
227* array A.
228* N_A (global) DESCA( N_ ) The number of columns in the global
229* array A.
230* MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
231* the rows of the array.
232* NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
233* the columns of the array.
234* RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
235* row of the array A is distributed.
236* CSRC_A (global) DESCA( CSRC_ ) The process column over which the
237* first column of the array A is
238* distributed.
239* LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
240* array. LLD_A >= MAX(1,LOCr(M_A)).
241*
242* Let K be the number of rows or columns of a distributed matrix,
243* and assume that its process grid has dimension p x q.
244* LOCr( K ) denotes the number of elements of K that a process
245* would receive if K were distributed over the p processes of its
246* process column.
247* Similarly, LOCc( K ) denotes the number of elements of K that a
248* process would receive if K were distributed over the q processes of
249* its process row.
250* The values of LOCr() and LOCc() may be determined via a call to the
251* ScaLAPACK tool function, NUMROC:
252* LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
253* LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
254* An upper bound for these quantities may be computed by:
255* LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
256* LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
257*
258*
259* One-dimensional descriptors:
260*
261* One-dimensional descriptors are a new addition to ScaLAPACK since
262* version 1.0. They simplify and shorten the descriptor for 1D
263* arrays.
264*
265* Since ScaLAPACK supports two-dimensional arrays as the fundamental
266* object, we allow 1D arrays to be distributed either over the
267* first dimension of the array (as if the grid were P-by-1) or the
268* 2nd dimension (as if the grid were 1-by-P). This choice is
269* indicated by the descriptor type (501 or 502)
270* as described below.
271*
272* IMPORTANT NOTE: the actual BLACS grid represented by the
273* CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
274* irrespective of which one-dimensional descriptor type
275* (501 or 502) is input.
276* This routine will interpret the grid properly either way.
277* ScaLAPACK routines *do not support intercontext operations* so that
278* the grid passed to a single ScaLAPACK routine *must be the same*
279* for all array descriptors passed to that routine.
280*
281* NOTE: In all cases where 1D descriptors are used, 2D descriptors
282* may also be used, since a one-dimensional array is a special case
283* of a two-dimensional array with one dimension of size unity.
284* The two-dimensional array used in this case *must* be of the
285* proper orientation:
286* If the appropriate one-dimensional descriptor is DTYPEA=501
287* (1 by P type), then the two dimensional descriptor must
288* have a CTXT value that refers to a 1 by P BLACS grid;
289* If the appropriate one-dimensional descriptor is DTYPEA=502
290* (P by 1 type), then the two dimensional descriptor must
291* have a CTXT value that refers to a P by 1 BLACS grid.
292*
293*
294* Summary of allowed descriptors, types, and BLACS grids:
295* DTYPE 501 502 1 1
296* BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
297* -----------------------------------------------------
298* A OK NO OK NO
299* B NO OK NO OK
300*
301* Let A be a generic term for any 1D block cyclicly distributed array.
302* Such a global array has an associated description vector DESCA.
303* In the following comments, the character _ should be read as
304* "of the global array".
305*
306* NOTATION STORED IN EXPLANATION
307* --------------- ---------- ------------------------------------------
308* DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
309* TYPE_A = 501: 1-by-P grid.
310* TYPE_A = 502: P-by-1 grid.
311* CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
312* the BLACS process grid A is distribu-
313* ted over. The context itself is glo-
314* bal, but the handle (the integer
315* value) may vary.
316* N_A (global) DESCA( 3 ) The size of the array dimension being
317* distributed.
318* NB_A (global) DESCA( 4 ) The blocking factor used to distribute
319* the distributed dimension of the array.
320* SRC_A (global) DESCA( 5 ) The process row or column over which the
321* first row or column of the array
322* is distributed.
323* LLD_A (local) DESCA( 6 ) The leading dimension of the local array
324* storing the local blocks of the distri-
325* buted array A. Minimum value of LLD_A
326* depends on TYPE_A.
327* TYPE_A = 501: LLD_A >=
328* size of undistributed dimension, 1.
329* TYPE_A = 502: LLD_A >=NB_A, 1.
330* Reserved DESCA( 7 ) Reserved for future use.
331*
332* =====================================================================
333*
334* Code Developer: Andrew J. Cleary, University of Tennessee.
335* Current address: Lawrence Livermore National Labs.
336* Last modified by: Peter Arbenz, Institute of Scientific Computing,
337* ETH, Zurich.
338*
339* =====================================================================
340*
341* .. Parameters ..
342 DOUBLE PRECISION ONE
343 parameter( one = 1.0d+0 )
344 DOUBLE PRECISION ZERO
345 parameter( zero = 0.0d+0 )
346 INTEGER INT_ONE
347 parameter( int_one = 1 )
348 INTEGER DESCMULT, BIGNUM
349 parameter( descmult = 100, bignum = descmult*descmult )
350 INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
351 $ lld_, mb_, m_, nb_, n_, rsrc_
352 parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
353 $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
354 $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
355* ..
356* .. Local Scalars ..
357 INTEGER COMM_PROC, CSRC, FIRST_PROC, I, I1, I2, ICTXT,
358 $ ictxt_new, ictxt_save, idum3, ja_new, laf_min,
359 $ level_dist, llda, max_bw, mbw2, mycol, myrow,
360 $ my_num_cols, nb, next_tri_size_m,
361 $ next_tri_size_n, np, npcol, nprow, np_save,
362 $ odd_size, ofst, part_offset, part_size,
363 $ prev_tri_size_m, prev_tri_size_n, return_code,
364 $ store_n_a, up_prev_tri_size_m,
365 $ up_prev_tri_size_n, work_size_min, work_u
366* ..
367* .. Local Arrays ..
368 INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 )
369* ..
370* .. External Subroutines ..
371 EXTERNAL blacs_gridexit, blacs_gridinfo, daxpy, ddbtrf,
372 $ desc_convert, dgemm, dgemv, dgerv2d, dgesd2d,
373 $ dlamov, dlatcpy, dtbtrs, dtrmm, dtrrv2d,
374 $ dtrsd2d, globchk, igamx2d, igebr2d, igebs2d,
376* ..
377* .. External Functions ..
378 INTEGER NUMROC
379 EXTERNAL numroc
380* ..
381* .. Intrinsic Functions ..
382 INTRINSIC max, min, mod
383* ..
384* .. Executable Statements ..
385*
386* Test the input parameters
387*
388 info = 0
389*
390* Convert descriptor into standard form for easy access to
391* parameters, check that grid is of right shape.
392*
393 desca_1xp( 1 ) = 501
394*
395 CALL desc_convert( desca, desca_1xp, return_code )
396*
397 IF( return_code.NE.0 ) THEN
398 info = -( 6*100+2 )
399 END IF
400*
401* Get values out of descriptor for use in code.
402*
403 ictxt = desca_1xp( 2 )
404 csrc = desca_1xp( 5 )
405 nb = desca_1xp( 4 )
406 llda = desca_1xp( 6 )
407 store_n_a = desca_1xp( 3 )
408*
409* Get grid parameters
410*
411*
412* Size of separator blocks is maximum of bandwidths
413*
414 max_bw = max( bwl, bwu )
415 mbw2 = max_bw*max_bw
416*
417 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
418 np = nprow*npcol
419*
420*
421*
422 IF( lwork.LT.-1 ) THEN
423 info = -10
424 ELSE IF( lwork.EQ.-1 ) THEN
425 idum3 = -1
426 ELSE
427 idum3 = 1
428 END IF
429*
430 IF( n.LT.0 ) THEN
431 info = -1
432 END IF
433*
434 IF( n+ja-1.GT.store_n_a ) THEN
435 info = -( 6*100+6 )
436 END IF
437*
438 IF( ( bwl.GT.n-1 ) .OR. ( bwl.LT.0 ) ) THEN
439 info = -2
440 END IF
441*
442 IF( ( bwu.GT.n-1 ) .OR. ( bwu.LT.0 ) ) THEN
443 info = -3
444 END IF
445*
446 IF( llda.LT.( bwl+bwu+1 ) ) THEN
447 info = -( 6*100+6 )
448 END IF
449*
450 IF( nb.LE.0 ) THEN
451 info = -( 6*100+4 )
452 END IF
453*
454* Argument checking that is specific to Divide & Conquer routine
455*
456 IF( nprow.NE.1 ) THEN
457 info = -( 6*100+2 )
458 END IF
459*
460 IF( n.GT.np*nb-mod( ja-1, nb ) ) THEN
461 info = -( 1 )
462 CALL pxerbla( ictxt, 'PDDBTRF, D&C alg.: only 1 block per proc'
463 $ , -info )
464 RETURN
465 END IF
466*
467 IF( ( ja+n-1.GT.nb ) .AND. ( nb.LT.2*max( bwl, bwu ) ) ) THEN
468 info = -( 6*100+4 )
469 CALL pxerbla( ictxt, 'PDDBTRF, D&C alg.: NB too small', -info )
470 RETURN
471 END IF
472*
473*
474* Check auxiliary storage size
475*
476 laf_min = nb*( bwl+bwu ) + 6*max( bwl, bwu )*max( bwl, bwu )
477*
478 IF( laf.LT.laf_min ) THEN
479 info = -8
480* put minimum value of laf into AF( 1 )
481 af( 1 ) = laf_min
482 CALL pxerbla( ictxt, 'PDDBTRF: auxiliary storage error ',
483 $ -info )
484 RETURN
485 END IF
486*
487* Check worksize
488*
489 work_size_min = max( bwl, bwu )*max( bwl, bwu )
490*
491 work( 1 ) = work_size_min
492*
493 IF( lwork.LT.work_size_min ) THEN
494 IF( lwork.NE.-1 ) THEN
495 info = -10
496 CALL pxerbla( ictxt, 'PDDBTRF: worksize error ', -info )
497 END IF
498 RETURN
499 END IF
500*
501* Pack params and positions into arrays for global consistency check
502*
503 param_check( 9, 1 ) = desca( 5 )
504 param_check( 8, 1 ) = desca( 4 )
505 param_check( 7, 1 ) = desca( 3 )
506 param_check( 6, 1 ) = desca( 1 )
507 param_check( 5, 1 ) = ja
508 param_check( 4, 1 ) = bwu
509 param_check( 3, 1 ) = bwl
510 param_check( 2, 1 ) = n
511 param_check( 1, 1 ) = idum3
512*
513 param_check( 9, 2 ) = 605
514 param_check( 8, 2 ) = 604
515 param_check( 7, 2 ) = 603
516 param_check( 6, 2 ) = 601
517 param_check( 5, 2 ) = 5
518 param_check( 4, 2 ) = 3
519 param_check( 3, 2 ) = 2
520 param_check( 2, 2 ) = 1
521 param_check( 1, 2 ) = 10
522*
523* Want to find errors with MIN( ), so if no error, set it to a big
524* number. If there already is an error, multiply by the the
525* descriptor multiplier.
526*
527 IF( info.GE.0 ) THEN
528 info = bignum
529 ELSE IF( info.LT.-descmult ) THEN
530 info = -info
531 ELSE
532 info = -info*descmult
533 END IF
534*
535* Check consistency across processors
536*
537 CALL globchk( ictxt, 9, param_check, 9, param_check( 1, 3 ),
538 $ info )
539*
540* Prepare output: set info = 0 if no error, and divide by DESCMULT
541* if error is not in a descriptor entry.
542*
543 IF( info.EQ.bignum ) THEN
544 info = 0
545 ELSE IF( mod( info, descmult ).EQ.0 ) THEN
546 info = -info / descmult
547 ELSE
548 info = -info
549 END IF
550*
551 IF( info.LT.0 ) THEN
552 CALL pxerbla( ictxt, 'PDDBTRF', -info )
553 RETURN
554 END IF
555*
556* Quick return if possible
557*
558 IF( n.EQ.0 )
559 $ RETURN
560*
561*
562* Adjust addressing into matrix space to properly get into
563* the beginning part of the relevant data
564*
565 part_offset = nb*( ( ja-1 ) / ( npcol*nb ) )
566*
567 IF( ( mycol-csrc ).LT.( ja-part_offset-1 ) / nb ) THEN
568 part_offset = part_offset + nb
569 END IF
570*
571 IF( mycol.LT.csrc ) THEN
572 part_offset = part_offset - nb
573 END IF
574*
575* Form a new BLACS grid (the "standard form" grid) with only procs
576* holding part of the matrix, of size 1xNP where NP is adjusted,
577* starting at csrc=0, with JA modified to reflect dropped procs.
578*
579* First processor to hold part of the matrix:
580*
581 first_proc = mod( ( ja-1 ) / nb+csrc, npcol )
582*
583* Calculate new JA one while dropping off unused processors.
584*
585 ja_new = mod( ja-1, nb ) + 1
586*
587* Save and compute new value of NP
588*
589 np_save = np
590 np = ( ja_new+n-2 ) / nb + 1
591*
592* Call utility routine that forms "standard-form" grid
593*
594 CALL reshape( ictxt, int_one, ictxt_new, int_one, first_proc,
595 $ int_one, np )
596*
597* Use new context from standard grid as context.
598*
599 ictxt_save = ictxt
600 ictxt = ictxt_new
601 desca_1xp( 2 ) = ictxt_new
602*
603* Get information about new grid.
604*
605 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
606*
607* Drop out processors that do not have part of the matrix.
608*
609 IF( myrow.LT.0 ) THEN
610 GO TO 140
611 END IF
612*
613* ********************************
614* Values reused throughout routine
615*
616* User-input value of partition size
617*
618 part_size = nb
619*
620* Number of columns in each processor
621*
622 my_num_cols = numroc( n, part_size, mycol, 0, npcol )
623*
624* Offset in columns to beginning of main partition in each proc
625*
626 IF( mycol.EQ.0 ) THEN
627 part_offset = part_offset + mod( ja_new-1, part_size )
628 my_num_cols = my_num_cols - mod( ja_new-1, part_size )
629 END IF
630*
631* Offset in elements
632*
633 ofst = part_offset*llda
634*
635* Size of main (or odd) partition in each processor
636*
637 odd_size = my_num_cols
638 IF( mycol.LT.np-1 ) THEN
639 odd_size = odd_size - max_bw
640 END IF
641*
642* Offset to workspace for Upper triangular factor
643*
644 work_u = bwu*odd_size + 3*mbw2
645*
646*
647* Zero out space for fillin
648*
649 DO 10 i = 1, laf_min
650 af( i ) = zero
651 10 CONTINUE
652*
653* Zero out space for work
654*
655 DO 20 i = 1, work_size_min
656 work( i ) = zero
657 20 CONTINUE
658*
659* Begin main code
660*
661*
662********************************************************************
663* PHASE 1: Local computation phase.
664********************************************************************
665*
666*
667* Sizes of the extra triangles communicated bewtween processors
668*
669 IF( mycol.GT.0 ) THEN
670 prev_tri_size_m = min( bwl, numroc( n, part_size, mycol, 0,
671 $ npcol ) )
672 prev_tri_size_n = min( bwl, numroc( n, part_size, mycol-1, 0,
673 $ npcol ) )
674 END IF
675*
676 IF( mycol.GT.0 ) THEN
677 up_prev_tri_size_m = min( bwu,
678 $ numroc( n, part_size, mycol, 0, npcol ) )
679 up_prev_tri_size_n = min( bwu,
680 $ numroc( n, part_size, mycol-1, 0,
681 $ npcol ) )
682 END IF
683*
684 IF( mycol.LT.npcol-1 ) THEN
685 next_tri_size_m = min( bwl, numroc( n, part_size, mycol+1, 0,
686 $ npcol ) )
687 next_tri_size_n = min( bwl, numroc( n, part_size, mycol, 0,
688 $ npcol ) )
689 END IF
690*
691 IF( mycol.LT.np-1 ) THEN
692* Transfer last triangle D_i of local matrix to next processor
693* which needs it to calculate fillin due to factorization of
694* its main (odd) block A_i.
695* Overlap the send with the factorization of A_i.
696*
697 CALL dtrsd2d( ictxt, 'U', 'N', next_tri_size_m,
698 $ next_tri_size_n, a( ofst+( my_num_cols-bwl )*
699 $ llda+( bwl+bwu+1 ) ), llda-1, 0, mycol+1 )
700*
701 END IF
702*
703*
704* Factor main partition A_i = L_i {U_i} in each processor
705*
706 CALL ddbtrf( odd_size, odd_size, bwl, bwu, a( ofst+1 ), llda,
707 $ info )
708*
709 IF( info.NE.0 ) THEN
710 info = mycol + 1
711 GO TO 30
712 END IF
713*
714*
715 IF( mycol.LT.np-1 ) THEN
716*
717* Apply factorization to lower connection block BL_i
718* transpose the connection block in preparation.
719* Apply factorization to upper connection block BU_i
720* Move the connection block in preparation.
721*
722 CALL dlatcpy( 'U', bwl, bwl, a( ( ofst+( bwl+bwu+1 )+
723 $ ( odd_size-bwl )*llda ) ), llda-1,
724 $ af( odd_size*bwu+2*mbw2+1+max_bw-bwl ), max_bw )
725 CALL dlamov( 'L', bwu, bwu, a( ( ofst+1+odd_size*llda ) ),
726 $ llda-1, af( work_u+odd_size*bwl+2*mbw2+1+max_bw-
727 $ bwu ), max_bw )
728*
729* Perform the triangular system solve {L_i}{{BU'}_i} = {B_i}
730*
731 CALL dtbtrs( 'L', 'N', 'U', bwu, bwl, bwu,
732 $ a( ofst+bwu+1+( odd_size-bwu )*llda ), llda,
733 $ af( work_u+odd_size*bwl+2*mbw2+1+max_bw-bwu ),
734 $ max_bw, info )
735*
736* Perform the triangular solve {U_i}^T{BL'}_i^T = {BL_i}^T
737*
738 CALL dtbtrs( 'U', 'T', 'N', bwl, bwu, bwl,
739 $ a( ofst+1+( odd_size-bwl )*llda ), llda,
740 $ af( odd_size*bwu+2*mbw2+1+max_bw-bwl ), max_bw,
741 $ info )
742*
743* transpose resulting block to its location
744* in main storage.
745*
746 CALL dlatcpy( 'L', bwl, bwl, af( odd_size*bwu+2*mbw2+1+max_bw-
747 $ bwl ), max_bw, a( ( ofst+( bwl+bwu+1 )+
748 $ ( odd_size-bwl )*llda ) ), llda-1 )
749*
750* Move the resulting block back to its location in main storage.
751*
752 CALL dlamov( 'L', bwu, bwu, af( work_u+odd_size*bwl+2*mbw2+1+
753 $ max_bw-bwu ), max_bw, a( ( ofst+1+odd_size*
754 $ llda ) ), llda-1 )
755*
756*
757* Compute contribution to diagonal block(s) of reduced system.
758* {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i}
759*
760* The following method uses more flops than necessary but
761* does not necessitate the writing of a new BLAS routine.
762*
763*
764 CALL dgemm( 'T', 'N', max_bw, max_bw, max_bw, -one,
765 $ af( odd_size*bwu+2*mbw2+1 ), max_bw,
766 $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw, one,
767 $ a( ofst+odd_size*llda+1+bwu ), llda-1 )
768*
769 END IF
770* End of "if ( MYCOL .lt. NP-1 )..." loop
771*
772*
773 30 CONTINUE
774* If the processor could not locally factor, it jumps here.
775*
776 IF( mycol.NE.0 ) THEN
777* Discard temporary matrix stored beginning in
778* AF( (odd_size+2*bwl, bwu)*bwl, bwu+1 ) and use for
779* off_diagonal block of reduced system.
780*
781* Receive previously transmitted matrix section, which forms
782* the right-hand-side for the triangular solve that calculates
783* the "spike" fillin.
784*
785*
786 CALL dtrrv2d( ictxt, 'U', 'N', prev_tri_size_m,
787 $ prev_tri_size_n, af( work_u+1 ), bwl, 0,
788 $ mycol-1 )
789*
790 IF( info.EQ.0 ) THEN
791*
792* Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ .
793*
794* Transpose transmitted triangular matrix $DL_i$
795*
796 DO 50 i1 = 1, bwl
797 DO 40 i2 = i1 + 1, bwl
798 af( work_u+i2+( i1-1 )*bwl ) = af( work_u+i1+( i2-1 )*
799 $ bwl )
800 af( work_u+i1+( i2-1 )*bwl ) = zero
801 40 CONTINUE
802 50 CONTINUE
803*
804 DO 60 i1 = 2, odd_size
805 i2 = min( i1-1, bwl )
806 CALL dgemv( 'N', bwl, i2, -one,
807 $ af( work_u+1+( i1-1-i2 )*bwl ), bwl,
808 $ a( ofst+bwu+1+i2+( i1-1-i2 )*llda ), llda-1,
809 $ one, af( work_u+1+( i1-1 )*bwl ), 1 )
810 60 CONTINUE
811*
812*
813* Calculate the "spike" fillin, ${U_i}^T {{GL}_i}^T = {DU_i}^T$
814*
815*
816* Copy D block into AF storage for solve.
817*
818 CALL dlamov( 'L', up_prev_tri_size_n, up_prev_tri_size_m,
819 $ a( ofst+1 ), llda-1, af( 1 ), bwu )
820*
821 DO 80 i1 = 1, odd_size
822 i2 = min( bwu, i1-1 )
823 CALL dgemv( 'N', bwu, i2, -one, af( ( i1-1-i2 )*bwu+1 ),
824 $ bwu, a( ofst+bwu+1-i2+( i1-1 )*llda ), 1,
825 $ one, af( ( i1-1 )*bwu+1 ), 1 )
826*
827 DO 70 i = 1, bwu
828 af( ( i1-1 )*bwu+i ) = af( ( i1-1 )*bwu+i ) /
829 $ a( ( i1-1 )*llda+bwu+1 )
830 70 CONTINUE
831 80 CONTINUE
832*
833* Calculate the update block for previous proc, E_i = GL_i{GU_i}
834*
835*
836* Zero out space in case result is smaller than storage block
837*
838 DO 90 i = 1, mbw2
839 af( odd_size*bwu+2*mbw2+i ) = zero
840 90 CONTINUE
841*
842 CALL dgemm( 'N', 'T', bwu, bwl, odd_size, -one, af( 1 ),
843 $ bwu, af( work_u+1 ), bwl, zero,
844 $ af( 1+max( 0, bwl-bwu )+odd_size*bwu+( 2*max_bw+
845 $ max( 0, bwu-bwl ) )*max_bw ), max_bw )
846*
847*
848* Initiate send of E_i to previous processor to overlap
849* with next computation.
850*
851 CALL dgesd2d( ictxt, max_bw, max_bw,
852 $ af( odd_size*bwu+2*mbw2+1 ), max_bw, 0,
853 $ mycol-1 )
854*
855*
856 IF( mycol.LT.np-1 ) THEN
857*
858* Calculate off-diagonal block(s) of reduced system.
859* Note: for ease of use in solution of reduced system, store
860* L's off-diagonal block in transpose form.
861*
862* Copy matrix HU_i (the last bwl rows of GU_i) to AFL storage
863* as per requirements of BLAS routine DTRMM.
864* Since we have GU_i stored,
865* transpose HU_i to HU_i^T.
866*
867 CALL dlamov( 'N', bwl, bwl,
868 $ af( work_u+( odd_size-bwl )*bwl+1 ), bwl,
869 $ af( ( odd_size )*bwu+1+( max_bw-bwl ) ),
870 $ max_bw )
871*
872 CALL dtrmm( 'R', 'U', 'T', 'N', bwl, bwl, -one,
873 $ a( ( ofst+( bwl+bwu+1 )+( odd_size-bwl )*
874 $ llda ) ), llda-1, af( ( odd_size )*bwu+1+
875 $ ( max_bw-bwl ) ), max_bw )
876*
877*
878* Copy matrix HL_i (the last bwu rows of GL_i^T) to AFU store
879* as per requirements of BLAS routine DTRMM.
880* Since we have GL_i^T stored,
881* transpose HL_i^T to HL_i.
882*
883 CALL dlamov( 'N', bwu, bwu, af( ( odd_size-bwu )*bwu+1 ),
884 $ bwu, af( work_u+( odd_size )*bwl+1+max_bw-
885 $ bwu ), max_bw )
886*
887 CALL dtrmm( 'R', 'L', 'N', 'N', bwu, bwu, -one,
888 $ a( ( ofst+1+odd_size*llda ) ), llda-1,
889 $ af( work_u+( odd_size )*bwl+1+max_bw-bwu ),
890 $ max_bw )
891*
892 END IF
893*
894 END IF
895* End of "if ( MYCOL .ne. 0 )..."
896*
897 END IF
898* End of "if (info.eq.0) then"
899*
900*
901* Check to make sure no processors have found errors
902*
903 CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info, -1, 0,
904 $ 0 )
905*
906 IF( mycol.EQ.0 ) THEN
907 CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
908 ELSE
909 CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
910 END IF
911*
912 IF( info.NE.0 ) THEN
913 GO TO 130
914 END IF
915* No errors found, continue
916*
917*
918********************************************************************
919* PHASE 2: Formation and factorization of Reduced System.
920********************************************************************
921*
922* Gather up local sections of reduced system
923*
924*
925* The last processor does not participate in the factorization of
926* the reduced system, having sent its E_i already.
927 IF( mycol.EQ.npcol-1 ) THEN
928 GO TO 120
929 END IF
930*
931* Initiate send of off-diag block(s) to overlap with next part.
932* Off-diagonal block needed on neighboring processor to start
933* algorithm.
934*
935 IF( ( mod( mycol+1, 2 ).EQ.0 ) .AND. ( mycol.GT.0 ) ) THEN
936*
937 CALL dgesd2d( ictxt, max_bw, max_bw, af( odd_size*bwu+1 ),
938 $ max_bw, 0, mycol-1 )
939*
940 CALL dgesd2d( ictxt, max_bw, max_bw,
941 $ af( work_u+odd_size*bwl+1 ), max_bw, 0, mycol-1 )
942*
943 END IF
944*
945* Copy last diagonal block into AF storage for subsequent
946* operations.
947*
948 CALL dlamov( 'N', max_bw, max_bw, a( ofst+odd_size*llda+bwu+1 ),
949 $ llda-1, af( odd_size*bwu+mbw2+1 ), max_bw )
950*
951* Receive cont. to diagonal block that is stored on this proc.
952*
953 IF( mycol.LT.npcol-1 ) THEN
954*
955 CALL dgerv2d( ictxt, max_bw, max_bw,
956 $ af( odd_size*bwu+2*mbw2+1 ), max_bw, 0, mycol+1 )
957*
958* Add contribution to diagonal block
959*
960 CALL daxpy( mbw2, one, af( odd_size*bwu+2*mbw2+1 ), 1,
961 $ af( odd_size*bwu+mbw2+1 ), 1 )
962*
963 END IF
964*
965*
966* *************************************
967* Modification Loop
968*
969* The distance for sending and receiving for each level starts
970* at 1 for the first level.
971 level_dist = 1
972*
973* Do until this proc is needed to modify other procs' equations
974*
975 100 CONTINUE
976 IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
977 $ GO TO 110
978*
979* Receive and add contribution to diagonal block from the left
980*
981 IF( mycol-level_dist.GE.0 ) THEN
982 CALL dgerv2d( ictxt, max_bw, max_bw, work( 1 ), max_bw, 0,
983 $ mycol-level_dist )
984*
985 CALL daxpy( mbw2, one, work( 1 ), 1, af( odd_size*bwu+mbw2+1 ),
986 $ 1 )
987*
988 END IF
989*
990* Receive and add contribution to diagonal block from the right
991*
992 IF( mycol+level_dist.LT.npcol-1 ) THEN
993 CALL dgerv2d( ictxt, max_bw, max_bw, work( 1 ), max_bw, 0,
994 $ mycol+level_dist )
995*
996 CALL daxpy( mbw2, one, work( 1 ), 1, af( odd_size*bwu+mbw2+1 ),
997 $ 1 )
998*
999 END IF
1000*
1001 level_dist = level_dist*2
1002*
1003 GO TO 100
1004 110 CONTINUE
1005* [End of GOTO Loop]
1006*
1007*
1008* *********************************
1009* Calculate and use this proc's blocks to modify other procs'...
1010*
1011* Factor diagonal block
1012*
1013 CALL ddbtrf( max_bw, max_bw, min( max_bw-1, bwl ),
1014 $ min( max_bw-1, bwu ), af( odd_size*bwu+mbw2+1-
1015 $ ( min( max_bw-1, bwu ) ) ), max_bw+1, info )
1016*
1017 IF( info.NE.0 ) THEN
1018 info = npcol + mycol
1019 END IF
1020*
1021* ****************************************************************
1022* Receive offdiagonal block from processor to right.
1023* If this is the first group of processors, the receive comes
1024* from a different processor than otherwise.
1025*
1026 IF( level_dist.EQ.1 ) THEN
1027 comm_proc = mycol + 1
1028*
1029* Move block into place that it will be expected to be for
1030* calcs.
1031*
1032 CALL dlamov( 'N', max_bw, max_bw, af( odd_size*bwu+1 ), max_bw,
1033 $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw )
1034*
1035 CALL dlamov( 'N', max_bw, max_bw, af( work_u+odd_size*bwl+1 ),
1036 $ max_bw, af( odd_size*bwu+2*mbw2+1 ), max_bw )
1037*
1038 ELSE
1039 comm_proc = mycol + level_dist / 2
1040 END IF
1041*
1042 IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1043*
1044 CALL dgerv2d( ictxt, max_bw, max_bw, af( odd_size*bwu+1 ),
1045 $ max_bw, 0, comm_proc )
1046*
1047 CALL dgerv2d( ictxt, max_bw, max_bw,
1048 $ af( work_u+odd_size*bwl+1 ), max_bw, 0,
1049 $ comm_proc )
1050*
1051 IF( info.EQ.0 ) THEN
1052*
1053*
1054* Modify upper off_diagonal block with diagonal block
1055*
1056*
1057 CALL dtbtrs( 'L', 'N', 'U', bwu, min( bwl, bwu-1 ), bwu,
1058 $ af( odd_size*bwu+mbw2+1+( max_bw+1 )*( max_bw-
1059 $ bwu ) ), max_bw+1, af( work_u+odd_size*bwl+1+
1060 $ max_bw-bwu ), max_bw, info )
1061*
1062* Modify lower off_diagonal block with diagonal block
1063*
1064*
1065 CALL dtbtrs( 'U', 'T', 'N', bwl, min( bwu, bwl-1 ), bwl,
1066 $ af( odd_size*bwu+mbw2+1-min( bwu,
1067 $ bwl-1 )+( max_bw+1 )*( max_bw-bwl ) ),
1068 $ max_bw+1, af( odd_size*bwu+1+max_bw-bwl ),
1069 $ max_bw, info )
1070*
1071 END IF
1072* End of "if ( info.eq.0 ) then"
1073*
1074* Calculate contribution from this block to next diagonal block
1075*
1076 CALL dgemm( 'T', 'N', max_bw, max_bw, max_bw, -one,
1077 $ af( ( odd_size )*bwu+1 ), max_bw,
1078 $ af( work_u+( odd_size )*bwl+1 ), max_bw, zero,
1079 $ work( 1 ), max_bw )
1080*
1081* Send contribution to diagonal block's owning processor.
1082*
1083 CALL dgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw, 0,
1084 $ mycol+level_dist )
1085*
1086 END IF
1087* End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1088*
1089*
1090* ****************************************************************
1091* Receive off_diagonal block from left and use to finish with this
1092* processor.
1093*
1094 IF( ( mycol / level_dist.GT.0 ) .AND.
1095 $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) ) THEN
1096*
1097 IF( level_dist.GT.1 ) THEN
1098*
1099* Receive offdiagonal block(s) from proc level_dist/2 to the
1100* left
1101*
1102 CALL dgerv2d( ictxt, max_bw, max_bw,
1103 $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw, 0,
1104 $ mycol-level_dist / 2 )
1105*
1106* Receive offdiagonal block(s) from proc level_dist/2 to the
1107* left
1108*
1109 CALL dgerv2d( ictxt, max_bw, max_bw,
1110 $ af( odd_size*bwu+2*mbw2+1 ), max_bw, 0,
1111 $ mycol-level_dist / 2 )
1112*
1113 END IF
1114*
1115*
1116 IF( info.EQ.0 ) THEN
1117*
1118* Use diagonal block(s) to modify this offdiagonal block
1119*
1120*
1121* Since DTBTRS has no "left-right" option, we must transpose
1122*
1123 CALL dlatcpy( 'N', max_bw, max_bw,
1124 $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw,
1125 $ work( 1 ), max_bw )
1126*
1127 CALL dtbtrs( 'L', 'N', 'U', max_bw, min( bwl, max_bw-1 ),
1128 $ bwl, af( odd_size*bwu+mbw2+1 ), max_bw+1,
1129 $ work( 1+max_bw*( max_bw-bwl ) ), max_bw, info )
1130*
1131* Transpose back
1132*
1133 CALL dlatcpy( 'N', max_bw, max_bw, work( 1 ), max_bw,
1134 $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw )
1135*
1136*
1137*
1138* Since DTBTRS has no "left-right" option, we must transpose
1139*
1140 CALL dlatcpy( 'N', max_bw, max_bw,
1141 $ af( odd_size*bwu+2*mbw2+1 ), max_bw,
1142 $ work( 1 ), max_bw )
1143*
1144 CALL dtbtrs( 'U', 'T', 'N', max_bw, min( bwu, max_bw-1 ),
1145 $ bwu, af( odd_size*bwu+mbw2+1-min( bwu,
1146 $ max_bw-1 ) ), max_bw+1,
1147 $ work( 1+max_bw*( max_bw-bwu ) ), max_bw, info )
1148*
1149* Transpose back
1150*
1151 CALL dlatcpy( 'N', max_bw, max_bw, work( 1 ), max_bw,
1152 $ af( odd_size*bwu+2*mbw2+1 ), max_bw )
1153*
1154*
1155 END IF
1156* End of "if( info.eq.0 ) then"
1157*
1158* Use offdiag block(s) to calculate modification to diag block
1159* of processor to the left
1160*
1161 CALL dgemm( 'N', 'T', max_bw, max_bw, max_bw, -one,
1162 $ af( ( odd_size )*bwu+2*mbw2+1 ), max_bw,
1163 $ af( work_u+( odd_size )*bwl+2*mbw2+1 ), max_bw,
1164 $ zero, work( 1 ), max_bw )
1165*
1166* Send contribution to diagonal block's owning processor.
1167*
1168 CALL dgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw, 0,
1169 $ mycol-level_dist )
1170*
1171* *******************************************************
1172*
1173 IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1174*
1175* Decide which processor offdiagonal block(s) goes to
1176*
1177 IF( ( mod( mycol / ( 2*level_dist ), 2 ) ).EQ.0 ) THEN
1178 comm_proc = mycol + level_dist
1179 ELSE
1180 comm_proc = mycol - level_dist
1181 END IF
1182*
1183* Use offdiagonal blocks to calculate offdiag
1184* block to send to neighboring processor. Depending
1185* on circumstances, may need to transpose the matrix.
1186*
1187 CALL dgemm( 'N', 'N', max_bw, max_bw, max_bw, -one,
1188 $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw,
1189 $ af( odd_size*bwu+1 ), max_bw, zero, work( 1 ),
1190 $ max_bw )
1191*
1192* Send contribution to offdiagonal block's owning processor.
1193*
1194 CALL dgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw, 0,
1195 $ comm_proc )
1196*
1197 CALL dgemm( 'N', 'N', max_bw, max_bw, max_bw, -one,
1198 $ af( odd_size*bwu+2*mbw2+1 ), max_bw,
1199 $ af( work_u+odd_size*bwl+1 ), max_bw, zero,
1200 $ work( 1 ), max_bw )
1201*
1202* Send contribution to offdiagonal block's owning processor.
1203*
1204 CALL dgesd2d( ictxt, max_bw, max_bw, work( 1 ), max_bw, 0,
1205 $ comm_proc )
1206*
1207 END IF
1208*
1209 END IF
1210* End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1211*
1212 120 CONTINUE
1213*
1214*
1215 130 CONTINUE
1216*
1217*
1218* Free BLACS space used to hold standard-form grid.
1219*
1220 IF( ictxt_save.NE.ictxt_new ) THEN
1221 CALL blacs_gridexit( ictxt_new )
1222 END IF
1223*
1224 140 CONTINUE
1225*
1226* Restore saved input parameters
1227*
1228 ictxt = ictxt_save
1229 np = np_save
1230*
1231* Output minimum worksize
1232*
1233 work( 1 ) = work_size_min
1234*
1235* Make INFO consistent across processors
1236*
1237 CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info, -1, 0,
1238 $ 0 )
1239*
1240 IF( mycol.EQ.0 ) THEN
1241 CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
1242 ELSE
1243 CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
1244 END IF
1245*
1246*
1247 RETURN
1248*
1249* End of PDDBTRF
1250*
1251 END
subroutine ddbtrf(m, n, kl, ku, ab, ldab, info)
Definition ddbtrf.f:2
subroutine desc_convert(desc_in, desc_out, info)
Definition desc_convert.f:2
subroutine dlatcpy(uplo, m, n, a, lda, b, ldb)
Definition dlatcpy.f:2
#define max(A, B)
Definition pcgemr.c:180
#define min(A, B)
Definition pcgemr.c:181
subroutine globchk(ictxt, n, x, ldx, iwork, info)
Definition pchkxmat.f:403
subroutine pddbtrf(n, bwl, bwu, a, ja, desca, af, laf, work, lwork, info)
Definition pddbtrf.f:3
subroutine pxerbla(ictxt, srname, info)
Definition pxerbla.f:2
void reshape(Int *context_in, Int *major_in, Int *context_out, Int *major_out, Int *first_proc, Int *nprow_new, Int *npcol_new)
Definition reshape.c:80