ScaLAPACK 2.1  2.1 ScaLAPACK: Scalable Linear Algebra PACKage
pdlaqr0.f
Go to the documentation of this file.
1  RECURSIVE SUBROUTINE pdlaqr0( WANTT, WANTZ, N, ILO, IHI, H,
2  \$ DESCH, WR, WI, ILOZ, IHIZ, Z, DESCZ, WORK, LWORK,
3  \$ IWORK, LIWORK, INFO, RECLEVEL )
4 *
5 * Contribution from the Department of Computing Science and HPC2N,
6 * Umea University, Sweden
7 *
8 * -- ScaLAPACK auxiliary routine (version 2.0.1) --
9 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
10 * Univ. of Colorado Denver and University of California, Berkeley.
11 * January, 2012
12 *
13  IMPLICIT NONE
14 *
15 * .. Scalar Arguments ..
16  INTEGER ihi, ihiz, ilo, iloz, info, liwork, lwork, n,
17  \$ reclevel
18  LOGICAL wantt, wantz
19 * ..
20 * .. Array Arguments ..
21  INTEGER desch( * ), descz( * ), iwork( * )
22  DOUBLE PRECISION h( * ), wi( n ), work( * ), wr( n ),
23  \$ z( * )
24 * ..
25 *
26 * Purpose
27 * =======
28 *
29 * PDLAQR0 computes the eigenvalues of a Hessenberg matrix H
30 * and, optionally, the matrices T and Z from the Schur decomposition
31 * H = Z*T*Z**T, where T is an upper quasi-triangular matrix (the
32 * Schur form), and Z is the orthogonal matrix of Schur vectors.
33 *
34 * Optionally Z may be postmultiplied into an input orthogonal
35 * matrix Q so that this routine can give the Schur factorization
36 * of a matrix A which has been reduced to the Hessenberg form H
37 * by the orthogonal matrix Q:
38 * A = Q * H * Q**T = (QZ) * T * (QZ)**T.
39 *
40 * Notes
41 * =====
42 *
43 * Each global data object is described by an associated description
44 * vector. This vector stores the information required to establish
45 * the mapping between an object element and its corresponding process
46 * and memory location.
47 *
48 * Let A be a generic term for any 2D block cyclicly distributed array.
49 * Such a global array has an associated description vector DESCA.
50 * In the following comments, the character _ should be read as
51 * "of the global array".
52 *
53 * NOTATION STORED IN EXPLANATION
54 * --------------- -------------- --------------------------------------
55 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
56 * DTYPE_A = 1.
57 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
58 * the BLACS process grid A is distribu-
59 * ted over. The context itself is glo-
60 * bal, but the handle (the integer
61 * value) may vary.
62 * M_A (global) DESCA( M_ ) The number of rows in the global
63 * array A.
64 * N_A (global) DESCA( N_ ) The number of columns in the global
65 * array A.
66 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
67 * the rows of the array.
68 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
69 * the columns of the array.
70 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
71 * row of the array A is distributed.
72 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
73 * first column of the array A is
74 * distributed.
75 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
76 * array. LLD_A >= MAX(1,LOCr(M_A)).
77 *
78 * Let K be the number of rows or columns of a distributed matrix,
79 * and assume that its process grid has dimension p x q.
80 * LOCr( K ) denotes the number of elements of K that a process
81 * would receive if K were distributed over the p processes of its
82 * process column.
83 * Similarly, LOCc( K ) denotes the number of elements of K that a
84 * process would receive if K were distributed over the q processes of
85 * its process row.
86 * The values of LOCr() and LOCc() may be determined via a call to the
87 * ScaLAPACK tool function, NUMROC:
88 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
89 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
90 * An upper bound for these quantities may be computed by:
91 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
92 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
93 *
94 * Arguments
95 * =========
96 *
97 * WANTT (global input) LOGICAL
98 * = .TRUE. : the full Schur form T is required;
99 * = .FALSE.: only eigenvalues are required.
100 *
101 * WANTZ (global input) LOGICAL
102 * = .TRUE. : the matrix of Schur vectors Z is required;
103 * = .FALSE.: Schur vectors are not required.
104 *
105 * N (global input) INTEGER
106 * The order of the Hessenberg matrix H (and Z if WANTZ).
107 * N >= 0.
108 *
109 * ILO (global input) INTEGER
110 * IHI (global input) INTEGER
111 * It is assumed that H is already upper triangular in rows
112 * and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
113 * set by a previous call to PDGEBAL, and then passed to PDGEHRD
114 * when the matrix output by PDGEBAL is reduced to Hessenberg
115 * form. Otherwise ILO and IHI should be set to 1 and N
116 * respectively. If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
117 * If N = 0, then ILO = 1 and IHI = 0.
118 *
119 * H (global input/output) DOUBLE PRECISION array, dimension
120 * (DESCH(LLD_),*)
121 * On entry, the upper Hessenberg matrix H.
122 * On exit, if JOB = 'S', H is upper quasi-triangular in
123 * rows and columns ILO:IHI, with 1-by-1 and 2-by-2 blocks on
124 * the main diagonal. The 2-by-2 diagonal blocks (corresponding
125 * to complex conjugate pairs of eigenvalues) are returned in
126 * standard form, with H(i,i) = H(i+1,i+1) and
127 * H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and JOB = 'E', the
128 * contents of H are unspecified on exit.
129 *
130 * DESCH (global and local input) INTEGER array of dimension DLEN_.
131 * The array descriptor for the distributed matrix H.
132 *
133 * WR (global output) DOUBLE PRECISION array, dimension (N)
134 * WI (global output) DOUBLE PRECISION array, dimension (N)
135 * The real and imaginary parts, respectively, of the computed
136 * eigenvalues ILO to IHI are stored in the corresponding
137 * elements of WR and WI. If two eigenvalues are computed as a
138 * complex conjugate pair, they are stored in consecutive
139 * elements of WR and WI, say the i-th and (i+1)th, with
140 * WI(i) > 0 and WI(i+1) < 0. If JOB = 'S', the
141 * eigenvalues are stored in the same order as on the diagonal
142 * of the Schur form returned in H.
143 *
144 * Z (global input/output) DOUBLE PRECISION array.
145 * If COMPZ = 'V', on entry Z must contain the current
146 * matrix Z of accumulated transformations from, e.g., PDGEHRD,
147 * and on exit Z has been updated; transformations are applied
148 * only to the submatrix Z(ILO:IHI,ILO:IHI).
149 * If COMPZ = 'N', Z is not referenced.
150 * If COMPZ = 'I', on entry Z need not be set and on exit,
151 * if INFO = 0, Z contains the orthogonal matrix Z of the Schur
152 * vectors of H.
153 *
154 * DESCZ (global and local input) INTEGER array of dimension DLEN_.
155 * The array descriptor for the distributed matrix Z.
156 *
157 * WORK (local workspace) DOUBLE PRECISION array, dimension(DWORK)
158 *
159 * LWORK (local input) INTEGER
160 * The length of the workspace array WORK.
161 *
162 * IWORK (local workspace) INTEGER array, dimension (LIWORK)
163 *
164 * LIWORK (local input) INTEGER
165 * The length of the workspace array IWORK.
166 *
167 * INFO (output) INTEGER
168 * = 0: successful exit
169 * .LT. 0: if INFO = -i, the i-th argument had an illegal
170 * value
171 * .GT. 0: if INFO = i, PDLAQR0 failed to compute all of
172 * the eigenvalues. Elements 1:ilo-1 and i+1:n of WR
173 * and WI contain those eigenvalues which have been
174 * successfully computed. (Failures are rare.)
175 *
176 * If INFO .GT. 0 and JOB = 'E', then on exit, the
177 * remaining unconverged eigenvalues are the eigen-
178 * values of the upper Hessenberg matrix rows and
179 * columns ILO through INFO of the final, output
180 * value of H.
181 *
182 * If INFO .GT. 0 and JOB = 'S', then on exit
183 *
184 * (*) (initial value of H)*U = U*(final value of H)
185 *
186 * where U is an orthogonal matrix. The final
187 * value of H is upper Hessenberg and quasi-triangular
188 * in rows and columns INFO+1 through IHI.
189 *
190 * If INFO .GT. 0 and COMPZ = 'V', then on exit
191 *
192 * (final value of Z) = (initial value of Z)*U
193 *
194 * where U is the orthogonal matrix in (*) (regard-
195 * less of the value of JOB.)
196 *
197 * If INFO .GT. 0 and COMPZ = 'I', then on exit
198 * (final value of Z) = U
199 * where U is the orthogonal matrix in (*) (regard-
200 * less of the value of JOB.)
201 *
202 * If INFO .GT. 0 and COMPZ = 'N', then Z is not
203 * accessed.
204 *
205 * ================================================================
206 * Based on contributions by
207 * Robert Granat, Department of Computing Science and HPC2N,
208 * Umea University, Sweden.
209 * ================================================================
210 *
211 * Restrictions: The block size in H and Z must be square and larger
212 * than or equal to six (6) due to restrictions in PDLAQR1, PDLAQR5
213 * and DLAQR6. Moreover, H and Z need to be distributed identically
214 * with the same context.
215 *
216 * ================================================================
217 * References:
218 * K. Braman, R. Byers, and R. Mathias,
219 * The Multi-Shift QR Algorithm Part I: Maintaining Well Focused
220 * Shifts, and Level 3 Performance.
221 * SIAM J. Matrix Anal. Appl., 23(4):929--947, 2002.
222 *
223 * K. Braman, R. Byers, and R. Mathias,
224 * The Multi-Shift QR Algorithm Part II: Aggressive Early
225 * Deflation.
226 * SIAM J. Matrix Anal. Appl., 23(4):948--973, 2002.
227 *
228 * R. Granat, B. Kagstrom, and D. Kressner,
229 * A Novel Parallel QR Algorithm for Hybrid Distributed Momory HPC
230 * Systems.
231 * SIAM J. Sci. Comput., 32(4):2345--2378, 2010.
232 *
233 * ================================================================
234 *
235 * .. Parameters ..
236 *
237 * ==== Exceptional deflation windows: try to cure rare
238 * . slow convergence by increasing the size of the
239 * . deflation window after KEXNW iterations. =====
240 *
241 * ==== Exceptional shifts: try to cure rare slow convergence
242 * . with ad-hoc exceptional shifts every KEXSH iterations.
243 * . The constants WILK1 and WILK2 are used to form the
244 * . exceptional shifts. ====
245 *
246  INTEGER block_cyclic_2d, csrc_, ctxt_, dlen_, dtype_,
247  \$ lld_, mb_, m_, nb_, n_, rsrc_
248  INTEGER recmax
249  PARAMETER ( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
250  \$ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
251  \$ rsrc_ = 7, csrc_ = 8, lld_ = 9, recmax = 3 )
252  INTEGER ntiny
253  PARAMETER ( ntiny = 11 )
254  INTEGER kexnw, kexsh
255  parameter( kexnw = 5, kexsh = 6 )
256  DOUBLE PRECISION wilk1, wilk2
257  parameter( wilk1 = 0.75d0, wilk2 = -0.4375d0 )
258  DOUBLE PRECISION zero, one
259  parameter( zero = 0.0d0, one = 1.0d0 )
260 * ..
261 * .. Local Scalars ..
262  DOUBLE PRECISION aa, bb, cc, cs, dd, sn, ss, swap, elem, t0,
263  \$ elem1, elem2, elem3, alpha, sdsum, stamp
264  INTEGER i, j, inf, it, itmax, k, kacc22, kbot, kdu, ks,
265  \$ kt, ktop, ku, kv, kwh, kwtop, kwv, ld, ls,
266  \$ lwkopt, ndfl, nh, nho, nibble, nmin, ns, nsmax,
267  \$ nsr, nve, nw, nwmax, nwr, lldh, lldz, ii, jj,
268  \$ ictxt, nprow, npcol, myrow, mycol, ipv, ipt,
269  \$ ipw, ipwrk, vrows, vcols, trows, tcols, wrows,
270  \$ wcols, hrsrc, hcsrc, nb, is, ie, nprocs, kk,
271  \$ iroffh, icoffh, hrsrc3, hcsrc3, nwin, totit,
272  \$ sweep, jw, totns, liwkopt, npmin, ictxt_new,
273  \$ myrow_new, mycol_new
274  LOGICAL nwinc, sorted, lquery, recursion
275  CHARACTER jbcmpz*2
276 * ..
277 * .. External Functions ..
278  INTEGER pilaenvx, numroc, indxg2p, iceil, blacs_pnum
279  EXTERNAL pilaenvx, numroc, indxg2p, iceil, blacs_pnum
280 * ..
281 * .. Local Arrays ..
282  INTEGER descv( dlen_ ), desct( dlen_ ), descw( dlen_ ),
283  \$ pmap( 64*64 )
284  DOUBLE PRECISION zdum( 1, 1 )
285 * ..
286 * .. External Subroutines ..
287  EXTERNAL pdlacpy, pdlaqr1, dlanv2, pdlaqr3, pdlaqr5,
288  \$ pdelget, dlaqr0, dlaset, pdgemr2d
289 * ..
290 * .. Intrinsic Functions ..
291  INTRINSIC abs, dble, int, max, min, mod
292 * ..
293 * .. Executable Statements ..
294  info = 0
295  ictxt = desch( ctxt_ )
296  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
297  nprocs = nprow*npcol
298  recursion = reclevel .LT. recmax
299 *
300 * Quick return for N = 0: nothing to do.
301 *
302  IF( n.EQ.0 ) THEN
303  work( 1 ) = one
304  iwork( 1 ) = 1
305  RETURN
306  END IF
307 *
308 * Set up job flags for PILAENV.
309 *
310  IF( wantt ) THEN
311  jbcmpz( 1: 1 ) = 'S'
312  ELSE
313  jbcmpz( 1: 1 ) = 'E'
314  END IF
315  IF( wantz ) THEN
316  jbcmpz( 2: 2 ) = 'V'
317  ELSE
318  jbcmpz( 2: 2 ) = 'N'
319  END IF
320 *
321 * Check if workspace query
322 *
323  lquery = lwork.EQ.-1 .OR. liwork.EQ.-1
324 *
325 * Extract local leading dimensions and block factors of matrices
326 * H and Z
327 *
328  lldh = desch( lld_ )
329  lldz = descz( lld_ )
330  nb = desch( mb_ )
331 *
332 * Tiny (sub-) matrices must use PDLAQR1. (Stops recursion)
333 *
334  IF( n.LE.ntiny ) THEN
335 *
336 * Estimate optimal workspace.
337 *
338  CALL pdlaqr1( wantt, wantz, n, ilo, ihi, h, desch, wr, wi,
339  \$ iloz, ihiz, z, descz, work, lwork, iwork, liwork, info )
340  lwkopt = int( work(1) )
341  liwkopt = iwork(1)
342 *
343 * Completely local matrices uses LAPACK. (Stops recursion)
344 *
345  ELSEIF( n.LE.nb ) THEN
346  IF( myrow.EQ.desch(rsrc_) .AND. mycol.EQ.desch(csrc_) ) THEN
347  CALL dlaqr0( wantt, wantz, n, ilo, ihi, h, desch(lld_),
348  \$ wr, wi, iloz, ihiz, z, descz(lld_), work, lwork, info )
349  IF( n.GT.2 )
350  \$ CALL dlaset( 'L', n-2, n-2, zero, zero, h(3),
351  \$ desch(lld_) )
352  lwkopt = int( work(1) )
353  liwkopt = 1
354  ELSE
355  lwkopt = 1
356  liwkopt = 1
357  END IF
358 *
359 * Do one more step of recursion
360 *
361  ELSE
362 *
363 * Zero out iteration and sweep counters for debugging purposes
364 *
365  totit = 0
366  sweep = 0
367  totns = 0
368 *
369 * Use small bulge multi-shift QR with aggressive early
370 * deflation on larger-than-tiny matrices.
371 *
372 * Hope for the best.
373 *
374  info = 0
375 *
376 * NWR = recommended deflation window size. At this
377 * point, N .GT. NTINY = 11, so there is enough
378 * subdiagonal workspace for NWR.GE.2 as required.
379 * (In fact, there is enough subdiagonal space for
380 * NWR.GE.3.)
381 *
382  nwr = pilaenvx( ictxt, 13, 'PDLAQR0', jbcmpz, n, ilo, ihi,
383  \$ lwork )
384  nwr = max( 2, nwr )
385  nwr = min( ihi-ilo+1, nwr )
386  nw = nwr
387 *
388 * NSR = recommended number of simultaneous shifts.
389 * At this point N .GT. NTINY = 11, so there is at
390 * enough subdiagonal workspace for NSR to be even
391 * and greater than or equal to two as required.
392 *
393  nwin = pilaenvx( ictxt, 19, 'PDLAQR0', jbcmpz, n, nb, nb, nb )
394  nsr = pilaenvx( ictxt, 15, 'PDLAQR0', jbcmpz, n, ilo, ihi,
395  \$ max(nwin,nb) )
396  nsr = min( nsr, ihi-ilo )
397  nsr = max( 2, nsr-mod( nsr, 2 ) )
398 *
399 * Estimate optimal workspace
400 *
401  lwkopt = 3*iceil(nwr,nprow)*iceil(nwr,npcol)
402 *
403 * Workspace query call to PDLAQR3
404 *
405  CALL pdlaqr3( wantt, wantz, n, ilo, ihi, nwr+1, h,
406  \$ desch, iloz, ihiz, z, descz, ls, ld, wr, wi, h,
407  \$ desch, n, h, desch, n, h, desch, work, -1, iwork,
408  \$ liwork, reclevel )
409  lwkopt = lwkopt + int( work( 1 ) )
410  liwkopt = iwork( 1 )
411 *
412 * Workspace query call to PDLAQR5
413 *
414  CALL pdlaqr5( wantt, wantz, 2, n, 1, n, n, wr, wi, h,
415  \$ desch, iloz, ihiz, z, descz, work, -1, iwork,
416  \$ liwork )
417 *
418 * Optimal workspace = MAX(PDLAQR3, PDLAQR5)
419 *
420  lwkopt = max( lwkopt, int( work( 1 ) ) )
421  liwkopt = max( liwkopt, iwork( 1 ) )
422 *
423 * Quick return in case of workspace query.
424 *
425  IF( lquery ) THEN
426  work( 1 ) = dble( lwkopt )
427  iwork( 1 ) = liwkopt
428  RETURN
429  END IF
430 *
431 * PDLAQR1/PDLAQR0 crossover point.
432 *
433  nmin = pilaenvx( ictxt, 12, 'PDLAQR0', jbcmpz, n, ilo, ihi,
434  \$ lwork )
435  nmin = max( ntiny, nmin )
436 *
437 * Nibble crossover point.
438 *
439  nibble = pilaenvx( ictxt, 14, 'PDLAQR0', jbcmpz, n, ilo, ihi,
440  \$ lwork )
441  nibble = max( 0, nibble )
442 *
443 * Accumulate reflections during ttswp? Use block
444 * 2-by-2 structure during matrix-matrix multiply?
445 *
446  kacc22 = pilaenvx( ictxt, 16, 'PDLAQR0', jbcmpz, n, ilo, ihi,
447  \$ lwork )
448  kacc22 = max( 1, kacc22 )
449  kacc22 = min( 2, kacc22 )
450 *
451 * NWMAX = the largest possible deflation window for
452 * which there is sufficient workspace.
453 *
454  nwmax = min( ( n-1 ) / 3, lwork / 2 )
455 *
456 * NSMAX = the Largest number of simultaneous shifts
457 * for which there is sufficient workspace.
458 *
459  nsmax = min( ( n+6 ) / 9, lwork - lwork/3 )
460  nsmax = nsmax - mod( nsmax, 2 )
461 *
462 * NDFL: an iteration count restarted at deflation.
463 *
464  ndfl = 1
465 *
466 * ITMAX = iteration limit
467 *
468  itmax = max( 30, 2*kexsh )*max( 10, ( ihi-ilo+1 ) )
469 *
470 * Last row and column in the active block.
471 *
472  kbot = ihi
473 *
474 * Main Loop.
475 *
476  DO 110 it = 1, itmax
477  totit = totit + 1
478 *
479 * Done when KBOT falls below ILO.
480 *
481  IF( kbot.LT.ilo )
482  \$ GO TO 120
483 *
484 * Locate active block.
485 *
486  DO 10 k = kbot, ilo + 1, -1
487  CALL infog2l( k, k-1, desch, nprow, npcol, myrow, mycol,
488  \$ ii, jj, hrsrc, hcsrc )
489  IF( myrow.EQ.hrsrc .AND. mycol.EQ.hcsrc ) THEN
490  IF( h( ii + (jj-1)*lldh ).EQ.zero )
491  \$ GO TO 20
492  END IF
493  10 CONTINUE
494  k = ilo
495  20 CONTINUE
496  ktop = k
497  IF( nprocs.GT.1 )
498  \$ CALL igamx2d( ictxt, 'All', '1-Tree', 1, 1, ktop, 1,
499  \$ -1, -1, -1, -1, -1 )
500 *
501 * Select deflation window size.
502 *
503  nh = kbot - ktop + 1
504  IF( nh.LE.ntiny ) THEN
505  nw = nh
506  ELSEIF( ndfl.LT.kexnw .OR. nh.LT.nw ) THEN
507 *
508 * Typical deflation window. If possible and
509 * advisable, nibble the entire active block.
510 * If not, use size NWR or NWR+1 depending upon
511 * which has the smaller corresponding subdiagonal
512 * entry (a heuristic).
513 *
514  nwinc = .true.
515  IF( nh.LE.min( nmin, nwmax ) ) THEN
516  nw = nh
517  ELSE
518  nw = min( nwr, nh, nwmax )
519  IF( nw.LT.nwmax ) THEN
520  IF( nw.GE.nh-1 ) THEN
521  nw = nh
522  ELSE
523  kwtop = kbot - nw + 1
524  CALL pdelget( 'All', '1-Tree', elem1, h, kwtop,
525  \$ kwtop-1, desch )
526  CALL pdelget( 'All', '1-Tree', elem2, h,
527  \$ kwtop-1, kwtop-2, desch )
528  IF( abs( elem1 ).GT.abs( elem2 ) ) nw = nw + 1
529  END IF
530  END IF
531  END IF
532  ELSE
533 *
534 * Exceptional deflation window. If there have
535 * been no deflations in KEXNW or more iterations,
536 * then vary the deflation window size. At first,
537 * because, larger windows are, in general, more
538 * powerful than smaller ones, rapidly increase the
539 * window up to the maximum reasonable and possible.
540 * Then maybe try a slightly smaller window.
541 *
542  IF( nwinc .AND. nw.LT.min( nwmax, nh ) ) THEN
543  nw = min( nwmax, nh, 2*nw )
544  ELSE
545  nwinc = .false.
546  IF( nw.EQ.nh .AND. nh.GT.2 )
547  \$ nw = nh - 1
548  END IF
549  END IF
550 *
551 * Aggressive early deflation:
552 * split workspace into
553 * - an NW-by-NW work array V for orthogonal matrix
554 * - an NW-by-at-least-NW-but-more-is-better
555 * (NW-by-NHO) horizontal work array for Schur factor
556 * - an at-least-NW-but-more-is-better (NVE-by-NW)
557 * vertical work array for matrix multiplications
558 * - align T, V and W with the deflation window
559 *
560  kv = n - nw + 1
561  kt = nw + 1
562  nho = ( n-nw-1 ) - kt + 1
563  kwv = nw + 2
564  nve = ( n-nw ) - kwv + 1
565 *
566  jw = min( nw, kbot-ktop+1 )
567  kwtop = kbot - jw + 1
568  iroffh = mod( kwtop - 1, nb )
569  icoffh = iroffh
570  hrsrc = indxg2p( kwtop, nb, myrow, desch(rsrc_), nprow )
571  hcsrc = indxg2p( kwtop, nb, mycol, desch(csrc_), npcol )
572  vrows = numroc( jw+iroffh, nb, myrow, hrsrc, nprow )
573  vcols = numroc( jw+icoffh, nb, mycol, hcsrc, npcol )
574  CALL descinit( descv, jw+iroffh, jw+icoffh, nb, nb,
575  \$ hrsrc, hcsrc, ictxt, max(1, vrows), info )
576 *
577  trows = numroc( jw+iroffh, nb, myrow, hrsrc, nprow )
578  tcols = numroc( jw+icoffh, nb, mycol, hcsrc, npcol )
579  CALL descinit( desct, jw+iroffh, jw+icoffh, nb, nb,
580  \$ hrsrc, hcsrc, ictxt, max(1, trows), info )
581  wrows = numroc( jw+iroffh, nb, myrow, hrsrc, nprow )
582  wcols = numroc( jw+icoffh, nb, mycol, hcsrc, npcol )
583  CALL descinit( descw, jw+iroffh, jw+icoffh, nb, nb,
584  \$ hrsrc, hcsrc, ictxt, max(1, wrows), info )
585 *
586  ipv = 1
587  ipt = ipv + descv( lld_ ) * vcols
588  ipw = ipt + desct( lld_ ) * tcols
589  ipwrk = ipw + descw( lld_ ) * wcols
590 *
591 * Aggressive early deflation
592 *
593  iwork(1) = it
594  CALL pdlaqr3( wantt, wantz, n, ktop, kbot, nw, h,
595  \$ desch, iloz, ihiz, z, descz, ls, ld, wr, wi,
596  \$ work(ipv), descv, nho, work(ipt), desct, nve,
597  \$ work(ipw), descw, work(ipwrk), lwork-ipwrk+1,
598  \$ iwork, liwork, reclevel )
599 *
600 * Adjust KBOT accounting for new deflations.
601 *
602  kbot = kbot - ld
603 *
604 * KS points to the shifts.
605 *
606  ks = kbot - ls + 1
607 *
608 * Skip an expensive QR sweep if there is a (partly
609 * heuristic) reason to expect that many eigenvalues
610 * will deflate without it. Here, the QR sweep is
611 * skipped if many eigenvalues have just been deflated
612 * or if the remaining active block is small.
613 *
614  IF( ( ld.EQ.0 ) .OR. ( ( 100*ld.LE.nw*nibble ) .AND. ( kbot-
615  \$ ktop+1.GT.min( nmin, nwmax ) ) ) ) THEN
616 *
617 * NS = nominal number of simultaneous shifts.
618 * This may be lowered (slightly) if PDLAQR3
619 * did not provide that many shifts.
620 *
621  ns = min( nsmax, nsr, max( 2, kbot-ktop ) )
622  ns = ns - mod( ns, 2 )
623 *
624 * If there have been no deflations
625 * in a multiple of KEXSH iterations,
626 * then try exceptional shifts.
627 * Otherwise use shifts provided by
628 * PDLAQR3 above or from the eigenvalues
629 * of a trailing principal submatrix.
630 *
631  IF( mod( ndfl, kexsh ).EQ.0 ) THEN
632  ks = kbot - ns + 1
633  DO 30 i = kbot, max( ks+1, ktop+2 ), -2
634  CALL pdelget( 'All', '1-Tree', elem1, h, i, i-1,
635  \$ desch )
636  CALL pdelget( 'All', '1-Tree', elem2, h, i-1, i-2,
637  \$ desch )
638  CALL pdelget( 'All', '1-Tree', elem3, h, i, i,
639  \$ desch )
640  ss = abs( elem1 ) + abs( elem2 )
641  aa = wilk1*ss + elem3
642  bb = ss
643  cc = wilk2*ss
644  dd = aa
645  CALL dlanv2( aa, bb, cc, dd, wr( i-1 ), wi( i-1 ),
646  \$ wr( i ), wi( i ), cs, sn )
647  30 CONTINUE
648  IF( ks.EQ.ktop ) THEN
649  CALL pdelget( 'All', '1-Tree', elem1, h, ks+1,
650  \$ ks+1, desch )
651  wr( ks+1 ) = elem1
652  wi( ks+1 ) = zero
653  wr( ks ) = wr( ks+1 )
654  wi( ks ) = wi( ks+1 )
655  END IF
656  ELSE
657 *
658 * Got NS/2 or fewer shifts? Use PDLAQR0 or
659 * PDLAQR1 on a trailing principal submatrix to
660 * get more.
661 *
662  IF( kbot-ks+1.LE.ns / 2 ) THEN
663  ks = kbot - ns + 1
664  kt = n - ns + 1
665  npmin = pilaenvx( ictxt, 23, 'PDLAQR0', 'EN', ns,
666  \$ nb, nprow, npcol )
667 c
668 c Temporarily force NPMIN <= 8 since only PDLAQR1 is used.
669 c
670  npmin = min(npmin, 8)
671  IF( min(nprow, npcol).LE.npmin+1 .OR.
672  \$ reclevel.GE.1 ) THEN
673 *
674 * The window is large enough. Compute the Schur
675 * decomposition with all processors.
676 *
677  iroffh = mod( ks - 1, nb )
678  icoffh = iroffh
679  IF( ns.GT.nmin ) THEN
680  hrsrc = indxg2p( ks, nb, myrow, desch(rsrc_),
681  \$ nprow )
682  hcsrc = indxg2p( ks, nb, myrow, desch(csrc_),
683  \$ npcol )
684  ELSE
685  hrsrc = 0
686  hcsrc = 0
687  END IF
688  trows = numroc( ns+iroffh, nb, myrow, hrsrc,
689  \$ nprow )
690  tcols = numroc( ns+icoffh, nb, mycol, hcsrc,
691  \$ npcol )
692  CALL descinit( desct, ns+iroffh, ns+icoffh, nb,
693  \$ nb, hrsrc, hcsrc, ictxt, max(1, trows),
694  \$ info )
695  ipt = 1
696  ipwrk = ipt + desct(lld_) * tcols
697 *
698  IF( ns.GT.nmin .AND. recursion ) THEN
699  CALL pdlacpy( 'All', ns, ns, h, ks, ks,
700  \$ desch, work(ipt), 1+iroffh, 1+icoffh,
701  \$ desct )
702  CALL pdlaqr0( .false., .false., iroffh+ns,
703  \$ 1+iroffh, iroffh+ns, work(ipt),
704  \$ desct, wr( ks-iroffh ),
705  \$ wi( ks-iroffh ), 1, 1, zdum,
706  \$ descz, work( ipwrk ),
707  \$ lwork-ipwrk+1, iwork, liwork,
708  \$ inf, reclevel+1 )
709  ELSE
710  CALL pdlamve( 'All', ns, ns, h, ks, ks,
711  \$ desch, work(ipt), 1+iroffh, 1+icoffh,
712  \$ desct, work(ipwrk) )
713  CALL pdlaqr1( .false., .false., iroffh+ns,
714  \$ 1+iroffh, iroffh+ns, work(ipt),
715  \$ desct, wr( ks-iroffh ),
716  \$ wi( ks-iroffh ), 1+iroffh, iroffh+ns,
717  \$ zdum, descz, work( ipwrk ),
718  \$ lwork-ipwrk+1, iwork, liwork, inf )
719  END IF
720  ELSE
721 *
722 * The window is too small. Redistribute the AED
723 * window to a subgrid and do the computation on
724 * the subgrid.
725 *
726  ictxt_new = ictxt
727  DO 50 i = 0, npmin-1
728  DO 40 j = 0, npmin-1
729  pmap( j+1+i*npmin ) =
730  \$ blacs_pnum( ictxt, i, j )
731  40 CONTINUE
732  50 CONTINUE
733  CALL blacs_gridmap( ictxt_new, pmap, npmin,
734  \$ npmin, npmin )
735  CALL blacs_gridinfo( ictxt_new, npmin, npmin,
736  \$ myrow_new, mycol_new )
737  IF( myrow.GE.npmin .OR. mycol.GE.npmin )
738  \$ ictxt_new = -1
739 *
740  IF( ictxt_new.GE.0 ) THEN
741  trows = numroc( ns, nb, myrow_new, 0, npmin )
742  tcols = numroc( ns, nb, mycol_new, 0, npmin )
743  CALL descinit( desct, ns, ns, nb, nb, 0, 0,
744  \$ ictxt_new, max(1,trows), info )
745  ipt = 1
746  ipwrk = ipt + desct(lld_) * tcols
747  ELSE
748  ipt = 1
749  ipwrk = 2
750  desct( ctxt_ ) = -1
751  inf = 0
752  END IF
753  CALL pdgemr2d( ns, ns, h, ks, ks, desch,
754  \$ work(ipt), 1, 1, desct, ictxt )
755 *
756 c
757 c This part is still not perfect.
758 c Either PDLAQR0 or PDLAQR1 can work, but not both.
759 c
760 c NMIN = PILAENVX( ICTXT_NEW, 12, 'PDLAQR0',
761 c \$ 'EN', NS, 1, NS, LWORK )
762  IF( ictxt_new.GE.0 ) THEN
763 c IF( NS.GT.NMIN .AND. RECLEVEL.LT.1 ) THEN
764 c CALL PDLAQR0( .FALSE., .FALSE., NS, 1,
765 c \$ NS, WORK(IPT), DESCT, WR( KS ),
766 c \$ WI( KS ), 1, 1, ZDUM, DESCT,
767 c \$ WORK( IPWRK ), LWORK-IPWRK+1, IWORK,
768 c \$ LIWORK, INF, RECLEVEL+1 )
769 c ELSE
770  CALL pdlaqr1( .false., .false., ns, 1,
771  \$ ns, work(ipt), desct, wr( ks ),
772  \$ wi( ks ), 1, ns, zdum, desct,
773  \$ work( ipwrk ), lwork-ipwrk+1, iwork,
774  \$ liwork, inf )
775 c END IF
776  CALL blacs_gridexit( ictxt_new )
777  END IF
778  IF( myrow+mycol.GT.0 ) THEN
779  DO 60 j = 0, ns-1
780  wr( ks+j ) = zero
781  wi( ks+j ) = zero
782  60 CONTINUE
783  END IF
784  CALL igamn2d( ictxt, 'All', '1-Tree', 1, 1, inf,
785  \$ 1, -1, -1, -1, -1, -1 )
786  CALL dgsum2d( ictxt, 'All', ' ', ns, 1, wr(ks),
787  \$ ns, -1, -1 )
788  CALL dgsum2d( ictxt, 'All', ' ', ns, 1, wi(ks),
789  \$ ns, -1, -1 )
790  END IF
791  ks = ks + inf
792 *
793 * In case of a rare QR failure use
794 * eigenvalues of the trailing 2-by-2
795 * principal submatrix.
796 *
797  IF( ks.GE.kbot ) THEN
798  CALL pdelget( 'All', '1-Tree', aa, h, kbot-1,
799  \$ kbot-1, desch )
800  CALL pdelget( 'All', '1-Tree', cc, h, kbot,
801  \$ kbot-1, desch )
802  CALL pdelget( 'All', '1-Tree', bb, h, kbot-1,
803  \$ kbot, desch )
804  CALL pdelget( 'All', '1-Tree', dd, h, kbot,
805  \$ kbot, desch )
806  CALL dlanv2( aa, bb, cc, dd, wr( kbot-1 ),
807  \$ wi( kbot-1 ), wr( kbot ),
808  \$ wi( kbot ), cs, sn )
809  ks = kbot - 1
810  END IF
811  END IF
812 *
813  IF( kbot-ks+1.GT.ns ) THEN
814 *
815 * Sort the shifts (helps a little)
816 * Bubble sort keeps complex conjugate
817 * pairs together.
818 *
819  sorted = .false.
820  DO 80 k = kbot, ks + 1, -1
821  IF( sorted )
822  \$ GO TO 90
823  sorted = .true.
824  DO 70 i = ks, k - 1
825  IF( abs( wr( i ) )+abs( wi( i ) ).LT.
826  \$ abs( wr( i+1 ) )+abs( wi( i+1 ) ) ) THEN
827  sorted = .false.
828 *
829  swap = wr( i )
830  wr( i ) = wr( i+1 )
831  wr( i+1 ) = swap
832 *
833  swap = wi( i )
834  wi( i ) = wi( i+1 )
835  wi( i+1 ) = swap
836  END IF
837  70 CONTINUE
838  80 CONTINUE
839  90 CONTINUE
840  END IF
841 *
842 * Shuffle shifts into pairs of real shifts
843 * and pairs of complex conjugate shifts
844 * assuming complex conjugate shifts are
846 * they are.)
847 *
848  DO 100 i = kbot, ks + 2, -2
849  IF( wi( i ).NE.-wi( i-1 ) ) THEN
850 *
851  swap = wr( i )
852  wr( i ) = wr( i-1 )
853  wr( i-1 ) = wr( i-2 )
854  wr( i-2 ) = swap
855 *
856  swap = wi( i )
857  wi( i ) = wi( i-1 )
858  wi( i-1 ) = wi( i-2 )
859  wi( i-2 ) = swap
860  END IF
861  100 CONTINUE
862  END IF
863 *
864 * If there are only two shifts and both are
865 * real, then use only one.
866 *
867  IF( kbot-ks+1.EQ.2 ) THEN
868  IF( wi( kbot ).EQ.zero ) THEN
869  CALL pdelget( 'All', '1-Tree', elem, h, kbot,
870  \$ kbot, desch )
871  IF( abs( wr( kbot )-elem ).LT.
872  \$ abs( wr( kbot-1 )-elem ) ) THEN
873  wr( kbot-1 ) = wr( kbot )
874  ELSE
875  wr( kbot ) = wr( kbot-1 )
876  END IF
877  END IF
878  END IF
879 *
880 * Use up to NS of the the smallest magnatiude
881 * shifts. If there aren't NS shifts available,
882 * then use them all, possibly dropping one to
883 * make the number of shifts even.
884 *
885  ns = min( ns, kbot-ks+1 )
886  ns = ns - mod( ns, 2 )
887  ks = kbot - ns + 1
888 *
889 * Small-bulge multi-shift QR sweep.
890 *
891  totns = totns + ns
892  sweep = sweep + 1
893  CALL pdlaqr5( wantt, wantz, kacc22, n, ktop, kbot,
894  \$ ns, wr( ks ), wi( ks ), h, desch, iloz, ihiz, z,
895  \$ descz, work, lwork, iwork, liwork )
896  END IF
897 *
898 * Note progress (or the lack of it).
899 *
900  IF( ld.GT.0 ) THEN
901  ndfl = 1
902  ELSE
903  ndfl = ndfl + 1
904  END IF
905 *
906 * End of main loop.
907  110 CONTINUE
908 *
909 * Iteration limit exceeded. Set INFO to show where
910 * the problem occurred and exit.
911 *
912  info = kbot
913  120 CONTINUE
914  END IF
915 *
916 * Return the optimal value of LWORK.
917 *
918  work( 1 ) = dble( lwkopt )
919  iwork( 1 ) = liwkopt
920  IF( .NOT. lquery ) THEN
921  iwork( 1 ) = totit
922  iwork( 2 ) = sweep
923  iwork( 3 ) = totns
924  END IF
925  RETURN
926 *
927 * End of PDLAQR0
928 *
929  END
indxg2p
integer function indxg2p(INDXGLOB, NB, IPROC, ISRCPROC, NPROCS)
Definition: indxg2p.f:2
max
#define max(A, B)
Definition: pcgemr.c:180
infog2l
subroutine infog2l(GRINDX, GCINDX, DESC, NPROW, NPCOL, MYROW, MYCOL, LRINDX, LCINDX, RSRC, CSRC)
Definition: infog2l.f:3
pilaenvx
integer function pilaenvx(ICTXT, ISPEC, NAME, OPTS, N1, N2, N3, N4)
Definition: pilaenvx.f:3
pdelget
subroutine pdelget(SCOPE, TOP, ALPHA, A, IA, JA, DESCA)
Definition: pdelget.f:2
pdlaqr1
recursive subroutine pdlaqr1(WANTT, WANTZ, N, ILO, IHI, A, DESCA, WR, WI, ILOZ, IHIZ, Z, DESCZ, WORK, LWORK, IWORK, ILWORK, INFO)
Definition: pdlaqr1.f:5
descinit
subroutine descinit(DESC, M, N, MB, NB, IRSRC, ICSRC, ICTXT, LLD, INFO)
Definition: descinit.f:3
pdlaqr3
recursive subroutine pdlaqr3(WANTT, WANTZ, N, KTOP, KBOT, NW, H, DESCH, ILOZ, IHIZ, Z, DESCZ, NS, ND, SR, SI, V, DESCV, NH, T, DESCT, NV, WV, DESCW, WORK, LWORK, IWORK, LIWORK, RECLEVEL)
Definition: pdlaqr3.f:6
pdlaqr0
recursive subroutine pdlaqr0(WANTT, WANTZ, N, ILO, IHI, H, DESCH, WR, WI, ILOZ, IHIZ, Z, DESCZ, WORK, LWORK, IWORK, LIWORK, INFO, RECLEVEL)
Definition: pdlaqr0.f:4
numroc
integer function numroc(N, NB, IPROC, ISRCPROC, NPROCS)
Definition: numroc.f:2
pdlamve
subroutine pdlamve(UPLO, M, N, A, IA, JA, DESCA, B, IB, JB, DESCB, DWORK)
Definition: pdlamve.f:3
pdlacpy
subroutine pdlacpy(UPLO, M, N, A, IA, JA, DESCA, B, IB, JB, DESCB)
Definition: pdlacpy.f:3
pdlaqr5
subroutine pdlaqr5(WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, SR, SI, H, DESCH, ILOZ, IHIZ, Z, DESCZ, WORK, LWORK, IWORK, LIWORK)
Definition: pdlaqr5.f:4
min
#define min(A, B)
Definition: pcgemr.c:181
iceil
integer function iceil(INUM, IDENOM)
Definition: iceil.f:2