SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
All Classes Files Functions Variables Typedefs Macros
pdlaqr5.f
Go to the documentation of this file.
1 SUBROUTINE pdlaqr5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
2 $ SR, SI, H, DESCH, ILOZ, IHIZ, Z, DESCZ, WORK,
3 $ LWORK, IWORK, LIWORK )
4*
5* Contribution from the Department of Computing Science and HPC2N,
6* Umea University, Sweden
7*
8* -- ScaLAPACK routine (version 2.0.2) --
9* Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
10* May 1 2012
11*
12 IMPLICIT NONE
13*
14* .. Scalar Arguments ..
15 INTEGER IHIZ, ILOZ, KACC22, KBOT, KTOP, N, NSHFTS,
16 $ LWORK, LIWORK
17 LOGICAL WANTT, WANTZ
18* ..
19* .. Array Arguments ..
20 INTEGER DESCH( * ), DESCZ( * ), IWORK( * )
21 DOUBLE PRECISION H( * ), SI( * ), SR( * ), Z( * ), WORK( * )
22* ..
23*
24* Purpose
25* =======
26*
27* This auxiliary subroutine called by PDLAQR0 performs a
28* single small-bulge multi-shift QR sweep by chasing separated
29* groups of bulges along the main block diagonal of H.
30*
31* WANTT (global input) logical scalar
32* WANTT = .TRUE. if the quasi-triangular Schur factor
33* is being computed. WANTT is set to .FALSE. otherwise.
34*
35* WANTZ (global input) logical scalar
36* WANTZ = .TRUE. if the orthogonal Schur factor is being
37* computed. WANTZ is set to .FALSE. otherwise.
38*
39* KACC22 (global input) integer with value 0, 1, or 2.
40* Specifies the computation mode of far-from-diagonal
41* orthogonal updates.
42* = 1: PDLAQR5 accumulates reflections and uses matrix-matrix
43* multiply to update the far-from-diagonal matrix entries.
44* = 2: PDLAQR5 accumulates reflections, uses matrix-matrix
45* multiply to update the far-from-diagonal matrix entries,
46* and takes advantage of 2-by-2 block structure during
47* matrix multiplies.
48*
49* N (global input) integer scalar
50* N is the order of the Hessenberg matrix H upon which this
51* subroutine operates.
52*
53* KTOP (global input) integer scalar
54* KBOT (global input) integer scalar
55* These are the first and last rows and columns of an
56* isolated diagonal block upon which the QR sweep is to be
57* applied. It is assumed without a check that
58* either KTOP = 1 or H(KTOP,KTOP-1) = 0
59* and
60* either KBOT = N or H(KBOT+1,KBOT) = 0.
61*
62* NSHFTS (global input) integer scalar
63* NSHFTS gives the number of simultaneous shifts. NSHFTS
64* must be positive and even.
65*
66* SR (global input) DOUBLE PRECISION array of size (NSHFTS)
67* SI (global input) DOUBLE PRECISION array of size (NSHFTS)
68* SR contains the real parts and SI contains the imaginary
69* parts of the NSHFTS shifts of origin that define the
70* multi-shift QR sweep.
71*
72* H (local input/output) DOUBLE PRECISION array of size
73* (DESCH(LLD_),*)
74* On input H contains a Hessenberg matrix. On output a
75* multi-shift QR sweep with shifts SR(J)+i*SI(J) is applied
76* to the isolated diagonal block in rows and columns KTOP
77* through KBOT.
78*
79* DESCH (global and local input) INTEGER array of dimension DLEN_.
80* The array descriptor for the distributed matrix H.
81*
82* ILOZ (global input) INTEGER
83* IHIZ (global input) INTEGER
84* Specify the rows of Z to which transformations must be
85* applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N
86*
87* Z (local input/output) DOUBLE PRECISION array of size
88* (DESCZ(LLD_),*)
89* If WANTZ = .TRUE., then the QR Sweep orthogonal
90* similarity transformation is accumulated into
91* Z(ILOZ:IHIZ,ILO:IHI) from the right.
92* If WANTZ = .FALSE., then Z is unreferenced.
93*
94* DESCZ (global and local input) INTEGER array of dimension DLEN_.
95* The array descriptor for the distributed matrix Z.
96*
97* WORK (local workspace) DOUBLE PRECISION array, dimension(DWORK)
98*
99* LWORK (local input) INTEGER
100* The length of the workspace array WORK.
101*
102* IWORK (local workspace) INTEGER array, dimension (LIWORK)
103*
104* LIWORK (local input) INTEGER
105* The length of the workspace array IWORK.
106*
107* ================================================================
108* Based on contributions by
109* Robert Granat, Department of Computing Science and HPC2N,
110* University of Umea, Sweden.
111*
112* ============================================================
113* References:
114* K. Braman, R. Byers, and R. Mathias,
115* The Multi-Shift QR Algorithm Part I: Maintaining Well Focused
116* Shifts, and Level 3 Performance.
117* SIAM J. Matrix Anal. Appl., 23(4):929--947, 2002.
118*
119* R. Granat, B. Kagstrom, and D. Kressner,
120* A Novel Parallel QR Algorithm for Hybrid Distributed Momory HPC
121* Systems.
122* SIAM J. Sci. Comput., 32(4):2345--2378, 2010.
123*
124* ============================================================
125* .. Parameters ..
126 INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
127 $ LLD_, MB_, M_, NB_, N_, RSRC_
128 parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
129 $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
130 $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
131 DOUBLE PRECISION ZERO, ONE
132 PARAMETER ( ZERO = 0.0d0, one = 1.0d0 )
133 INTEGER NTINY
134 parameter( ntiny = 11 )
135* ..
136* .. Local Scalars ..
137 DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM,
138 $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2,
139 $ ulp, tau, elem, stamp, ddum, orth
140 INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN,
141 $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS,
142 $ m, m22, mbot, mend, mstart, mtop, nbmps, ndcol,
143 $ ns, nu, lldh, lldz, lldu, lldv, lldw, lldwh,
144 $ info, ictxt, nprow, npcol, nb, iroffh, itop,
145 $ nwin, myrow, mycol, lns, numwin, lkacc22,
146 $ lchain, win, idonejob, ipnext, anmwin, lenrbuf,
147 $ lencbuf, ichoff, lrsrc, lcsrc, lktop, lkbot,
148 $ ii, jj, swin, ewin, lnwin, dim, llktop, llkbot,
149 $ ipv, ipu, iph, ipw, ku, kwh, kwv, nve, lks,
150 $ idum, nho, dir, winid, indx, iloc, jloc, rsrc1,
151 $ csrc1, rsrc2, csrc2, rsrc3, csrc3, rsrc4, ipuu,
152 $ csrc4, lrows, lcols, indxs, ks, jloc1, iloc1,
153 $ lktop1, lktop2, wchunk, numchunk, oddeven,
154 $ chunknum, dim1, dim4, ipw3, hrows, zrows,
155 $ hcols, ipw1, ipw2, rsrc, east, jloc4, iloc4,
156 $ west, csrc, south, norht, indxe, north,
157 $ ihh, ipiw, lkbot1, nprocs, liroffh,
158 $ winfin, rws3, cls3, indx2, hrows2,
159 $ zrows2, hcols2, mnrbuf,
160 $ mxrbuf, mncbuf, mxcbuf, lwkopt
161 LOGICAL BLK22, BMP22, INTRO, DONEJOB, ODDNPROW,
162 $ ODDNPCOL, LQUERY, BCDONE
163 CHARACTER JBCMPZ*2, JOB
164* ..
165* .. External Functions ..
166 LOGICAL LSAME
167 INTEGER PILAENVX, ICEIL, INDXG2P, INDXG2L, NUMROC
168 DOUBLE PRECISION DLAMCH, DLANGE
169 EXTERNAL dlamch, pilaenvx, iceil, indxg2p, indxg2l,
170 $ numroc, lsame, dlange
171* ..
172* .. Intrinsic Functions ..
173 INTRINSIC abs, dble, max, min, mod
174* ..
175* .. Local Arrays ..
176 DOUBLE PRECISION VT( 3 )
177* ..
178* .. External Subroutines ..
179 EXTERNAL dgemm, dlabad, dlamov, dlaqr1, dlarfg, dlaset,
180 $ dtrmm, dlaqr6
181* ..
182* .. Executable Statements ..
183*
184 info = 0
185 ictxt = desch( ctxt_ )
186 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
187 nprocs = nprow*npcol
188 lldh = desch( lld_ )
189 lldz = descz( lld_ )
190 nb = desch( mb_ )
191 iroffh = mod( ktop - 1, nb )
192 lquery = lwork.EQ.-1 .OR. liwork.EQ.-1
193*
194* If there are no shifts, then there is nothing to do.
195*
196 IF( .NOT. lquery .AND. nshfts.LT.2 )
197 $ RETURN
198*
199* If the active block is empty or 1-by-1, then there
200* is nothing to do.
201*
202 IF( .NOT. lquery .AND. ktop.GE.kbot )
203 $ RETURN
204*
205* Shuffle shifts into pairs of real shifts and pairs of
206* complex conjugate shifts assuming complex conjugate
207* shifts are already adjacent to one another.
208*
209 IF( .NOT. lquery ) THEN
210 DO 10 i = 1, nshfts - 2, 2
211 IF( si( i ).NE.-si( i+1 ) ) THEN
212*
213 swap = sr( i )
214 sr( i ) = sr( i+1 )
215 sr( i+1 ) = sr( i+2 )
216 sr( i+2 ) = swap
217*
218 swap = si( i )
219 si( i ) = si( i+1 )
220 si( i+1 ) = si( i+2 )
221 si( i+2 ) = swap
222 END IF
223 10 CONTINUE
224 END IF
225*
226* NSHFTS is supposed to be even, but if is odd,
227* then simply reduce it by one. The shuffle above
228* ensures that the dropped shift is real and that
229* the remaining shifts are paired.
230*
231 ns = nshfts - mod( nshfts, 2 )
232*
233* Extract the size of the computational window.
234*
235 nwin = pilaenvx( ictxt, 19, 'PDLAQR5', jbcmpz, n, nb, nb, nb )
236 nwin = min( nwin, kbot-ktop+1 )
237*
238* Adjust number of simultaneous shifts if it exceeds the limit
239* set by the number of diagonal blocks in the active submatrix
240* H(KTOP:KBOT,KTOP:KBOT).
241*
242 ns = max( 2, min( ns, iceil( kbot-ktop+1, nb )*nwin/3 ) )
243 ns = ns - mod( ns, 2 )
244
245*
246* Decide the number of simultaneous computational windows
247* from the number of shifts - each window should contain up to
248* (NWIN / 3) shifts. Also compute the number of shifts per
249* window and make sure that number is even.
250*
251 lns = min( max( 2, nwin / 3 ), max( 2, ns / min(nprow,npcol) ) )
252 lns = lns - mod( lns, 2 )
253 numwin = max( 1, min( iceil( ns, lns ),
254 $ iceil( kbot-ktop+1, nb ) - 1 ) )
255 IF( nprow.NE.npcol ) THEN
256 numwin = min( numwin, min(nprow,npcol) )
257 lns = min( lns, max( 2, ns / min(nprow,npcol) ) )
258 lns = lns - mod( lns, 2 )
259 END IF
260*
261* Machine constants for deflation.
262*
263 safmin = dlamch( 'SAFE MINIMUM' )
264 safmax = one / safmin
265 CALL dlabad( safmin, safmax )
266 ulp = dlamch( 'PRECISION' )
267 smlnum = safmin*( dble( n ) / ulp )
268*
269* Use accumulated reflections to update far-from-diagonal
270* entries on a local level?
271*
272 IF( lns.LT.14 ) THEN
273 lkacc22 = 1
274 ELSE
275 lkacc22 = 2
276 END IF
277*
278* If so, exploit the 2-by-2 block structure?
279* ( Usually it is not efficient to exploit the 2-by-2 structure
280* because the block size is too small. )
281*
282 blk22 = ( lns.GT.2 ) .AND. ( kacc22.EQ.2 )
283*
284* Clear trash.
285*
286 IF( .NOT. lquery .AND. ktop+2.LE.kbot )
287 $ CALL pdelset( h, ktop+2, ktop, desch, zero )
288*
289* NBMPS = number of 2-shift bulges in each chain
290*
291 nbmps = lns / 2
292*
293* KDU = width of slab
294*
295 kdu = 6*nbmps - 3
296*
297* LCHAIN = length of each chain
298*
299 lchain = 3 * nbmps + 1
300*
301* Check if workspace query.
302*
303 IF( lquery ) THEN
304 hrows = numroc( n, nb, myrow, desch(rsrc_), nprow )
305 hcols = numroc( n, nb, mycol, desch(csrc_), npcol )
306 lwkopt = (5+2*numwin)*nb**2 + 2*hrows*nb + hcols*nb +
307 $ max( hrows*nb, hcols*nb )
308 work(1) = dble(lwkopt)
309 iwork(1) = 5*numwin
310 RETURN
311 END IF
312*
313* Check if KTOP and KBOT are valid.
314*
315 IF( ktop.LT.1 .OR. kbot.GT.n ) RETURN
316*
317* Create and chase NUMWIN chains of NBMPS bulges.
318*
319* Set up window introduction.
320*
321 anmwin = 0
322 intro = .true.
323 ipiw = 1
324*
325* Main loop:
326* While-loop over the computational windows which is
327* terminated when all windows have been introduced,
328* chased down to the bottom of the considered submatrix
329* and chased off.
330*
331 20 CONTINUE
332*
333* Set up next window as long as we have less than the prescribed
334* number of windows. Each window is described an integer quadruple:
335* 1. Local value of KTOP (below denoted by LKTOP)
336* 2. Local value of KBOT (below denoted by LKBOT)
337* 3-4. Processor indices (LRSRC,LCSRC) associated with the window.
338* (5. Mark that decides if a window is fully processed or not)
339*
340* Notice - the next window is only introduced if the first block
341* in the active submatrix does not contain any other windows.
342*
343 IF( anmwin.GT.0 ) THEN
344 lktop = iwork( 1+(anmwin-1)*5 )
345 ELSE
346 lktop = ktop
347 END IF
348 IF( intro .AND. (anmwin.EQ.0 .OR. lktop.GT.iceil(ktop,nb)*nb) )
349 $ THEN
350 anmwin = anmwin + 1
351*
352* Structure of IWORK:
353* IWORK( 1+(WIN-1)*5 ): start position
354* IWORK( 2+(WIN-1)*5 ): stop position
355* IWORK( 3+(WIN-1)*5 ): processor row id
356* IWORK( 4+(WIN-1)*5 ): processor col id
357* IWORK( 5+(WIN-1)*5 ): window status (0, 1, or 2)
358*
359 iwork( 1+(anmwin-1)*5 ) = ktop
360 iwork( 2+(anmwin-1)*5 ) = ktop +
361 $ min( nwin,nb-iroffh,kbot-ktop+1 ) - 1
362 iwork( 3+(anmwin-1)*5 ) = indxg2p( iwork(1+(anmwin-1)*5), nb,
363 $ myrow, desch(rsrc_), nprow )
364 iwork( 4+(anmwin-1)*5 ) = indxg2p( iwork(2+(anmwin-1)*5), nb,
365 $ mycol, desch(csrc_), npcol )
366 iwork( 5+(anmwin-1)*5 ) = 0
367 ipiw = 6+(anmwin-1)*5
368 IF( anmwin.EQ.numwin ) intro = .false.
369 END IF
370*
371* Do-loop over the number of windows.
372*
373 ipnext = 1
374 donejob = .false.
375 idonejob = 0
376 lenrbuf = 0
377 lencbuf = 0
378 ichoff = 0
379 DO 40 win = 1, anmwin
380*
381* Extract window information to simplify the rest.
382*
383 lrsrc = iwork( 3+(win-1)*5 )
384 lcsrc = iwork( 4+(win-1)*5 )
385 lktop = iwork( 1+(win-1)*5 )
386 lkbot = iwork( 2+(win-1)*5 )
387 lnwin = lkbot - lktop + 1
388*
389* Check if anything to do for current window, i.e., if the local
390* chain of bulges has reached the next block border etc.
391*
392 IF( iwork(5+(win-1)*5).LT.2 .AND. lnwin.GT.1 .AND.
393 $ (lnwin.GT.lchain .OR. lkbot.EQ.kbot ) ) THEN
394 liroffh = mod(lktop-1,nb)
395 swin = lktop-liroffh
396 ewin = min(kbot,lktop-liroffh+nb-1)
397 dim = ewin-swin+1
398 IF( dim.LE.ntiny .AND. .NOT.lkbot.EQ.kbot ) THEN
399 iwork( 5+(win-1)*5 ) = 2
400 GO TO 45
401 END IF
402 idonejob = 1
403 IF( iwork(5+(win-1)*5).EQ.0 ) THEN
404 iwork(5+(win-1)*5) = 1
405 END IF
406*
407* Let the process that owns the corresponding window do the
408* local bulge chase.
409*
410 IF( myrow.EQ.lrsrc .AND. mycol.EQ.lcsrc ) THEN
411*
412* Set the kind of job to do in DLAQR6:
413* 1. JOB = 'I': Introduce and chase bulges in window WIN
414* 2. JOB = 'C': Chase bulges from top to bottom of window WIN
415* 3. JOB = 'O': Chase bulges off window WIN
416* 4. JOB = 'A': All of 1-3 above is done - this will for
417* example happen for very small active
418* submatrices (like 2-by-2)
419*
420 llkbot = llktop + lnwin - 1
421 IF( lktop.EQ.ktop .AND. lkbot.EQ.kbot ) THEN
422 job = 'All steps'
423 ichoff = 1
424 ELSEIF( lktop.EQ.ktop ) THEN
425 job = 'Introduce and chase'
426 ELSEIF( lkbot.EQ.kbot ) THEN
427 job = 'Off-chase bulges'
428 ichoff = 1
429 ELSE
430 job = 'Chase bulges'
431 END IF
432*
433* Copy submatrix of H corresponding to window WIN into
434* workspace and set out additional workspace for storing
435* orthogonal transformations. This submatrix must be at
436* least (NTINY+1)-by-(NTINY+1) to fit into DLAQR6 - if not,
437* abort and go for cross border bulge chasing with this
438* particular window.
439*
440 ii = indxg2l( swin, nb, myrow, desch(rsrc_), nprow )
441 jj = indxg2l( swin, nb, mycol, desch(csrc_), npcol )
442 llktop = 1 + liroffh
443 llkbot = llktop + lnwin - 1
444*
445 ipu = ipnext
446 iph = ipu + lnwin**2
447 ipuu = iph + max(ntiny+1,dim)**2
448 ipv = ipuu + max(ntiny+1,dim)**2
449 ipnext = iph
450*
451 IF( lsame( job, 'A' ) .OR. lsame( job, 'O' ) .AND.
452 $ dim.LT.ntiny+1 ) THEN
453 CALL dlaset( 'All', ntiny+1, ntiny+1, zero, one,
454 $ work(iph), ntiny+1 )
455 END IF
456 CALL dlamov( 'Upper', dim, dim, h(ii+(jj-1)*lldh), lldh,
457 $ work(iph), max(ntiny+1,dim) )
458 CALL dcopy( dim-1, h(ii+(jj-1)*lldh+1), lldh+1,
459 $ work(iph+1), max(ntiny+1,dim)+1 )
460 IF( lsame( job, 'C' ) .OR. lsame( job, 'O') ) THEN
461 CALL dcopy( dim-2, h(ii+(jj-1)*lldh+2), lldh+1,
462 $ work(iph+2), max(ntiny+1,dim)+1 )
463 CALL dcopy( dim-3, h(ii+(jj-1)*lldh+3), lldh+1,
464 $ work(iph+3), max(ntiny+1,dim)+1 )
465 CALL dlaset( 'Lower', dim-4, dim-4, zero,
466 $ zero, work(iph+4), max(ntiny+1,dim) )
467 ELSE
468 CALL dlaset( 'Lower', dim-2, dim-2, zero,
469 $ zero, work(iph+2), max(ntiny+1,dim) )
470 END IF
471*
472 ku = max(ntiny+1,dim) - kdu + 1
473 kwh = kdu + 1
474 nho = ( max(ntiny+1,dim)-kdu+1-4 ) - ( kdu+1 ) + 1
475 kwv = kdu + 4
476 nve = max(ntiny+1,dim) - kdu - kwv + 1
477 CALL dlaset( 'All', max(ntiny+1,dim),
478 $ max(ntiny+1,dim), zero, one, work(ipuu),
479 $ max(ntiny+1,dim) )
480*
481* Small-bulge multi-shift QR sweep.
482*
483 lks = max( 1, ns - win*lns + 1 )
484 CALL dlaqr6( job, wantt, .true., lkacc22,
485 $ max(ntiny+1,dim), llktop, llkbot, lns, sr( lks ),
486 $ si( lks ), work(iph), max(ntiny+1,dim), llktop,
487 $ llkbot, work(ipuu), max(ntiny+1,dim), work(ipu),
488 $ 3, work( iph+ku-1 ),
489 $ max(ntiny+1,dim), nve, work( iph+kwv-1 ),
490 $ max(ntiny+1,dim), nho, work( iph-1+ku+(kwh-1)*
491 $ max(ntiny+1,dim) ), max(ntiny+1,dim) )
492*
493* Copy submatrix of H back.
494*
495 CALL dlamov( 'Upper', dim, dim, work(iph),
496 $ max(ntiny+1,dim), h(ii+(jj-1)*lldh), lldh )
497 CALL dcopy( dim-1, work(iph+1), max(ntiny+1,dim)+1,
498 $ h(ii+(jj-1)*lldh+1), lldh+1 )
499 IF( lsame( job, 'I' ) .OR. lsame( job, 'C' ) ) THEN
500 CALL dcopy( dim-2, work(iph+2), dim+1,
501 $ h(ii+(jj-1)*lldh+2), lldh+1 )
502 CALL dcopy( dim-3, work(iph+3), dim+1,
503 $ h(ii+(jj-1)*lldh+3), lldh+1 )
504 ELSE
505 CALL dlaset( 'Lower', dim-2, dim-2, zero,
506 $ zero, h(ii+(jj-1)*lldh+2), lldh )
507 END IF
508*
509* Copy actual submatrix of U to the correct place
510* of the buffer.
511*
512 CALL dlamov( 'All', lnwin, lnwin,
513 $ work(ipuu+(max(ntiny+1,dim)*liroffh)+liroffh),
514 $ max(ntiny+1,dim), work(ipu), lnwin )
515 END IF
516*
517* In case the local submatrix was smaller than
518* (NTINY+1)-by-(NTINY+1) we go here and proceed.
519*
520 45 CONTINUE
521 ELSE
522 iwork( 5+(win-1)*5 ) = 2
523 END IF
524*
525* Increment counter for buffers of orthogonal transformations.
526*
527 IF( myrow.EQ.lrsrc .OR. mycol.EQ.lcsrc ) THEN
528 IF( idonejob.EQ.1 .AND. iwork(5+(win-1)*5).LT.2 ) THEN
529 IF( myrow.EQ.lrsrc ) lenrbuf = lenrbuf + lnwin*lnwin
530 IF( mycol.EQ.lcsrc ) lencbuf = lencbuf + lnwin*lnwin
531 END IF
532 END IF
533 40 CONTINUE
534*
535* Did some work in the above do-loop?
536*
537 CALL igsum2d( ictxt, 'All', '1-Tree', 1, 1, idonejob, 1, -1, -1 )
538 donejob = idonejob.GT.0
539*
540* Chased off bulges from first window?
541*
542 IF( nprocs.GT.1 )
543 $ CALL igamx2d( ictxt, 'All', '1-Tree', 1, 1, ichoff, 1, -1,
544 $ -1, -1, -1, -1 )
545*
546* If work was done in the do-loop over local windows, perform
547* updates, otherwise go for cross border bulge chasing and updates.
548*
549 IF( donejob ) THEN
550*
551* Broadcast orthogonal transformations.
552*
553 49 CONTINUE
554 IF( lenrbuf.GT.0 .OR. lencbuf.GT.0 ) THEN
555 DO 50 dir = 1, 2
556 bcdone = .false.
557 DO 60 win = 1, anmwin
558 IF( ( lenrbuf.EQ.0 .AND. lencbuf.EQ.0 ) .OR.
559 $ bcdone ) GO TO 62
560 lrsrc = iwork( 3+(win-1)*5 )
561 lcsrc = iwork( 4+(win-1)*5 )
562 IF( myrow.EQ.lrsrc .AND. mycol.EQ.lcsrc ) THEN
563 IF( dir.EQ.1 .AND. lenrbuf.GT.0 .AND.
564 $ npcol.GT.1 ) THEN
565 CALL dgebs2d( ictxt, 'Row', '1-Tree', lenrbuf,
566 $ 1, work, lenrbuf )
567 ELSEIF( dir.EQ.2 .AND. lencbuf.GT.0 .AND.
568 $ nprow.GT.1 ) THEN
569 CALL dgebs2d( ictxt, 'Col', '1-Tree', lencbuf,
570 $ 1, work, lencbuf )
571 END IF
572 IF( lenrbuf.GT.0 )
573 $ CALL dlamov( 'All', lenrbuf, 1, work, lenrbuf,
574 $ work(1+lenrbuf), lencbuf )
575 bcdone = .true.
576 ELSEIF( myrow.EQ.lrsrc .AND. dir.EQ.1 ) THEN
577 IF( lenrbuf.GT.0 .AND. npcol.GT.1 ) THEN
578 CALL dgebr2d( ictxt, 'Row', '1-Tree', lenrbuf,
579 $ 1, work, lenrbuf, lrsrc, lcsrc )
580 bcdone = .true.
581 END IF
582 ELSEIF( mycol.EQ.lcsrc .AND. dir.EQ.2 ) THEN
583 IF( lencbuf.GT.0 .AND. nprow.GT.1 ) THEN
584 CALL dgebr2d( ictxt, 'Col', '1-Tree', lencbuf,
585 $ 1, work(1+lenrbuf), lencbuf, lrsrc, lcsrc )
586 bcdone = .true.
587 END IF
588 END IF
589 62 CONTINUE
590 60 CONTINUE
591 50 CONTINUE
592 END IF
593*
594* Compute updates - make sure to skip windows that was skipped
595* regarding local bulge chasing.
596*
597 DO 65 dir = 1, 2
598 winid = 0
599 IF( dir.EQ.1 ) THEN
600 ipnext = 1
601 ELSE
602 ipnext = 1 + lenrbuf
603 END IF
604 DO 70 win = 1, anmwin
605 IF( iwork( 5+(win-1)*5 ).EQ.2 ) GO TO 75
606 lrsrc = iwork( 3+(win-1)*5 )
607 lcsrc = iwork( 4+(win-1)*5 )
608 lktop = iwork( 1+(win-1)*5 )
609 lkbot = iwork( 2+(win-1)*5 )
610 lnwin = lkbot - lktop + 1
611 IF( (myrow.EQ.lrsrc.AND.lenrbuf.GT.0.AND.dir.EQ.1) .OR.
612 $ (mycol.EQ.lcsrc.AND.lencbuf.GT.0.AND.dir.EQ.2 ) )
613 $ THEN
614*
615* Set up workspaces.
616*
617 ipu = ipnext
618 ipnext = ipu + lnwin*lnwin
619 ipw = 1 + lenrbuf + lencbuf
620 liroffh = mod(lktop-1,nb)
621 winid = winid + 1
622*
623* Recompute JOB to see if block structure of U could
624* possibly be exploited or not.
625*
626 IF( lktop.EQ.ktop .AND. lkbot.EQ.kbot ) THEN
627 job = 'All steps'
628 ELSEIF( lktop.EQ.ktop ) THEN
629 job = 'Introduce and chase'
630 ELSEIF( lkbot.EQ.kbot ) THEN
631 job = 'Off-chase bulges'
632 ELSE
633 job = 'Chase bulges'
634 END IF
635 END IF
636*
637* Use U to update far-from-diagonal entries in H.
638* If required, use U to update Z as well.
639*
640 IF( .NOT. blk22 .OR. .NOT. lsame(job,'C')
641 $ .OR. lns.LE.2 ) THEN
642*
643 IF( dir.EQ.2 .AND. lencbuf.GT.0 .AND.
644 $ mycol.EQ.lcsrc ) THEN
645 IF( wantt ) THEN
646 DO 80 indx = 1, lktop-liroffh-1, nb
647 CALL infog2l( indx, lktop, desch, nprow,
648 $ npcol, myrow, mycol, iloc, jloc, rsrc1,
649 $ csrc1 )
650 IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc1 ) THEN
651 lrows = min( nb, lktop-indx )
652 CALL dgemm('No transpose', 'No transpose',
653 $ lrows, lnwin, lnwin, one,
654 $ h((jloc-1)*lldh+iloc), lldh,
655 $ work( ipu ), lnwin, zero,
656 $ work(ipw),
657 $ lrows )
658 CALL dlamov( 'All', lrows, lnwin,
659 $ work(ipw), lrows,
660 $ h((jloc-1)*lldh+iloc), lldh )
661 END IF
662 80 CONTINUE
663 END IF
664 IF( wantz ) THEN
665 DO 90 indx = 1, n, nb
666 CALL infog2l( indx, lktop, descz, nprow,
667 $ npcol, myrow, mycol, iloc, jloc, rsrc1,
668 $ csrc1 )
669 IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc1 ) THEN
670 lrows = min(nb,n-indx+1)
671 CALL dgemm( 'No transpose',
672 $ 'No transpose', lrows, lnwin, lnwin,
673 $ one, z((jloc-1)*lldz+iloc), lldz,
674 $ work( ipu ), lnwin, zero,
675 $ work(ipw), lrows )
676 CALL dlamov( 'All', lrows, lnwin,
677 $ work(ipw), lrows,
678 $ z((jloc-1)*lldz+iloc), lldz )
679 END IF
680 90 CONTINUE
681 END IF
682 END IF
683*
684* Update the rows of H affected by the bulge-chase.
685*
686 IF( dir.EQ.1 .AND. lenrbuf.GT.0 .AND.
687 $ myrow.EQ.lrsrc ) THEN
688 IF( wantt ) THEN
689 IF( iceil(lkbot,nb).EQ.iceil(kbot,nb) ) THEN
690 lcols = min(iceil(kbot,nb)*nb,n) - kbot
691 ELSE
692 lcols = 0
693 END IF
694 IF( lcols.GT.0 ) THEN
695 indx = kbot + 1
696 CALL infog2l( lktop, indx, desch, nprow,
697 $ npcol, myrow, mycol, iloc, jloc,
698 $ rsrc1, csrc1 )
699 IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc1 ) THEN
700 CALL dgemm( 'Transpose', 'No Transpose',
701 $ lnwin, lcols, lnwin, one, work(ipu),
702 $ lnwin, h((jloc-1)*lldh+iloc), lldh,
703 $ zero, work(ipw), lnwin )
704 CALL dlamov( 'All', lnwin, lcols,
705 $ work(ipw), lnwin,
706 $ h((jloc-1)*lldh+iloc), lldh )
707 END IF
708 END IF
709 93 CONTINUE
710 indxs = iceil(lkbot,nb)*nb + 1
711 DO 95 indx = indxs, n, nb
712 CALL infog2l( lktop, indx,
713 $ desch, nprow, npcol, myrow, mycol,
714 $ iloc, jloc, rsrc1, csrc1 )
715 IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc1 ) THEN
716 lcols = min( nb, n-indx+1 )
717 CALL dgemm( 'Transpose', 'No Transpose',
718 $ lnwin, lcols, lnwin, one, work(ipu),
719 $ lnwin, h((jloc-1)*lldh+iloc), lldh,
720 $ zero, work(ipw),
721 $ lnwin )
722 CALL dlamov( 'All', lnwin, lcols,
723 $ work(ipw), lnwin,
724 $ h((jloc-1)*lldh+iloc), lldh )
725 END IF
726 95 CONTINUE
727 END IF
728 END IF
729 ELSE
730 ks = lnwin-lns/2*3
731*
732* The LNWIN-by-LNWIN matrix U containing the accumulated
733* orthogonal transformations has the following structure:
734*
735* [ U11 U12 ]
736* U = [ ],
737* [ U21 U22 ]
738*
739* where U21 is KS-by-KS upper triangular and U12 is
740* (LNWIN-KS)-by-(LNWIN-KS) lower triangular.
741* Here, KS = LNS.
742*
743* Update the columns of H and Z affected by the bulge
744* chasing.
745*
746* Compute H2*U21 + H1*U11 in workspace.
747*
748 IF( dir.EQ.2 .AND. lencbuf.GT.0 .AND.
749 $ mycol.EQ.lcsrc ) THEN
750 IF( wantt ) THEN
751 DO 100 indx = 1, lktop-liroffh-1, nb
752 CALL infog2l( indx, lktop, desch, nprow,
753 $ npcol, myrow, mycol, iloc, jloc, rsrc1,
754 $ csrc1 )
755 IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc1 ) THEN
756 jloc1 = indxg2l( lktop+lnwin-ks, nb,
757 $ mycol, desch( csrc_ ), npcol )
758 lrows = min( nb, lktop-indx )
759 CALL dlamov( 'All', lrows, ks,
760 $ h((jloc1-1)*lldh+iloc ), lldh,
761 $ work(ipw), lrows )
762 CALL dtrmm( 'Right', 'Upper',
763 $ 'No transpose','Non-unit', lrows,
764 $ ks, one, work( ipu+lnwin-ks ), lnwin,
765 $ work(ipw), lrows )
766 CALL dgemm('No transpose', 'No transpose',
767 $ lrows, ks, lnwin-ks, one,
768 $ h((jloc-1)*lldh+iloc), lldh,
769 $ work( ipu ), lnwin, one, work(ipw),
770 $ lrows )
771*
772* Compute H1*U12 + H2*U22 in workspace.
773*
774 CALL dlamov( 'All', lrows, lnwin-ks,
775 $ h((jloc-1)*lldh+iloc), lldh,
776 $ work( ipw+ks*lrows ), lrows )
777 CALL dtrmm( 'Right', 'Lower',
778 $ 'No transpose', 'Non-Unit',
779 $ lrows, lnwin-ks, one,
780 $ work( ipu+lnwin*ks ), lnwin,
781 $ work( ipw+ks*lrows ), lrows )
782 CALL dgemm('No transpose', 'No transpose',
783 $ lrows, lnwin-ks, ks, one,
784 $ h((jloc1-1)*lldh+iloc), lldh,
785 $ work( ipu+lnwin*ks+lnwin-ks ), lnwin,
786 $ one, work( ipw+ks*lrows ), lrows )
787*
788* Copy workspace to H.
789*
790 CALL dlamov( 'All', lrows, lnwin,
791 $ work(ipw), lrows,
792 $ h((jloc-1)*lldh+iloc), lldh )
793 END IF
794 100 CONTINUE
795 END IF
796*
797 IF( wantz ) THEN
798*
799* Compute Z2*U21 + Z1*U11 in workspace.
800*
801 DO 110 indx = 1, n, nb
802 CALL infog2l( indx, lktop, descz, nprow,
803 $ npcol, myrow, mycol, iloc, jloc, rsrc1,
804 $ csrc1 )
805 IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc1 ) THEN
806 jloc1 = indxg2l( lktop+lnwin-ks, nb,
807 $ mycol, descz( csrc_ ), npcol )
808 lrows = min(nb,n-indx+1)
809 CALL dlamov( 'All', lrows, ks,
810 $ z((jloc1-1)*lldz+iloc ), lldz,
811 $ work(ipw), lrows )
812 CALL dtrmm( 'Right', 'Upper',
813 $ 'No transpose', 'Non-unit',
814 $ lrows, ks, one, work( ipu+lnwin-ks ),
815 $ lnwin, work(ipw), lrows )
816 CALL dgemm( 'No transpose',
817 $ 'No transpose', lrows, ks, lnwin-ks,
818 $ one, z((jloc-1)*lldz+iloc), lldz,
819 $ work( ipu ), lnwin, one, work(ipw),
820 $ lrows )
821*
822* Compute Z1*U12 + Z2*U22 in workspace.
823*
824 CALL dlamov( 'All', lrows, lnwin-ks,
825 $ z((jloc-1)*lldz+iloc), lldz,
826 $ work( ipw+ks*lrows ), lrows)
827 CALL dtrmm( 'Right', 'Lower',
828 $ 'No transpose', 'Non-unit',
829 $ lrows, lnwin-ks, one,
830 $ work( ipu+lnwin*ks ), lnwin,
831 $ work( ipw+ks*lrows ), lrows )
832 CALL dgemm( 'No transpose',
833 $ 'No transpose', lrows, lnwin-ks, ks,
834 $ one, z((jloc1-1)*lldz+iloc), lldz,
835 $ work( ipu+lnwin*ks+lnwin-ks ), lnwin,
836 $ one, work( ipw+ks*lrows ),
837 $ lrows )
838*
839* Copy workspace to Z.
840*
841 CALL dlamov( 'All', lrows, lnwin,
842 $ work(ipw), lrows,
843 $ z((jloc-1)*lldz+iloc), lldz )
844 END IF
845 110 CONTINUE
846 END IF
847 END IF
848*
849 IF( dir.EQ.1 .AND. lenrbuf.GT.0 .AND.
850 $ myrow.EQ.lrsrc ) THEN
851 IF( wantt ) THEN
852 indxs = iceil(lkbot,nb)*nb + 1
853 DO 120 indx = indxs, n, nb
854 CALL infog2l( lktop, indx,
855 $ desch, nprow, npcol, myrow, mycol, iloc,
856 $ jloc, rsrc1, csrc1 )
857 IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc1 ) THEN
858*
859* Compute U21**T*H2 + U11**T*H1 in workspace.
860*
861 iloc1 = indxg2l( lktop+lnwin-ks, nb,
862 $ myrow, desch( rsrc_ ), nprow )
863 lcols = min( nb, n-indx+1 )
864 CALL dlamov( 'All', ks, lcols,
865 $ h((jloc-1)*lldh+iloc1), lldh,
866 $ work(ipw), lnwin )
867 CALL dtrmm( 'Left', 'Upper', 'Transpose',
868 $ 'Non-unit', ks, lcols, one,
869 $ work( ipu+lnwin-ks ), lnwin,
870 $ work(ipw), lnwin )
871 CALL dgemm( 'Transpose', 'No transpose',
872 $ ks, lcols, lnwin-ks, one, work(ipu),
873 $ lnwin, h((jloc-1)*lldh+iloc), lldh,
874 $ one, work(ipw), lnwin )
875*
876* Compute U12**T*H1 + U22**T*H2 in workspace.
877*
878 CALL dlamov( 'All', lnwin-ks, lcols,
879 $ h((jloc-1)*lldh+iloc), lldh,
880 $ work( ipw+ks ), lnwin )
881 CALL dtrmm( 'Left', 'Lower', 'Transpose',
882 $ 'Non-unit', lnwin-ks, lcols, one,
883 $ work( ipu+lnwin*ks ), lnwin,
884 $ work( ipw+ks ), lnwin )
885 CALL dgemm( 'Transpose', 'No Transpose',
886 $ lnwin-ks, lcols, ks, one,
887 $ work( ipu+lnwin*ks+lnwin-ks ), lnwin,
888 $ h((jloc-1)*lldh+iloc1), lldh,
889 $ one, work( ipw+ks ), lnwin )
890*
891* Copy workspace to H.
892*
893 CALL dlamov( 'All', lnwin, lcols,
894 $ work(ipw), lnwin,
895 $ h((jloc-1)*lldh+iloc), lldh )
896 END IF
897 120 CONTINUE
898 END IF
899 END IF
900 END IF
901*
902* Update position information about current window.
903*
904 IF( dir.EQ.2 ) THEN
905 IF( lkbot.EQ.kbot ) THEN
906 lktop = kbot+1
907 lkbot = kbot+1
908 iwork( 1+(win-1)*5 ) = lktop
909 iwork( 2+(win-1)*5 ) = lkbot
910 iwork( 5+(win-1)*5 ) = 2
911 ELSE
912 lktop = min( lktop + lnwin - lchain,
913 $ iceil( lktop, nb )*nb - lchain + 1,
914 $ kbot )
915 iwork( 1+(win-1)*5 ) = lktop
916 lkbot = min( lkbot + lnwin - lchain,
917 $ iceil( lkbot, nb )*nb, kbot )
918 iwork( 2+(win-1)*5 ) = lkbot
919 lnwin = lkbot-lktop+1
920 IF( lnwin.EQ.lchain ) iwork(5+(win-1)*5) = 2
921 END IF
922 END IF
923 75 CONTINUE
924 70 CONTINUE
925 65 CONTINUE
926*
927* If bulges were chasen off from first window, the window is
928* removed.
929*
930 IF( ichoff.GT.0 ) THEN
931 DO 128 win = 2, anmwin
932 iwork( 1+(win-2)*5 ) = iwork( 1+(win-1)*5 )
933 iwork( 2+(win-2)*5 ) = iwork( 2+(win-1)*5 )
934 iwork( 3+(win-2)*5 ) = iwork( 3+(win-1)*5 )
935 iwork( 4+(win-2)*5 ) = iwork( 4+(win-1)*5 )
936 iwork( 5+(win-2)*5 ) = iwork( 5+(win-1)*5 )
937 128 CONTINUE
938 anmwin = anmwin - 1
939 ipiw = 6+(anmwin-1)*5
940 END IF
941*
942* If we have no more windows, return.
943*
944 IF( anmwin.LT.1 ) RETURN
945*
946 ELSE
947*
948* Set up windows such that as many bulges as possible can be
949* moved over the border to the next block. Make sure that the
950* cross border window is at least (NTINY+1)-by-(NTINY+1), unless
951* we are chasing off the bulges from the last window. This is
952* accomplished by setting the bottom index LKBOT such that the
953* local window has the correct size.
954*
955* If LKBOT then becomes larger than KBOT, the endpoint of the whole
956* global submatrix, or LKTOP from a window located already residing
957* at the other side of the border, this is taken care of by some
958* dirty tricks.
959*
960 DO 130 win = 1, anmwin
961 lktop1 = iwork( 1+(win-1)*5 )
962 lkbot = iwork( 2+(win-1)*5 )
963 lnwin = max( 6, min( lkbot - lktop1 + 1, lchain ) )
964 lkbot1 = max( min( kbot, iceil(lktop1,nb)*nb+lchain),
965 $ min( kbot, min( lktop1+2*lnwin-1,
966 $ (iceil(lktop1,nb)+1)*nb ) ) )
967 iwork( 2+(win-1)*5 ) = lkbot1
968 130 CONTINUE
969 ichoff = 0
970*
971* Keep a record over what windows that were moved over the borders
972* such that we can delay some windows due to lack of space on the
973* other side of the border; we do not want to leave any of the
974* bulges behind...
975*
976* IWORK( 5+(WIN-1)*5 ) = 0: window WIN has not been processed
977* IWORK( 5+(WIN-1)*5 ) = 1: window WIN is being processed (need to
978* know for updates)
979* IWORK( 5+(WIN-1)*5 ) = 2: window WIN has been fully processed
980*
981* So, start by marking all windows as not processed.
982*
983 DO 135 win = 1, anmwin
984 iwork( 5+(win-1)*5 ) = 0
985 135 CONTINUE
986*
987* Do the cross border bulge-chase as follows: Start from the
988* first window (the one that is closest to be chased off the
989* diagonal of H) and take the odd windows first followed by the
990* even ones. To not get into hang-problems on processor meshes
991* with at least one odd dimension, the windows will in such a case
992* be processed in chunks of {the minimum odd process dimension}-1
993* windows to avoid overlapping processor scopes in forming the
994* cross border computational windows and the cross border update
995* regions.
996*
997 wchunk = max( 1, min( anmwin, nprow-1, npcol-1 ) )
998 numchunk = iceil( anmwin, wchunk )
999*
1000* Based on the computed chunk of windows, start working with
1001* crossborder bulge-chasing. Repeat this as long as there is
1002* still work left to do (137 is a kind of do-while statement).
1003*
1004 137 CONTINUE
1005*
1006* Zero out LENRBUF and LENCBUF each time we restart this loop.
1007*
1008 lenrbuf = 0
1009 lencbuf = 0
1010*
1011 DO 140 oddeven = 1, min( 2, anmwin )
1012 DO 150 chunknum = 1, numchunk
1013 ipnext = 1
1014 DO 160 win = oddeven+(chunknum-1)*wchunk,
1015 $ min(anmwin,max(1,oddeven+(chunknum)*wchunk-1)), 2
1016*
1017* Get position and size of the WIN:th active window and
1018* make sure that we skip the cross border bulge for this
1019* window if the window is not shared between several data
1020* layout blocks (and processors).
1021*
1022* Also, delay windows that do not have sufficient size of
1023* the other side of the border. Moreover, make sure to skip
1024* windows that was already processed in the last round of
1025* the do-while loop (137).
1026*
1027 IF( iwork( 5+(win-1)*5 ).EQ.2 ) GO TO 165
1028 lktop = iwork( 1+(win-1)*5 )
1029 lkbot = iwork( 2+(win-1)*5 )
1030 IF( win.GT.1 ) THEN
1031 lktop2 = iwork( 1+(win-2)*5 )
1032 ELSE
1033 lktop2 = kbot+1
1034 END IF
1035 IF( iceil(lktop,nb).EQ.iceil(lkbot,nb) .OR.
1036 $ lkbot.GE.lktop2 ) GO TO 165
1037 lnwin = lkbot - lktop + 1
1038 IF( lnwin.LE.ntiny .AND. lkbot.NE.kbot .AND.
1039 $ .NOT. mod(lkbot,nb).EQ.0 ) GO TO 165
1040*
1041* If window is going to be processed, mark it as processed.
1042*
1043 iwork( 5+(win-1)*5 ) = 1
1044*
1045* Extract processors for current cross border window,
1046* as below:
1047*
1048* 1 | 2
1049* --+--
1050* 3 | 4
1051*
1052 rsrc1 = iwork( 3+(win-1)*5 )
1053 csrc1 = iwork( 4+(win-1)*5 )
1054 rsrc2 = rsrc1
1055 csrc2 = mod( csrc1+1, npcol )
1056 rsrc3 = mod( rsrc1+1, nprow )
1057 csrc3 = csrc1
1058 rsrc4 = mod( rsrc1+1, nprow )
1059 csrc4 = mod( csrc1+1, npcol )
1060*
1061* Form group of four processors for cross border window.
1062*
1063 IF( ( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) .OR.
1064 $ ( myrow.EQ.rsrc2 .AND. mycol.EQ.csrc2 ) .OR.
1065 $ ( myrow.EQ.rsrc3 .AND. mycol.EQ.csrc3 ) .OR.
1066 $ ( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) ) THEN
1067*
1068* Compute the upper and lower parts of the active
1069* window.
1070*
1071 dim1 = nb - mod(lktop-1,nb)
1072 dim4 = lnwin - dim1
1073*
1074* Temporarily compute a new value of the size of the
1075* computational window that is larger than or equal to
1076* NTINY+1; call the *real* value DIM.
1077*
1078 dim = lnwin
1079 lnwin = max(ntiny+1,lnwin)
1080*
1081* Divide workspace.
1082*
1083 ipu = ipnext
1084 iph = ipu + dim**2
1085 ipuu = iph + lnwin**2
1086 ipv = ipuu + lnwin**2
1087 ipnext = iph
1088 IF( dim.LT.lnwin ) THEN
1089 CALL dlaset( 'All', lnwin, lnwin, zero,
1090 $ one, work( iph ), lnwin )
1091 ELSE
1092 CALL dlaset( 'All', dim, dim, zero,
1093 $ zero, work( iph ), lnwin )
1094 END IF
1095*
1096* Form the active window.
1097*
1098 IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) THEN
1099 iloc = indxg2l( lktop, nb, myrow,
1100 $ desch( rsrc_ ), nprow )
1101 jloc = indxg2l( lktop, nb, mycol,
1102 $ desch( csrc_ ), npcol )
1103 CALL dlamov( 'All', dim1, dim1,
1104 $ h((jloc-1)*lldh+iloc), lldh, work(iph),
1105 $ lnwin )
1106 IF( rsrc1.NE.rsrc4 .OR. csrc1.NE.csrc4 ) THEN
1107* Proc#1 <==> Proc#4
1108 CALL dgesd2d( ictxt, dim1, dim1,
1109 $ work(iph), lnwin, rsrc4, csrc4 )
1110 CALL dgerv2d( ictxt, dim4, dim4,
1111 $ work(iph+dim1*lnwin+dim1),
1112 $ lnwin, rsrc4, csrc4 )
1113 END IF
1114 END IF
1115 IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
1116 iloc = indxg2l( lktop+dim1, nb, myrow,
1117 $ desch( rsrc_ ), nprow )
1118 jloc = indxg2l( lktop+dim1, nb, mycol,
1119 $ desch( csrc_ ), npcol )
1120 CALL dlamov( 'All', dim4, dim4,
1121 $ h((jloc-1)*lldh+iloc), lldh,
1122 $ work(iph+dim1*lnwin+dim1),
1123 $ lnwin )
1124 IF( rsrc4.NE.rsrc1 .OR. csrc4.NE.csrc1 ) THEN
1125* Proc#4 <==> Proc#1
1126 CALL dgesd2d( ictxt, dim4, dim4,
1127 $ work(iph+dim1*lnwin+dim1),
1128 $ lnwin, rsrc1, csrc1 )
1129 CALL dgerv2d( ictxt, dim1, dim1,
1130 $ work(iph), lnwin, rsrc1, csrc1 )
1131 END IF
1132 END IF
1133 IF( myrow.EQ.rsrc2 .AND. mycol.EQ.csrc2 ) THEN
1134 iloc = indxg2l( lktop, nb, myrow,
1135 $ desch( rsrc_ ), nprow )
1136 jloc = indxg2l( lktop+dim1, nb, mycol,
1137 $ desch( csrc_ ), npcol )
1138 CALL dlamov( 'All', dim1, dim4,
1139 $ h((jloc-1)*lldh+iloc), lldh,
1140 $ work(iph+dim1*lnwin), lnwin )
1141 IF( rsrc2.NE.rsrc1 .OR. csrc2.NE.csrc1 ) THEN
1142* Proc#2 ==> Proc#1
1143 CALL dgesd2d( ictxt, dim1, dim4,
1144 $ work(iph+dim1*lnwin),
1145 $ lnwin, rsrc1, csrc1 )
1146 END IF
1147 END IF
1148 IF( myrow.EQ.rsrc2 .AND. mycol.EQ.csrc2 ) THEN
1149 IF( rsrc2.NE.rsrc4 .OR. csrc2.NE.csrc4 ) THEN
1150* Proc#2 ==> Proc#4
1151 CALL dgesd2d( ictxt, dim1, dim4,
1152 $ work(iph+dim1*lnwin),
1153 $ lnwin, rsrc4, csrc4 )
1154 END IF
1155 END IF
1156 IF( myrow.EQ.rsrc3 .AND. mycol.EQ.csrc3 ) THEN
1157 iloc = indxg2l( lktop+dim1, nb, myrow,
1158 $ desch( rsrc_ ), nprow )
1159 jloc = indxg2l( lktop+dim1-1, nb, mycol,
1160 $ desch( csrc_ ), npcol )
1161 CALL dlamov( 'All', 1, 1,
1162 $ h((jloc-1)*lldh+iloc), lldh,
1163 $ work(iph+(dim1-1)*lnwin+dim1),
1164 $ lnwin )
1165 IF( rsrc3.NE.rsrc1 .OR. csrc3.NE.csrc1 ) THEN
1166* Proc#3 ==> Proc#1
1167 CALL dgesd2d( ictxt, 1, 1,
1168 $ work(iph+(dim1-1)*lnwin+dim1),
1169 $ lnwin, rsrc1, csrc1 )
1170 END IF
1171 END IF
1172 IF( myrow.EQ.rsrc3 .AND. mycol.EQ.csrc3 ) THEN
1173 IF( rsrc3.NE.rsrc4 .OR. csrc3.NE.csrc4 ) THEN
1174* Proc#3 ==> Proc#4
1175 CALL dgesd2d( ictxt, 1, 1,
1176 $ work(iph+(dim1-1)*lnwin+dim1),
1177 $ lnwin, rsrc4, csrc4 )
1178 END IF
1179 END IF
1180 IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) THEN
1181 IF( rsrc1.NE.rsrc2 .OR. csrc1.NE.csrc2 ) THEN
1182* Proc#1 <== Proc#2
1183 CALL dgerv2d( ictxt, dim1, dim4,
1184 $ work(iph+dim1*lnwin),
1185 $ lnwin, rsrc2, csrc2 )
1186 END IF
1187 IF( rsrc1.NE.rsrc3 .OR. csrc1.NE.csrc3 ) THEN
1188* Proc#1 <== Proc#3
1189 CALL dgerv2d( ictxt, 1, 1,
1190 $ work(iph+(dim1-1)*lnwin+dim1),
1191 $ lnwin, rsrc3, csrc3 )
1192 END IF
1193 END IF
1194 IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
1195 IF( rsrc4.NE.rsrc2 .OR. csrc4.NE.csrc2 ) THEN
1196* Proc#4 <== Proc#2
1197 CALL dgerv2d( ictxt, dim1, dim4,
1198 $ work(iph+dim1*lnwin),
1199 $ lnwin, rsrc2, csrc2 )
1200 END IF
1201 IF( rsrc4.NE.rsrc3 .OR. csrc4.NE.csrc3 ) THEN
1202* Proc#4 <== Proc#3
1203 CALL dgerv2d( ictxt, 1, 1,
1204 $ work(iph+(dim1-1)*lnwin+dim1),
1205 $ lnwin, rsrc3, csrc3 )
1206 END IF
1207 END IF
1208*
1209* Prepare for call to DLAQR6 - it could happen that no
1210* bulges where introduced in the pre-cross border step
1211* since the chain was too long to fit in the top-left
1212* part of the cross border window. In such a case, the
1213* bulges are introduced here instead. It could also
1214* happen that the bottom-right part is too small to hold
1215* the whole chain -- in such a case, the bulges are
1216* chasen off immediately, as well.
1217*
1218 IF( (myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1) .OR.
1219 $ (myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4) ) THEN
1220 IF( lktop.EQ.ktop .AND. lkbot.EQ.kbot .AND.
1221 $ (dim1.LE.lchain .OR. dim1.LE.ntiny ) ) THEN
1222 job = 'All steps'
1223 ichoff = 1
1224 ELSEIF( lktop.EQ.ktop .AND.
1225 $ ( dim1.LE.lchain .OR. dim1.LE.ntiny ) ) THEN
1226 job = 'Introduce and chase'
1227 ELSEIF( lkbot.EQ.kbot ) THEN
1228 job = 'Off-chase bulges'
1229 ichoff = 1
1230 ELSE
1231 job = 'Chase bulges'
1232 END IF
1233 ku = lnwin - kdu + 1
1234 kwh = kdu + 1
1235 nho = ( lnwin-kdu+1-4 ) - ( kdu+1 ) + 1
1236 kwv = kdu + 4
1237 nve = lnwin - kdu - kwv + 1
1238 CALL dlaset( 'All', lnwin, lnwin,
1239 $ zero, one, work(ipuu), lnwin )
1240*
1241* Small-bulge multi-shift QR sweep.
1242*
1243 lks = max(1, ns - win*lns + 1)
1244 CALL dlaqr6( job, wantt, .true., lkacc22, lnwin,
1245 $ 1, dim, lns, sr( lks ), si( lks ),
1246 $ work(iph), lnwin, 1, dim,
1247 $ work(ipuu), lnwin, work(ipu), 3,
1248 $ work( iph+ku-1 ), lnwin, nve,
1249 $ work( iph+kwv-1 ), lnwin, nho,
1250 $ work( iph-1+ku+(kwh-1)*lnwin ), lnwin )
1251*
1252* Copy local submatrices of H back to global matrix.
1253*
1254 IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) THEN
1255 iloc = indxg2l( lktop, nb, myrow,
1256 $ desch( rsrc_ ), nprow )
1257 jloc = indxg2l( lktop, nb, mycol,
1258 $ desch( csrc_ ), npcol )
1259 CALL dlamov( 'All', dim1, dim1, work(iph),
1260 $ lnwin, h((jloc-1)*lldh+iloc),
1261 $ lldh )
1262 END IF
1263 IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
1264 iloc = indxg2l( lktop+dim1, nb, myrow,
1265 $ desch( rsrc_ ), nprow )
1266 jloc = indxg2l( lktop+dim1, nb, mycol,
1267 $ desch( csrc_ ), npcol )
1268 CALL dlamov( 'All', dim4, dim4,
1269 $ work(iph+dim1*lnwin+dim1),
1270 $ lnwin, h((jloc-1)*lldh+iloc), lldh )
1271 END IF
1272*
1273* Copy actual submatrix of U to the correct place of
1274* the buffer.
1275*
1276 CALL dlamov( 'All', dim, dim,
1277 $ work(ipuu), lnwin, work(ipu), dim )
1278 END IF
1279*
1280* Return data to process 2 and 3.
1281*
1282 rws3 = min(3,dim4)
1283 cls3 = min(3,dim1)
1284 IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) THEN
1285 IF( rsrc1.NE.rsrc3 .OR. csrc1.NE.csrc3 ) THEN
1286* Proc#1 ==> Proc#3
1287 CALL dgesd2d( ictxt, rws3, cls3,
1288 $ work( iph+(dim1-cls3)*lnwin+dim1 ),
1289 $ lnwin, rsrc3, csrc3 )
1290 END IF
1291 END IF
1292 IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
1293 IF( rsrc4.NE.rsrc2 .OR. csrc4.NE.csrc2 ) THEN
1294* Proc#4 ==> Proc#2
1295 CALL dgesd2d( ictxt, dim1, dim4,
1296 $ work( iph+dim1*lnwin),
1297 $ lnwin, rsrc2, csrc2 )
1298 END IF
1299 END IF
1300 IF( myrow.EQ.rsrc2 .AND. mycol.EQ.csrc2 ) THEN
1301 iloc = indxg2l( lktop, nb, myrow,
1302 $ desch( rsrc_ ), nprow )
1303 jloc = indxg2l( lktop+dim1, nb, mycol,
1304 $ desch( csrc_ ), npcol )
1305 IF( rsrc2.NE.rsrc4 .OR. csrc2.NE.csrc4 ) THEN
1306* Proc#2 <== Proc#4
1307 CALL dgerv2d( ictxt, dim1, dim4,
1308 $ work(iph+dim1*lnwin),
1309 $ lnwin, rsrc4, csrc4 )
1310 END IF
1311 CALL dlamov( 'All', dim1, dim4,
1312 $ work( iph+dim1*lnwin ), lnwin,
1313 $ h((jloc-1)*lldh+iloc), lldh )
1314 END IF
1315 IF( myrow.EQ.rsrc3 .AND. mycol.EQ.csrc3 ) THEN
1316 iloc = indxg2l( lktop+dim1, nb, myrow,
1317 $ desch( rsrc_ ), nprow )
1318 jloc = indxg2l( lktop+dim1-cls3, nb, mycol,
1319 $ desch( csrc_ ), npcol )
1320 IF( rsrc3.NE.rsrc1 .OR. csrc3.NE.csrc1 ) THEN
1321* Proc#3 <== Proc#1
1322 CALL dgerv2d( ictxt, rws3, cls3,
1323 $ work( iph+(dim1-cls3)*lnwin+dim1 ),
1324 $ lnwin, rsrc1, csrc1 )
1325 END IF
1326 CALL dlamov( 'Upper', rws3, cls3,
1327 $ work( iph+(dim1-cls3)*lnwin+dim1 ),
1328 $ lnwin, h((jloc-1)*lldh+iloc),
1329 $ lldh )
1330 IF( rws3.GT.1 .AND. cls3.GT.1 ) THEN
1331 elem = work( iph+(dim1-cls3)*lnwin+dim1+1 )
1332 IF( elem.NE.zero ) THEN
1333 CALL dlamov( 'Lower', rws3-1, cls3-1,
1334 $ work( iph+(dim1-cls3)*lnwin+dim1+1 ),
1335 $ lnwin, h((jloc-1)*lldh+iloc+1), lldh )
1336 END IF
1337 END IF
1338 END IF
1339*
1340* Restore correct value of LNWIN.
1341*
1342 lnwin = dim
1343*
1344 END IF
1345*
1346* Increment counter for buffers of orthogonal
1347* transformations.
1348*
1349 IF( myrow.EQ.rsrc1 .OR. mycol.EQ.csrc1 .OR.
1350 $ myrow.EQ.rsrc4 .OR. mycol.EQ.csrc4 ) THEN
1351 IF( myrow.EQ.rsrc1 .OR. myrow.EQ.rsrc4 )
1352 $ lenrbuf = lenrbuf + lnwin*lnwin
1353 IF( mycol.EQ.csrc1 .OR. mycol.EQ.csrc4 )
1354 $ lencbuf = lencbuf + lnwin*lnwin
1355 END IF
1356*
1357* If no cross border bulge chasing was performed for the
1358* current WIN:th window, the processor jump to this point
1359* and consider the next one.
1360*
1361 165 CONTINUE
1362*
1363 160 CONTINUE
1364*
1365* Broadcast orthogonal transformations -- this will only happen
1366* if the buffer associated with the orthogonal transformations
1367* is not empty (controlled by LENRBUF, for row-wise
1368* broadcasts, and LENCBUF, for column-wise broadcasts).
1369*
1370 DO 170 dir = 1, 2
1371 bcdone = .false.
1372 DO 180 win = oddeven+(chunknum-1)*wchunk,
1373 $ min(anmwin,max(1,oddeven+(chunknum)*wchunk-1)), 2
1374 IF( ( lenrbuf.EQ.0 .AND. lencbuf.EQ.0 ) .OR.
1375 $ bcdone ) GO TO 185
1376 rsrc1 = iwork( 3+(win-1)*5 )
1377 csrc1 = iwork( 4+(win-1)*5 )
1378 rsrc4 = mod( rsrc1+1, nprow )
1379 csrc4 = mod( csrc1+1, npcol )
1380 IF( ( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) .OR.
1381 $ ( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) ) THEN
1382 IF( dir.EQ.1 .AND. lenrbuf.GT.0 .AND.
1383 $ npcol.GT.1 .AND. nprocs.GT.2 ) THEN
1384 IF( myrow.EQ.rsrc1 .OR. ( myrow.EQ.rsrc4
1385 $ .AND. rsrc4.NE.rsrc1 ) ) THEN
1386 CALL dgebs2d( ictxt, 'Row', '1-Tree',
1387 $ lenrbuf, 1, work, lenrbuf )
1388 ELSE
1389 CALL dgebr2d( ictxt, 'Row', '1-Tree',
1390 $ lenrbuf, 1, work, lenrbuf, rsrc1,
1391 $ csrc1 )
1392 END IF
1393 ELSEIF( dir.EQ.2 .AND. lencbuf.GT.0 .AND.
1394 $ nprow.GT.1 .AND. nprocs.GT.2 ) THEN
1395 IF( mycol.EQ.csrc1 .OR. ( mycol.EQ.csrc4
1396 $ .AND. csrc4.NE.csrc1 ) ) THEN
1397 CALL dgebs2d( ictxt, 'Col', '1-Tree',
1398 $ lencbuf, 1, work, lencbuf )
1399 ELSE
1400 CALL dgebr2d( ictxt, 'Col', '1-Tree',
1401 $ lencbuf, 1, work(1+lenrbuf), lencbuf,
1402 $ rsrc1, csrc1 )
1403 END IF
1404 END IF
1405 IF( lenrbuf.GT.0 .AND. ( mycol.EQ.csrc1 .OR.
1406 $ ( mycol.EQ.csrc4 .AND. csrc4.NE.csrc1 ) ) )
1407 $ CALL dlamov( 'All', lenrbuf, 1, work, lenrbuf,
1408 $ work(1+lenrbuf), lencbuf )
1409 bcdone = .true.
1410 ELSEIF( myrow.EQ.rsrc1 .AND. dir.EQ.1 ) THEN
1411 IF( lenrbuf.GT.0 .AND. npcol.GT.1 )
1412 $ CALL dgebr2d( ictxt, 'Row', '1-Tree', lenrbuf,
1413 $ 1, work, lenrbuf, rsrc1, csrc1 )
1414 bcdone = .true.
1415 ELSEIF( mycol.EQ.csrc1 .AND. dir.EQ.2 ) THEN
1416 IF( lencbuf.GT.0 .AND. nprow.GT.1 )
1417 $ CALL dgebr2d( ictxt, 'Col', '1-Tree', lencbuf,
1418 $ 1, work(1+lenrbuf), lencbuf, rsrc1, csrc1 )
1419 bcdone = .true.
1420 ELSEIF( myrow.EQ.rsrc4 .AND. dir.EQ.1 ) THEN
1421 IF( lenrbuf.GT.0 .AND. npcol.GT.1 )
1422 $ CALL dgebr2d( ictxt, 'Row', '1-Tree', lenrbuf,
1423 $ 1, work, lenrbuf, rsrc4, csrc4 )
1424 bcdone = .true.
1425 ELSEIF( mycol.EQ.csrc4 .AND. dir.EQ.2 ) THEN
1426 IF( lencbuf.GT.0 .AND. nprow.GT.1 )
1427 $ CALL dgebr2d( ictxt, 'Col', '1-Tree', lencbuf,
1428 $ 1, work(1+lenrbuf), lencbuf, rsrc4, csrc4 )
1429 bcdone = .true.
1430 END IF
1431 185 CONTINUE
1432 180 CONTINUE
1433 170 CONTINUE
1434*
1435* Prepare for computing cross border updates by exchanging
1436* data in cross border update regions in H and Z.
1437*
1438 DO 190 dir = 1, 2
1439 winid = 0
1440 ipw3 = 1
1441 DO 200 win = oddeven+(chunknum-1)*wchunk,
1442 $ min(anmwin,max(1,oddeven+(chunknum)*wchunk-1)), 2
1443 IF( iwork( 5+(win-1)*5 ).NE.1 ) GO TO 205
1444*
1445* Make sure this part of the code is only executed when
1446* there has been some work performed on the WIN:th
1447* window.
1448*
1449 lktop = iwork( 1+(win-1)*5 )
1450 lkbot = iwork( 2+(win-1)*5 )
1451*
1452* Extract processor indices associated with
1453* the current window.
1454*
1455 rsrc1 = iwork( 3+(win-1)*5 )
1456 csrc1 = iwork( 4+(win-1)*5 )
1457 rsrc4 = mod( rsrc1+1, nprow )
1458 csrc4 = mod( csrc1+1, npcol )
1459*
1460* Compute local number of rows and columns
1461* of H and Z to exchange.
1462*
1463 IF(((mycol.EQ.csrc1.OR.mycol.EQ.csrc4).AND.dir.EQ.2)
1464 $ .OR.((myrow.EQ.rsrc1.OR.myrow.EQ.rsrc4).AND.
1465 $ dir.EQ.1)) THEN
1466 winid = winid + 1
1467 lnwin = lkbot - lktop + 1
1468 ipu = ipnext
1469 dim1 = nb - mod(lktop-1,nb)
1470 dim4 = lnwin - dim1
1471 ipnext = ipu + lnwin*lnwin
1472 IF( dir.EQ.2 ) THEN
1473 IF( wantz ) THEN
1474 zrows = numroc( n, nb, myrow, descz( rsrc_ ),
1475 $ nprow )
1476 ELSE
1477 zrows = 0
1478 END IF
1479 IF( wantt ) THEN
1480 hrows = numroc( lktop-1, nb, myrow,
1481 $ desch( rsrc_ ), nprow )
1482 ELSE
1483 hrows = 0
1484 END IF
1485 ELSE
1486 zrows = 0
1487 hrows = 0
1488 END IF
1489 IF( dir.EQ.1 ) THEN
1490 IF( wantt ) THEN
1491 hcols = numroc( n - (lktop+dim1-1), nb,
1492 $ mycol, csrc4, npcol )
1493 IF( mycol.EQ.csrc4 ) hcols = hcols - dim4
1494 ELSE
1495 hcols = 0
1496 END IF
1497 ELSE
1498 hcols = 0
1499 END IF
1500 ipw = max( 1 + lenrbuf + lencbuf, ipw3 )
1501 ipw1 = ipw + hrows * lnwin
1502 IF( wantz ) THEN
1503 ipw2 = ipw1 + lnwin * hcols
1504 ipw3 = ipw2 + zrows * lnwin
1505 ELSE
1506 ipw3 = ipw1 + lnwin * hcols
1507 END IF
1508 END IF
1509*
1510* Let each process row and column involved in the updates
1511* exchange data in H and Z with their neighbours.
1512*
1513 IF( dir.EQ.2 .AND. wantt .AND. lencbuf.GT.0 ) THEN
1514 IF( mycol.EQ.csrc1 .OR. mycol.EQ.csrc4 ) THEN
1515 DO 210 indx = 1, nprow
1516 IF( mycol.EQ.csrc1 ) THEN
1517 CALL infog2l( 1+(indx-1)*nb, lktop, desch,
1518 $ nprow, npcol, myrow, mycol, iloc,
1519 $ jloc1, rsrc, csrc1 )
1520 IF( myrow.EQ.rsrc ) THEN
1521 CALL dlamov( 'All', hrows, dim1,
1522 $ h((jloc1-1)*lldh+iloc), lldh,
1523 $ work(ipw), hrows )
1524 IF( npcol.GT.1 ) THEN
1525 east = mod( mycol + 1, npcol )
1526 CALL dgesd2d( ictxt, hrows, dim1,
1527 $ work(ipw), hrows, rsrc, east )
1528 CALL dgerv2d( ictxt, hrows, dim4,
1529 $ work(ipw+hrows*dim1), hrows,
1530 $ rsrc, east )
1531 END IF
1532 END IF
1533 END IF
1534 IF( mycol.EQ.csrc4 ) THEN
1535 CALL infog2l( 1+(indx-1)*nb, lktop+dim1,
1536 $ desch, nprow, npcol, myrow, mycol,
1537 $ iloc, jloc4, rsrc, csrc4 )
1538 IF( myrow.EQ.rsrc ) THEN
1539 CALL dlamov( 'All', hrows, dim4,
1540 $ h((jloc4-1)*lldh+iloc), lldh,
1541 $ work(ipw+hrows*dim1), hrows )
1542 IF( npcol.GT.1 ) THEN
1543 west = mod( mycol - 1 + npcol,
1544 $ npcol )
1545 CALL dgesd2d( ictxt, hrows, dim4,
1546 $ work(ipw+hrows*dim1), hrows,
1547 $ rsrc, west )
1548 CALL dgerv2d( ictxt, hrows, dim1,
1549 $ work(ipw), hrows, rsrc, west )
1550 END IF
1551 END IF
1552 END IF
1553 210 CONTINUE
1554 END IF
1555 END IF
1556*
1557 IF( dir.EQ.1 .AND. wantt .AND. lenrbuf.GT.0 ) THEN
1558 IF( myrow.EQ.rsrc1 .OR. myrow.EQ.rsrc4 ) THEN
1559 DO 220 indx = 1, npcol
1560 IF( myrow.EQ.rsrc1 ) THEN
1561 IF( indx.EQ.1 ) THEN
1562 IF( lkbot.LT.n ) THEN
1563 CALL infog2l( lktop, lkbot+1, desch,
1564 $ nprow, npcol, myrow, mycol,
1565 $ iloc1, jloc, rsrc1, csrc )
1566 ELSE
1567 csrc = -1
1568 END IF
1569 ELSEIF( mod(lkbot,nb).NE.0 ) THEN
1570 CALL infog2l( lktop,
1571 $ (iceil(lkbot,nb)+(indx-2))*nb+1,
1572 $ desch, nprow, npcol, myrow, mycol,
1573 $ iloc1, jloc, rsrc1, csrc )
1574 ELSE
1575 CALL infog2l( lktop,
1576 $ (iceil(lkbot,nb)+(indx-1))*nb+1,
1577 $ desch, nprow, npcol, myrow, mycol,
1578 $ iloc1, jloc, rsrc1, csrc )
1579 END IF
1580 IF( mycol.EQ.csrc ) THEN
1581 CALL dlamov( 'All', dim1, hcols,
1582 $ h((jloc-1)*lldh+iloc1), lldh,
1583 $ work(ipw1), lnwin )
1584 IF( nprow.GT.1 ) THEN
1585 south = mod( myrow + 1, nprow )
1586 CALL dgesd2d( ictxt, dim1, hcols,
1587 $ work(ipw1), lnwin, south,
1588 $ csrc )
1589 CALL dgerv2d( ictxt, dim4, hcols,
1590 $ work(ipw1+dim1), lnwin, south,
1591 $ csrc )
1592 END IF
1593 END IF
1594 END IF
1595 IF( myrow.EQ.rsrc4 ) THEN
1596 IF( indx.EQ.1 ) THEN
1597 IF( lkbot.LT.n ) THEN
1598 CALL infog2l( lktop+dim1, lkbot+1,
1599 $ desch, nprow, npcol, myrow,
1600 $ mycol, iloc4, jloc, rsrc4,
1601 $ csrc )
1602 ELSE
1603 csrc = -1
1604 END IF
1605 ELSEIF( mod(lkbot,nb).NE.0 ) THEN
1606 CALL infog2l( lktop+dim1,
1607 $ (iceil(lkbot,nb)+(indx-2))*nb+1,
1608 $ desch, nprow, npcol, myrow, mycol,
1609 $ iloc4, jloc, rsrc4, csrc )
1610 ELSE
1611 CALL infog2l( lktop+dim1,
1612 $ (iceil(lkbot,nb)+(indx-1))*nb+1,
1613 $ desch, nprow, npcol, myrow, mycol,
1614 $ iloc4, jloc, rsrc4, csrc )
1615 END IF
1616 IF( mycol.EQ.csrc ) THEN
1617 CALL dlamov( 'All', dim4, hcols,
1618 $ h((jloc-1)*lldh+iloc4), lldh,
1619 $ work(ipw1+dim1), lnwin )
1620 IF( nprow.GT.1 ) THEN
1621 north = mod( myrow - 1 + nprow,
1622 $ nprow )
1623 CALL dgesd2d( ictxt, dim4, hcols,
1624 $ work(ipw1+dim1), lnwin, north,
1625 $ csrc )
1626 CALL dgerv2d( ictxt, dim1, hcols,
1627 $ work(ipw1), lnwin, north,
1628 $ csrc )
1629 END IF
1630 END IF
1631 END IF
1632 220 CONTINUE
1633 END IF
1634 END IF
1635*
1636 IF( dir.EQ.2 .AND. wantz .AND. lencbuf.GT.0) THEN
1637 IF( mycol.EQ.csrc1 .OR. mycol.EQ.csrc4 ) THEN
1638 DO 230 indx = 1, nprow
1639 IF( mycol.EQ.csrc1 ) THEN
1640 CALL infog2l( 1+(indx-1)*nb, lktop,
1641 $ descz, nprow, npcol, myrow, mycol,
1642 $ iloc, jloc1, rsrc, csrc1 )
1643 IF( myrow.EQ.rsrc ) THEN
1644 CALL dlamov( 'All', zrows, dim1,
1645 $ z((jloc1-1)*lldz+iloc), lldz,
1646 $ work(ipw2), zrows )
1647 IF( npcol.GT.1 ) THEN
1648 east = mod( mycol + 1, npcol )
1649 CALL dgesd2d( ictxt, zrows, dim1,
1650 $ work(ipw2), zrows, rsrc,
1651 $ east )
1652 CALL dgerv2d( ictxt, zrows, dim4,
1653 $ work(ipw2+zrows*dim1),
1654 $ zrows, rsrc, east )
1655 END IF
1656 END IF
1657 END IF
1658 IF( mycol.EQ.csrc4 ) THEN
1659 CALL infog2l( 1+(indx-1)*nb,
1660 $ lktop+dim1, descz, nprow, npcol,
1661 $ myrow, mycol, iloc, jloc4, rsrc,
1662 $ csrc4 )
1663 IF( myrow.EQ.rsrc ) THEN
1664 CALL dlamov( 'All', zrows, dim4,
1665 $ z((jloc4-1)*lldz+iloc), lldz,
1666 $ work(ipw2+zrows*dim1), zrows )
1667 IF( npcol.GT.1 ) THEN
1668 west = mod( mycol - 1 + npcol,
1669 $ npcol )
1670 CALL dgesd2d( ictxt, zrows, dim4,
1671 $ work(ipw2+zrows*dim1),
1672 $ zrows, rsrc, west )
1673 CALL dgerv2d( ictxt, zrows, dim1,
1674 $ work(ipw2), zrows, rsrc,
1675 $ west )
1676 END IF
1677 END IF
1678 END IF
1679 230 CONTINUE
1680 END IF
1681 END IF
1682*
1683* If no exchanges was performed for the current window,
1684* all processors jump to this point and try the next
1685* one.
1686*
1687 205 CONTINUE
1688*
1689 200 CONTINUE
1690*
1691* Compute crossborder bulge-chase updates.
1692*
1693 winid = 0
1694 IF( dir.EQ.1 ) THEN
1695 ipnext = 1
1696 ELSE
1697 ipnext = 1 + lenrbuf
1698 END IF
1699 ipw3 = 1
1700 DO 240 win = oddeven+(chunknum-1)*wchunk,
1701 $ min(anmwin,max(1,oddeven+(chunknum)*wchunk-1)), 2
1702 IF( iwork( 5+(win-1)*5 ).NE.1 ) GO TO 245
1703*
1704* Only perform this part of the code if there was really
1705* some work performed on the WIN:th window.
1706*
1707 lktop = iwork( 1+(win-1)*5 )
1708 lkbot = iwork( 2+(win-1)*5 )
1709 lnwin = lkbot - lktop + 1
1710*
1711* Extract the processor indices associated with
1712* the current window.
1713*
1714 rsrc1 = iwork( 3+(win-1)*5 )
1715 csrc1 = iwork( 4+(win-1)*5 )
1716 rsrc4 = mod( rsrc1+1, nprow )
1717 csrc4 = mod( csrc1+1, npcol )
1718*
1719 IF(((mycol.EQ.csrc1.OR.mycol.EQ.csrc4).AND.dir.EQ.2)
1720 $ .OR.((myrow.EQ.rsrc1.OR.myrow.EQ.rsrc4).AND.
1721 $ dir.EQ.1)) THEN
1722*
1723* Set up workspaces.
1724*
1725 winid = winid + 1
1726 lktop = iwork( 1+(win-1)*5 )
1727 lkbot = iwork( 2+(win-1)*5 )
1728 lnwin = lkbot - lktop + 1
1729 dim1 = nb - mod(lktop-1,nb)
1730 dim4 = lnwin - dim1
1731 ipu = ipnext + (winid-1)*lnwin*lnwin
1732 IF( dir.EQ.2 ) THEN
1733 IF( wantz ) THEN
1734 zrows = numroc( n, nb, myrow, descz( rsrc_ ),
1735 $ nprow )
1736 ELSE
1737 zrows = 0
1738 END IF
1739 IF( wantt ) THEN
1740 hrows = numroc( lktop-1, nb, myrow,
1741 $ desch( rsrc_ ), nprow )
1742 ELSE
1743 hrows = 0
1744 END IF
1745 ELSE
1746 zrows = 0
1747 hrows = 0
1748 END IF
1749 IF( dir.EQ.1 ) THEN
1750 IF( wantt ) THEN
1751 hcols = numroc( n - (lktop+dim1-1), nb,
1752 $ mycol, csrc4, npcol )
1753 IF( mycol.EQ.csrc4 ) hcols = hcols - dim4
1754 ELSE
1755 hcols = 0
1756 END IF
1757 ELSE
1758 hcols = 0
1759 END IF
1760*
1761* IPW = local copy of overlapping column block of H
1762* IPW1 = local copy of overlapping row block of H
1763* IPW2 = local copy of overlapping column block of Z
1764* IPW3 = workspace for right hand side of matrix
1765* multiplication
1766*
1767 ipw = max( 1 + lenrbuf + lencbuf, ipw3 )
1768 ipw1 = ipw + hrows * lnwin
1769 IF( wantz ) THEN
1770 ipw2 = ipw1 + lnwin * hcols
1771 ipw3 = ipw2 + zrows * lnwin
1772 ELSE
1773 ipw3 = ipw1 + lnwin * hcols
1774 END IF
1775*
1776* Recompute job to see if special structure of U
1777* could possibly be exploited.
1778*
1779 IF( lktop.EQ.ktop .AND. lkbot.EQ.kbot ) THEN
1780 job = 'All steps'
1781 ELSEIF( lktop.EQ.ktop .AND.
1782 $ ( dim1.LT.lchain+1 .OR. dim1.LE.ntiny ) )
1783 $ THEN
1784 job = 'Introduce and chase'
1785 ELSEIF( lkbot.EQ.kbot ) THEN
1786 job = 'Off-chase bulges'
1787 ELSE
1788 job = 'Chase bulges'
1789 END IF
1790 END IF
1791*
1792* Test if to exploit sparsity structure of
1793* orthogonal matrix U.
1794*
1795 ks = dim1+dim4-lns/2*3
1796 IF( .NOT. blk22 .OR. dim1.NE.ks .OR.
1797 $ dim4.NE.ks .OR. lsame(job,'I') .OR.
1798 $ lsame(job,'O') .OR. lns.LE.2 ) THEN
1799*
1800* Update the columns of H and Z.
1801*
1802 IF( dir.EQ.2 .AND. wantt .AND. lencbuf.GT.0 ) THEN
1803 DO 250 indx = 1, min(lktop-1,1+(nprow-1)*nb), nb
1804 IF( mycol.EQ.csrc1 ) THEN
1805 CALL infog2l( indx, lktop, desch, nprow,
1806 $ npcol, myrow, mycol, iloc, jloc,
1807 $ rsrc, csrc1 )
1808 IF( myrow.EQ.rsrc ) THEN
1809 CALL dgemm( 'No transpose',
1810 $ 'No transpose', hrows, dim1,
1811 $ lnwin, one, work( ipw ), hrows,
1812 $ work( ipu ), lnwin, zero,
1813 $ work(ipw3), hrows )
1814 CALL dlamov( 'All', hrows, dim1,
1815 $ work(ipw3), hrows,
1816 $ h((jloc-1)*lldh+iloc), lldh )
1817 END IF
1818 END IF
1819 IF( mycol.EQ.csrc4 ) THEN
1820 CALL infog2l( indx, lktop+dim1, desch,
1821 $ nprow, npcol, myrow, mycol, iloc,
1822 $ jloc, rsrc, csrc4 )
1823 IF( myrow.EQ.rsrc ) THEN
1824 CALL dgemm( 'No transpose',
1825 $ 'No transpose', hrows, dim4,
1826 $ lnwin, one, work( ipw ), hrows,
1827 $ work( ipu+lnwin*dim1 ), lnwin,
1828 $ zero, work(ipw3), hrows )
1829 CALL dlamov( 'All', hrows, dim4,
1830 $ work(ipw3), hrows,
1831 $ h((jloc-1)*lldh+iloc), lldh )
1832 END IF
1833 END IF
1834 250 CONTINUE
1835 END IF
1836*
1837 IF( dir.EQ.2 .AND. wantz .AND. lencbuf.GT.0 ) THEN
1838 DO 260 indx = 1, min(n,1+(nprow-1)*nb), nb
1839 IF( mycol.EQ.csrc1 ) THEN
1840 CALL infog2l( indx, lktop, descz, nprow,
1841 $ npcol, myrow, mycol, iloc, jloc,
1842 $ rsrc, csrc1 )
1843 IF( myrow.EQ.rsrc ) THEN
1844 CALL dgemm( 'No transpose',
1845 $ 'No transpose', zrows, dim1,
1846 $ lnwin, one, work( ipw2 ),
1847 $ zrows, work( ipu ), lnwin,
1848 $ zero, work(ipw3), zrows )
1849 CALL dlamov( 'All', zrows, dim1,
1850 $ work(ipw3), zrows,
1851 $ z((jloc-1)*lldz+iloc), lldz )
1852 END IF
1853 END IF
1854 IF( mycol.EQ.csrc4 ) THEN
1855 CALL infog2l( indx, lktop+dim1, descz,
1856 $ nprow, npcol, myrow, mycol, iloc,
1857 $ jloc, rsrc, csrc4 )
1858 IF( myrow.EQ.rsrc ) THEN
1859 CALL dgemm( 'No transpose',
1860 $ 'No transpose', zrows, dim4,
1861 $ lnwin, one, work( ipw2 ),
1862 $ zrows,
1863 $ work( ipu+lnwin*dim1 ), lnwin,
1864 $ zero, work(ipw3), zrows )
1865 CALL dlamov( 'All', zrows, dim4,
1866 $ work(ipw3), zrows,
1867 $ z((jloc-1)*lldz+iloc), lldz )
1868 END IF
1869 END IF
1870 260 CONTINUE
1871 END IF
1872*
1873* Update the rows of H.
1874*
1875 IF( dir.EQ.1 .AND. wantt .AND. lenrbuf.GT.0 ) THEN
1876 IF( lkbot.LT.n ) THEN
1877 IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc4 .AND.
1878 $ mod(lkbot,nb).NE.0 ) THEN
1879 indx = lkbot + 1
1880 CALL infog2l( lktop, indx, desch, nprow,
1881 $ npcol, myrow, mycol, iloc, jloc,
1882 $ rsrc1, csrc4 )
1883 CALL dgemm( 'Transpose', 'No Transpose',
1884 $ dim1, hcols, lnwin, one, work(ipu),
1885 $ lnwin, work( ipw1 ), lnwin, zero,
1886 $ work(ipw3), dim1 )
1887 CALL dlamov( 'All', dim1, hcols,
1888 $ work(ipw3), dim1,
1889 $ h((jloc-1)*lldh+iloc), lldh )
1890 END IF
1891 IF( myrow.EQ.rsrc4.AND.mycol.EQ.csrc4 .AND.
1892 $ mod(lkbot,nb).NE.0 ) THEN
1893 indx = lkbot + 1
1894 CALL infog2l( lktop+dim1, indx, desch,
1895 $ nprow, npcol, myrow, mycol, iloc,
1896 $ jloc, rsrc4, csrc4 )
1897 CALL dgemm( 'Transpose', 'No Transpose',
1898 $ dim4, hcols, lnwin, one,
1899 $ work( ipu+dim1*lnwin ), lnwin,
1900 $ work( ipw1), lnwin, zero,
1901 $ work(ipw3), dim4 )
1902 CALL dlamov( 'All', dim4, hcols,
1903 $ work(ipw3), dim4,
1904 $ h((jloc-1)*lldh+iloc), lldh )
1905 END IF
1906 indxs = iceil(lkbot,nb)*nb + 1
1907 IF( mod(lkbot,nb).NE.0 ) THEN
1908 indxe = min(n,indxs+(npcol-2)*nb)
1909 ELSE
1910 indxe = min(n,indxs+(npcol-1)*nb)
1911 END IF
1912 DO 270 indx = indxs, indxe, nb
1913 IF( myrow.EQ.rsrc1 ) THEN
1914 CALL infog2l( lktop, indx, desch,
1915 $ nprow, npcol, myrow, mycol, iloc,
1916 $ jloc, rsrc1, csrc )
1917 IF( mycol.EQ.csrc ) THEN
1918 CALL dgemm( 'Transpose',
1919 $ 'No Transpose', dim1, hcols,
1920 $ lnwin, one, work( ipu ), lnwin,
1921 $ work( ipw1 ), lnwin, zero,
1922 $ work(ipw3), dim1 )
1923 CALL dlamov( 'All', dim1, hcols,
1924 $ work(ipw3), dim1,
1925 $ h((jloc-1)*lldh+iloc), lldh )
1926 END IF
1927 END IF
1928 IF( myrow.EQ.rsrc4 ) THEN
1929 CALL infog2l( lktop+dim1, indx, desch,
1930 $ nprow, npcol, myrow, mycol, iloc,
1931 $ jloc, rsrc4, csrc )
1932 IF( mycol.EQ.csrc ) THEN
1933 CALL dgemm( 'Transpose',
1934 $ 'No Transpose', dim4, hcols,
1935 $ lnwin, one,
1936 $ work( ipu+lnwin*dim1 ), lnwin,
1937 $ work( ipw1 ), lnwin,
1938 $ zero, work(ipw3), dim4 )
1939 CALL dlamov( 'All', dim4, hcols,
1940 $ work(ipw3), dim4,
1941 $ h((jloc-1)*lldh+iloc), lldh )
1942 END IF
1943 END IF
1944 270 CONTINUE
1945 END IF
1946 END IF
1947 ELSE
1948*
1949* Update the columns of H and Z.
1950*
1951* Compute H2*U21 + H1*U11 on the left side of the border.
1952*
1953 IF( dir.EQ.2 .AND. wantt .AND. lencbuf.GT.0 ) THEN
1954 indxe = min(lktop-1,1+(nprow-1)*nb)
1955 DO 280 indx = 1, indxe, nb
1956 IF( mycol.EQ.csrc1 ) THEN
1957 CALL infog2l( indx, lktop, desch, nprow,
1958 $ npcol, myrow, mycol, iloc, jloc,
1959 $ rsrc, csrc1 )
1960 IF( myrow.EQ.rsrc ) THEN
1961 CALL dlamov( 'All', hrows, ks,
1962 $ work( ipw+hrows*dim4), hrows,
1963 $ work(ipw3), hrows )
1964 CALL dtrmm( 'Right', 'Upper',
1965 $ 'No transpose',
1966 $ 'Non-unit', hrows, ks, one,
1967 $ work( ipu+dim4 ), lnwin,
1968 $ work(ipw3), hrows )
1969 CALL dgemm( 'No transpose',
1970 $ 'No transpose', hrows, ks, dim4,
1971 $ one, work( ipw ), hrows,
1972 $ work( ipu ), lnwin, one,
1973 $ work(ipw3), hrows )
1974 CALL dlamov( 'All', hrows, ks,
1975 $ work(ipw3), hrows,
1976 $ h((jloc-1)*lldh+iloc), lldh )
1977 END IF
1978 END IF
1979*
1980* Compute H1*U12 + H2*U22 on the right side of
1981* the border.
1982*
1983 IF( mycol.EQ.csrc4 ) THEN
1984 CALL infog2l( indx, lktop+dim1, desch,
1985 $ nprow, npcol, myrow, mycol, iloc,
1986 $ jloc, rsrc, csrc4 )
1987 IF( myrow.EQ.rsrc ) THEN
1988 CALL dlamov( 'All', hrows, dim4,
1989 $ work(ipw), hrows, work( ipw3 ),
1990 $ hrows )
1991 CALL dtrmm( 'Right', 'Lower',
1992 $ 'No transpose',
1993 $ 'Non-unit', hrows, dim4, one,
1994 $ work( ipu+lnwin*ks ), lnwin,
1995 $ work( ipw3 ), hrows )
1996 CALL dgemm( 'No transpose',
1997 $ 'No transpose', hrows, dim4, ks,
1998 $ one, work( ipw+hrows*dim4),
1999 $ hrows,
2000 $ work( ipu+lnwin*ks+dim4 ), lnwin,
2001 $ one, work( ipw3 ), hrows )
2002 CALL dlamov( 'All', hrows, dim4,
2003 $ work(ipw3), hrows,
2004 $ h((jloc-1)*lldh+iloc), lldh )
2005 END IF
2006 END IF
2007 280 CONTINUE
2008 END IF
2009*
2010 IF( dir.EQ.2 .AND. wantz .AND. lencbuf.GT.0 ) THEN
2011*
2012* Compute Z2*U21 + Z1*U11 on the left side
2013* of border.
2014*
2015 indxe = min(n,1+(nprow-1)*nb)
2016 DO 290 indx = 1, indxe, nb
2017 IF( mycol.EQ.csrc1 ) THEN
2018 CALL infog2l( indx, i, descz, nprow,
2019 $ npcol, myrow, mycol, iloc, jloc,
2020 $ rsrc, csrc1 )
2021 IF( myrow.EQ.rsrc ) THEN
2022 CALL dlamov( 'All', zrows, ks,
2023 $ work( ipw2+zrows*dim4),
2024 $ zrows, work(ipw3), zrows )
2025 CALL dtrmm( 'Right', 'Upper',
2026 $ 'No transpose',
2027 $ 'Non-unit', zrows, ks, one,
2028 $ work( ipu+dim4 ), lnwin,
2029 $ work(ipw3), zrows )
2030 CALL dgemm( 'No transpose',
2031 $ 'No transpose', zrows, ks,
2032 $ dim4, one, work( ipw2 ),
2033 $ zrows, work( ipu ), lnwin,
2034 $ one, work(ipw3), zrows )
2035 CALL dlamov( 'All', zrows, ks,
2036 $ work(ipw3), zrows,
2037 $ z((jloc-1)*lldz+iloc), lldz )
2038 END IF
2039 END IF
2040*
2041* Compute Z1*U12 + Z2*U22 on the right side
2042* of border.
2043*
2044 IF( mycol.EQ.csrc4 ) THEN
2045 CALL infog2l( indx, i+dim1, descz,
2046 $ nprow, npcol, myrow, mycol, iloc,
2047 $ jloc, rsrc, csrc4 )
2048 IF( myrow.EQ.rsrc ) THEN
2049 CALL dlamov( 'All', zrows, dim4,
2050 $ work(ipw2), zrows,
2051 $ work( ipw3 ), zrows )
2052 CALL dtrmm( 'Right', 'Lower',
2053 $ 'No transpose',
2054 $ 'Non-unit', zrows, dim4,
2055 $ one, work( ipu+lnwin*ks ),
2056 $ lnwin, work( ipw3 ), zrows )
2057 CALL dgemm( 'No transpose',
2058 $ 'No transpose', zrows, dim4,
2059 $ ks, one,
2060 $ work( ipw2+zrows*(dim4)),
2061 $ zrows,
2062 $ work( ipu+lnwin*ks+dim4 ),
2063 $ lnwin, one, work( ipw3 ),
2064 $ zrows )
2065 CALL dlamov( 'All', zrows, dim4,
2066 $ work(ipw3), zrows,
2067 $ z((jloc-1)*lldz+iloc), lldz )
2068 END IF
2069 END IF
2070 290 CONTINUE
2071 END IF
2072*
2073 IF( dir.EQ.1 .AND. wantt .AND. lenrbuf.GT.0) THEN
2074 IF ( lkbot.LT.n ) THEN
2075*
2076* Compute U21**T*H2 + U11**T*H1 on the upper
2077* side of the border.
2078*
2079 IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc4.AND.
2080 $ mod(lkbot,nb).NE.0 ) THEN
2081 indx = lkbot + 1
2082 CALL infog2l( lktop, indx, desch, nprow,
2083 $ npcol, myrow, mycol, iloc, jloc,
2084 $ rsrc1, csrc4 )
2085 CALL dlamov( 'All', ks, hcols,
2086 $ work( ipw1+dim4 ), lnwin,
2087 $ work(ipw3), ks )
2088 CALL dtrmm( 'Left', 'Upper', 'Transpose',
2089 $ 'Non-unit', ks, hcols, one,
2090 $ work( ipu+dim4 ), lnwin,
2091 $ work(ipw3), ks )
2092 CALL dgemm( 'Transpose', 'No transpose',
2093 $ ks, hcols, dim4, one, work(ipu),
2094 $ lnwin, work(ipw1), lnwin,
2095 $ one, work(ipw3), ks )
2096 CALL dlamov( 'All', ks, hcols,
2097 $ work(ipw3), ks,
2098 $ h((jloc-1)*lldh+iloc), lldh )
2099 END IF
2100*
2101* Compute U12**T*H1 + U22**T*H2 one the lower
2102* side of the border.
2103*
2104 IF( myrow.EQ.rsrc4.AND.mycol.EQ.csrc4.AND.
2105 $ mod(lkbot,nb).NE.0 ) THEN
2106 indx = lkbot + 1
2107 CALL infog2l( lktop+dim1, indx, desch,
2108 $ nprow, npcol, myrow, mycol, iloc,
2109 $ jloc, rsrc4, csrc4 )
2110 CALL dlamov( 'All', dim4, hcols,
2111 $ work( ipw1 ), lnwin,
2112 $ work( ipw3 ), dim4 )
2113 CALL dtrmm( 'Left', 'Lower', 'Transpose',
2114 $ 'Non-unit', dim4, hcols, one,
2115 $ work( ipu+lnwin*ks ), lnwin,
2116 $ work( ipw3 ), dim4 )
2117 CALL dgemm( 'Transpose', 'No Transpose',
2118 $ dim4, hcols, ks, one,
2119 $ work( ipu+lnwin*ks+dim4 ), lnwin,
2120 $ work( ipw1+dim1 ), lnwin,
2121 $ one, work( ipw3), dim4 )
2122 CALL dlamov( 'All', dim4, hcols,
2123 $ work(ipw3), dim4,
2124 $ h((jloc-1)*lldh+iloc), lldh )
2125 END IF
2126*
2127* Compute U21**T*H2 + U11**T*H1 on upper side
2128* on border.
2129*
2130 indxs = iceil(lkbot,nb)*nb+1
2131 IF( mod(lkbot,nb).NE.0 ) THEN
2132 indxe = min(n,indxs+(npcol-2)*nb)
2133 ELSE
2134 indxe = min(n,indxs+(npcol-1)*nb)
2135 END IF
2136 DO 300 indx = indxs, indxe, nb
2137 IF( myrow.EQ.rsrc1 ) THEN
2138 CALL infog2l( lktop, indx, desch,
2139 $ nprow, npcol, myrow, mycol, iloc,
2140 $ jloc, rsrc1, csrc )
2141 IF( mycol.EQ.csrc ) THEN
2142 CALL dlamov( 'All', ks, hcols,
2143 $ work( ipw1+dim4 ), lnwin,
2144 $ work(ipw3), ks )
2145 CALL dtrmm( 'Left', 'Upper',
2146 $ 'Transpose', 'Non-unit',
2147 $ ks, hcols, one,
2148 $ work( ipu+dim4 ), lnwin,
2149 $ work(ipw3), ks )
2150 CALL dgemm( 'Transpose',
2151 $ 'No transpose', ks, hcols,
2152 $ dim4, one, work(ipu), lnwin,
2153 $ work(ipw1), lnwin, one,
2154 $ work(ipw3), ks )
2155 CALL dlamov( 'All', ks, hcols,
2156 $ work(ipw3), ks,
2157 $ h((jloc-1)*lldh+iloc), lldh )
2158 END IF
2159 END IF
2160*
2161* Compute U12**T*H1 + U22**T*H2 on lower
2162* side of border.
2163*
2164 IF( myrow.EQ.rsrc4 ) THEN
2165 CALL infog2l( lktop+dim1, indx, desch,
2166 $ nprow, npcol, myrow, mycol, iloc,
2167 $ jloc, rsrc4, csrc )
2168 IF( mycol.EQ.csrc ) THEN
2169 CALL dlamov( 'All', dim4, hcols,
2170 $ work( ipw1 ), lnwin,
2171 $ work( ipw3 ), dim4 )
2172 CALL dtrmm( 'Left', 'Lower',
2173 $ 'Transpose','Non-unit',
2174 $ dim4, hcols, one,
2175 $ work( ipu+lnwin*ks ), lnwin,
2176 $ work( ipw3 ), dim4 )
2177 CALL dgemm( 'Transpose',
2178 $ 'No Transpose', dim4, hcols,
2179 $ ks, one,
2180 $ work( ipu+lnwin*ks+dim4 ),
2181 $ lnwin, work( ipw1+dim1 ),
2182 $ lnwin, one, work( ipw3),
2183 $ dim4 )
2184 CALL dlamov( 'All', dim4, hcols,
2185 $ work(ipw3), dim4,
2186 $ h((jloc-1)*lldh+iloc), lldh )
2187 END IF
2188 END IF
2189 300 CONTINUE
2190 END IF
2191 END IF
2192 END IF
2193*
2194* Update window information - mark processed windows are
2195* completed.
2196*
2197 IF( dir.EQ.2 ) THEN
2198 IF( lkbot.EQ.kbot ) THEN
2199 lktop = kbot+1
2200 lkbot = kbot+1
2201 iwork( 1+(win-1)*5 ) = lktop
2202 iwork( 2+(win-1)*5 ) = lkbot
2203 ELSE
2204 lktop = min( lktop + lnwin - lchain,
2205 $ min( kbot, iceil( lkbot, nb )*nb ) -
2206 $ lchain + 1 )
2207 iwork( 1+(win-1)*5 ) = lktop
2208 lkbot = min( max( lkbot + lnwin - lchain,
2209 $ lktop + nwin - 1), min( kbot,
2210 $ iceil( lkbot, nb )*nb ) )
2211 iwork( 2+(win-1)*5 ) = lkbot
2212 END IF
2213 IF( iwork( 5+(win-1)*5 ).EQ.1 )
2214 $ iwork( 5+(win-1)*5 ) = 2
2215 iwork( 3+(win-1)*5 ) = rsrc4
2216 iwork( 4+(win-1)*5 ) = csrc4
2217 END IF
2218*
2219* If nothing was done for the WIN:th window, all
2220* processors come here and consider the next one
2221* instead.
2222*
2223 245 CONTINUE
2224 240 CONTINUE
2225 190 CONTINUE
2226 150 CONTINUE
2227 140 CONTINUE
2228*
2229* Chased off bulges from first window?
2230*
2231 IF( nprocs.GT.1 )
2232 $ CALL igamx2d( ictxt, 'All', '1-Tree', 1, 1, ichoff, 1,
2233 $ -1, -1, -1, -1, -1 )
2234*
2235* If the bulge was chasen off from first window it is removed.
2236*
2237 IF( ichoff.GT.0 ) THEN
2238 DO 198 win = 2, anmwin
2239 iwork( 1+(win-2)*5 ) = iwork( 1+(win-1)*5 )
2240 iwork( 2+(win-2)*5 ) = iwork( 2+(win-1)*5 )
2241 iwork( 3+(win-2)*5 ) = iwork( 3+(win-1)*5 )
2242 iwork( 4+(win-2)*5 ) = iwork( 4+(win-1)*5 )
2243 198 CONTINUE
2244 anmwin = anmwin - 1
2245 ipiw = 6+(anmwin-1)*5
2246 END IF
2247*
2248* If we have no more windows, return.
2249*
2250 IF( anmwin.LT.1 ) RETURN
2251*
2252* Check for any more windows to bring over the border.
2253*
2254 winfin = 0
2255 DO 199 win = 1, anmwin
2256 winfin = winfin+iwork( 5+(win-1)*5 )
2257 199 CONTINUE
2258 IF( winfin.LT.2*anmwin ) GO TO 137
2259*
2260* Zero out process mark for each window - this is legal now when
2261* the process starts over with local bulge-chasing etc.
2262*
2263 DO 201 win = 1, anmwin
2264 iwork( 5+(win-1)*5 ) = 0
2265 201 CONTINUE
2266*
2267 END IF
2268*
2269* Go back to local bulge-chase and see if there is more work to do.
2270*
2271 GO TO 20
2272*
2273* End of PDLAQR5
2274*
2275 END
subroutine dlaqr6(job, wantt, wantz, kacc22, n, ktop, kbot, nshfts, sr, si, h, ldh, iloz, ihiz, z, ldz, v, ldv, u, ldu, nv, wv, ldwv, nh, wh, ldwh)
Definition dlaqr6.f:4
subroutine infog2l(grindx, gcindx, desc, nprow, npcol, myrow, mycol, lrindx, lcindx, rsrc, csrc)
Definition infog2l.f:3
#define max(A, B)
Definition pcgemr.c:180
#define min(A, B)
Definition pcgemr.c:181
subroutine pdelset(a, ia, ja, desca, alpha)
Definition pdelset.f:2
subroutine pdlaqr5(wantt, wantz, kacc22, n, ktop, kbot, nshfts, sr, si, h, desch, iloz, ihiz, z, descz, work, lwork, iwork, liwork)
Definition pdlaqr5.f:4