ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pzhettrd.f
Go to the documentation of this file.
1  SUBROUTINE pzhettrd( UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK,
2  $ LWORK, INFO )
3 *
4 * -- ScaLAPACK routine (version 2.0.2) --
5 * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6 * May 1 2012
7 *
8 * .. Scalar Arguments ..
9  CHARACTER UPLO
10  INTEGER IA, INFO, JA, LWORK, N
11 * ..
12 * .. Array Arguments ..
13  INTEGER DESCA( * )
14  DOUBLE PRECISION D( * ), E( * )
15  COMPLEX*16 A( * ), TAU( * ), WORK( * )
16 * ..
17 *
18 * Purpose
19 *
20 * =======
21 *
22 * PZHETTRD reduces a complex Hermitian matrix sub( A ) to Hermitian
23 * tridiagonal form T by an unitary similarity transformation:
24 * Q' * sub( A ) * Q = T, where sub( A ) = A(IA:IA+N-1,JA:JA+N-1).
25 *
26 * Notes
27 * =====
28 *
29 * Each global data object is described by an associated description
30 * vector. This vector stores the information required to establish
31 * the mapping between an object element and its corresponding
32 * process and memory location.
33 *
34 * Let A be a generic term for any 2D block cyclicly distributed
35 * array.
36 * Such a global array has an associated description vector DESCA.
37 * In the following comments, the character _ should be read as
38 * "of the global array".
39 *
40 * NOTATION STORED IN EXPLANATION
41 * --------------- -------------- -----------------------------------
42 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
43 * DTYPE_A = 1.
44 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle,
45 * indicating the BLACS process grid A is distribu-
46 * ted over. The context itself is glo-
47 * bal, but the handle (the integer
48 * value) may vary.
49 * M_A (global) DESCA( M_ ) The number of rows in the global
50 * array A.
51 * N_A (global) DESCA( N_ ) The number of columns in the global
52 * array A.
53 * MB_A (global) DESCA( MB_ ) The blocking factor used to
54 * distribute the rows of the array.
55 * NB_A (global) DESCA( NB_ ) The blocking factor used to
56 * distribute the columns of the array.
57 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the
58 * first row of the array A is distributed.
59 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
60 * first column of the array A is
61 * distributed.
62 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
63 * array. LLD_A >= MAX(1,LOCp(M_A)).
64 *
65 * Let K be the number of rows or columns of a distributed matrix,
66 * and assume that its process grid has dimension p x q.
67 * LOCp( K ) denotes the number of elements of K that a process
68 * would receive if K were distributed over the p processes of its
69 * process column.
70 * Similarly, LOCq( K ) denotes the number of elements of K that a
71 * process would receive if K were distributed over the q processes
72 * of its process row.
73 * The values of LOCp() and LOCq() may be determined via a call to
74 * the ScaLAPACK tool function, NUMROC:
75 * LOCp( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
76 * LOCq( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
77 * An upper bound for these quantities may be computed by:
78 * LOCp( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
79 * LOCq( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
80 *
81 * Arguments
82 * =========
83 *
84 * UPLO (global input) CHARACTER
85 * Specifies whether the upper or lower triangular part of the
86 * Hermitian matrix sub( A ) is stored:
87 * = 'U': Upper triangular
88 * = 'L': Lower triangular
89 *
90 * N (global input) INTEGER
91 * The number of rows and columns to be operated on, i.e. the
92 * order of the distributed submatrix sub( A ). N >= 0.
93 *
94 * A (local input/local output) COMPLEX*16 pointer into the
95 * local memory to an array of dimension (LLD_A,LOCq(JA+N-1)).
96 * On entry, this array contains the local pieces of the
97 * Hermitian distributed matrix sub( A ). If UPLO = 'U', the
98 * leading N-by-N upper triangular part of sub( A ) contains
99 * the upper triangular part of the matrix, and its strictly
100 * lower triangular part is not referenced. If UPLO = 'L', the
101 * leading N-by-N lower triangular part of sub( A ) contains the
102 * lower triangular part of the matrix, and its strictly upper
103 * triangular part is not referenced. On exit, if UPLO = 'U',
104 * the diagonal and first superdiagonal of sub( A ) are over-
105 * written by the corresponding elements of the tridiagonal
106 * matrix T, and the elements above the first superdiagonal,
107 * with the array TAU, represent the unitary matrix Q as a
108 * product of elementary reflectors; if UPLO = 'L', the diagonal
109 * and first subdiagonal of sub( A ) are overwritten by the
110 * corresponding elements of the tridiagonal matrix T, and the
111 * elements below the first subdiagonal, with the array TAU,
112 * represent the unitary matrix Q as a product of elementary
113 * reflectors. See Further Details.
114 *
115 * IA (global input) INTEGER
116 * The row index in the global array A indicating the first
117 * row of sub( A ).
118 *
119 * JA (global input) INTEGER
120 * The column index in the global array A indicating the
121 * first column of sub( A ).
122 *
123 * DESCA (global and local input) INTEGER array of dimension DLEN_.
124 * The array descriptor for the distributed matrix A.
125 *
126 * D (local output) DOUBLE PRECISION array, dim LOCq(JA+N-1)
127 * The diagonal elements of the tridiagonal matrix T:
128 * D(i) = A(i,i). D is tied to the distributed matrix A.
129 *
130 * E (local output) DOUBLE PRECISION array, dim LOCq(JA+N-1)
131 * if UPLO = 'U', LOCq(JA+N-2) otherwise. The off-diagonal
132 * elements of the tridiagonal matrix T: E(i) = A(i,i+1) if
133 * UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. E is tied to the
134 * distributed matrix A.
135 *
136 * TAU (local output) COMPLEX*16, array, dimension
137 * LOCq(JA+N-1). This array contains the scalar factors TAU of
138 * the elementary reflectors. TAU is tied to the distributed
139 * matrix A.
140 *
141 * WORK (local workspace) COMPLEX*16 array, dimension (LWORK)
142 * On exit, WORK( 1 ) returns the minimal and optimal workspace
143 *
144 * LWORK (local input) INTEGER
145 * The dimension of the array WORK.
146 * LWORK >= 2*( ANB+1 )*( 4*NPS+2 ) + NPS
147 * Where:
148 * NPS = MAX( NUMROC( N, 1, 0, 0, NPROW ), 2*ANB )
149 * ANB = PJLAENV( DESCA( CTXT_ ), 3, 'PZHETTRD', 'L', 0, 0,
150 * 0, 0 )
151 *
152 * NUMROC is a ScaLAPACK tool function;
153 * PJLAENV is a ScaLAPACK envionmental inquiry function
154 * MYROW, MYCOL, NPROW and NPCOL can be determined by calling
155 * the subroutine BLACS_GRIDINFO.
156 *
157 * INFO (global output) INTEGER
158 * = 0: successful exit
159 * < 0: If the i-th argument is an array and the j-entry had
160 * an illegal value, then INFO = -(i*100+j), if the i-th
161 * argument is a scalar and had an illegal value, then
162 * INFO = -i.
163 *
164 * Further Details
165 * ===============
166 *
167 * If UPLO = 'U', the matrix Q is represented as a product of
168 * elementary reflectors
169 *
170 * Q = H(n-1) . . . H(2) H(1).
171 *
172 * Each H(i) has the form
173 *
174 * H(i) = I - tau * v * v'
175 *
176 * where tau is a complex scalar, and v is a complex vector with
177 * v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in
178 * A(ia:ia+i-2,ja+i), and tau in TAU(ja+i-1).
179 *
180 * If UPLO = 'L', the matrix Q is represented as a product of
181 * elementary reflectors
182 *
183 * Q = H(1) H(2) . . . H(n-1).
184 *
185 * Each H(i) has the form
186 *
187 * H(i) = I - tau * v * v'
188 *
189 * where tau is a complex scalar, and v is a complex vector with
190 * v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in
191 * A(ia+i+1:ia+n-1,ja+i-1), and tau in TAU(ja+i-1).
192 *
193 * The contents of sub( A ) on exit are illustrated by the following
194 * examples with n = 5:
195 *
196 * if UPLO = 'U': if UPLO = 'L':
197 *
198 * ( d e v2 v3 v4 ) ( d )
199 * ( d e v3 v4 ) ( e d )
200 * ( d e v4 ) ( v1 e d )
201 * ( d e ) ( v1 v2 e d )
202 * ( d ) ( v1 v2 v3 e d )
203 *
204 * where d and e denote diagonal and off-diagonal elements of T, and
205 * vi denotes an element of the vector defining H(i).
206 *
207 * Data storage requirements
208 * =========================
209 *
210 * PZHETTRD is not intended to be called directly. All users are
211 * encourage to call PZHETRD which will then call PZHETTRD if
212 * appropriate. A must be in cyclic format (i.e. MB = NB = 1),
213 * the process grid must be square ( i.e. NPROW = NPCOL ) and
214 * only lower triangular storage is supported.
215 *
216 * Local variables
217 * ===============
218 *
219 * PZHETTRD uses five local arrays:
220 * WORK ( InV ) dimension ( NP, ANB+1): array V
221 * WORK ( InH ) dimension ( NP, ANB+1): array H
222 * WORK ( InVT ) dimension ( NQ, ANB+1): transpose of the array V
223 * WORK ( InHT ) dimension ( NQ, ANB+1): transpose of the array H
224 * WORK ( InVTT ) dimension ( NQ, 1): transpose of the array VT
225 *
226 * Arrays V and H are replicated across all processor columns.
227 * Arrays V^T and H^T are replicated across all processor rows.
228 *
229 * WORK ( InVT ), or V^T, is stored as a tall skinny
230 * array ( NQ x ANB-1 ) for efficiency. Since only the lower
231 * triangular portion of A is updated, Av is computed as:
232 * tril(A) * v + v^T * tril(A,-1). This is performed as
233 * two local triangular matrix-vector multiplications (both in
234 * MVR2) followed by a transpose and a sum across the columns.
235 * In the local computation, WORK( InVT ) is used to compute
236 * tril(A) * v and WORK( InV ) is used to compute
237 * v^T * tril(A,-1)
238 *
239 * The following variables are global indices into A:
240 * INDEX: The current global row and column number.
241 * MAXINDEX: The global row and column for the first row and
242 * column in the trailing block of A.
243 * LIIB, LIJB: The first row, column in
244 *
245 * The following variables point into the arrays A, V, H, V^T, H^T:
246 * BINDEX =INDEX-MININDEX: The column index in V, H, V^T, H^T.
247 * LII: local index I: The local row number for row INDEX
248 * LIJ: local index J: The local column number for column INDEX
249 * LIIP1: local index I+1: The local row number for row INDEX+1
250 * LIJP1: local index J+1: The local col number for col INDEX+1
251 * LTLI: lower triangular local index I: The local row for the
252 * upper left entry in tril( A(INDEX, INDEX) )
253 * LTLIP1: lower triangular local index I+1: The local row for the
254 * upper left entry in tril( A(INDEX+1, INDEX+1) )
255 *
256 * Details: The distinction between LII and LTLI (and between
257 * LIIP1 and LTLIP1) is subtle. Within the current processor
258 * column (i.e. MYCOL .eq. CURCOL) they are the same. However,
259 * on some processors, A( LII, LIJ ) points to an element
260 * above the diagonal, on these processors, LTLI = LII+1.
261 *
262 * The following variables give the number of rows and/or columns
263 * in various matrices:
264 * NP: The number of local rows in A( 1:N, 1:N )
265 * NQ: The number of local columns in A( 1:N, 1:N )
266 * NPM0: The number of local rows in A( INDEX:N, INDEX:N )
267 * NQM0: The number of local columns in A( INDEX:N, INDEX:N )
268 * NPM1: The number of local rows in A( INDEX+1:N, INDEX:N )
269 * NQM1: The number of local columns in A( INDEX+1:N, INDEX:N )
270 * LTNM0: The number of local rows & columns in
271 * tril( A( INDEX:N, INDEX:N ) )
272 * LTNM1: The number of local rows & columns in
273 * tril( A( INDEX+1:N, INDEX+1:N ) )
274 * NOTE: LTNM0 == LTNM1 on all processors except the diagonal
275 * processors, i.e. those where MYCOL == MYROW.
276 *
277 * Invariants:
278 * NP = NPM0 + LII - 1
279 * NQ = NQM0 + LIJ - 1
280 * NP = NPM1 + LIIP1 - 1
281 * NQ = NQM1 + LIJP1 - 1
282 * NP = LTLI + LTNM0 - 1
283 * NP = LTLIP1 + LTNM1 - 1
284 *
285 * Temporary variables. The following variables are used within
286 * a few lines after they are set and do hold state from one loop
287 * iteration to the next:
288 *
289 * The matrix A:
290 * The matrix A does not hold the same values that it would
291 * in an unblocked code nor the values that it would hold in
292 * in a blocked code.
293 *
294 * The value of A is confusing. It is easiest to state the
295 * difference between trueA and A at the point that MVR2 is called,
296 * so we will start there.
297 *
298 * Let trueA be the value that A would
299 * have at a given point in an unblocked code and A
300 * be the value that A has in this code at the same point.
301 *
302 * At the time of the call to MVR2,
303 * trueA = A + V' * H + H' * V
304 * where H = H( MAXINDEX:N, 1:BINDEX ) and
305 * V = V( MAXINDEX:N, 1:BINDEX ).
306 *
307 * At the bottom of the inner loop,
308 * trueA = A + V' * H + H' * V + v' * h + h' * v
309 * where H = H( MAXINDEX:N, 1:BINDEX ) and
310 * V = V( MAXINDEX:N, 1:BINDEX ) and
311 * v = V( liip1:N, BINDEX+1 ) and
312 * h = H( liip1:N, BINDEX+1 )
313 *
314 * At the top of the loop, BINDEX gets incremented, hence:
315 * trueA = A + V' * H + H' * V + v' * h + h' * v
316 * where H = H( MAXINDEX:N, 1:BINDEX-1 ) and
317 * V = V( MAXINDEX:N, 1:BINDEX-1 ) and
318 * v = V( liip1:N, BINDEX ) and
319 * h = H( liip1:N, BINDEX )
320 *
321 *
322 * A gets updated at the bottom of the outer loop
323 * After this update, trueA = A + v' * h + h' * v
324 * where v = V( liip1:N, BINDEX ) and
325 * h = H( liip1:N, BINDEX ) and BINDEX = 0
326 * Indeed, the previous loop invariant as stated above for the
327 * top of the loop still holds, but with BINDEX = 0, H and V
328 * are null matrices.
329 *
330 * After the current column of A is updated,
331 * trueA( INDEX, INDEX:N ) = A( INDEX, INDEX:N )
332 * the rest of A is untouched.
333 *
334 * After the current block column of A is updated,
335 * trueA = A + V' * H + H' * V
336 * where H = H( MAXINDEX:N, 1:BINDEX ) and
337 * V = V( MAXINDEX:N, 1:BINDEX )
338 *
339 * This brings us back to the point at which mvr2 is called.
340 *
341 *
342 * Details of the parallelization:
343 *
344 * We delay spreading v across to all processor columns (which
345 * would naturally happen at the bottom of the loop) in order to
346 * combine the spread of v( : , i-1 ) with the spread of h( : , i )
347 *
348 * In order to compute h( :, i ), we must update A( :, i )
349 * which means that the processor column owning A( :, i ) must
350 * have: c, tau, v( i, i ) and h( i, i ).
351 *
352 * The traditional
353 * way of computing v (and the one used in pzlatrd.f and
354 * zlatrd.f) is:
355 * v = tau * v
356 * c = v' * h
357 * alpha = - tau * c / 2
358 * v = v + alpha * h
359 * However, the traditional way of computing v requires that tau
360 * be broadcast to all processors in the current column (to compute
361 * v = tau * v) and then a sum-to-all is required (to
362 * compute v' * h ). We use the following formula instead:
363 * c = v' * h
364 * v = tau * ( v - c * tau' * h / 2 )
365 * The above formula allows tau to be spread down in the
366 * same call to DGSUM2D which performs the sum-to-all of c.
367 *
368 * The computation of v, which could be performed in any processor
369 * column (or other procesor subsets), is performed in the
370 * processor column that owns A( :, i+1 ) so that A( :, i+1 )
371 * can be updated prior to spreading v across.
372 *
373 * We keep the block column of A up-to-date to minimize the
374 * work required in updating the current column of A. Updating
375 * the block column of A is reasonably load balanced whereas
376 * updating the current column of A is not (only the current
377 * processor column is involved).
378 *
379 * In the following overview of the steps performed, M in the
380 * margin indicates message traffic and C indicates O(n^2 nb/sqrt(p))
381 * or more flops per processor.
382 *
383 * Inner loop:
384 * A( index:n, index ) -= ( v * ht(bindex) + h * vt( bindex) )
385 *M h = house( A(index:n, index) )
386 *M Spread v, h across
387 *M vt = v^T; ht = h^T
388 * A( index+1:n, index+1:maxindex ) -=
389 * ( v * ht(index+1:maxindex) + h *vt(index+1:maxindex) )
390 *C v = tril(A) * h; vt = ht * tril(A,-1)
391 *MorC v = v - H*V*h - V*H*h
392 *M v = v + vt^T
393 *M c = v' * h
394 * v = tau * ( v - c * tau' * h / 2 )
395 *C A = A - H*V - V*H
396 *
397 *
398 *
399 * =================================================================
400 *
401 * .. Parameters ..
402  INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_,
403  $ mb_, nb_, rsrc_, csrc_, lld_
404  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
405  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
406  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
407  DOUBLE PRECISION ONE
408  parameter( one = 1.0d0 )
409  COMPLEX*16 Z_ONE, Z_NEGONE, Z_ZERO
410  parameter( z_one = 1.0d0, z_negone = -1.0d0,
411  $ z_zero = 0.0d0 )
412  DOUBLE PRECISION ZERO
413  parameter( zero = 0.0d+0 )
414 * ..
415 *
416 *
417 * .. Local Scalars ..
418 *
419 *
420  LOGICAL BALANCED, INTERLEAVE, TWOGEMMS, UPPER
421  INTEGER ANB, BINDEX, CURCOL, CURROW, I, ICTXT, INDEX,
422  $ indexa, indexinh, indexinv, inh, inhb, inht,
423  $ inhtb, intmp, inv, invb, invt, invtb, j, lda,
424  $ ldv, ldzg, lii, liib, liip1, lij, lijb, lijp1,
425  $ ltlip1, ltnm1, lwmin, maxindex, minindex,
426  $ mycol, myfirstrow, myrow, mysetnum, nbzg, np,
427  $ npb, npcol, npm0, npm1, nprow, nps, npset, nq,
428  $ nqb, nqm1, numrows, nxtcol, nxtrow, pbmax,
429  $ pbmin, pbsize, pnb, rowsperproc
430  DOUBLE PRECISION NORM, SAFMAX, SAFMIN
431  COMPLEX*16 ALPHA, BETA, C, CONJTOPH, CONJTOPV,
432  $ oneoverbeta, toph, topnv, toptau, topv
433 * ..
434 * .. Local Arrays ..
435 *
436 *
437 *
438 *
439  INTEGER IDUM1( 1 ), IDUM2( 1 )
440  DOUBLE PRECISION DTMP( 5 )
441  COMPLEX*16 CC( 3 )
442 * ..
443 * .. External Subroutines ..
444  EXTERNAL blacs_gridinfo, chk1mat, dcombnrm2, dgebr2d,
445  $ dgebs2d, dgsum2d, pchk1mat, pdtreecomb,
446  $ pxerbla, zgebr2d, zgebs2d, zgemm, zgemv,
447  $ zgerv2d, zgesd2d, zgsum2d, zlamov, zscal,
448  $ ztrmvt
449 * ..
450 * .. External Functions ..
451 *
452  LOGICAL LSAME
453  INTEGER ICEIL, NUMROC, PJLAENV
454  DOUBLE PRECISION DZNRM2, PDLAMCH
455  EXTERNAL lsame, iceil, numroc, pjlaenv, dznrm2, pdlamch
456 * ..
457 * .. Intrinsic Functions ..
458  INTRINSIC dble, dcmplx, dconjg, dimag, ichar, max, min,
459  $ mod, sign, sqrt
460 * ..
461 *
462 *
463 * .. Executable Statements ..
464 * This is just to keep ftnchek and toolpack/1 happy
465  IF( block_cyclic_2d*csrc_*ctxt_*dlen_*dtype_*lld_*mb_*m_*nb_*n_*
466  $ rsrc_.LT.0 )RETURN
467 *
468 *
469 *
470 * Further details
471 * ===============
472 *
473 * At the top of the loop, v and nh have been computed but not
474 * spread across. Hence, A is out-of-date even after the
475 * rank 2k update. Furthermore, we compute the next v before
476 * nh is spread across.
477 *
478 * I claim that if we used a sum-to-all on NV, by summing CC within
479 * each column, that we could compute NV locally and could avoid
480 * spreading V across. Bruce claims that sum-to-all can be made
481 * to cost no more than sum-to-one on the Paragon. If that is
482 * true, this would be a win. But,
483 * the BLACS sum-to-all is just a sum-to-one followed by a broadcast,
484 * and hence the present scheme is better for now.
485 *
486 * Get grid parameters
487 *
488  ictxt = desca( ctxt_ )
489  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
490 *
491  safmax = sqrt( pdlamch( ictxt, 'O' ) ) / n
492  safmin = sqrt( pdlamch( ictxt, 'S' ) )
493 *
494 * Test the input parameters
495 *
496  info = 0
497  IF( nprow.EQ.-1 ) THEN
498  info = -( 600+ctxt_ )
499  ELSE
500 *
501 * Here we set execution options for PZHETTRD
502 *
503  pnb = pjlaenv( ictxt, 2, 'PZHETTRD', 'L', 0, 0, 0, 0 )
504  anb = pjlaenv( ictxt, 3, 'PZHETTRD', 'L', 0, 0, 0, 0 )
505 *
506  interleave = ( pjlaenv( ictxt, 4, 'PZHETTRD', 'L', 1, 0, 0,
507  $ 0 ).EQ.1 )
508  twogemms = ( pjlaenv( ictxt, 4, 'PZHETTRD', 'L', 2, 0, 0,
509  $ 0 ).EQ.1 )
510  balanced = ( pjlaenv( ictxt, 4, 'PZHETTRD', 'L', 3, 0, 0,
511  $ 0 ).EQ.1 )
512 *
513  CALL chk1mat( n, 2, n, 2, ia, ja, desca, 6, info )
514 *
515 *
516  upper = lsame( uplo, 'U' )
517  IF( info.EQ.0 .AND. desca( nb_ ).NE.1 )
518  $ info = 600 + nb_
519  IF( info.EQ.0 ) THEN
520 *
521 *
522 * Here is the arithmetic:
523 * Let maxnpq = max( np, nq, 2 * ANB )
524 * LDV = 4 * max( np, nq ) + 2
525 * LWMIN = 2 * ( ANB + 1 ) * LDV + MAX( np, 2 * ANB )
526 * = 2 * ( ANB + 1 ) * ( 4 * NPS + 2 ) + NPS
527 *
528 * This overestimates memory requirements when ANB > NP/2
529 * Memory requirements are lower when interleave = .false.
530 * Hence, we could have two sets of memory requirements,
531 * one for interleave and one for
532 *
533 *
534  nps = max( numroc( n, 1, 0, 0, nprow ), 2*anb )
535  lwmin = 2*( anb+1 )*( 4*nps+2 ) + nps
536 *
537  work( 1 ) = dcmplx( lwmin )
538  IF( .NOT.lsame( uplo, 'L' ) ) THEN
539  info = -1
540  ELSE IF( ia.NE.1 ) THEN
541  info = -4
542  ELSE IF( ja.NE.1 ) THEN
543  info = -5
544  ELSE IF( nprow.NE.npcol ) THEN
545  info = -( 600+ctxt_ )
546  ELSE IF( desca( dtype_ ).NE.1 ) THEN
547  info = -( 600+dtype_ )
548  ELSE IF( desca( mb_ ).NE.1 ) THEN
549  info = -( 600+mb_ )
550  ELSE IF( desca( nb_ ).NE.1 ) THEN
551  info = -( 600+nb_ )
552  ELSE IF( desca( rsrc_ ).NE.0 ) THEN
553  info = -( 600+rsrc_ )
554  ELSE IF( desca( csrc_ ).NE.0 ) THEN
555  info = -( 600+csrc_ )
556  ELSE IF( lwork.LT.lwmin ) THEN
557  info = -11
558  END IF
559  END IF
560  IF( upper ) THEN
561  idum1( 1 ) = ichar( 'U' )
562  ELSE
563  idum1( 1 ) = ichar( 'L' )
564  END IF
565  idum2( 1 ) = 1
566 *
567  CALL pchk1mat( n, 2, n, 2, ia, ja, desca, 6, 1, idum1, idum2,
568  $ info )
569  END IF
570 *
571  IF( info.NE.0 ) THEN
572  CALL pxerbla( ictxt, 'PZHETTRD', -info )
573  RETURN
574  END IF
575 *
576 * Quick return if possible
577 *
578  IF( n.EQ.0 )
579  $ RETURN
580 *
581 *
582 *
583 * Reduce the lower triangle of sub( A )
584  np = numroc( n, 1, myrow, 0, nprow )
585  nq = numroc( n, 1, mycol, 0, npcol )
586 *
587  nxtrow = 0
588  nxtcol = 0
589 *
590  liip1 = 1
591  lijp1 = 1
592  npm1 = np
593  nqm1 = nq
594 *
595  lda = desca( lld_ )
596  ictxt = desca( ctxt_ )
597 *
598 *
599 *
600 * Miscellaneous details:
601 * Put tau, D and E in the right places
602 * Check signs
603 * Place all the arrays in WORK, control their placement
604 * in memory.
605 *
606 *
607 *
608 * Loop invariants
609 * A(LIIP1, LIJ) points to the first element of A(I+1,J)
610 * NPM1,NQM1 = the number of rows, cols in A( LII+1:N,LIJ+1:N )
611 * A(LII:N,LIJ:N) is one step out of date.
612 * proc( CURROW, CURCOL ) owns A(LII,LIJ)
613 * proc( NXTROW, CURCOL ) owns A(LIIP1,LIJ)
614 *
615  inh = 1
616 *
617  IF( interleave ) THEN
618 *
619 * H and V are interleaved to minimize memory movement
620 * LDV has to be twice as large to accomodate interleaving.
621 * In addition, LDV is doubled again to allow v, h and
622 * toptau to be spreaad across and transposed in a
623 * single communication operation with minimum memory
624 * movement.
625 *
626 * We could reduce LDV back to 2*MAX(NPM1,NQM1)
627 * by increasing the memory movement required in
628 * the spread and transpose of v, h and toptau.
629 * However, since the non-interleaved path already
630 * provides a mear minimum memory requirement option,
631 * we did not provide this additional path.
632 *
633  ldv = 4*( max( npm1, nqm1 ) ) + 2
634 *
635  inh = 1
636 *
637  inv = inh + ldv / 2
638  invt = inh + ( anb+1 )*ldv
639 *
640  inht = invt + ldv / 2
641  intmp = invt + ldv*( anb+1 )
642 *
643  ELSE
644  ldv = max( npm1, nqm1 )
645 *
646  inht = inh + ldv*( anb+1 )
647  inv = inht + ldv*( anb+1 )
648 *
649 * The code works without this +1, but only because of a
650 * coincidence. Without the +1, WORK(INVT) gets trashed, but
651 * WORK(INVT) is only used once and when it is used, it is
652 * multiplied by WORK( INH ) which is zero. Hence, the fact
653 * that WORK(INVT) is trashed has no effect.
654 *
655  invt = inv + ldv*( anb+1 ) + 1
656  intmp = invt + ldv*( 2*anb )
657 *
658  END IF
659 *
660  IF( info.NE.0 ) THEN
661  CALL pxerbla( ictxt, 'PZHETTRD', -info )
662  work( 1 ) = dcmplx( lwmin )
663  RETURN
664  END IF
665 *
666 *
667 * The satisfies the loop invariant: trueA = A - V * HT - H * VT,
668 * (where V, H, VT and HT all have BINDEX+1 rows/columns)
669 * the first ANB times through the loop.
670 *
671 *
672 *
673 * Setting either ( InH and InHT ) or InV to Z_ZERO
674 * is adequate except in the face of NaNs.
675 *
676 *
677  DO 10 i = 1, np
678  work( inh+i-1 ) = z_zero
679  work( inv+i-1 ) = z_zero
680  10 CONTINUE
681  DO 20 i = 1, nq
682  work( inht+i-1 ) = z_zero
683  20 CONTINUE
684 *
685 *
686 *
687  topnv = z_zero
688 *
689  ltlip1 = lijp1
690  ltnm1 = npm1
691  IF( mycol.GT.myrow ) THEN
692  ltlip1 = ltlip1 + 1
693  ltnm1 = ltnm1 - 1
694  END IF
695 *
696 *
697  DO 210 minindex = 1, n - 1, anb
698 *
699 *
700  maxindex = min( minindex+anb-1, n )
701  lijb = numroc( maxindex, 1, mycol, 0, npcol ) + 1
702  liib = numroc( maxindex, 1, myrow, 0, nprow ) + 1
703 *
704  nqb = nq - lijb + 1
705  npb = np - liib + 1
706  inhtb = inht + lijb - 1
707  invtb = invt + lijb - 1
708  inhb = inh + liib - 1
709  invb = inv + liib - 1
710 *
711 *
712 *
713 *
714  DO 160 index = minindex, min( maxindex, n-1 )
715 *
716  bindex = index - minindex
717 *
718  currow = nxtrow
719  curcol = nxtcol
720 *
721  nxtrow = mod( currow+1, nprow )
722  nxtcol = mod( curcol+1, npcol )
723 *
724  lii = liip1
725  lij = lijp1
726  npm0 = npm1
727 *
728  IF( myrow.EQ.currow ) THEN
729  npm1 = npm1 - 1
730  liip1 = liip1 + 1
731  END IF
732  IF( mycol.EQ.curcol ) THEN
733  nqm1 = nqm1 - 1
734  lijp1 = lijp1 + 1
735  ltlip1 = ltlip1 + 1
736  ltnm1 = ltnm1 - 1
737  END IF
738 *
739 *
740 *
741 *
742 * V = NV, VT = NVT, H = NH, HT = NHT
743 *
744 *
745 * Update the current column of A
746 *
747 *
748  IF( mycol.EQ.curcol ) THEN
749 *
750  indexa = lii + ( lij-1 )*lda
751  indexinv = inv + lii - 1 + ( bindex-1 )*ldv
752  indexinh = inh + lii - 1 + ( bindex-1 )*ldv
753  conjtoph = dconjg( work( inht+lij-1+bindex*ldv ) )
754  conjtopv = dconjg( topnv )
755 *
756  IF( index.GT.1 ) THEN
757  DO 30 i = 0, npm0 - 1
758 * A( INDEXA+I ) = A( INDEXA+I )
759  a( indexa+i ) = a( indexa+i ) -
760  $ work( indexinv+ldv+i )*conjtoph -
761  $ work( indexinh+ldv+i )*conjtopv
762  30 CONTINUE
763  END IF
764 *
765 *
766  END IF
767 *
768 *
769  IF( mycol.EQ.curcol ) THEN
770 *
771 * Compute the householder vector
772 *
773  IF( myrow.EQ.currow ) THEN
774  dtmp( 2 ) = dble( a( lii+( lij-1 )*lda ) )
775  ELSE
776  dtmp( 2 ) = zero
777  END IF
778  IF( myrow.EQ.nxtrow ) THEN
779  dtmp( 3 ) = dble( a( liip1+( lij-1 )*lda ) )
780  dtmp( 4 ) = dimag( a( liip1+( lij-1 )*lda ) )
781  ELSE
782  dtmp( 3 ) = zero
783  dtmp( 4 ) = zero
784  END IF
785 *
786  norm = dznrm2( npm1, a( liip1+( lij-1 )*lda ), 1 )
787  dtmp( 1 ) = norm
788 *
789 * IF DTMP(5) = 1.0, NORM is too large and might cause
790 * overflow, hence PDTREECOMB must be called. IF DTMP(5)
791 * is zero on output, DTMP(1) can be trusted.
792 *
793  dtmp( 5 ) = zero
794  IF( dtmp( 1 ).GE.safmax .OR. dtmp( 1 ).LT.safmin ) THEN
795  dtmp( 5 ) = one
796  dtmp( 1 ) = zero
797  END IF
798 *
799  dtmp( 1 ) = dtmp( 1 )*dtmp( 1 )
800  CALL dgsum2d( ictxt, 'C', ' ', 5, 1, dtmp, 5, -1,
801  $ curcol )
802  IF( dtmp( 5 ).EQ.zero ) THEN
803  dtmp( 1 ) = sqrt( dtmp( 1 ) )
804  ELSE
805  dtmp( 1 ) = norm
806  CALL pdtreecomb( ictxt, 'C', 1, dtmp, -1, mycol,
807  $ dcombnrm2 )
808  END IF
809 *
810  norm = dtmp( 1 )
811 *
812  d( lij ) = dtmp( 2 )
813  IF( myrow.EQ.currow .AND. mycol.EQ.curcol ) THEN
814  a( lii+( lij-1 )*lda ) = dcmplx( d( lij ), zero )
815  END IF
816 *
817 *
818  alpha = dcmplx( dtmp( 3 ), dtmp( 4 ) )
819 *
820  norm = sign( norm, dble( alpha ) )
821 *
822  IF( norm.EQ.zero ) THEN
823  toptau = zero
824  ELSE
825  beta = norm + alpha
826  toptau = beta / norm
827  oneoverbeta = 1.0d0 / beta
828 *
829  CALL zscal( npm1, oneoverbeta,
830  $ a( liip1+( lij-1 )*lda ), 1 )
831  END IF
832 *
833  IF( myrow.EQ.nxtrow ) THEN
834  a( liip1+( lij-1 )*lda ) = z_one
835  END IF
836 *
837  tau( lij ) = toptau
838  e( lij ) = -norm
839 *
840  END IF
841 *
842 *
843 * Spread v, nh, toptau across
844 *
845  DO 40 i = 0, npm1 - 1
846  work( inv+liip1-1+bindex*ldv+npm1+i ) = a( liip1+i+
847  $ ( lij-1 )*lda )
848  40 CONTINUE
849 *
850  IF( mycol.EQ.curcol ) THEN
851  work( inv+liip1-1+bindex*ldv+npm1+npm1 ) = toptau
852  CALL zgebs2d( ictxt, 'R', ' ', npm1+npm1+1, 1,
853  $ work( inv+liip1-1+bindex*ldv ),
854  $ npm1+npm1+1 )
855  ELSE
856  CALL zgebr2d( ictxt, 'R', ' ', npm1+npm1+1, 1,
857  $ work( inv+liip1-1+bindex*ldv ),
858  $ npm1+npm1+1, myrow, curcol )
859  toptau = work( inv+liip1-1+bindex*ldv+npm1+npm1 )
860  END IF
861  DO 50 i = 0, npm1 - 1
862  work( inh+liip1-1+( bindex+1 )*ldv+i ) = work( inv+liip1-
863  $ 1+bindex*ldv+npm1+i )
864  50 CONTINUE
865 *
866  IF( index.LT.n ) THEN
867  IF( myrow.EQ.nxtrow .AND. mycol.EQ.curcol )
868  $ a( liip1+( lij-1 )*lda ) = e( lij )
869  END IF
870 *
871 * Transpose v, nh
872 *
873 *
874  IF( myrow.EQ.mycol ) THEN
875  DO 60 i = 0, npm1 + npm1
876  work( invt+lijp1-1+bindex*ldv+i ) = work( inv+liip1-1+
877  $ bindex*ldv+i )
878  60 CONTINUE
879  ELSE
880  CALL zgesd2d( ictxt, npm1+npm1, 1,
881  $ work( inv+liip1-1+bindex*ldv ), npm1+npm1,
882  $ mycol, myrow )
883  CALL zgerv2d( ictxt, nqm1+nqm1, 1,
884  $ work( invt+lijp1-1+bindex*ldv ), nqm1+nqm1,
885  $ mycol, myrow )
886  END IF
887 *
888  DO 70 i = 0, nqm1 - 1
889  work( inht+lijp1-1+( bindex+1 )*ldv+i ) = work( invt+
890  $ lijp1-1+bindex*ldv+nqm1+i )
891  70 CONTINUE
892 *
893 *
894 * Update the current block column of A
895 *
896  IF( index.GT.1 ) THEN
897  DO 90 j = lijp1, lijb - 1
898  DO 80 i = 0, npm1 - 1
899 *
900  a( liip1+i+( j-1 )*lda ) = a( liip1+i+( j-1 )*lda )
901  $ - work( inv+liip1-1+bindex*ldv+i )*
902  $ dconjg( work( inht+j-1+bindex*ldv ) ) -
903  $ work( inh+liip1-1+bindex*ldv+i )*
904  $ dconjg( work( invt+j-1+bindex*ldv ) )
905  80 CONTINUE
906  90 CONTINUE
907  END IF
908 *
909 *
910 *
911 * Compute NV = A * NHT; NVT = A * NH
912 *
913 * These two lines are necessary because these elements
914 * are not always involved in the calls to ZTRMVT
915 * for two reasons:
916 * 1) On diagonal processors, the call to TRMVT
917 * involves only LTNM1-1 elements
918 * 2) On some processes, NQM1 < LTM1 or LIIP1 < LTLIP1
919 * and when the results are combined across all processes,
920 * uninitialized values may be included.
921  work( inv+liip1-1+( bindex+1 )*ldv ) = z_zero
922  work( invt+lijp1-1+( bindex+1 )*ldv+nqm1-1 ) = z_zero
923 *
924 *
925  IF( myrow.EQ.mycol ) THEN
926  IF( ltnm1.GT.1 ) THEN
927  CALL ztrmvt( 'L', ltnm1-1,
928  $ a( ltlip1+1+( lijp1-1 )*lda ), lda,
929  $ work( invt+lijp1-1+( bindex+1 )*ldv ), 1,
930  $ work( inh+ltlip1+1-1+( bindex+1 )*ldv ),
931  $ 1, work( inv+ltlip1+1-1+( bindex+1 )*
932  $ ldv ), 1, work( inht+lijp1-1+( bindex+
933  $ 1 )*ldv ), 1 )
934  END IF
935  DO 100 i = 1, ltnm1
936  work( invt+lijp1+i-1-1+( bindex+1 )*ldv )
937  $ = work( invt+lijp1+i-1-1+( bindex+1 )*ldv ) +
938  $ a( ltlip1+i-1+( lijp1+i-1-1 )*lda )*
939  $ work( inh+ltlip1+i-1-1+( bindex+1 )*ldv )
940  100 CONTINUE
941  ELSE
942  IF( ltnm1.GT.0 )
943  $ CALL ztrmvt( 'L', ltnm1, a( ltlip1+( lijp1-1 )*lda ),
944  $ lda, work( invt+lijp1-1+( bindex+1 )*
945  $ ldv ), 1, work( inh+ltlip1-1+( bindex+
946  $ 1 )*ldv ), 1, work( inv+ltlip1-1+
947  $ ( bindex+1 )*ldv ), 1,
948  $ work( inht+lijp1-1+( bindex+1 )*ldv ),
949  $ 1 )
950 *
951  END IF
952 *
953 *
954 * We take advantage of the fact that:
955 * A * sum( B ) = sum ( A * B ) for matrices A,B
956 *
957 * trueA = A + V * HT + H * VT
958 * hence: (trueA)v = Av' + V * HT * v + H * VT * v
959 * VT * v = sum_p_in_NPROW ( VTp * v )
960 * H * VT * v = H * sum (VTp * v) = sum ( H * VTp * v )
961 *
962 * v = v + V * HT * h + H * VT * h
963 *
964 *
965 *
966 * tmp = HT * nh1
967  DO 110 i = 1, 2*( bindex+1 )
968  work( intmp-1+i ) = 0
969  110 CONTINUE
970 *
971  IF( balanced ) THEN
972  npset = nprow
973  mysetnum = myrow
974  rowsperproc = iceil( nqb, npset )
975  myfirstrow = min( nqb+1, 1+rowsperproc*mysetnum )
976  numrows = min( rowsperproc, nqb-myfirstrow+1 )
977 *
978 *
979 * tmp = HT * v
980 *
981  CALL zgemv( 'C', numrows, bindex+1, z_one,
982  $ work( inhtb+myfirstrow-1 ), ldv,
983  $ work( inhtb+myfirstrow-1+( bindex+1 )*ldv ),
984  $ 1, z_zero, work( intmp ), 1 )
985 * tmp2 = VT * v
986  CALL zgemv( 'C', numrows, bindex+1, z_one,
987  $ work( invtb+myfirstrow-1 ), ldv,
988  $ work( inhtb+myfirstrow-1+( bindex+1 )*ldv ),
989  $ 1, z_zero, work( intmp+bindex+1 ), 1 )
990 *
991 *
992  CALL zgsum2d( ictxt, 'C', ' ', 2*( bindex+1 ), 1,
993  $ work( intmp ), 2*( bindex+1 ), -1, -1 )
994  ELSE
995 * tmp = HT * v
996 *
997  CALL zgemv( 'C', nqb, bindex+1, z_one, work( inhtb ),
998  $ ldv, work( inhtb+( bindex+1 )*ldv ), 1,
999  $ z_zero, work( intmp ), 1 )
1000 * tmp2 = VT * v
1001  CALL zgemv( 'C', nqb, bindex+1, z_one, work( invtb ),
1002  $ ldv, work( inhtb+( bindex+1 )*ldv ), 1,
1003  $ z_zero, work( intmp+bindex+1 ), 1 )
1004 *
1005  END IF
1006 *
1007 *
1008 *
1009  IF( balanced ) THEN
1010  mysetnum = mycol
1011 *
1012  rowsperproc = iceil( npb, npset )
1013  myfirstrow = min( npb+1, 1+rowsperproc*mysetnum )
1014  numrows = min( rowsperproc, npb-myfirstrow+1 )
1015 *
1016  CALL zgsum2d( ictxt, 'R', ' ', 2*( bindex+1 ), 1,
1017  $ work( intmp ), 2*( bindex+1 ), -1, -1 )
1018 *
1019 *
1020 * v = v + V * tmp
1021  IF( index.GT.1. ) THEN
1022  CALL zgemv( 'N', numrows, bindex+1, z_negone,
1023  $ work( invb+myfirstrow-1 ), ldv,
1024  $ work( intmp ), 1, z_one,
1025  $ work( invb+myfirstrow-1+( bindex+1 )*
1026  $ ldv ), 1 )
1027 *
1028 * v = v + H * tmp2
1029  CALL zgemv( 'N', numrows, bindex+1, z_negone,
1030  $ work( inhb+myfirstrow-1 ), ldv,
1031  $ work( intmp+bindex+1 ), 1, z_one,
1032  $ work( invb+myfirstrow-1+( bindex+1 )*
1033  $ ldv ), 1 )
1034  END IF
1035 *
1036  ELSE
1037 * v = v + V * tmp
1038  CALL zgemv( 'N', npb, bindex+1, z_negone, work( invb ),
1039  $ ldv, work( intmp ), 1, z_one,
1040  $ work( invb+( bindex+1 )*ldv ), 1 )
1041 *
1042 *
1043 * v = v + H * tmp2
1044  CALL zgemv( 'N', npb, bindex+1, z_negone, work( inhb ),
1045  $ ldv, work( intmp+bindex+1 ), 1, z_one,
1046  $ work( invb+( bindex+1 )*ldv ), 1 )
1047 *
1048  END IF
1049 *
1050 *
1051 * Transpose NV and add it back into NVT
1052 *
1053  IF( myrow.EQ.mycol ) THEN
1054  DO 120 i = 0, nqm1 - 1
1055  work( intmp+i ) = work( invt+lijp1-1+( bindex+1 )*ldv+
1056  $ i )
1057  120 CONTINUE
1058  ELSE
1059  CALL zgesd2d( ictxt, nqm1, 1,
1060  $ work( invt+lijp1-1+( bindex+1 )*ldv ),
1061  $ nqm1, mycol, myrow )
1062  CALL zgerv2d( ictxt, npm1, 1, work( intmp ), npm1, mycol,
1063  $ myrow )
1064 *
1065  END IF
1066  DO 130 i = 0, npm1 - 1
1067  work( inv+liip1-1+( bindex+1 )*ldv+i ) = work( inv+liip1-
1068  $ 1+( bindex+1 )*ldv+i ) + work( intmp+i )
1069  130 CONTINUE
1070 *
1071 * Sum-to-one NV rowwise (within a row)
1072 *
1073  CALL zgsum2d( ictxt, 'R', ' ', npm1, 1,
1074  $ work( inv+liip1-1+( bindex+1 )*ldv ), npm1,
1075  $ myrow, nxtcol )
1076 *
1077 *
1078 * Dot product c = NV * NH
1079 * Sum-to-all c within next processor column
1080 *
1081 *
1082  IF( mycol.EQ.nxtcol ) THEN
1083  cc( 1 ) = z_zero
1084  DO 140 i = 0, npm1 - 1
1085  cc( 1 ) = cc( 1 ) + dconjg( work( inv+liip1-1+
1086  $ ( bindex+1 )*ldv+i ) )*
1087  $ work( inh+liip1-1+( bindex+1 )*ldv+i )
1088  140 CONTINUE
1089  IF( myrow.EQ.nxtrow ) THEN
1090  cc( 2 ) = work( inv+liip1-1+( bindex+1 )*ldv )
1091  cc( 3 ) = work( inh+liip1-1+( bindex+1 )*ldv )
1092  ELSE
1093  cc( 2 ) = z_zero
1094  cc( 3 ) = z_zero
1095  END IF
1096  CALL zgsum2d( ictxt, 'C', ' ', 3, 1, cc, 3, -1, nxtcol )
1097 *
1098  topv = cc( 2 )
1099  c = cc( 1 )
1100  toph = cc( 3 )
1101 *
1102  topnv = toptau*( topv-c*dconjg( toptau ) / 2*toph )
1103 *
1104 *
1105 * Compute V = Tau * (V - C * Tau' / 2 * H )
1106 *
1107 *
1108  DO 150 i = 0, npm1 - 1
1109  work( inv+liip1-1+( bindex+1 )*ldv+i ) = toptau*
1110  $ ( work( inv+liip1-1+( bindex+1 )*ldv+i )-c*
1111  $ dconjg( toptau ) / 2*work( inh+liip1-1+( bindex+
1112  $ 1 )*ldv+i ) )
1113  150 CONTINUE
1114 *
1115  END IF
1116 *
1117 *
1118  160 CONTINUE
1119 *
1120 *
1121 * Perform the rank2k update
1122 *
1123  IF( maxindex.LT.n ) THEN
1124 *
1125  DO 170 i = 0, npm1 - 1
1126  work( intmp+i ) = work( inh+liip1-1+anb*ldv+i )
1127  170 CONTINUE
1128 *
1129 *
1130 *
1131  IF( .NOT.twogemms ) THEN
1132  IF( interleave ) THEN
1133  ldzg = ldv / 2
1134  ELSE
1135  CALL zlamov( 'A', ltnm1, anb, work( inht+lijp1-1 ),
1136  $ ldv, work( invt+lijp1-1+anb*ldv ), ldv )
1137 *
1138  CALL zlamov( 'A', ltnm1, anb, work( inv+ltlip1-1 ),
1139  $ ldv, work( inh+ltlip1-1+anb*ldv ), ldv )
1140  ldzg = ldv
1141  END IF
1142  nbzg = anb*2
1143  ELSE
1144  ldzg = ldv
1145  nbzg = anb
1146  END IF
1147 *
1148 *
1149  DO 180 pbmin = 1, ltnm1, pnb
1150 *
1151  pbsize = min( pnb, ltnm1-pbmin+1 )
1152  pbmax = min( ltnm1, pbmin+pnb-1 )
1153  CALL zgemm( 'N', 'C', pbsize, pbmax, nbzg, z_negone,
1154  $ work( inh+ltlip1-1+pbmin-1 ), ldzg,
1155  $ work( invt+lijp1-1 ), ldzg, z_one,
1156  $ a( ltlip1+pbmin-1+( lijp1-1 )*lda ), lda )
1157  IF( twogemms ) THEN
1158  CALL zgemm( 'N', 'C', pbsize, pbmax, anb, z_negone,
1159  $ work( inv+ltlip1-1+pbmin-1 ), ldzg,
1160  $ work( inht+lijp1-1 ), ldzg, z_one,
1161  $ a( ltlip1+pbmin-1+( lijp1-1 )*lda ), lda )
1162  END IF
1163  180 CONTINUE
1164 *
1165 *
1166 *
1167  DO 190 i = 0, npm1 - 1
1168  work( inv+liip1-1+i ) = work( inv+liip1-1+anb*ldv+i )
1169  work( inh+liip1-1+i ) = work( intmp+i )
1170  190 CONTINUE
1171  DO 200 i = 0, nqm1 - 1
1172  work( inht+lijp1-1+i ) = work( inht+lijp1-1+anb*ldv+i )
1173  200 CONTINUE
1174 *
1175 *
1176  END IF
1177 *
1178 * End of the update A code
1179 *
1180  210 CONTINUE
1181 *
1182  IF( mycol.EQ.nxtcol ) THEN
1183  IF( myrow.EQ.nxtrow ) THEN
1184 *
1185  d( nq ) = dble( a( np+( nq-1 )*lda ) )
1186  a( np+( nq-1 )*lda ) = d( nq )
1187 *
1188  CALL dgebs2d( ictxt, 'C', ' ', 1, 1, d( nq ), 1 )
1189  ELSE
1190  CALL dgebr2d( ictxt, 'C', ' ', 1, 1, d( nq ), 1, nxtrow,
1191  $ nxtcol )
1192  END IF
1193  END IF
1194 *
1195 *
1196 *
1197 *
1198  work( 1 ) = dcmplx( lwmin )
1199  RETURN
1200 *
1201 * End of PZHETTRD
1202 *
1203 *
1204  END
max
#define max(A, B)
Definition: pcgemr.c:180
pdtreecomb
subroutine pdtreecomb(ICTXT, SCOPE, N, MINE, RDEST0, CDEST0, SUBPTR)
Definition: pdtreecomb.f:3
pzhettrd
subroutine pzhettrd(UPLO, N, A, IA, JA, DESCA, D, E, TAU, WORK, LWORK, INFO)
Definition: pzhettrd.f:3
ztrmvt
subroutine ztrmvt(UPLO, N, T, LDT, X, INCX, Y, INCY, W, INCW, Z, INCZ)
Definition: ztrmvt.f:3
pchk1mat
subroutine pchk1mat(MA, MAPOS0, NA, NAPOS0, IA, JA, DESCA, DESCAPOS0, NEXTRA, EX, EXPOS, INFO)
Definition: pchkxmat.f:3
chk1mat
subroutine chk1mat(MA, MAPOS0, NA, NAPOS0, IA, JA, DESCA, DESCAPOS0, INFO)
Definition: chk1mat.f:3
dcombnrm2
subroutine dcombnrm2(X, Y)
Definition: pdtreecomb.f:307
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181