SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
PB_Cpsyr2kA.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS auxiliary routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "../pblas.h"
14#include "../PBpblas.h"
15#include "../PBtools.h"
16#include "../PBblacs.h"
17#include "../PBblas.h"
18
19#ifdef __STDC__
20void PB_Cpsyr2kA( PBTYP_T * TYPE, char * DIRECAB, char * CONJUG,
21 char * UPLO, char * TRANS, Int N, Int K, char * ALPHA,
22 char * A, Int IA, Int JA, Int * DESCA, char * B,
23 Int IB, Int JB, Int * DESCB, char * BETA, char * C,
24 Int IC, Int JC, Int * DESCC )
25#else
26void PB_Cpsyr2kA( TYPE, DIRECAB, CONJUG, UPLO, TRANS, N, K, ALPHA, A, IA,
27 JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC )
28/*
29* .. Scalar Arguments ..
30*/
31 char * CONJUG, * DIRECAB, * TRANS, * UPLO;
32 Int IA, IB, IC, JA, JB, JC, K, N;
33 char * ALPHA, * BETA;
34 PBTYP_T * TYPE;
35/*
36* .. Array Arguments ..
37*/
38 Int * DESCA, * DESCB, * DESCC;
39 char * A, * B, * C;
40#endif
41{
42/*
43* Purpose
44* =======
45*
46* PB_Cpsyr2kA performs one of the following symmetric or Hermitian rank
47* 2k operations
48*
49* sub( C ) := alpha*sub( A )*sub( B )' + alpha*sub( B )*sub( A )' +
50* beta*sub( C ),
51* or
52* sub( C ) := alpha*sub( A )*conjg( sub( B ) )' +
53* conjg( alpha )*sub( B )*conjg( sub( A ) )' +
54* beta*sub( C ),
55* or
56* sub( C ) := alpha*sub( A )'*sub( B ) + alpha*sub( B )'*sub( A ) +
57* beta*sub( C ),
58* or
59* sub( C ) := alpha*conjg( sub( A )' )*sub( B ) +
60* conjg( alpha )*conjg( sub( B )' )*sub( A ) +
61* beta*sub( C ),
62*
63* where
64*
65* sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1),
66*
67* sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N',
68* A(IA:IA+K-1,JA:JA+N-1) otherwise, and,
69*
70* sub( B ) denotes B(IB:IB+N-1,JB:JB+K-1) if TRANS = 'N',
71* B(IB:IB+K-1,JB:JB+N-1) otherwise.
72*
73* Alpha and beta are scalars, sub( C ) is an n by n symmetric or
74* Hermitian submatrix and sub( A ) and sub( B ) are n by k submatrices
75* in the first case and k by n submatrices in the second case.
76*
77* This is the outer-product algorithm using the logical LCM hybrid
78* and static blocking technique. The submatrix operand sub( C ) stays
79* in place.
80*
81* Notes
82* =====
83*
84* A description vector is associated with each 2D block-cyclicly dis-
85* tributed matrix. This vector stores the information required to
86* establish the mapping between a matrix entry and its corresponding
87* process and memory location.
88*
89* In the following comments, the character _ should be read as
90* "of the distributed matrix". Let A be a generic term for any 2D
91* block cyclicly distributed matrix. Its description vector is DESC_A:
92*
93* NOTATION STORED IN EXPLANATION
94* ---------------- --------------- ------------------------------------
95* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
96* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
97* the NPROW x NPCOL BLACS process grid
98* A is distributed over. The context
99* itself is global, but the handle
100* (the integer value) may vary.
101* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
102* ted matrix A, M_A >= 0.
103* N_A (global) DESCA[ N_ ] The number of columns in the distri-
104* buted matrix A, N_A >= 0.
105* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
106* block of the matrix A, IMB_A > 0.
107* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
108* left block of the matrix A,
109* INB_A > 0.
110* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
111* bute the last M_A-IMB_A rows of A,
112* MB_A > 0.
113* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
114* bute the last N_A-INB_A columns of
115* A, NB_A > 0.
116* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
117* row of the matrix A is distributed,
118* NPROW > RSRC_A >= 0.
119* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
120* first column of A is distributed.
121* NPCOL > CSRC_A >= 0.
122* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
123* array storing the local blocks of
124* the distributed matrix A,
125* IF( Lc( 1, N_A ) > 0 )
126* LLD_A >= MAX( 1, Lr( 1, M_A ) )
127* ELSE
128* LLD_A >= 1.
129*
130* Let K be the number of rows of a matrix A starting at the global in-
131* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
132* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
133* receive if these K rows were distributed over NPROW processes. If K
134* is the number of columns of a matrix A starting at the global index
135* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
136* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
137* these K columns were distributed over NPCOL processes.
138*
139* The values of Lr() and Lc() may be determined via a call to the func-
140* tion PB_Cnumroc:
141* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
142* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
143*
144* Arguments
145* =========
146*
147* TYPE (local input) pointer to a PBTYP_T structure
148* On entry, TYPE is a pointer to a structure of type PBTYP_T,
149* that contains type information (See pblas.h).
150*
151* DIRECAB (global input) pointer to CHAR
152* On entry, DIRECAB specifies the direction in which the rows
153* or columns of sub( A ) and sub( B ) should be looped over as
154* follows:
155* DIRECAB = 'F' or 'f' forward or increasing,
156* DIRECAB = 'B' or 'b' backward or decreasing.
157*
158* CONJUG (global input) pointer to CHAR
159* On entry, CONJUG specifies whether sub( C ) is a symmetric or
160* Hermitian submatrix operand as follows:
161* CONJUG = 'N' or 'n' sub( C ) is symmetric,
162* CONJUG = 'Z' or 'z' sub( C ) is Hermitian.
163*
164* UPLO (global input) pointer to CHAR
165* On entry, UPLO specifies whether the local pieces of
166* the array C containing the upper or lower triangular part
167* of the submatrix sub( C ) are to be referenced as follows:
168* UPLO = 'U' or 'u' Only the local pieces corresponding to
169* the upper triangular part of the
170* submatrix sub( C ) are referenced,
171* UPLO = 'L' or 'l' Only the local pieces corresponding to
172* the lower triangular part of the
173* submatrix sub( C ) are referenced.
174*
175* TRANS (global input) pointer to CHAR
176* On entry, TRANS specifies the operation to be performed as
177* follows:
178*
179* TRANS = 'N' or 'n'
180* sub( C ) := alpha*sub( A )*sub( B )' +
181* alpha*sub( B )*sub( A )' +
182* beta*sub( C ),
183* or
184* sub( C ) := alpha*sub( A )*sub( B )' +
185* alpha*sub( B )*sub( A )' +
186* beta*sub( C ),
187* or
188* sub( C ) := alpha*sub( A )*conjg( sub( B )' ) +
189* conjg( alpha )*sub( B )*conjg( sub( A )' ) +
190* beta*sub( C ),
191*
192* TRANS = 'T' or 't'
193* sub( C ) := alpha*sub( B )'*sub( A ) +
194* alpha*sub( A )'*sub( B ) +
195* beta*sub( C ),
196* or
197* sub( C ) := alpha*sub( B )'*sub( A ) +
198* alpha*sub( A )'*sub( B ) +
199* beta*sub( C ),
200*
201* TRANS = 'C' or 'c'
202* sub( C ) := alpha*sub( B )'*sub( A ) +
203* alpha*sub( A )'*sub( B ) +
204* beta*sub( C ),
205* or
206* sub( C ) := alpha*conjg( sub( A )' )*sub( B ) +
207* conjg( alpha )*conjg( sub( B )' )*sub( A ) +
208* beta*sub( C ).
209*
210* N (global input) INTEGER
211* On entry, N specifies the order of the submatrix sub( C ).
212* N must be at least zero.
213*
214* K (global input) INTEGER
215* On entry with TRANS = 'N' or 'n', K specifies the number of
216* columns of the submatrices sub( A ) and sub( B ), and on
217* entry with TRANS = 'T' or 't' or 'C' or 'c', K specifies the
218* number of rows of the submatrices sub( A ) and sub( B ).
219* K must be at least zero.
220*
221* ALPHA (global input) pointer to CHAR
222* On entry, ALPHA specifies the scalar alpha. When ALPHA is
223* supplied as zero then the local entries of the arrays A
224* and B corresponding to the entries of the submatrices
225* sub( A ) and sub( B ) respectively need not be set on input.
226*
227* A (local input) pointer to CHAR
228* On entry, A is an array of dimension (LLD_A, Ka), where Ka is
229* at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at
230* least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
231* contains the local entries of the matrix A.
232* Before entry with TRANS = 'N' or 'n', this array contains the
233* local entries corresponding to the entries of the n by k sub-
234* matrix sub( A ), otherwise the local entries corresponding to
235* the entries of the k by n submatrix sub( A ).
236*
237* IA (global input) INTEGER
238* On entry, IA specifies A's global row index, which points to
239* the beginning of the submatrix sub( A ).
240*
241* JA (global input) INTEGER
242* On entry, JA specifies A's global column index, which points
243* to the beginning of the submatrix sub( A ).
244*
245* DESCA (global and local input) INTEGER array
246* On entry, DESCA is an integer array of dimension DLEN_. This
247* is the array descriptor for the matrix A.
248*
249* B (local input) pointer to CHAR
250* On entry, B is an array of dimension (LLD_B, Kb), where Kb is
251* at least Lc( 1, JB+K-1 ) when TRANS = 'N' or 'n', and is at
252* least Lc( 1, JB+N-1 ) otherwise. Before entry, this array
253* contains the local entries of the matrix B.
254* Before entry with TRANS = 'N' or 'n', this array contains the
255* local entries corresponding to the entries of the n by k sub-
256* matrix sub( B ), otherwise the local entries corresponding to
257* the entries of the k by n submatrix sub( B ).
258*
259* IB (global input) INTEGER
260* On entry, IB specifies B's global row index, which points to
261* the beginning of the submatrix sub( B ).
262*
263* JB (global input) INTEGER
264* On entry, JB specifies B's global column index, which points
265* to the beginning of the submatrix sub( B ).
266*
267* DESCB (global and local input) INTEGER array
268* On entry, DESCB is an integer array of dimension DLEN_. This
269* is the array descriptor for the matrix B.
270*
271* BETA (global input) pointer to CHAR
272* On entry, BETA specifies the scalar beta. When BETA is
273* supplied as zero then the local entries of the array C
274* corresponding to the entries of the submatrix sub( C ) need
275* not be set on input.
276*
277* C (local input/local output) pointer to CHAR
278* On entry, C is an array of dimension (LLD_C, Kc), where Kc is
279* at least Lc( 1, JC+N-1 ). Before entry, this array contains
280* the local entries of the matrix C.
281* Before entry with UPLO = 'U' or 'u', this array contains
282* the local entries corresponding to the upper triangular part
283* of the symmetric or Hermitian submatrix sub( C ), and the
284* local entries corresponding to the strictly lower triangular
285* of sub( C ) are not referenced. On exit, the upper triangular
286* part of sub( C ) is overwritten by the upper triangular part
287* of the updated submatrix.
288* Before entry with UPLO = 'L' or 'l', this array contains
289* the local entries corresponding to the lower triangular part
290* of the symmetric or Hermitian submatrix sub( C ), and the
291* local entries corresponding to the strictly upper triangular
292* of sub( C ) are not referenced. On exit, the lower triangular
293* part of sub( C ) is overwritten by the lower triangular part
294* of the updated submatrix.
295* Note that the imaginary parts of the local entries corres-
296* ponding to the diagonal elements of sub( C ) need not be
297* set, they are assumed to be zero, and on exit they are set
298* to zero.
299*
300* IC (global input) INTEGER
301* On entry, IC specifies C's global row index, which points to
302* the beginning of the submatrix sub( C ).
303*
304* JC (global input) INTEGER
305* On entry, JC specifies C's global column index, which points
306* to the beginning of the submatrix sub( C ).
307*
308* DESCC (global and local input) INTEGER array
309* On entry, DESCC is an integer array of dimension DLEN_. This
310* is the array descriptor for the matrix C.
311*
312* -- Written on April 1, 1998 by
313* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
314*
315* ---------------------------------------------------------------------
316*/
317/*
318* .. Local Scalars ..
319*/
320 char * one, * talpha, * zero;
321 Int ABfwd, ABmyprocD, ABmyprocR, ABnprocsD, ABnprocsR, ABrocs,
322 Abufld, AcurrocR, Afr, AiD, AiR, AiiD, AiiR, AinbD, AinbR,
323 Ainb1D, Ainb1R, AisR, AkkR, Ald, AnbD, AnbR, AnpD, AnpR,
324 Aoff, ArocD, ArocR, AsrcR, Bbufld, BcurrocR, Bfr, BiD, BiR,
325 BiiD, BiiR, BinbD, BinbR, Binb1D, Binb1R, BisR, BkkR, Bld,
326 BnbD, BnbR, BnpD,
327 BnpR, Boff, BrocD, BrocR, BsrcR, Ccol, Cii, Cimb1,
328 Cinb1, Cjj, Clcmb, Cld, Clp, Clq, Cnq0, Cmb, Cmp, Cmp0,
329 Cnb, Cnq, Crow,
330 WACfr, WACld, WACsum, WARfr, WARld, WARsum,
331 WBCfr, WBCld, WBCsum, WBRfr, WBRld, WBRsum, Wkbb=0,
332 conjg, ctxt, k, kb, kbb, l, lb,
333 lcmb, ltmp, maxp, maxpm1, maxq, mycol, myrow, ncpq, notran,
334 npcol, npq, nprow, nrpq, p=0, q=0, size, tmp, upper;
335 GEMM_T gemm;
336 TZSYR2_T tzsyr2k;
337/*
338* .. Local Arrays ..
339*/
340 PB_VM_T VM;
341 Int Cd0 [DLEN_], DBUFA[DLEN_], DBUFB[DLEN_], WACd0[DLEN_],
342 WARd0[DLEN_], WBCd0[DLEN_], WBRd0[DLEN_];
343 char * Abuf = NULL, * Bbuf = NULL, * Cptr = NULL, * WAC = NULL,
344 * WAR = NULL, * WBC = NULL, * WBR = NULL;
345/* ..
346* .. Executable Statements ..
347*
348*/
349/*
350* sub( C ) := beta * sub( C )
351*/
352 PB_Cplascal( TYPE, UPLO, CONJUG, N, N, BETA, C, IC, JC, DESCC );
353/*
354* Retrieve process grid information
355*/
356 Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
357
358 conjg = ( Mupcase( CONJUG [0] ) == CCONJG );
359
360 size = TYPE->size; one = TYPE->one; zero = TYPE->zero; gemm = TYPE->Fgemm;
361 kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
362/*
363* Compute descriptor Cd0 for sub( C )
364*/
365 PB_Cdescribe( N, N, IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj,
366 &Cld, &Cimb1, &Cinb1, &Cmb, &Cnb, &Crow, &Ccol, Cd0 );
367
368 Cmp = PB_Cnumroc( N, 0, Cimb1, Cmb, myrow, Crow, nprow );
369 Cnq = PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol );
370
371 if( ( Cmp > 0 ) && ( Cnq > 0 ) )
372 {
373 Cptr = Mptr( C, Cii, Cjj, Cld, size );
374
375 if( conjg )
376 {
377 talpha = PB_Cmalloc( size ); PB_Cconjg( TYPE, ALPHA, talpha );
378 tzsyr2k = PB_Ctzher2k;
379 }
380 else { talpha = ALPHA; tzsyr2k = PB_Ctzsyr2k; }
381/*
382* Computational partitioning size is computed as the product of the logical
383* value returned by pilaenv_ and 2 * lcm( nprow, npcol ).
384*/
385 Clcmb = 2 * kb * PB_Clcm( ( Crow >= 0 ? nprow : 1 ),
386 ( Ccol >= 0 ? npcol : 1 ) );
387 }
388/*
389* Retrieve local information for sub( A ) and sub( B )
390*/
391 if( ( notran = ( Mupcase( TRANS[0] ) == CNOTRAN ) ) != 0 )
392 {
393 ABnprocsR = npcol;
394 AiR = JA; AinbR = DESCA[INB_]; AnbR = DESCA[NB_]; AsrcR = DESCA[CSRC_];
395 BiR = JB; BinbR = DESCB[INB_]; BnbR = DESCB[NB_]; BsrcR = DESCB[CSRC_];
396 }
397 else
398 {
399 ABnprocsR = nprow;
400 AiR = IA; AinbR = DESCA[IMB_]; AnbR = DESCA[MB_]; AsrcR = DESCA[RSRC_];
401 BiR = IB; BinbR = DESCB[IMB_]; BnbR = DESCB[MB_]; BsrcR = DESCB[RSRC_];
402 }
403/*
404* If sub( A ) and sub( B ) only spans one process row or column, then there is
405* no need to pack the data.
406*/
407 if( !( PB_Cspan( K, AiR, AinbR, AnbR, AsrcR, ABnprocsR ) ) &&
408 !( PB_Cspan( K, BiR, BinbR, BnbR, BsrcR, ABnprocsR ) ) )
409 {
410/*
411* Replicate sub( A ) in process rows and columns spanned by sub( C ): WAR, WAC
412* Replicate sub( B ) in process rows and columns spanned by sub( C ): WBR, WBC
413*/
414 if( notran )
415 {
416 PB_CInV( TYPE, NOCONJG, COLUMN, N, N, Cd0, K, A, IA, JA, DESCA, COLUMN,
417 &WAC, WACd0, &WACfr );
418 PB_CInV( TYPE, CONJUG, ROW, N, N, Cd0, K, WAC, 0, 0, WACd0, COLUMN,
419 &WAR, WARd0, &WARfr );
420
421 PB_CInV( TYPE, NOCONJG, COLUMN, N, N, Cd0, K, B, IB, JB, DESCB, COLUMN,
422 &WBC, WBCd0, &WBCfr );
423 PB_CInV( TYPE, CONJUG, ROW, N, N, Cd0, K, WBC, 0, 0, WBCd0, COLUMN,
424 &WBR, WBRd0, &WBRfr );
425 }
426 else
427 {
428 PB_CInV( TYPE, NOCONJG, ROW, N, N, Cd0, K, A, IA, JA, DESCA, ROW,
429 &WAR, WARd0, &WARfr );
430 PB_CInV( TYPE, CONJUG, COLUMN, N, N, Cd0, K, WAR, 0, 0, WARd0, ROW,
431 &WAC, WACd0, &WACfr );
432
433 PB_CInV( TYPE, NOCONJG, ROW, N, N, Cd0, K, B, IB, JB, DESCB, ROW,
434 &WBR, WBRd0, &WBRfr );
435 PB_CInV( TYPE, CONJUG, COLUMN, N, N, Cd0, K, WBR, 0, 0, WBRd0, ROW,
436 &WBC, WBCd0, &WBCfr );
437 }
438/*
439* Perform the local update if I own some data
440*/
441 if( ( Cmp > 0 ) && ( Cnq > 0 ) )
442 {
443 WACld = WACd0[LLD_]; WBCld = WBCd0[LLD_];
444 WARld = WARd0[LLD_]; WBRld = WBRd0[LLD_];
445
446 if( Mupcase( UPLO[0] ) == CUPPER )
447 {
448 for( l = 0; l < N; l += Clcmb )
449 {
450 lb = N - l; lb = MIN( lb, Clcmb );
451 Clp = PB_Cnumroc( l, 0, Cimb1, Cmb, myrow, Crow, nprow );
452 Clq = PB_Cnumroc( l, 0, Cinb1, Cnb, mycol, Ccol, npcol );
453 Cnq0 = PB_Cnumroc( lb, l, Cinb1, Cnb, mycol, Ccol, npcol );
454 if( Clp > 0 && Cnq0 > 0 )
455 {
456 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Clp, &Cnq0, &K,
457 ALPHA, WAC, &WACld, Mptr( WBR, 0, Clq, WBRld, size ),
458 &WBRld, one, Mptr( Cptr, 0, Clq, Cld, size ), &Cld );
459 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Clp, &Cnq0, &K,
460 talpha, WBC, &WBCld, Mptr( WAR, 0, Clq, WARld, size ),
461 &WARld, one, Mptr( Cptr, 0, Clq, Cld, size ), &Cld );
462 }
463 PB_Cpsyr2( TYPE, UPPER, lb, K, ALPHA, Mptr( WAC, Clp, 0, WACld,
464 size ), WACld, Mptr( WAR, 0, Clq, WARld, size ),
465 WARld, Mptr( WBC, Clp, 0, WBCld, size ), WBCld,
466 Mptr( WBR, 0, Clq, WBRld, size ), WBRld, Cptr, l, l,
467 Cd0, tzsyr2k );
468 }
469 }
470 else
471 {
472 for( l = 0; l < N; l += Clcmb )
473 {
474 lb = N - l; ltmp = l + ( lb = MIN( lb, Clcmb ) );
475 Clp = PB_Cnumroc( l, 0, Cimb1, Cmb, myrow, Crow, nprow );
476 Clq = PB_Cnumroc( l, 0, Cinb1, Cnb, mycol, Ccol, npcol );
477 PB_Cpsyr2( TYPE, LOWER, lb, K, ALPHA, Mptr( WAC, Clp, 0, WACld,
478 size ), WACld, Mptr( WAR, 0, Clq, WARld, size ),
479 WARld, Mptr( WBC, Clp, 0, WBCld, size ), WBCld,
480 Mptr( WBR, 0, Clq, WBRld, size ), WBRld, Cptr, l, l,
481 Cd0, tzsyr2k );
482 Clp = PB_Cnumroc( ltmp, 0, Cimb1, Cmb, myrow, Crow, nprow );
483 Cmp0 = Cmp - Clp;
484 Cnq0 = PB_Cnumroc( lb, l, Cinb1, Cnb, mycol, Ccol, npcol );
485 if( Cmp0 > 0 && Cnq0 > 0 )
486 {
487 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq0,
488 &K, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), &WACld,
489 Mptr( WBR, 0, Clq, WBRld, size ), &WBRld, one,
490 Mptr( Cptr, Clp, Clq, Cld, size ), &Cld );
491 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq0,
492 &K, talpha, Mptr( WBC, Clp, 0, WBCld, size ), &WBCld,
493 Mptr( WAR, 0, Clq, WARld, size ), &WARld, one,
494 Mptr( Cptr, Clp, Clq, Cld, size ), &Cld );
495 }
496 }
497 }
498 if( conjg ) free( talpha );
499 }
500
501 if( WACfr ) free( WAC );
502 if( WARfr ) free( WAR );
503 if( WBCfr ) free( WBC );
504 if( WBRfr ) free( WBR );
505
506 return;
507 }
508/*
509* Otherwise sub( A ) and sub( B ) spans more than one process row or columns
510*/
511 ABfwd = ( Mupcase( DIRECAB[0] ) == CFORWARD );
512 upper = ( Mupcase( UPLO [0] ) == CUPPER );
513
514 if( notran )
515 {
516 ABmyprocD = myrow; ABmyprocR = mycol; ABnprocsD = nprow;
517 AiD = IA; AinbD = DESCA[IMB_]; AnbD = DESCA[MB_]; Ald = DESCA[LLD_];
518 BiD = IB; BinbD = DESCB[IMB_]; BnbD = DESCB[MB_]; Bld = DESCB[LLD_];
519 PB_Cinfog2l( IA, JA, DESCA, ABnprocsD, ABnprocsR, ABmyprocD, ABmyprocR,
520 &AiiD, &AiiR, &ArocD, &ArocR );
521 PB_Cinfog2l( IB, JB, DESCB, ABnprocsD, ABnprocsR, ABmyprocD, ABmyprocR,
522 &BiiD, &BiiR, &BrocD, &BrocR );
523 }
524 else
525 {
526 ABmyprocD = mycol; ABmyprocR = myrow; ABnprocsD = npcol;
527 AiD = JA; AinbD = DESCA[INB_]; AnbD = DESCA[NB_]; Ald = DESCA[LLD_];
528 BiD = JB; BinbD = DESCB[INB_]; BnbD = DESCB[NB_]; Bld = DESCB[LLD_];
529 PB_Cinfog2l( IA, JA, DESCA, ABnprocsR, ABnprocsD, ABmyprocR, ABmyprocD,
530 &AiiR, &AiiD, &ArocR, &ArocD );
531 PB_Cinfog2l( IB, JB, DESCB, ABnprocsR, ABnprocsD, ABmyprocR, ABmyprocD,
532 &BiiR, &BiiD, &BrocR, &BrocD );
533 }
534 Ainb1D = PB_Cfirstnb( N, AiD, AinbD, AnbD );
535 AnpD = PB_Cnumroc( N, 0, Ainb1D, AnbD, ABmyprocD, ArocD, ABnprocsD );
536 Ainb1R = PB_Cfirstnb( K, AiR, AinbR, AnbR );
537 AisR = ( ( AsrcR < 0 ) || ( ABnprocsR == 1 ) );
538
539 Binb1D = PB_Cfirstnb( N, BiD, BinbD, BnbD );
540 BnpD = PB_Cnumroc( N, 0, Binb1D, BnbD, ABmyprocD, BrocD, ABnprocsD );
541 Binb1R = PB_Cfirstnb( K, BiR, BinbR, BnbR );
542 BisR = ( ( BsrcR < 0 ) || ( ABnprocsR == 1 ) );
543/*
544* When sub( A ) is not replicated and backward pass on sub( A ), find the
545* virtual process q owning the last row or column of sub( A ).
546*/
547 if( !( AisR ) && !( ABfwd ) )
548 {
549 tmp = PB_Cindxg2p( K - 1, Ainb1R, AnbR, ArocR, ArocR, ABnprocsR );
550 q = MModSub( tmp, ArocR, ABnprocsR );
551 }
552/*
553* When sub( B ) is not replicated and backward pass on sub( B ), find the
554* virtual process p owning the last row or column of sub( B ).
555*/
556 if( !( BisR ) && !( ABfwd ) )
557 {
558 tmp = PB_Cindxg2p( K - 1, Binb1R, BnbR, BrocR, BrocR, ABnprocsR );
559 p = MModSub( tmp, BrocR, ABnprocsR );
560 }
561/*
562* Allocate work space in process rows and columns spanned by sub( C )
563*/
564 PB_COutV( TYPE, COLUMN, NOINIT, N, N, Cd0, kb, &WAC, WACd0, &WACfr,
565 &WACsum );
566 PB_COutV( TYPE, ROW, NOINIT, N, N, Cd0, kb, &WAR, WARd0, &WARfr,
567 &WARsum );
568 PB_COutV( TYPE, COLUMN, NOINIT, N, N, Cd0, kb, &WBC, WBCd0, &WBCfr,
569 &WBCsum );
570 PB_COutV( TYPE, ROW, NOINIT, N, N, Cd0, kb, &WBR, WBRd0, &WBRfr,
571 &WBRsum );
572/*
573* Loop over the virtual process grid induced by the rows or columns of
574* sub( A ) and sub( B )
575*/
576 lcmb = PB_Clcm( ( maxp = ( BisR ? 1 : ABnprocsR ) ) * BnbR,
577 ( maxq = ( AisR ? 1 : ABnprocsR ) ) * AnbR );
578 maxpm1 = maxp - 1;
579/*
580* Find out process coordinates corresponding to first virtual process (p,q)
581*/
582 AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, ABnprocsR ) );
583 AkkR = PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR, ABnprocsR );
584 AnpR = PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR, ABnprocsR );
585
586 BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, ABnprocsR ) );
587 BkkR = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, ABnprocsR );
588 BnpR = PB_Cnumroc( K, 0, Binb1R, BnbR, BcurrocR, BrocR, ABnprocsR );
589/*
590* Find out how many diagonals this virtual process (p,q) has
591*/
592 PB_CVMinit( &VM, 0, BnpR, AnpR, Binb1R, Ainb1R, BnbR, AnbR, p, q,
593 maxp, maxq, lcmb );
594 npq = PB_CVMnpq( &VM );
595
596 for( k = 0; k < K; k += kb )
597 {
598 kbb = K - k; kbb = MIN( kbb, kb );
599
600 while( Wkbb != kbb )
601 {
602/*
603* Ensure that the current virtual process (p,q) has something to contribute
604* to the replicated buffers WA and WB.
605*/
606 while( npq == 0 )
607 {
608 if( ( ABfwd && ( p == maxpm1 ) ) ||
609 ( !( ABfwd ) && ( p == 0 ) ) )
610 q = ( ABfwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) );
611 p = ( ABfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) );
612
613 AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, ABnprocsR ) );
614 AkkR = PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR,
615 ABnprocsR );
616 AnpR = PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR,
617 ABnprocsR );
618
619 BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, ABnprocsR ) );
620 BkkR = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR,
621 ABnprocsR );
622 BnpR = PB_Cnumroc( K, 0, Binb1R, BnbR, BcurrocR, BrocR,
623 ABnprocsR );
624
625 PB_CVMinit( &VM, 0, BnpR, AnpR, Binb1R, Ainb1R, BnbR, AnbR,
626 p, q, maxp, maxq, lcmb );
627 npq = PB_CVMnpq( &VM );
628 }
629/*
630* Current virtual process (p,q) has something, find out how many rows or
631* columns could be used: ABrocs.
632*/
633 if( Wkbb == 0 ) { ABrocs = ( npq < kbb ? npq : kbb ); }
634 else { ABrocs = kbb - Wkbb; ABrocs = MIN( ABrocs, npq ); }
635/*
636* Find out how many rows or columns of sub( A ) and sub( B ) are contiguous
637*/
638 PB_CVMcontig( &VM, &nrpq, &ncpq, &Boff, &Aoff );
639
640 if( notran )
641 {
642/*
643* Compute the descriptor DBUFA for the buffer that will contained the packed
644* columns of sub( A ).
645*/
646 if( ( Afr = ( ncpq < ABrocs ) ) != 0 )
647 {
648/*
649* If columns of sub( A ) are not contiguous, then allocate the buffer and
650* pack the ABrocs columns of sub( A ).
651*/
652 Abufld = MAX( 1, AnpD );
653 if( AisR || ( ABmyprocR == AcurrocR ) )
654 {
655 Abuf = PB_Cmalloc( AnpD * ABrocs * size );
657 ABrocs, AnpD, one, Mptr( A, AiiD, AkkR, Ald,
658 size ), Ald, zero, Abuf, Abufld );
659 }
660 }
661 else
662 {
663/*
664* Otherwise, re-use sub( A ) directly.
665*/
666 Abufld = Ald;
667 if( AisR || ( ABmyprocR == AcurrocR ) )
668 Abuf = Mptr( A, AiiD, AkkR + Aoff, Ald, size );
669 }
670 PB_Cdescset( DBUFA, N, ABrocs, Ainb1D, ABrocs, AnbD, ABrocs,
671 ArocD, AcurrocR, ctxt, Abufld );
672/*
673* Replicate panels of columns of sub( A ) over sub( C )
674*/
675 PB_CInV2( TYPE, NOCONJG, COLUMN, N, N, Cd0, ABrocs, Abuf, 0, 0,
676 DBUFA, COLUMN, WAC, Wkbb, WACd0 );
677 if( Afr & ( AisR || ( ABmyprocR == AcurrocR ) ) )
678 if( Abuf ) free( Abuf );
679
680/*
681* Compute the descriptor DBUFB for the buffer that will contained the packed
682* columns of sub( B ).
683*/
684 if( ( Bfr = ( nrpq < ABrocs ) ) != 0 )
685 {
686/*
687* If columns of sub( B ) are not contiguous, then allocate the buffer and
688* pack the ABrocs columns of sub( B ).
689*/
690 Bbufld = MAX( 1, BnpD );
691 if( BisR || ( ABmyprocR == BcurrocR ) )
692 {
693 Bbuf = PB_Cmalloc( BnpD * ABrocs * size );
695 ABrocs, BnpD, one, Mptr( B, BiiD, BkkR, Bld,
696 size ), Bld, zero, Bbuf, Bbufld );
697 }
698 }
699 else
700 {
701/*
702* Otherwise, re-use sub( B ) directly.
703*/
704 Bbufld = Bld;
705 if( BisR || ( ABmyprocR == BcurrocR ) )
706 Bbuf = Mptr( B, BiiD, BkkR + Boff, Bld, size );
707 }
708 PB_Cdescset( DBUFB, N, ABrocs, Binb1D, ABrocs, BnbD, ABrocs,
709 BrocD, BcurrocR, ctxt, Bbufld );
710/*
711* Replicate panels of columns of sub( A ) over sub( C )
712*/
713 PB_CInV2( TYPE, NOCONJG, COLUMN, N, N, Cd0, ABrocs, Bbuf, 0, 0,
714 DBUFB, COLUMN, WBC, Wkbb, WBCd0 );
715 if( Bfr & ( BisR || ( ABmyprocR == BcurrocR ) ) )
716 if( Bbuf ) free( Bbuf );
717 }
718 else
719 {
720/*
721* Compute the descriptor DBUFA for the buffer that will contained the packed
722* rows of sub( A ).
723*/
724 if( ( Afr = ( ncpq < ABrocs ) ) != 0 )
725 {
726/*
727* If rows of sub( A ) are not contiguous, then allocate the buffer and
728* pack the ABrocs rows of sub( A ).
729*/
730 Abufld = ABrocs;
731 if( AisR || ( ABmyprocR == AcurrocR ) )
732 {
733 Abuf = PB_Cmalloc( AnpD * ABrocs * size );
735 ABrocs, AnpD, one, Mptr( A, AkkR, AiiD, Ald,
736 size ), Ald, zero, Abuf, Abufld );
737 }
738 }
739 else
740 {
741/*
742* Otherwise, re-use sub( A ) directly.
743*/
744 Abufld = Ald;
745 if( AisR || ( ABmyprocR == AcurrocR ) )
746 Abuf = Mptr( A, AkkR + Aoff, AiiD, Ald, size );
747 }
748 PB_Cdescset( DBUFA, ABrocs, N, ABrocs, Ainb1D, ABrocs, AnbD,
749 AcurrocR, ArocD, ctxt, Abufld );
750/*
751* Replicate panels of rows of sub( A ) over sub( C )
752*/
753 PB_CInV2( TYPE, NOCONJG, ROW, N, N, Cd0, ABrocs, Abuf, 0, 0,
754 DBUFA, ROW, WAR, Wkbb, WARd0 );
755 if( Afr & ( AisR || ( ABmyprocR == AcurrocR ) ) )
756 if( Abuf ) free( Abuf );
757/*
758* Compute the descriptor DBUFB for the buffer that will contained the packed
759* rows of sub( B ).
760*/
761 if( ( Bfr = ( nrpq < ABrocs ) ) != 0 )
762 {
763/*
764* If rows of sub( B ) are not contiguous, then allocate the buffer and
765* pack the ABrocs rows of sub( B ).
766*/
767 Bbufld = ABrocs;
768 if( BisR || ( ABmyprocR == BcurrocR ) )
769 {
770 Bbuf = PB_Cmalloc( BnpD * ABrocs * size );
772 ABrocs, BnpD, one, Mptr( B, BkkR, BiiD, Bld,
773 size ), Bld, zero, Bbuf, Bbufld );
774 }
775 }
776 else
777 {
778/*
779* Otherwise, re-use sub( B ) directly.
780*/
781 Bbufld = Bld;
782 if( BisR || ( ABmyprocR == BcurrocR ) )
783 Bbuf = Mptr( B, BkkR + Boff, BiiD, Bld, size );
784 }
785 PB_Cdescset( DBUFB, ABrocs, N, ABrocs, Binb1D, ABrocs, BnbD,
786 BcurrocR, BrocD, ctxt, Bbufld );
787/*
788* Replicate panels of rows of sub( B ) over sub( C )
789*/
790 PB_CInV2( TYPE, NOCONJG, ROW, N, N, Cd0, ABrocs, Bbuf, 0, 0,
791 DBUFB, ROW, WBR, Wkbb, WBRd0 );
792 if( Bfr & ( BisR || ( ABmyprocR == BcurrocR ) ) )
793 if( Bbuf ) free( Bbuf );
794 }
795/*
796* Update the local indexes of sub( A ) and sub( B )
797*/
798 PB_CVMupdate( &VM, ABrocs, &BkkR, &AkkR );
799/*
800* ABrocs rows or columns of sub( A ) and sub( B ) have been replicated,
801* update the number of diagonals in this virtual process as well as the
802* number of rows or columns of sub( A ) and sub( B ) that are in WA, WB.
803*/
804 npq -= ABrocs;
805 Wkbb += ABrocs;
806 }
807
808 if( notran )
809 {
810/*
811* WAR := WAC'
812*/
813 PB_CInV2( TYPE, CONJUG, ROW, N, N, Cd0, kbb, WAC, 0, 0, WACd0,
814 COLUMN, WAR, 0, WARd0 );
815/*
816* WBR := WBC'
817*/
818 PB_CInV2( TYPE, CONJUG, ROW, N, N, Cd0, kbb, WBC, 0, 0, WBCd0,
819 COLUMN, WBR, 0, WBRd0 );
820 }
821 else
822 {
823/*
824* WAC := WAR'
825*/
826 PB_CInV2( TYPE, CONJUG, COLUMN, N, N, Cd0, kbb, WAR, 0, 0, WARd0,
827 ROW, WAC, 0, WACd0 );
828/*
829* WBC := WBR'
830*/
831 PB_CInV2( TYPE, CONJUG, COLUMN, N, N, Cd0, kbb, WBR, 0, 0, WBRd0,
832 ROW, WBC, 0, WBCd0 );
833 }
834/*
835* Perform the local update if I own some data
836*/
837 if( ( Cmp > 0 ) && ( Cnq > 0 ) )
838 {
839 WACld = WACd0[LLD_]; WBCld = WBCd0[LLD_];
840 WARld = WARd0[LLD_]; WBRld = WBRd0[LLD_];
841
842 if( upper )
843 {
844 for( l = 0; l < N; l += Clcmb )
845 {
846 lb = N - l; lb = MIN( lb, Clcmb );
847 Clp = PB_Cnumroc( l, 0, Cimb1, Cmb, myrow, Crow, nprow );
848 Clq = PB_Cnumroc( l, 0, Cinb1, Cnb, mycol, Ccol, npcol );
849 Cnq0 = PB_Cnumroc( lb, l, Cinb1, Cnb, mycol, Ccol, npcol );
850 if( Clp > 0 && Cnq0 > 0 )
851 {
852 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Clp, &Cnq0,
853 &kbb, ALPHA, WAC, &WACld, Mptr( WBR, 0, Clq, WBRld,
854 size ), &WBRld, one, Mptr( Cptr, 0, Clq, Cld, size ),
855 &Cld );
856 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Clp, &Cnq0,
857 &kbb, talpha, WBC, &WBCld, Mptr( WAR, 0, Clq, WARld,
858 size ), &WARld, one, Mptr( Cptr, 0, Clq, Cld, size ),
859 &Cld );
860 }
861 PB_Cpsyr2( TYPE, UPPER, lb, kbb, ALPHA, Mptr( WAC, Clp, 0, WACld,
862 size ), WACld, Mptr( WAR, 0, Clq, WARld, size ),
863 WARld, Mptr( WBC, Clp, 0, WBCld, size ), WBCld,
864 Mptr( WBR, 0, Clq, WBRld, size ), WBRld, Cptr, l, l,
865 Cd0, tzsyr2k );
866 }
867 }
868 else
869 {
870 for( l = 0; l < N; l += Clcmb )
871 {
872 lb = N - l; ltmp = l + ( lb = MIN( lb, Clcmb ) );
873 Clp = PB_Cnumroc( l, 0, Cimb1, Cmb, myrow, Crow, nprow );
874 Clq = PB_Cnumroc( l, 0, Cinb1, Cnb, mycol, Ccol, npcol );
875
876 PB_Cpsyr2( TYPE, LOWER, lb, kbb, ALPHA, Mptr( WAC, Clp, 0, WACld,
877 size ), WACld, Mptr( WAR, 0, Clq, WARld, size ),
878 WARld, Mptr( WBC, Clp, 0, WBCld, size ), WBCld,
879 Mptr( WBR, 0, Clq, WBRld, size ), WBRld, Cptr, l, l,
880 Cd0, tzsyr2k );
881 Clp = PB_Cnumroc( ltmp, 0, Cimb1, Cmb, myrow, Crow, nprow );
882 Cmp0 = Cmp - Clp;
883 Cnq0 = PB_Cnumroc( lb, l, Cinb1, Cnb, mycol, Ccol, npcol );
884 if( Cmp0 > 0 && Cnq0 > 0 )
885 {
886 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq0,
887 &kbb, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), &WACld,
888 Mptr( WBR, 0, Clq, WBRld, size ), &WBRld, one,
889 Mptr( Cptr, Clp, Clq, Cld, size ), &Cld );
890 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq0,
891 &kbb, talpha, Mptr( WBC, Clp, 0, WBCld, size ), &WBCld,
892 Mptr( WAR, 0, Clq, WARld, size ), &WARld, one,
893 Mptr( Cptr, Clp, Clq, Cld, size ), &Cld );
894 }
895 }
896 }
897 }
898
899 Wkbb = 0;
900 }
901
902 if( WACfr ) free( WAC );
903 if( WARfr ) free( WAR );
904 if( WBCfr ) free( WBC );
905 if( WBRfr ) free( WBR );
906
907 if( conjg && ( Cmp > 0 ) && ( Cnq > 0 ) ) free( talpha );
908/*
909* End of PB_Cpsyr2kA
910*/
911}
#define Int
Definition Bconfig.h:22
void(* TZSYR2_T)()
Definition pblas.h:430
F_VOID_FCT(* GEMM_T)()
Definition pblas.h:317
#define C2F_CHAR(a)
Definition pblas.h:125
#define COLUMN
Definition PBblacs.h:45
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define NOTRAN
Definition PBblas.h:44
#define CCONJG
Definition PBblas.h:21
#define NOINIT
Definition PBblas.h:62
#define NOCONJG
Definition PBblas.h:45
#define CUPPER
Definition PBblas.h:26
#define CNOTRAN
Definition PBblas.h:18
#define LOWER
Definition PBblas.h:51
#define UPPER
Definition PBblas.h:52
#define CFORWARD
Definition PBblas.h:38
#define pilaenv_
Definition PBpblas.h:44
#define CTXT_
Definition PBtools.h:38
void PB_CVMinit()
Int PB_Cfirstnb()
#define MAX(a_, b_)
Definition PBtools.h:77
#define MB_
Definition PBtools.h:43
void PB_CInV2()
char * PB_Cmalloc()
void PB_Cinfog2l()
#define MModSub(I1, I2, d)
Definition PBtools.h:102
#define PACKING
Definition PBtools.h:53
#define MIN(a_, b_)
Definition PBtools.h:76
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
void PB_Ctzsyr2k()
#define LLD_
Definition PBtools.h:47
Int PB_Cnumroc()
Int PB_CVMpack()
void PB_CInV()
void PB_Cpsyr2kA()
void PB_Cplascal()
void PB_CVMupdate()
#define RSRC_
Definition PBtools.h:45
void PB_Cdescset()
void PB_COutV()
#define MModAdd1(I, d)
Definition PBtools.h:100
#define MModAdd(I1, I2, d)
Definition PBtools.h:97
#define INB_
Definition PBtools.h:42
Int PB_CVMnpq()
#define MModSub1(I, d)
Definition PBtools.h:105
void PB_Ctzher2k()
#define CSRC_
Definition PBtools.h:46
Int PB_Clcm()
#define IMB_
Definition PBtools.h:41
Int PB_Cindxg2p()
Int PB_Cg2lrem()
void PB_Cpsyr2()
#define Mupcase(C)
Definition PBtools.h:83
#define DLEN_
Definition PBtools.h:48
#define NB_
Definition PBtools.h:44
void PB_Cconjg()
void PB_CVMcontig()
Int PB_Cspan()
void PB_Cdescribe()
#define TYPE
Definition clamov.c:7