SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
PB_CptrsmB.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS auxiliary routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "../pblas.h"
14#include "../PBpblas.h"
15#include "../PBtools.h"
16#include "../PBblacs.h"
17#include "../PBblas.h"
18
19#ifdef __STDC__
20void PB_CptrsmB( PBTYP_T * TYPE, char * DIRECB, char * SIDE,
21 char * UPLO, char * TRANSA, char * DIAG, Int M, Int N,
22 char * ALPHA, char * A, Int IA, Int JA, Int * DESCA,
23 char * B, Int IB, Int JB, Int * DESCB )
24#else
25void PB_CptrsmB( TYPE, DIRECB, SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A,
26 IA, JA, DESCA, B, IB, JB, DESCB )
27/*
28* .. Scalar Arguments ..
29*/
30 char * DIAG, * DIRECB, * SIDE, * TRANSA, * UPLO;
31 Int IA, IB, JA, JB, M, N;
32 char * ALPHA;
33 PBTYP_T * TYPE;
34/*
35* .. Array Arguments ..
36*/
37 Int * DESCA, * DESCB;
38 char * A, * B;
39#endif
40{
41/*
42* Purpose
43* =======
44*
45* PB_CptrsmB solves one of the matrix equations
46*
47* op( sub( A ) )*X = alpha*sub( B ), or
48*
49* X*op( sub( A ) ) = alpha*sub( B ),
50*
51* where
52*
53* sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L',
54* A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and,
55*
56* sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1).
57*
58* Alpha is a scalar, X and sub( B ) are m by n submatrices, sub( A ) is
59* a unit, or non-unit, upper or lower triangular submatrix and op( Y )
60* is one of
61*
62* op( Y ) = Y or op( Y ) = Y' or op( Y ) = conjg( Y' ).
63*
64* The submatrix X is overwritten on sub( B ).
65*
66* This is the inner-product algorithm using the logical LCM hybrid
67* and static blocking techniques. The submatrix operand sub( A ) stays
68* in place.
69*
70* Notes
71* =====
72*
73* A description vector is associated with each 2D block-cyclicly dis-
74* tributed matrix. This vector stores the information required to
75* establish the mapping between a matrix entry and its corresponding
76* process and memory location.
77*
78* In the following comments, the character _ should be read as
79* "of the distributed matrix". Let A be a generic term for any 2D
80* block cyclicly distributed matrix. Its description vector is DESC_A:
81*
82* NOTATION STORED IN EXPLANATION
83* ---------------- --------------- ------------------------------------
84* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
85* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
86* the NPROW x NPCOL BLACS process grid
87* A is distributed over. The context
88* itself is global, but the handle
89* (the integer value) may vary.
90* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
91* ted matrix A, M_A >= 0.
92* N_A (global) DESCA[ N_ ] The number of columns in the distri-
93* buted matrix A, N_A >= 0.
94* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
95* block of the matrix A, IMB_A > 0.
96* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
97* left block of the matrix A,
98* INB_A > 0.
99* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
100* bute the last M_A-IMB_A rows of A,
101* MB_A > 0.
102* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
103* bute the last N_A-INB_A columns of
104* A, NB_A > 0.
105* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
106* row of the matrix A is distributed,
107* NPROW > RSRC_A >= 0.
108* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
109* first column of A is distributed.
110* NPCOL > CSRC_A >= 0.
111* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
112* array storing the local blocks of
113* the distributed matrix A,
114* IF( Lc( 1, N_A ) > 0 )
115* LLD_A >= MAX( 1, Lr( 1, M_A ) )
116* ELSE
117* LLD_A >= 1.
118*
119* Let K be the number of rows of a matrix A starting at the global in-
120* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
121* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
122* receive if these K rows were distributed over NPROW processes. If K
123* is the number of columns of a matrix A starting at the global index
124* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
125* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
126* these K columns were distributed over NPCOL processes.
127*
128* The values of Lr() and Lc() may be determined via a call to the func-
129* tion PB_Cnumroc:
130* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
131* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
132*
133* Arguments
134* =========
135*
136* TYPE (local input) pointer to a PBTYP_T structure
137* On entry, TYPE is a pointer to a structure of type PBTYP_T,
138* that contains type information (See pblas.h).
139*
140* DIRECB (global input) pointer to CHAR
141* On entry, DIRECB specifies the direction in which the rows
142* or columns of sub( B ) should be looped over as follows:
143* DIRECB = 'F' or 'f' forward or increasing,
144* DIRECB = 'B' or 'b' backward or decreasing.
145*
146* SIDE (global input) pointer to CHAR
147* On entry, SIDE specifies whether op( sub( A ) ) appears on
148* the left or right of X as follows:
149*
150* SIDE = 'L' or 'l' op( sub( A ) )*X = alpha*sub( B ),
151*
152* SIDE = 'R' or 'r' X*op( sub( A ) ) = alpha*sub( B ).
153*
154* UPLO (global input) pointer to CHAR
155* On entry, UPLO specifies whether the submatrix sub( A ) is
156* an upper or lower triangular submatrix as follows:
157*
158* UPLO = 'U' or 'u' sub( A ) is an upper triangular
159* submatrix,
160*
161* UPLO = 'L' or 'l' sub( A ) is a lower triangular
162* submatrix.
163*
164* TRANSA (global input) pointer to CHAR
165* On entry, TRANSA specifies the form of op( sub( A ) ) to be
166* used in the matrix multiplication as follows:
167*
168* TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ),
169*
170* TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )',
171*
172* TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ).
173*
174* DIAG (global input) pointer to CHAR
175* On entry, DIAG specifies whether or not sub( A ) is unit
176* triangular as follows:
177*
178* DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian-
179* gular,
180*
181* DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri-
182* angular.
183*
184* M (global input) INTEGER
185* On entry, M specifies the number of rows of the submatrix
186* sub( B ). M must be at least zero.
187*
188* N (global input) INTEGER
189* On entry, N specifies the number of columns of the submatrix
190* sub( B ). N must be at least zero.
191*
192* ALPHA (global input) pointer to CHAR
193* On entry, ALPHA specifies the scalar alpha. When ALPHA is
194* supplied as zero then the local entries of the array B
195* corresponding to the entries of the submatrix sub( B ) need
196* not be set on input.
197*
198* A (local input) pointer to CHAR
199* On entry, A is an array of dimension (LLD_A, Ka), where Ka is
200* at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at
201* least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
202* contains the local entries of the matrix A.
203* Before entry with UPLO = 'U' or 'u', this array contains the
204* local entries corresponding to the entries of the upper tri-
205* angular submatrix sub( A ), and the local entries correspon-
206* ding to the entries of the strictly lower triangular part of
207* the submatrix sub( A ) are not referenced.
208* Before entry with UPLO = 'L' or 'l', this array contains the
209* local entries corresponding to the entries of the lower tri-
210* angular submatrix sub( A ), and the local entries correspon-
211* ding to the entries of the strictly upper triangular part of
212* the submatrix sub( A ) are not referenced.
213* Note that when DIAG = 'U' or 'u', the local entries corres-
214* ponding to the diagonal elements of the submatrix sub( A )
215* are not referenced either, but are assumed to be unity.
216*
217* IA (global input) INTEGER
218* On entry, IA specifies A's global row index, which points to
219* the beginning of the submatrix sub( A ).
220*
221* JA (global input) INTEGER
222* On entry, JA specifies A's global column index, which points
223* to the beginning of the submatrix sub( A ).
224*
225* DESCA (global and local input) INTEGER array
226* On entry, DESCA is an integer array of dimension DLEN_. This
227* is the array descriptor for the matrix A.
228*
229* B (local input/local output) pointer to CHAR
230* On entry, B is an array of dimension (LLD_B, Kb), where Kb is
231* at least Lc( 1, JB+N-1 ). Before entry, this array contains
232* the local entries of the matrix B.
233* On exit, the local entries of this array corresponding to the
234* to the entries of the submatrix sub( B ) are overwritten by
235* the local entries of the m by n solution submatrix.
236*
237* IB (global input) INTEGER
238* On entry, IB specifies B's global row index, which points to
239* the beginning of the submatrix sub( B ).
240*
241* JB (global input) INTEGER
242* On entry, JB specifies B's global column index, which points
243* to the beginning of the submatrix sub( B ).
244*
245* DESCB (global and local input) INTEGER array
246* On entry, DESCB is an integer array of dimension DLEN_. This
247* is the array descriptor for the matrix B.
248*
249* -- Written on April 1, 1998 by
250* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
251*
252* ---------------------------------------------------------------------
253*/
254/*
255* .. Local Scalars ..
256*/
257 char Broc, TranOp, conjg, * negone, * one, * talpha, * talph0, top,
258 * zero;
259 Int Acol, Aii, Aimb1, Ainb1, Ajj, Akp, Akq, Alcmb, Ald, Amb, An,
260 Anb, Anp, Anp0, Anq, Anq0, Arow, Asrc, Astart, BcurrocR, Bfwd,
261 BiiD, BiiR, Binb1D, Binb1R, BisR, Bld, BmyprocD, BmyprocR,
262 BnD, BnR, BnbD, BnbR, BnpR, BnprocsD, BnprocsR, BrocD, BrocR,
263 BsrcR, LNorRT, WBCfr, WBCld, WBCapbX, WBCsum, WBRfr, WBRld,
264 WBRapbX, WBRsum, ctxt, izero=0, k, kb, kbnext, kbprev, ktmp,
265 lside, mycol, myrow, n, nb, nbb, notran, npcol, nprow, p=0,
266 size, tmp, upper;
267 TZPAD_T pad;
268 GEMM_T gemm;
269 GSUM2D_T gsum2d;
270/*
271* .. Local Arrays ..
272*/
273 Int Ad0[DLEN_], DBUFB[DLEN_], WBCd[DLEN_], WBRd[DLEN_];
274 char * Aptr = NULL, * Bptr = NULL, * WBC = NULL, * WBR = NULL;
275/* ..
276* .. Executable Statements ..
277*
278*/
279/*
280* Retrieve process grid information
281*/
282 Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
283
284 Bfwd = ( Mupcase( DIRECB[0] ) == CFORWARD );
285 lside = ( Mupcase( SIDE [0] ) == CLEFT );
286 upper = ( Mupcase( UPLO [0] ) == CUPPER );
287 notran = ( ( TranOp = Mupcase( TRANSA[0] ) ) == CNOTRAN );
288 LNorRT = ( lside && notran ) || ( !( lside ) && !( notran ) );
289
290 size = TYPE->size;
291 one = TYPE->one; zero = TYPE->zero; negone = TYPE->negone;
292 pad = TYPE->Ftzpad; gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d;
293 nb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
294/*
295* Compute local information for sub( A ) and sub( B )
296*/
297 if( lside )
298 {
299 BnD = An = M; BnR = N; Broc = CCOLUMN;
300 BmyprocD = myrow; BnprocsD = nprow;
301 BmyprocR = mycol; BnprocsR = npcol;
302 BnbD = DESCB[MB_ ]; BnbR = DESCB[NB_ ];
303 BsrcR = DESCB[CSRC_]; Bld = DESCB[LLD_];
304 PB_Cinfog2l( IB, JB, DESCB, BnprocsD, BnprocsR, BmyprocD, BmyprocR,
305 &BiiD, &BiiR, &BrocD, &BrocR );
306 Binb1D = PB_Cfirstnb( BnD, IB, DESCB[IMB_], BnbD );
307 Binb1R = PB_Cfirstnb( BnR, JB, DESCB[INB_], BnbR );
308 }
309 else
310 {
311 BnD = An = N; BnR = M; Broc = CROW;
312 BmyprocD = mycol; BnprocsD = npcol;
313 BmyprocR = myrow; BnprocsR = nprow;
314 BnbR = DESCB[MB_ ]; BnbD = DESCB[NB_ ];
315 BsrcR = DESCB[RSRC_]; Bld = DESCB[LLD_];
316 PB_Cinfog2l( IB, JB, DESCB, BnprocsR, BnprocsD, BmyprocR, BmyprocD,
317 &BiiR, &BiiD, &BrocR, &BrocD );
318 Binb1D = PB_Cfirstnb( BnD, JB, DESCB[INB_], BnbD );
319 Binb1R = PB_Cfirstnb( BnR, IB, DESCB[IMB_], BnbR );
320 }
321/*
322* Compute descriptor Ad0 for sub( A )
323*/
324 PB_Cdescribe( An, An, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
325 &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
326/*
327* Compute conjugate of alpha for the conjugate transpose cases
328*/
329 if( TranOp == CCOTRAN )
330 {
331 conjg = CCONJG; talpha = PB_Cmalloc( size );
332 PB_Cconjg( TYPE, ALPHA, talpha );
333 }
334 else { conjg = CNOCONJG; talpha = ALPHA; }
335/*
336* Retrieve BLACS combine topology, select backward ot forward substitution.
337*/
338 if( LNorRT )
339 {
340 top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
341 Astart = ( upper ? An - 1 : 0 );
342 }
343 else
344 {
345 top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
346 Astart = ( upper ? 0 : An - 1 );
347 }
348/*
349* Computational partitioning size is computed as the product of the logical
350* value returned by pilaenv_ and 2 * lcm( nprow, npcol ).
351*/
352 Alcmb = 2 * nb * PB_Clcm( ( Arow >= 0 ? nprow : 1 ),
353 ( Acol >= 0 ? npcol : 1 ) );
354/*
355* When sub( B ) is not replicated and backward pass on sub( B ), find the
356* virtual process p owning the last row or column of sub( B ).
357*/
358 if( !( BisR = ( ( BsrcR < 0 ) || ( BnprocsR == 1 ) ) ) && !( Bfwd ) )
359 {
360 tmp = PB_Cindxg2p( BnR - 1, Binb1R, BnbR, BrocR, BrocR, BnprocsR );
361 p = MModSub( tmp, BrocR, BnprocsR );
362 }
363/*
364* Loop over the processes rows or columns owning the BnR rows or columns of
365* sub( B ) to be processed.
366*/
367 n = BnR;
368
369 while( n > 0 )
370 {
371/*
372* Find out who is the active process row or column as well as the number of
373* rows or columns of sub( B ) it owns.
374*/
375 BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, BnprocsR ) );
376 BnpR = PB_Cnumroc( BnR, 0, Binb1R, BnbR, BcurrocR, BrocR, BnprocsR );
377
378 n -= BnpR;
379/*
380* Re-adjust the number of rows or columns to be handled at each step, in order
381* to average the message sizes and the computational granularity.
382*/
383 if( BnpR ) nbb = BnpR / ( ( BnpR - 1 ) / nb + 1 );
384
385 while( BnpR )
386 {
387 nbb = MIN( nbb, BnpR );
388/*
389* Describe the local contiguous panel of sub( B )
390*/
391 if( lside )
392 {
393 PB_Cdescset( DBUFB, BnD, nbb, Binb1D, nbb, BnbD, BnbR, BrocD,
394 BcurrocR, ctxt, Bld );
395 if( BisR || ( BmyprocR == BcurrocR ) )
396 Bptr = Mptr( B, BiiD, BiiR, Bld, size );
397 }
398 else
399 {
400 PB_Cdescset( DBUFB, nbb, BnD, nbb, Binb1D, BnbR, BnbD, BcurrocR,
401 BrocD, ctxt, Bld );
402 if( BisR || ( BmyprocR == BcurrocR ) )
403 Bptr = Mptr( B, BiiR, BiiD, Bld, size );
404 }
405
406 talph0 = talpha;
407
408 if( LNorRT )
409 {
410/*
411* Reuse sub( B ) and/or create vector WBC in process column owning the first
412* or last column of sub( A )
413*/
414 PB_CInOutV2( TYPE, &conjg, COLUMN, An, An, Astart, Ad0, nbb, Bptr,
415 0, 0, DBUFB, &Broc, &WBC, WBCd, &WBCfr, &WBCsum,
416 &WBCapbX );
417/*
418* Create WBR in process rows spanned by sub( A )
419*/
420 PB_COutV( TYPE, ROW, INIT, An, An, Ad0, nbb, &WBR, WBRd, &WBRfr,
421 &WBRsum );
422/*
423* Retrieve local quantities related to sub( A ) -> Ad0
424*/
425 Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
426 Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
427 Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
428
429 Anp = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow );
430 Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol );
431 if( ( Anp > 0 ) && ( Anq > 0 ) )
432 Aptr = Mptr( A, Aii, Ajj, Ald, size );
433
434 WBCld = WBCd[LLD_]; WBRld = WBRd[LLD_];
435
436 if( upper )
437 {
438/*
439* sub( A ) is upper triangular
440*/
441 for( k = ( Astart / Alcmb ) * Alcmb; k >= 0; k -= Alcmb )
442 {
443 ktmp = An - k; kb = MIN( ktmp, Alcmb );
444/*
445* Solve logical diagonal block, WBC contains the solution scattered in multiple
446* process columns and WBR contains the solution replicated in the process rows.
447*/
448 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
449 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
450 PB_Cptrsm( TYPE, WBRsum, SIDE, UPLO, TRANSA, DIAG, kb, nbb,
451 talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld,
452 size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ),
453 WBRld );
454/*
455* Update: only the part of sub( B ) to be solved at the next step is locally
456* updated and combined, the remaining part of the matrix to be solved later
457* is only locally updated.
458*/
459 if( Akp > 0 )
460 {
461 Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
462 if( WBCsum )
463 {
464 kbprev = MIN( k, Alcmb );
465 ktmp = PB_Cnumroc( kbprev, k-kbprev, Aimb1, Amb,
466 myrow, Arow, nprow );
467 Akp -= ktmp;
468
469 if( ktmp > 0 )
470 {
471 if( Anq0 > 0 )
472 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &ktmp,
473 &nbb, &Anq0, negone, Mptr( Aptr, Akp, Akq,
474 Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld,
475 size ), &WBRld, talph0, Mptr( WBC, Akp, 0,
476 WBCld, size ), &WBCld );
477 Asrc = PB_Cindxg2p( k-1, Ainb1, Anb, Acol, Acol,
478 npcol );
479 gsum2d( ctxt, ROW, &top, ktmp, nbb, Mptr( WBC, Akp,
480 0, WBCld, size ), WBCld, myrow, Asrc );
481 if( mycol != Asrc )
482 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &ktmp,
483 &nbb, &izero, zero, zero, Mptr( WBC, Akp, 0,
484 WBCld, size ), &WBCld );
485 }
486 if( ( Akp > 0 ) && ( Anq0 > 0 ) )
487 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Akp,
488 &nbb, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald,
489 size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ),
490 &WBRld, talph0, WBC, &WBCld );
491 }
492 else
493 {
494 if( Anq0 > 0 )
495 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Akp,
496 &nbb, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald,
497 size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ),
498 &WBRld, talph0, WBC, &WBCld );
499 }
500 }
501 talph0 = one;
502 }
503 }
504 else
505 {
506/*
507* sub( A ) is lower triangular
508*/
509 for( k = 0; k < An; k += Alcmb )
510 {
511 ktmp = An - k; kb = MIN( ktmp, Alcmb );
512/*
513* Solve logical diagonal block, WBC contains the solution scattered in multiple
514* process columns and WBR contains the solution replicated in the process rows.
515*/
516 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
517 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
518 PB_Cptrsm( TYPE, WBRsum, SIDE, UPLO, TRANSA, DIAG, kb, nbb,
519 talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld,
520 size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ),
521 WBRld );
522/*
523* Update: only the part of sub( B ) to be solved at the next step is locally
524* updated and combined, the remaining part of the matrix to be solved later is
525* only locally updated.
526*/
527 Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow );
528 if( ( Anp0 = Anp - Akp ) > 0 )
529 {
530 Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
531 if( WBCsum )
532 {
533 kbnext = ktmp - kb;
534 kbnext = MIN( kbnext, Alcmb );
535 ktmp = PB_Cnumroc( kbnext, k+kb, Aimb1, Amb, myrow,
536 Arow, nprow );
537 Anp0 -= ktmp;
538
539 if( ktmp > 0 )
540 {
541 if( Anq0 > 0 )
542 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &ktmp,
543 &nbb, &Anq0, negone, Mptr( Aptr, Akp, Akq,
544 Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld,
545 size ), &WBRld, talph0, Mptr( WBC, Akp, 0,
546 WBCld, size ), &WBCld );
547 Asrc = PB_Cindxg2p( k+kb, Ainb1, Anb, Acol, Acol,
548 npcol );
549 gsum2d( ctxt, ROW, &top, ktmp, nbb, Mptr( WBC, Akp,
550 0, WBCld, size ), WBCld, myrow, Asrc );
551 if( mycol != Asrc )
552 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &ktmp,
553 &nbb, &izero, zero, zero, Mptr( WBC, Akp, 0,
554 WBCld, size ), &WBCld );
555 }
556 if( ( Anp0 > 0 ) && ( Anq0 > 0 ) )
557 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Anp0,
558 &nbb, &Anq0, negone, Mptr( Aptr, Akp+ktmp, Akq,
559 Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld,
560 size ), &WBRld, talph0, Mptr( WBC, Akp+ktmp, 0,
561 WBCld, size ), &WBCld );
562 }
563 else
564 {
565 if( Anq0 > 0 )
566 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Anp0,
567 &nbb, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald,
568 size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ),
569 &WBRld, talph0, Mptr( WBC, Akp, 0, WBCld,
570 size ), &WBCld );
571 }
572 }
573 talph0 = one;
574 }
575 }
576/*
577* Combine the scattered resulting matrix WBC
578*/
579 if( WBCsum && ( Anp > 0 ) )
580 gsum2d( ctxt, ROW, &top, Anp, nbb, WBC, WBCld, myrow,
581 WBCd[CSRC_] );
582/*
583* sub( B ) := WBC (if necessary)
584*/
585 if( WBCapbX )
586 PB_Cpaxpby( TYPE, &conjg, An, nbb, one, WBC, 0, 0, WBCd, COLUMN,
587 zero, Bptr, 0, 0, DBUFB, &Broc );
588 }
589 else
590 {
591/*
592* Reuse sub( B ) and/or create vector WBR in process row owning the first or
593* last row of sub( A )
594*/
595 PB_CInOutV2( TYPE, &conjg, ROW, An, An, Astart, Ad0, nbb, Bptr,
596 0, 0, DBUFB, &Broc, &WBR, WBRd, &WBRfr, &WBRsum,
597 &WBRapbX );
598/*
599* Create WBC in process columns spanned by sub( A )
600*/
601 PB_COutV( TYPE, COLUMN, INIT, An, An, Ad0, nbb, &WBC, WBCd, &WBCfr,
602 &WBCsum );
603/*
604* Retrieve local quantities related to Ad0 -> sub( A )
605*/
606 Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
607 Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
608 Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
609
610 Anp = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow );
611 Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol );
612 if( ( Anp > 0 ) && ( Anq > 0 ) )
613 Aptr = Mptr( A, Aii, Ajj, Ald, size );
614
615 WBCld = WBCd[LLD_]; WBRld = WBRd[LLD_];
616
617 if( upper )
618 {
619/*
620* sub( A ) is upper triangular
621*/
622 for( k = 0; k < An; k += Alcmb )
623 {
624 ktmp = An - k; kb = MIN( ktmp, Alcmb );
625/*
626* Solve logical diagonal block, WBR contains the solution scattered in multiple
627* process rows and WBC contains the solution replicated in the process columns.
628*/
629 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
630 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
631 PB_Cptrsm( TYPE, WBCsum, SIDE, UPLO, TRANSA, DIAG, nbb, kb,
632 talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld,
633 size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ),
634 WBRld );
635/*
636* Update: only the part of sub( B ) to be solved at the next step is locally
637* updated and combined, the remaining part of the matrix to be solved later is
638* only locally updated.
639*/
640 Akq = PB_Cnumroc( k+kb, 0, Ainb1, Anb, mycol, Acol, npcol );
641 if( ( Anq0 = Anq - Akq ) > 0 )
642 {
643 Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
644 if( WBRsum )
645 {
646 kbnext = ktmp - kb;
647 kbnext = MIN( kbnext, Alcmb );
648 ktmp = PB_Cnumroc( kbnext, k+kb, Ainb1, Anb, mycol,
649 Acol, npcol );
650 Anq0 -= ktmp;
651
652 if( ktmp > 0 )
653 {
654 if( Anp0 > 0 )
655 gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
656 &ktmp, &Anp0, negone, Mptr( WBC, Akp, 0,
657 WBCld, size ), &WBCld, Mptr( Aptr, Akp, Akq,
658 Ald, size ), &Ald, talph0, Mptr( WBR, 0,
659 Akq, WBRld, size ), &WBRld );
660 Asrc = PB_Cindxg2p( k+kb, Aimb1, Amb, Arow, Arow,
661 nprow );
662 gsum2d( ctxt, COLUMN, &top, nbb, ktmp, Mptr( WBR, 0,
663 Akq, WBRld, size ), WBRld, Asrc, mycol );
664 if( myrow != Asrc )
665 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &nbb,
666 &ktmp, &izero, zero, zero, Mptr( WBR, 0, Akq,
667 WBRld, size ), &WBRld );
668 }
669 if( ( Anp0 > 0 ) && ( Anq0 > 0 ) )
670 gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
671 &Anq0, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld,
672 size ), &WBCld, Mptr( Aptr, Akp, Akq+ktmp, Ald,
673 size ), &Ald, talph0, Mptr( WBR, 0, Akq+ktmp,
674 WBRld, size ), &WBRld );
675 }
676 else
677 {
678 if( Anp0 > 0 )
679 gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
680 &Anq0, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld,
681 size ), &WBCld, Mptr( Aptr, Akp, Akq, Ald,
682 size ), &Ald, talph0, Mptr( WBR, 0, Akq, WBRld,
683 size ), &WBRld );
684 }
685 }
686 talph0 = one;
687 }
688 }
689 else
690 {
691/*
692* sub( A ) is lower triangular
693*/
694 for( k = ( Astart / Alcmb ) * Alcmb; k >= 0; k -= Alcmb )
695 {
696 ktmp = An - k; kb = MIN( ktmp, Alcmb );
697/*
698* Solve logical diagonal block, WBR contains the solution scattered in multiple
699* process rows and WBC contains the solution replicated in the process columns.
700*/
701 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
702 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
703 PB_Cptrsm( TYPE, WBCsum, SIDE, UPLO, TRANSA, DIAG, nbb, kb,
704 talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld,
705 size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ),
706 WBRld );
707/*
708* Update: only the part of sub( B ) to be solved at the next step is locally
709* updated and combined, the remaining part of the matrix to be solved later
710* is only locally updated.
711*/
712 if( Akq > 0 )
713 {
714 Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
715 if( WBRsum )
716 {
717 kbprev = MIN( k, Alcmb );
718 ktmp = PB_Cnumroc( kbprev, k-kbprev, Ainb1, Anb,
719 mycol, Acol, npcol );
720 Akq -= ktmp;
721
722 if( ktmp > 0 )
723 {
724 if( Anp0 > 0 )
725 gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
726 &ktmp, &Anp0, negone, Mptr( WBC, Akp, 0,
727 WBCld, size ), &WBCld, Mptr( Aptr, Akp, Akq,
728 Ald, size ), &Ald, talph0, Mptr( WBR, 0,
729 Akq, WBRld, size ), &WBRld );
730 Asrc = PB_Cindxg2p( k-1, Aimb1, Amb, Arow, Arow,
731 nprow );
732 gsum2d( ctxt, COLUMN, &top, nbb, ktmp, Mptr( WBR, 0,
733 Akq, WBRld, size ), WBRld, Asrc, mycol );
734 if( myrow != Asrc )
735 pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &nbb,
736 &ktmp, &izero, zero, zero, Mptr( WBR, 0, Akq,
737 WBRld, size ), &WBRld );
738 }
739 if( ( Anp0 > 0 ) && ( Akq > 0 ) )
740 gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
741 &Akq, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld,
742 size ), &WBCld, Mptr( Aptr, Akp, 0, Ald,
743 size ), &Ald, talph0, WBR, &WBRld );
744 }
745 else
746 {
747 if( Anp0 > 0 )
748 gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
749 &Akq, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld,
750 size ), &WBCld, Mptr( Aptr, Akp, 0, Ald,
751 size ), &Ald, talph0, WBR, &WBRld );
752 }
753 }
754 talph0 = one;
755 }
756 }
757/*
758* Combine the scattered resulting matrix WBR
759*/
760 if( WBRsum && ( Anq > 0 ) )
761 gsum2d( ctxt, COLUMN, &top, nbb, Anq, WBR, WBRld, WBRd[RSRC_],
762 mycol );
763/*
764* sub( B ) := WBR (if necessary)
765*/
766 if( WBRapbX )
767 PB_Cpaxpby( TYPE, &conjg, nbb, An, one, WBR, 0, 0, WBRd, ROW,
768 zero, Bptr, 0, 0, DBUFB, &Broc );
769 }
770
771 if( WBCfr ) free( WBC );
772 if( WBRfr ) free( WBR );
773/*
774* Go to the next contiguous panel if any residing in this process row or column
775*/
776 BnpR -= nbb;
777
778 if( BisR || ( BmyprocR == BcurrocR ) ) BiiR += nbb;
779 }
780/*
781* Go to next or previous process row or column owning some of sub( B )
782*/
783 if( !( BisR ) )
784 p = ( Bfwd ? MModAdd1( p, BnprocsR ) : MModSub1( p, BnprocsR ) );
785 }
786
787 if( TranOp == CCOTRAN ) free( talpha );
788/*
789* End of PB_CptrsmB
790*/
791}
#define Int
Definition Bconfig.h:22
F_VOID_FCT(* GEMM_T)()
Definition pblas.h:317
void(* GSUM2D_T)()
Definition pblas.h:286
#define C2F_CHAR(a)
Definition pblas.h:125
F_VOID_FCT(* TZPAD_T)()
Definition pblas.h:292
#define CCOLUMN
Definition PBblacs.h:20
#define TOP_GET
Definition PBblacs.h:50
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define CROW
Definition PBblacs.h:21
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define NOTRAN
Definition PBblas.h:44
#define CCONJG
Definition PBblas.h:21
#define ALL
Definition PBblas.h:50
#define CLEFT
Definition PBblas.h:29
#define TRAN
Definition PBblas.h:46
#define CNOCONJG
Definition PBblas.h:19
#define NOCONJG
Definition PBblas.h:45
#define CUPPER
Definition PBblas.h:26
#define CNOTRAN
Definition PBblas.h:18
#define CCOTRAN
Definition PBblas.h:22
#define INIT
Definition PBblas.h:61
#define CFORWARD
Definition PBblas.h:38
#define pilaenv_
Definition PBpblas.h:44
#define CTXT_
Definition PBtools.h:38
Int PB_Cfirstnb()
#define MB_
Definition PBtools.h:43
char * PB_Cmalloc()
void PB_Cinfog2l()
void PB_CptrsmB()
#define MModSub(I1, I2, d)
Definition PBtools.h:102
#define MIN(a_, b_)
Definition PBtools.h:76
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
#define LLD_
Definition PBtools.h:47
Int PB_Cnumroc()
char * PB_Ctop()
void PB_CInOutV2()
#define RSRC_
Definition PBtools.h:45
void PB_Cdescset()
void PB_COutV()
#define MModAdd1(I, d)
Definition PBtools.h:100
#define MModAdd(I1, I2, d)
Definition PBtools.h:97
#define INB_
Definition PBtools.h:42
#define MModSub1(I, d)
Definition PBtools.h:105
void PB_Cptrsm()
#define CSRC_
Definition PBtools.h:46
Int PB_Clcm()
#define IMB_
Definition PBtools.h:41
Int PB_Cindxg2p()
#define Mupcase(C)
Definition PBtools.h:83
#define DLEN_
Definition PBtools.h:48
#define NB_
Definition PBtools.h:44
void PB_Cconjg()
void PB_Cpaxpby()
void PB_Cdescribe()
#define TYPE
Definition clamov.c:7