C      ALGORITHM 711, COLLECTED ALGORITHMS FROM ACM.
C      THIS WORK PUBLISHED IN TRANSACTIONS ON MATHEMATICAL SOFTWARE,
C      VOL. 18, NO. 4, DECEMBER, 1992, PP. 414-428.
[READ.ME]

This subdirectory contains the version of the BTN software suitable for the
Intel iPSC/2 and iPSC/860 hypercube computers (these are distributed memory
parallel computers).

THE FOLLOWING FILES ARE INCLUDED:

   read.me      - this file
   btnlib.f     - subroutines for optimization
   chklib.f     - subroutines for checking derivatives
   d1mach.f     - routine to evaluate machine constants
                  [already set up for the Intel iPSC/2]
   host1.f      - sample host program 1
   host2.f      - sample host program 2
   host3.f      - sample host program 3
   host3.d      - data file for host3.f
   host4.f      - sample host program 4
   node1.f      - sample node program 1
   node2.f      - sample node program 2
   node3.f      - sample node program 3
   node4.f      - sample node program 4
   makefile     - "make" file for parallel execution on the Intel iPSC/2
   makevec      - "make" file for parallel execution on the Intel iPSC/2
                  using vector co-processors for vector operations
   make860      - "make" file for parallel execution on the Intel i860

Main programs 1-4 are the sample programs in the guide to the BTN software.

TO RUN THE SOFTWARE ON THE IPSC/2 HYPERCUBE:

    Copy the desired host program to "host.f"
    Copy the desired node program to "node.f"
    Type "make"
    Type "getcube"
    Type "host"

WARNINGS:

1.  When run in parallel, two consecutive runs of the same program can produce
different results, since different rounding errors can occur.  This does not
indicate a problem with the program, but rather is an artifact of parallel
computing.
2.  The parallel computing software was prepared using release 3.2 of the
Unix/386 operating system on the Intel iPSC/2 hypercube computer. The
"parallel" commands and other calls to system software are not part of the
Fortran 77 language and are not standardized. They may not work correctly
under different versions of the operating system.
3.  The sample main programs declare work arrays for problems with at most 100
variables, and for computers with at most 16 processors.  The PARAMETER
statements should be adjusted, increasing the values of N and NDMX, if these
values are not appropriate.

[END OF FILE READ.ME]
=========================================================================
all: 	host   node

HOBJS =	host.o

NOBJS =	node.o btnlib.o chklib.o d1mach.o

.SUFFIXES: .f .o

host.o:	host.f
	f77 -c -vx host.f

node.o: node.f
	f77 -c -vx node.f

.f.o:
	f77 -c -vx $<

host:	$(HOBJS)
	f77 $(HOBJS) -o host -host -vec

node:	$(NOBJS)
	f77 $(NOBJS) -o node -node -vec
=========================================================================
all: 	host   node

HOBJS =	host.o

NOBJS =	node.o btnlib.o chklib.o d1mach.o

.SUFFIXES: .f .o

host.o:	host.f
	f77 -c -vx host.f

node.o: node.f
	f77 -c -vx node.f

.f.o:
	f77 -c -vx $<

host:	$(HOBJS)
	f77 $(HOBJS) -o host -host -vec

node:	$(NOBJS)
	f77 $(NOBJS) -o node -node -vec -vx
=========================================================================
all: 	host   node

HOBJS =	host.o

NOBJS =	node.o btnlib.o chklib.o d1mach.o

.SUFFIXES: .f .o

host.o:	host.f
	f77 -c host.f

node.o: node.f
	if77 -i860 -c node.f

.f.o:
	if77 -c -i860 $<

host:	$(HOBJS)
	f77 $(HOBJS) -o host -host -vec

node:	$(NOBJS)
	if77 $(NOBJS) -o node -node -lvec
=========================================================================
n        =  100    % number of variables    format (10x, i5)
ichk     =    0    % 0 to minimize, 1 to check derivatives
istart   =    1    % CHKDER: starting index
iend     =   86    % CHKDER: starting index
istep    =    5    % CHKDER: starting index
msglvl   =    1    % print level
iunit    =   10    % unit number for output
kblock   =    4    % block size (default: k = cubesize)
rnktol   = 1.d-9          % tolerance for rank test
accrcy   = 1.d-6          % convergence tolerance
c********************************************************************
c BTN: parallel software for unconstrained optimization
c last changed: 09/30/91
c********************************************************************
c
      subroutine btnez (n, x, f, g, w, lw, iw, sfun, iflag)
c
c parallel block truncated-Newton routine for unconstrained optimization
c
c Parameters
c n        -> integer, number of variables
c x       <-> double precision, size n, nonlinear variables (initial guess
c                 and solution)
c f       <-  double precision, nonlinear function value (output)
c g       <-  double precision, size n, gradient (output)
c w        -  double precision, size lw, work space; currently must be of
c                 length at least 10n + 9k + 4k^2 + 2(n/k + 1),
c                 where k = # of processors on hypercube
c lw       -> integer, length of w
c iw       -  integer, size 3k, array for storing index information for
c                 work arrays (k = # of processors on hypercube)
c sfun     -  subroutine to evaluation nonlinear function:
c                 subroutine sfun (n, x, f, g, active, kblock)
c             Routine sfun must be declared EXTERNAL in the calling program.
c             The parameters n, x, f, and g are as above; x and n must
c             not be modified by routine sfun; sfun must return the
c             nonlinear function value in f and the gradient vector in g.
c             The logical variable active indicates if the processor is
c             working (active = .true.) or idle (active = .false.).  When
c             BTN is used via BTNEZ, then routine sfun can begin with the line
c                     if (.not. active) return
c             since in this case idle processors are ignored. [For more
c             detailed information, see routine BTN.] Parameter kblock is the
c             current block size (this parameter can be ignored when using
c             BTNEZ).
c iflag   <-  integer, error code:
c                 0 => normal return: successful termination
c                 1 => linesearch failed to find lower point
c                 2 => search direction points uphill
c                 3 => function appears to be unbounded below
c                 4 => more than maxfun evaluations of f(x) in linesearch
c                 5 => not converged after nlmax iterations
c                 6 => n <= 0
c               900 => insufficient work space provided
c               999 => disaster (see file btnout.iunit for message)
c             See the user manual for more information about the meaning
c             of these error codes.
c
      implicit         double precision (a-h,o-z)
      double precision x(*), g(*), w(*)
      integer          iw(*), pid
      external         sfun
c
c set parameters for BTN
c
      call btnpar (nodemx, kmax, maxit, msglvl, pid, iprec, nlmax,
     *   initv, tolq, iunit, rnktol, maxfun, accrcy, stepmx, eta,
     *   ireset, indstp)
c
c solve optimization problem
c
      call btn (n, x, f, g, w, lw, iw, sfun, iflag,
     *          nodemx, kmax, maxit, msglvl, pid, iprec, nlmax,
     *          initv, tolq, iunit, rnktol, maxfun, accrcy,
     *          stepmx, eta, ireset, indstp, sfun)
c
      return
      end
c
c
      subroutine btnpar (nodemx, kmax, maxit, msglvl, pid, iprec,
     *    nlmax, initv, tolq, iunit, rnktol, maxfun, accrcy, stepmx,
     *    eta, ireset, indstp)
      implicit double precision (a-h,o-z)
      integer  pid
c
c Sets parameters for BTN
c
c Parameters:
c
c nodemx   integer, size of hypercube
c kmax     integer, block size
c          default:  kmax = numnodes()
c maxit    integer, maximum number of inner iterations allowed
c          default:  maxit = 20
c msglvl   integer,  amount of printing desired
c          (<-1: none,
c            -1: warning messages only
c             0: one line per outer iteration).
c          default:  msglvl = 0
c pid      integer, process i.d. number (used for message passing).
c          Unless multiple processes are being run simultaneously, pid
c          can be set to 0.
c          default:  pid = 0
c iprec    integer, type of preconditioner
c          (0 = none, 1 = BFGS, 2 = approx. BFGS)
c          default:  iprec = 2.
c nlmax    integer, maximum number of outer iterations allowed
c          default:  nlmax = 500
c initv    integer, type of initialization
c           (1 = random,2 = rhs + random, 3 = limited memory)
c          default:  initv = 3
c tolq     double precision, tolerance for quadratic-based test
c          default:  tolq =  5.d-1
c iunit    integer, output unit # for printing from this processor
c          (if this unit has not already been opened, messages go to
c          file btnout.<iunit>)
c rnktol   double precision, tolerance for rank test in inner iteration
c          default:  rnktol = 1.d-9]
c maxfun   integer, maximum number of function evaluations allowed in
c          the linesearch.
c          default:  maxfun = 2*nlmax
c accrcy   double precision, user-chosen convergence tolerance (stop the
c          algorithm if the infinity-norm of the gradient is  <=
c          accrcy*(1.d0 + |f(x)|).
c          default:  accrcy = 0.d0
c stepmx   double precision, maximum step allowed in line search
c          default:  stepmx = 1.d3
c eta      double precision, accuracy parameter for line search (must be
c          strictly between 0 and 1, and it is recommended that
c          it be chosen between .1 and .9)
c          default:  eta = 0.2
c ireset   integer, reset preconditioner after ireset iterations.
c          If ireset <= 0, then no resetting is done.
c          default:  ireset = 0 (no resetting)
c indstp   integer, indicates if the user wishes to control the
c          maximum step size in the line search with a user provided
c          subroutine MAXSTP.  If indstp=0, BTN automatically determines
c          the maximum step.  If indstp<>0, then subroutine MAXSTP must
c          be provided.
c          default:  indstp = 0 (automatic setting of step size)
c
      nodemx  = numnodes()
      kmax    = numnodes()
      maxit   = 20
      msglvl  = 0
      pid     = 0
      iprec   = 2
      nlmax   = 500
      initv   = 3
      tolq    = 5.0d-1
      iunit   = 10
      rnktol  = 1.0d-9
      maxfun  = 2*nlmax
      accrcy  = 0.d0
      stepmx  = 1.d3
      eta     = 0.2d0
      ireset  = 0
      indstp  = 0
c
      return
      end
c
c
      subroutine btn (n, x, f, g, w, lw, rowinf, sfun, iflag,
     *            nodemx, kmax, maxit, msglvl, pid, iprec, nlmax,
     *            initv, tolq, iunit, rnktol, maxfun, accrcy,
     *            stepmx, eta, ireset, indstp, maxstp)
c
c parallel block truncated-Newton routine
c
c Parameters
c n        -> integer, number of variables
c x        -> double precision, size n, nonlinear variables (initial guess
c                  and solution)
c f       <-  double precision, nonlinear function value (output)
c g       <-  double precision, size n, gradient (output)
c w        -  double precision, size lw, work space; currently must be of
c                 length at least 10n + 9kmax + 4kmax^2 + 2(n/kmax + 1);
c lw       -> integer, length of w
c rowinf   -  integer, size nodemx*3, array for storing index
c                 information for work arrays
c sfun     -  subroutine to evaluation nonlinear function:
c                 subroutine sfun (n, x, f, g, active, kblock)
c             Routine sfun must be declared EXTERNAL in the calling program.
c             The parameters n, x, f, and g are as above; x and n must
c             not be modified by routine sfun; sfun must return the
c             nonlinear function value in f and the gradient vector in g.
c             The logical variable active indicates if the processor is
c             working (active = .true.) or idle (active = .false.).  Under
c             normal circumstances (i.e., if each function evaluation is
c             performed using only one processor) then routine sfun can
c             begin with the line
c                     if (.not. active) return
c             since in this case idle processors are ignored.
c
c             If each function evaluation is being performed in parallel by
c             several processors, then active, kblock (the current block size),
c             and kmax can be used to determine what each processor should
c             do.  In this case, processors 0, 1, ..., (kblock-1) will call
c             sfun with active=.true., and all other processors will use
c             active=.false.  An idle processor # (id) will call sfun
c             at the same time as processor # (mod(id,kmax)).  Only the
c             active processor will be passed the vector x, and only
c             this processor need return the values of f and g.  An example
c             illustrating this idea is provided in the user manual.
c iflag   <-  integer, error code:
c                 0 => ok
c                 1 => linesearch failed to find lower point
c                 2 => search direction points uphill
c                 3 => function appears to be unbounded below
c                 4 => more than maxfun evaluations of f(x)
c                 5 => not converged after nlmax iterations
c                 6 => n <= 0
c               900 => insufficient work space provided
c               999 => disaster (see file btnout.iunit for message)
c            See the user manual for more information about the meaning
c            of these error codes.
c nodemx   -> integer, size of hypercube
c kmax     -> integer, block size [usually equal to nodemx]
c maxit    -> integer, maximum number of inner iterations allowed
c                 [typical value: 20]
c msglvl   -> integer, amount of printing desired (<0 = none;
c                 0 = 1 line per outer iteration)
c pid      -> integer, process id number [typical value: 0]
c iprec    -> integer, type of preconditioner (0 = none, 1 = BFGS,
c                 2 = approx. BFGS) [it is strongly suggested that
c                 iprec = 2 be used]
c nlmax    -> integer, maximum number of outer iterations allowed
c                 [typical value: 500]
c initv    -> integer, type of initialization (1 = random,
c                 (2 = rhs + random, 3 = limited memory) [it is
c                 suggested that initv = 3 be used]
c tolq     -> double precision, tolerance for quadratic-based test
c                 [typical value: 5.d-1]
c iunit    -> integer, output unit # for printing from this processor
c                 (if this unit has not already been opened, output
c                 will go to file btnout.<iunit>); iunit must be
c                 less than 1000
c rnktol   -> double precision, tolerance for rank test in inner iteration
c                 [typical value: 1.d-9]
c maxfun   -> integer, maximum number of function evaluations allowed in
c                 the linesearch [typical value: 2*nlmax]
c accrcy   -> double precision, user-chosen convergence tolerance (stop the
c                 algorithm if the infinity-norm of the gradient is  <=
c                 accrcy*(1.d0 + |f(x)|).  Under normal circumstances,
c                 the user should set accrcy=0.0 and let the stringent
c                 convergence test built-in to BTN be used.  If the function
c                 f(x) or its gradient cannot be obtained accurately, then
c                 the user may wish to override this test by setting
c                 accrcy = 1.d-6 (say), or perhaps some slightly larger value.
c                 [typical value: 0.d0]
c stepmx   -> double precision, maximum step allowed in line search
c                 [typical value: 1.d3]
c eta      -> double precision, accuracy parameter for line search (must be
c                 strictly between 0 and 1, and it is recommended that
c                 it be chosen between .1 and .9) [typical value: 0.2]
c ireset   -> integer, reset preconditioner after ireset iterations; if
c                 no resetting is desired, set ireset=0; normally it
c                 will not be necessary to reset the preconditioner based
c                 on the iteration count, but it can occasionally improve
c                 performance on certain specially structured problems; see
c                 the user manual for further guidance.
c                 [recommended value: 0]
c indstp   -> integer, indicates if the user wishes to select the maximum
c                 step in the line search manually at every iteration.
c                 See sample main program 4 for an example.
c                 If indstp=0, then BTN sets the maximum step automatically.
c                 If indstp<>0, then the user-supplied routine MAXSTP is
c                 called (see below).
c                 [typical value: 0]
c maxstp   -  (optional) subroutine to set the maximum step for the line
c                 search.  It must have the following calling sequence:
c                     subroutine maxstp (n, x, d, stepmx)
c                 where n and x are as for subroutine sfun, d is the current
c                 search direction (the new vector of parameters will be of
c                 the form x+a*d for some scalar a), and stepmx is the largest
c                 allowable step (computed by the user within subroutine
c                 maxstp).  The values of n, x, and d must not be modified by
c                 the user, and stepmx must be defined by the user.  Subroutine
c                 maxstp must be declared EXTERNAL in the calling program.
c                 NOTE: most applications will not require this feature, and
c                 should set indstp=0 when calling BTN (see above); when
c                 indstp=0, BTN will not attempt to call maxstp, and no
c                 subroutine need be provided.  In this case, the call to BTN
c                 can have the form:  call btn (..., indstp, sfun), where sfun
c                 is the name of the function evaluation routine described
c                 above.  See the sample main programs for further examples.
c                 [typical value: routine maxstp not provided]
c
      implicit         double precision (a-h,o-z)
      double precision x(*), g(*), w(*)
      integer          rowinf(nodemx,3), pid, idpsze, type0,
     *                 flgbcg, cubesz
      logical          active, msg, idzero, inuse
      character        outfle*10
      external         sfun
c
c get cube information
c
      cubesz   = numnodes()
      id       = mynode ()
c
c check for inappropriate parameter settings
c
      if (maxit .lt. 0) then
          if (id .eq. 0 .and. msglvl .ge. -1) then
              write (*,*) ' BTN - Warning: maxit < 0'
              write (*,*) '       maxit = ', maxit
              write (*,*) '       Resetting maxit = 20'
          end if
          maxit = 20
      end if
      if (iprec .lt. 0 .or. iprec .gt. 2) then
          if (id .eq. 0 .and. msglvl .ge. -1) then
              write (*,*) ' BTN - Warning: iprec out of range'
              write (*,*) '       iprec = ', iprec
              write (*,*) '       Resetting iprec = 2'
          end if
          iprec = 2
      end if
      if (initv .lt. 1 .or. initv .gt. 3) then
          if (id .eq. 0 .and. msglvl .ge. -1) then
              write (*,*) ' BTN - Warning: initv out of range'
              write (*,*) '       initv = ', initv
              write (*,*) '       Resetting initv = 3'
          end if
          initv = 3
      end if
      if (tolq .le. 0.d0 .or. tolq .ge. 1.d0) then
          if (id .eq. 0 .and. msglvl .ge. -1) then
              write (*,*) ' BTN - Warning: tolq out of range'
              write (*,*) '       tolq = ', tolq
              write (*,*) '       Resetting tolq = 0.5'
          end if
          tolq = 5.d-1
      end if
      if (rnktol .lt. 0.d0) then
          if (id .eq. 0 .and. msglvl .ge. -1) then
              write (*,*) ' BTN - Warning: rnktol out of range'
              write (*,*) '       rnktol = ', rnktol
              write (*,*) '       Resetting rnktol = 1.d-9'
          end if
          rnktol = 1.d-9
      end if
      if (eta .le. 0.d0 .or. eta .ge. 1.d0) then
          if (id .eq. 0 .and. msglvl .ge. -1) then
              write (*,*) ' BTN - Warning: eta out of range'
              write (*,*) '       eta = ', eta
              write (*,*) '       Resetting eta = 0.2'
          end if
          eta = 2.d-1
      end if
c
c set up
c
      inquire (iunit, opened = inuse)
      if (.not. inuse) then
          if (iunit .gt. 99) then
              write (outfle,830) iunit
          else if (iunit .gt. 9) then
              write (outfle,840) iunit
          else
              write (outfle,850) iunit
          end if
          open (iunit, file = outfle, status = 'unknown')
      end if
      if (n .le. 0) then
          iflag = 6
          go to 500
      end if
      mmax     = n/cubesz + 1
      maxit1   = maxit
      idpsze   = 8
      idzero   = id .eq. 0
c
c compare n with blocksize
c
      if (n .lt. kmax) then
          if (id .eq. 0 .and. msglvl .ge. -1) then
              write (*,*) ' BTN - Warning: blocksize too small'
              write (*,*) '       kmax = ', kmax
              write (*,*) '       n    = ', n
              write (*,*) '       Setting kmax = n'
          end if
          kmax = n
      end if
      itsave   = 0
      type0    = 11002
      kxk      = kmax * kmax
      k        = kmax
      nk       = n + k
      nf       = 1
c
c subdivide the work array
c
      lp      = 1
      lpre    = lp     + n
      ivinit  = lpre   + mmax
      iv0     = ivinit + nk
      iv1     = iv0    + nk
      ir      = iv1    + nk
      iar     = ir     + nk
      ip      = iar    + nk
      ialpha  = ip     + nk
      ibeta   = ialpha + kxk
      il1     = ibeta  + kxk
      il2     = il1    + kxk
      id1     = il2    + kxk
      id2     = id1    + kmax
      is      = id2    + kmax
      iwk     = is     + kmax
      iwk1    = iwk    + n
      iwk2    = iwk1   + n
      lprenw  = iwk2   + n
      lwtest  = lprenw + mmax - 1
      if (lw .lt. lwtest) then
          call disast ('BTN', 'INSUFFICIENT STORAGE', iunit, iflag)
          write (iunit,*) ' # of locations required = ', lwtest
          iflag = 900
          go to 500
      end if
c
      msg     = msglvl .ge.  0
      ncg     = 0
      iter    = 0
      iflag   = 0
      flgbcg  = 0
      intflg  = 0
c
c determine the columns under control of each node
c    m defines the number of rows handled by each processor
c    myrows is the starting location of rows handled.
c    rowinf(i,1) and rowinf(i,2) define myrows and m for processor i
c
      call getmr (n, cubesz, rowinf, nodemx, 1, n, 1)
      myrows = rowinf(id+1,1)
      m      = rowinf(id+1,2)
      mm     = m
c
c initial printouts
c
      if (id .lt. kmax) then
          active = .true.
      else
          active = .false.
      end if
      call sfun (n, x, f, g, active, kmax)
      fsave = abs(f)
      if (kmax .lt. cubesz) then
          itest = cubesz - kmax - 1
          if (id .le. itest) then
              idtmp = id + kmax
              call csend (type0  , g, n*idpsze, idtmp, pid)
              call csend (type0+1, f,   idpsze, idtmp, pid)
          else if (id .ge. kmax) then
              call crecv (type0  , g, n*idpsze)
              call crecv (type0+1, f,   idpsze)
          end if
          type0 = type0 + 2
      end if
      gnrm = pdnrmi (m, g(myrows), 1,
     *               cubesz, id, type0, rowinf(1,3), pid)
      if (idzero .and. msg) then
          write (*,800)
          write (*,810) iter, nf, ncg, f, gnrm
      end if
c
c set tolerances for convergence tests
c
      call settol (tol0,tol1,tol2,tol3,tol4,id)
c
c convergence test at initial point
c
      ftest = 1.d0 + dabs(f)
      if (gnrm .lt. tol0*ftest) go to 500
      type0   = 10003
c
c initialize preconditioner to the identity
c
      if (iprec .gt. 0) call dfill (m, 1.d0, w(lprenw), 1)
c
c  main loop
c
      do 10 iter = 1,nlmax
c
c  set the preconditioner
c
          itsave = itsave + 1
          if (iprec .eq. 0) then
              call dfill (m, 1.d0, w(lpre), 1)
          else
              call dcopy (m, w(lprenw), 1, w(lpre), 1)
          end if
c
c  get search direction
c  direction resides on w(lp) ... w(lp+n-1)
c
          call precg (w(iv1), m, m, myrows, g, kmax, iter,
     *        initv, flgbcg)
          call bcg (n, w(lp), g, gnrm, k, kmax, flgbcg,
     *        maxit1, x, nbl, w(lpre), w(lprenw), pid,
     *        tolq, mm, kmax, w(iv0),  w(iv1),  w(ir),
     *        w(iar), w(ip), w(ialpha),  w(ibeta),  w(il1),
     *        w(il2), w(id1),  w(id2),  w(is), w(iwk), w(iwk1),
     *        w(iwk2), rowinf, nodemx, myrows, m, iprec,
     *        rowinf(1,3), sfun, type0, iunit, rnktol)
          k = kmax
          if (iflag .eq. 999) go to 500
          call dneg (m, w(lp+myrows-1), 1)
          dg = pddot (m, w(lp+myrows-1), 1, g(myrows), 1,
     *        cubesz, id, type0, rowinf(1,3), pid)
          ncg = ncg + nbl
          if (initv .eq. 3 .and.
     *        flgbcg .gt. 0 .and. intflg .eq. flgbcg)
     *        initv = 2
          call postcg (w(iv1), m, n, m, myrows, k, iter, initv,
     *        w(ivinit), m, w(lp), g, gnrm, flgbcg, id, cubesz,
     *        rowinf(1,3), type0, pid)
          if (flgbcg .ge. 0) intflg = flgbcg
c
c collect the search direction on all nodes (even those not
c involved in computing the search direction)
c
          type0 = 10001
          call xgcol (w(lp+myrows-1), m*idpsze, w(lp), n*idpsze, ncnt,
     *            cubesz, m, id, type0, n, rowinf(1,3), pid)
c
c parallel line search
c
          oldf = f
          if (indstp .ne. 0) call maxstp (n, x, w(lp), stepmx)
          call psrch (iflag, n, x, f, g, w(lp), w(iwk), w(iwk1),
     *               w(iwk2), sfun, nf, eta, stepmx, id, pid, kmax,
     *               cubesz, rowinf(1,3), type0, dg, alpha)
          if (iflag .lt. 0) then
              maxit1 = 1 + 3/kmax
          else
              maxit1 = maxit
          end if
          if (iflag .lt. 0) iflag = 0
          gnrm = pdnrmi (m, g(myrows), 1,
     *              cubesz, id, type0, rowinf(1,3), pid)
          if (idzero .and. msg) then
              write (*,810) iter, nf, ncg, f, gnrm
          end if
          if (iflag .ne. 0) go to 500
c
c test for convergence
c
          diff  = abs(oldf - f)
          ftest = 1.d0 + dabs(f)
          xnrm = pdnrmi (m, x, 1,
     *              cubesz, id, type0, rowinf(1,3), pid)
          xtest = 1.d0 + xnrm
          pnrm = pdnrmi (m, w(lp), 1,
     *              cubesz, id, type0, rowinf(1,3), pid)
c
          if (alpha*pnrm  .lt.   tol1*xtest
     *        .and. diff  .lt.   tol2*ftest
     *        .and. gnrm  .lt.   tol3*ftest) go to 500
          if (      gnrm  .lt.   tol4*ftest) go to 500
          if (      gnrm  .lt. accrcy*ftest) go to 500
          if (nf .gt. maxfun) then
              iflag = 4
              go to 500
          end if
          ftest = abs(f)
          if (ftest .le. fsave*1.d-5 .or. itsave .eq. ireset) then
              if (iprec .gt. 0) call dfill (n, 1.d0, w(lprenw), 1)
              fsave = ftest
              itsave = 0
          end if
10    continue
      iflag = 5
c
500   if (idzero .and. msg) write (*,820) iflag
      if (.not. inuse) close (iunit)
      return
800   format (/, 3x, 'it', 3x, 'nf', 2x, 'ncg', 11x, 'f', 14x, '|g|')
810   format (' ', i4, 1x, i4, 1x, i4, 2x, 1pd16.8, 2x, 1pd12.4)
820   format (' BTN terminating with error code = ', i4)
830   format ('btnout.',   i3)
840   format ('btnout.0',  i2)
850   format ('btnout.00', i1)
      end
c
c
      subroutine settol (tol0,tol1,tol2,tol3,tol4,id)
      implicit double precision (a-h,o-z)
c
c set tolerances for convergence tests
c
      eps    = d1mach(3)
      if (1.d0 + 2.d0*eps .eq. 1.d0) then
          if (id .eq. 0) then
              write (*,*) ' ### BTN: ERROR ###'
              write (*,*) '     Machine epsilon too small'
              write (*,*) '     Value = ', eps
              write (*,*) '     Modify routine D1MACH'
              write (*,*) '     Terminating execution'
          end if
          stop
      end if
      rteps  = sqrt(eps)
      rtol   = 1.d1*rteps
      rtolsq = rtol*rtol
      peps   = eps**0.6666d0
c
      tol0 = 1.d-2  * rteps
      tol1 = rtol   + rteps
      tol2 = rtolsq + eps
      tol3 = sqrt(peps)
      tol4 = 1.d-2*rtol
c
      return
      end
c
c
      subroutine bcg (n, x, b, bnrm, k, kmax, iflag, maxit,
     *                xnl, ncg, PC, PCnew, pid,
     *                tolq, mm, kk, V0, V1, R, AR, U,
     *                alpha, beta, L1, L2, D1, D2, s,
     *                wk, wk1, wk2, rowprc, nodemx, myrows, m,
     *                iprec, nodenm, sfun, type0, iunit, rnktol)
c
c Parallel block conjugate-gradient iteration (to compute a
c search direction for a truncated-Newton optimization algorithm)
c
c PARAMETERS
c n          -> integer, dimension of problem
c x         <-  double precision, size n, search direction
c b          -> double precision, size n, right-hand side vector
c bnrm       -> double precision, infinity-norm of right-hand side
c k         <-> integer, current block size
c kmax       -> integer, maximum block size
c iflag     <-  integer, flag:
c                       0 => okay
c                      -1 => steepest-descent direction
c                       i => linear depend. in col. i at iteration 1
c                     999 => disaster (message to unit iunit)
c maxit      -> integer, maximum number of inner iterations allowed
c                     (if maxit<0, then nonlinearity test is also used)
c xnl        -> double precision, size n, vector of nonlinear parameters
c ncg       <-  integer, counter for number of inner iterations
c PC        <-> double precision, size m, current and new preconditioner
c PCnew      -  double precision, size m, work array (new PC)
c pid        -> integer, process i.d. number
c tolq       -> double precision, tolerance for quadratic test
c mm         -> integer, leading dimension of m*k matrices
c kk         -> integer, leading dimension of k*k matrices
c V0         -  double precision, size m*k, work array (old Lanczos vectors)
c V1         -> double precision, size m*k, work array (current Lanczos vectors)
c R          -  double precision, size m*k, work array (preconditioned V1)
c AR         -  double precision, size m*k, work array (Hessian times R)
c U          -  double precision, size m*k, work array (inner search directions)
c alpha      -  double precision, size k*k, work array (diagonal block of V'AV)
c beta       -  double precision, size k*k, work array (off-diag block of V'AV)
c L1         -  double precision, size k*k, work array (Cholesky factor of V'AV)
c L2         -  double precision, size k*k, work array (Cholesky factor of V'AV)
c D1         -  double precision, size   k, work array (Cholesky factor of V'AV)
c D2         -  double precision, size   k, work array (Cholesky factor of V'AV)
c s          -  double precision, size   k, work array (inner step sizes)
c wk         -  double precision, size   n, work array
c wk1        -  double precision, size   n, work array
c wk2        -  double precision, size   n, work array
c rowprc     -> integer, size kmax*2, information on processor allocation
c nodemx     -> integer, maximum number of processors available
c myrows     -> integer, index of first row on this processor
c m          -> integer, number of rows on this processor
c iprec      -> integer, choice of preconditioner:
c                       0 => no preconditioner
c                       1 => LMQN diagonal PC
c                       2 => approximate LMQN diagonal PC
c nodenm     -  integer, size kmax, array used by lower-level routines
c sfun       -> external, subroutine sfun (n,x,f,g,a,k), evaluate f(x)
c type0     <-> integer, type number for message passing
c iunit      -> integer, output unit # for printing
c               (see file btnout.iunit for output)
c rnktol     -> double precision, tolerance for rank test in inner iteration
c
      implicit         double precision (a-h,o-z)
      parameter        (idpsze = 8)
      double precision b(n), xnl(n), x(n), PC(m), PCnew(m),
     *                 V0(mm,kk), V1(mm,kk), R(mm,kk), AR(mm,kk),
     *                 U(mm,kk), alpha(kk,kk), beta(kk,kk),
     *                 L1(kk,kk), L2(kk,kk), D1(kk), D2(kk), s(kk),
     *                 wk(n), wk1(n), wk2(n)
      integer          nodenm(*), rowprc(nodemx,2),
     *                 type0, kmax, pid, cubesz
      logical          active, indef
      external         sfun
c
c set up
c
      cubesz = numnodes ()
      id     = mynode ()
      kold   = k
      iflag  = 0
      info   = 0
      iend   = 0
      indef  = .false.
      xAx    = 0.d0
      nblock = n/k
      if (nblock*k .lt. n) nblock = nblock + 1
c
c**********************************************************************
c Initialization
c**********************************************************************
c
      call dfill (m, 0.d0, x(myrows), 1)
      call dfill (k, 0.d0, s, 1)
      if (bnrm .eq. 0.d0) then
          call disast ('BCG', 'bnrm = 0', iunit, iflag)
          go to 500
      end if
      call dcopy (m, b(myrows), 1, V1, 1)
      call dscal (m, 1.d0/bnrm, V1, 1)
c
c**********************************************************************
c Main loop
c**********************************************************************
c
      maxnbl = nblock
      if (maxit  .eq.     0) maxit  = 1
      if (maxnbl .gt. maxit) maxnbl = maxit
c
      do 100 nbl = 1, maxnbl
          ncg  = nbl
          kold = k
c
c**********************************************************************
c Get QR of V1
c**********************************************************************
c
          call msolve (V1, mm, m, k, R, mm, PC)
          call pgetch (V1, mm, R, mm, beta, kk, m, cubesz, kold, id,
     *           L1, kk, k, type0, nodenm, iunit, iflag, rnktol, pid)
          if (iflag .eq. 999) return
          if (nbl .eq. 1) then
              if (k .eq. 0) then
                  call disast ('BCG', 'K=0 AT ITER=1', iunit, iflag)
                  iflag = -1
                  call dcopy  (m, b(myrows), 1, x(myrows), 1)
                  return
              else
                  if (k. lt. kold) iflag = k+1
                  kold = k
              endif
          else
              if (k .eq. 0) go to 500
          endif
c
c Compute V1 x (Beta')^(-1)
c
          call vltinv (V1, mm, m, k, V1, mm, beta, kk, iunit, iflag)
          if (iflag .eq. 999) return
          call msolve (V1, mm, m, k, R, mm, PC)
          if (nbl .eq. 1) then
              s(1) = pddot (m, b(myrows), 1, R(1,1), 1,
     *            cubesz, id, type0, nodenm, pid)
              s(1) = s(1) / bnrm
          end if
c
c**********************************************************************
c Compute AR, alpha
c**********************************************************************
c
          call getar(R, mm, k, AR, mm, cubesz, m, myrows, rowprc,
     *           nodemx, wk, wk1, wk2, type0, id, pid, n, xnl, b,
     *           sfun)
          call AtrBA (R, mm, k, AR, mm, alpha, kk, L1, kk,
     *           id, m, cubesz, type0, nodenm, pid)
c
c update diagonal preconditioner (if desired)
c
          if (iprec .eq. 1) then
              call frmpc (PCnew, alpha, R, AR, mm, m, kk, k,
     *            id, iunit, cubesz, wk, wk1, nodenm, type0, pid)
          endif
          if (iprec .eq. 2) then
              call frmpc1 (PCnew, alpha, R, AR, mm, m, kk, k,
     *            id, iunit, cubesz, wk, wk1, nodenm, type0, pid)
          endif
c
c**********************************************************************
c Cholesky for L1
c set D1 = D2 (kold * kold), then solve for L1 (kold * k)
c L2 * D1 * L1 = beta, where L2, D1 are of dimension kold
c**********************************************************************
c
          if (nbl .gt. 1) then
              call dcopy (kold, D2, 1, D1, 1)
              do 30 i = 1,k
                  call lsol (L2(i,i), kk, kold-i+1, beta(i,i),
     *                  L1(i,i), iunit, iflag)
                  if (iflag .eq. 999) return
                  call dvdiv (kold-i+1,L1(i,i),1,D1(i),1,L1(i,i),1)
30            continue
          end if
c
c**********************************************************************
c Cholesky for alpha
c Cholesky factorization of block tridiagonal matrix.
c In the following L2 will be the factor of the diagonal block.
c All processors compute the Cholesky simultaneously.
c**********************************************************************
c
          call matcpy (alpha, kk, L2, kk, k, k)
          if (nbl .gt. 1) then
              do 50 i = 1,k
              do 50 j = 1,i
                  do 40 l = i,kold
                      L2(i,j) = L2(i,j) - L1(l,i) * D1(l) * L1(l,j)
40                continue
                  if (j .lt. i) L2(j,i) = L2 (i,j)
50            continue
          end if
          call dpofa2 (L2, kk, k, info, id, iunit, iflag, rnktol)
          if (iflag .eq. 999) return
          if (info .ne. 0) then
              indef = .true.
              k = abs(info) - 1
              if (nbl .eq. 1 .and. k .eq. 0) then
                  call dcopy (m, b(myrows), 1, x(myrows), 1)
                  iflag = -1
                  return
              endif
              if (nbl .gt. 1 .and. k .eq. 0) go to 500
          endif
c
c get L2 and D2
c
          call getld2 (L2, kk, k, D2, iunit, iflag)
          if (iflag .eq. 999) return
c
c**********************************************************************
c Search direction U
c**********************************************************************
c compute  V1 - U L1
c     V1 (n * k), U (n * kold), L1 (kold * k) lower triangular
c     V1 - U L1 will be stored in R
c     U*L1 is stored in U (=U)
c
          if (nbl .gt. 1) then
              call timesl (U, mm, m, kold, L1, kk, k)
              call aminsb (R, mm, U, mm, R, mm, m, k)
          end if
c
c solve for U:
c    U * L2' = V1
c    V1 is n * k
c    L2 is k * k lower triangular
c
          call vltinv (R, mm, m, k, U, mm, L2, kk, iunit, iflag)
          if (iflag .eq. 999) return
c
c**********************************************************************
c Step size
c**********************************************************************
c
          if (nbl .eq. 1) then
              call lsol  (L2, kk, k, s, s, iunit, iflag)
              if (iflag .eq. 999) return
              call dvdiv (k, s, 1, D2, 1, s, 1)
          else
              call dvmul  (kold, D1, 1, s, 1, s, 1)
              call premlt (s, mm, kold, 1, L1, kk, k, wk2, n)
              call lsol   (L2, kk, k, wk2, s, iunit, iflag)
              if (iflag .eq. 999) return
              call dvdiv  (k, s, 1, D2, 1, s, 1)
              call dscal  (k, -1.d0, s, 1)
          end if
c
c**********************************************************************
c Update x
c**********************************************************************
c
          call maxpy (x(myrows), n, m, 1, 1.d0, U, mm, k, s, kk)
c
c**********************************************************************
c compute termination criterion using quadratic test
c**********************************************************************
c
          if (indef) go to 500
          if (nbl .eq. maxnbl) go to 500
          if (nbl .gt. 1 .and. k .eq. 0) go to 500
c
c quadratic test
c
          call dvmul  (k, D2, 1, s, 1, wk2, 1)
          xAx  = xAx + ddot(k, s, 1, wk2, 1)
          bx   = pddot (m, b(myrows), 1, x(myrows), 1, cubesz, id,
     *                type0, nodenm, pid)
          qnew = xAx * 0.5d0 - bx
          if (nbl .gt. 1) then
              diff = qnew - qold
              if (qnew .eq. 0.d0) then
                  call disast ('BCG','Q(P) = 0',iunit, iflag)
                  go to 500
              end if
              if ((nbl * diff / qnew) .lt. tolq) iend = 1
          end if
          qold = qnew
c
          if (iend .eq. 1) go to 500
c
c**********************************************************************
c Update for next iteration:  form new V
c**********************************************************************
c AR:= AR - V0 * beta; V0 (n * kold), beta (kold * k)
c
          if (nbl .gt. 1) then
              call timesl (V0, mm, m, kold, beta, kk, k)
              call aminsb (AR, mm, V0, mm, AR, mm, m, k)
          end if
c
c V1:= AR - V1 * alpha; update V0
c
          call matcpy (V1, mm,  V0, mm,  m, k)
          call atimsb (V0, mm, m, k, alpha, kk, k, V1, mm)
          call aminsb (AR, mm, V1, mm, V1, mm, m, k)
c
100   continue
c
500   call dscal (m, bnrm, x(myrows), 1)
      return
      end
c
c
      subroutine pgetch (V, ldv, Av, ldav, Beta, ldb, m, kmax, k,
     *             id, buf, ldbf, kr, type0, nodenm, iunit,
     *             iflag, rnktol, pid)
c
c Cholesky factorization of V'A V, where
c
c PARAMETERS
c V         -> double precision, size m*k, input matrix
c ldv       -> integer, leading dimension of V
c Av        -> double precision, size m*k, input matrix A*V
c ldav      -> integer, leading dimension of AV
c Beta     <-  double precision, size k*k, Cholesky factor
c ldb       -> integer, leading dimension of Beta
c m         -> integer, number of rows on this processor
c kmax      -> integer, number of active processors
c k         -> integer, block size (current)
c id        -> integer, i.d. number of this processor
c buf       -  double precision, size k*k, work matrix
c ldbf      -> integer, leading dimension of buf
c kr       <-  integer, rank of Beta (new block size)
c type0    <-> integer, message type for message passing
c nodenm    -  integer, size p, array for lower-level routines
c iunit     -> integer, output unit # for printing
c iflag    <-  integer, flag (=999 in case of disaster)
c rnktol    -> double precision, tolerance for rank test in inner iteration
c pid       -> integer, process i.d. number
c
      implicit    double precision (a-h,o-z)
      parameter   (izero = 0)
      dimension   V(ldv,*), Av(ldav,*), Beta(ldb,*), buf(ldbf,*)
      integer     nodenm(*), kmax, type0, pid
c
c form V'AV
c
      call AtrBA (V, ldv, k, Av, ldav, Beta, ldb, buf, ldbf,
     *            id, m, kmax, type0, nodenm, pid)
c
c find Cholesky factorization of V'AV (simultaneously on all nodes)
c and determine new block size (i.e., rank of Beta)
c
      call dpofa2 (Beta, ldb, k, info, id, iunit, iflag, rnktol)
      if (iflag .eq. 999) return
      if (info .ne. 0) then
          kr = abs(info) - 1
      else if (info .eq. 0) then
          kr = k
      endif
c
      return
      end
c
c
      subroutine AtrBA (A, lda, k, BA, ldb, C, ldc, buf, ldbf,
     *                   id, m, kmax, type0, nodenm, pid)
c
c computes C = A'* BA, where BA is B*A with B symmetric.
c A (n x k), BA (n * k) and C (k * k)
c This node controls m rows of A and the respective m rows of BA
c The result is explicitly symmetrized, and both halves are computed.
c
c PARAMETERS
c A         -> double precision, size m*k, input matrix
c lda       -> integer, leading dimension of A
c k         -> integer, block size
c BA        -> double precision, size m*k, input matrix B*A
c ldb       -> integer, leading dimension of BA
c C        <-  double precision, size k*k, result matrix
c ldc       -> integer, leading dimension of C
c buf       -  double precision, size k*k, work matrix
c ldbf      -> integer, leading dimension of buf
c id        -> integer, i.d. number of this processor
c m         -> integer, number of rows on this processor
c kmax      -> integer, number of active processors
c type0    <-> integer, type number for message passing
c nodenm    -  integer, size kmax, array used by lower-level routines
c pid       -> integer, process i.d. number
c
      implicit         double precision (a-h,o-z)
      parameter        (idpsze = 8, izero = 0)
      double precision A(lda,*), BA(ldb,*), C(ldc,*)
      double precision buf(ldbf,*)
      integer          nodenm(*), kmax, type0, pid
c
      msgl = ldc * k
c
c form piece of A'BA
c
      do 10 i = 1, k
      do 10 j = 1, k
          C(i,j) = ddot(m, A(1,i), 1, BA(1,j), 1)
10    continue
c
c symmetrize result
c
      do 15 i = 1,k
      do 15 j = 1,i-1
           C(j,i) = (C(i,j)+C(j,i))*5.d-1
           C(i,j) = C(j,i)
15    continue
c
c  get sum of C from all nodes
c
      call xgdsum (C, msgl, buf, kmax, id, type0, nodenm, pid)
c
      return
      end
c
c
      subroutine getar (R, ldr, k, AR, ldar, kmax, m, myrows, rowprc,
     *                  nodemx, wk, wk1, wk2, type0, id, pid, n,
     *                  xnl, g, sfun)
c
c getar: forms A*R, where R is a block matrix, by finite differencing
c
c PARAMETERS
c
c R             -> double precision, size m*k, original block matrix
c ldr           -> integer, leading dimension of R
c k             -> integer, block size
c AR           <-  double precision, size m*k, result: A*R (A = Hessian)
c ldar          -> integer, leading dimension of AR
c kmax          -> integer, # of active processors
c m             -> integer, # of rows in R and AR
c myrows        -> integer, initial row of block on this processor
c rowprc        -> integer, size kmax*2, blocking information
c nodemx        -> integer, leading dimension of rowprc
c wk, wk1, wk2  -  double precision, size n, work arrays
c type0         -> integer, type # for sending messages
c id            -> integer, index of this processor
c pid           -> integer, process id
c n             -> integer, # of variables
c xnl           -> double precision, current estimate of nonlinear variables
c g             -> double precision, current gradient
c
      implicit         double precision (a-h,o-z)
      parameter        (idpsze = 8)
      double precision R(ldr,*), AR(ldar,*), wk(*), wk1(*),
     *                 wk2(*), g(*), xnl(*)
      integer          rowprc(nodemx,2), kmax, pid, type0, type1
      logical          active
      external         sfun
c
c transpose R (sending one sub-column at a time)
c
      type1 = type0 + kmax
      do 10 j = 1,k
          if (j .ne. id+1) then
              call csend (type0+id, R(1,j), m*idpsze, j-1, pid)
          else
              call dcopy (m, R(1,j),1, wk(myrows),1)
          endif
10    continue
c
c receive one column of R in wk, and form one column of AR in wk1
c
      if (id .lt. k) then
          do 20 j = 1,kmax
          if (j .ne. id+1)
     *         call crecv (type0+j-1, wk(rowprc(j,1)),
     *                  rowprc(j,2)*idpsze)
20        continue
          active = .true.
          call atimes  (wk, wk1, n, xnl, g, wk2, sfun, active, k)
c
c 'untranspose' the result wk1, sending sub-columns to rest of cube
c
          do 30 j = 1,kmax
              if (j. ne. id+1) then
                  call csend (type1+id, wk1(rowprc(j,1)),
     *                    rowprc(j,2)*idpsze, j-1, pid)
              else
                  call dcopy(m, wk1(myrows), 1, AR(1,j), 1)
              endif
30        continue
      else
          active = .false.
          call atimes  (wk, wk1, n, xnl, g, wk2, sfun, active, k)
      endif
c
c store result in AR
c
      do 40 j = 1,k
          if (j .ne. id+1) call crecv(type1+j-1, AR(1,j), m*idpsze)
40    continue
      type0 = type0 + 2*kmax
c
      return
      end
c
c
      subroutine atimes (v, av, n, x, g, wk, sfun, active, k)
      implicit         double precision  (a-h,o-z)
      double precision v(*), av(*), x(*), g(*), wk(*)
      logical          active
c
c compute matrix/vector product via finite-differencing
c
c PARAMETERS
c v       -> double precision, size n, input for matrix/vector product
c av      <- double precision, size n, result of matrix/vector product (Av)
c n       -> integer, dimension of problem
c x       -> double precision, size n, current nonlinear parameter vector
c g       -> double precision, size n, current nonlinear gradient
c wk      -  double precision, size n, work array
c sfun    -> subroutine sfun (n,x,f,g,a,k) to evaluate nonlinear function
c active  -> logical, true if this processor must evaluate f(x)
c k       -> integer, the current block size
c
c REQUIRES
c d1mach -  function to specify machine constants
c BLAS   -  basic linear algebra subroutines
c
      if (.not. active) then
          call sfun (n, wk, fw, av, active, k)
          return
      end if
      eps = d1mach(3)
      h   = 1.d1*sqrt(eps)
      rh  = 1.d0 / h
      call dcopy (n, x, 1, wk, 1)
      call daxpy (n, h, v, 1, wk, 1)
      call sfun  (n, wk, fw, av, active, k)
      call daxpy (n, -1.d0, g, 1, av, 1)
      call dscal (n, rh, av, 1)
c
      return
      end
c
c
      subroutine frmpc (D, alpha, R, AR, mm, m, kk, k,
     *            id, iunit, kmax, wk, wk1,
     *            nodenm, type0, pid)
c
c Update the diagonal preconditioner, based on BFGS formula
c
c PARAMETERS
c D        <-> double precision, size m, diagonal preconditioner
c alpha     -> double precision, size k*k, matrix from routine BCG
c R         -> double precision, size m*k, matrix from routine BCG
c AR        -> double precision, size m*k, matrix from routine BCG
c mm        -> integer, leading dimension of R and AR
c m         -> integer, number of rows handled by this processor
c kk        -> integer, leading dimension of alpha
c k         -> integer, current block size
c id        -> integer, i.d. number of this processor
c iunit     -> integer, unit number for output
c kmax      -> integer, number of processors in use
c wk        -  double precision, size 1, work variable
c wk1       -  double precision, size 1, dummy argument (to match
c              with FORMM1)
c nodenm    -  integer, size kmax, work array for lower routines
c type0    <-> integer, records current message type
c pid       -> integer, process i.d. number
c
      implicit         double precision (a-h,o-z)
      double precision D(*), R(mm,*), AR(mm,*), alpha(kk,*),
     *                 wk(*), wk1(*)
      integer          nodenm(*), type0, pid
c
c update D for each vector in the block
c
      do 30 ind = 1,k
c
c form v'Dv for this node, and then produce global result
c
          vdv = 0.d0
          do 10 i = 1,m
              vdv = vdv + r(i,ind)*D(i)*r(i,ind)
10        continue
          call xgdsum (vdv, 1, wk, kmax, id, type0, nodenm, pid)
c
c update D
c
          vgv = alpha(ind,ind)
          if (abs(vdv) .le. 1.d-12) go to 30
          if (abs(vgv) .le. 1.d-12) go to 30
          do 20 i = 1,m
              t1 = D(i)*R(i,ind)
              t2 = AR(i,ind)
              D(i) = D(i) - t1*t1/vdv + t2*t2/vgv
              if (D(i) .le. 1.d-6) D(i) = 1.d0
20        continue
c
30    continue
c
      return
      end
c
c
      subroutine frmpc1 (D, alpha, R, AR, mm, m, kk, k,
     *            id, iunit, kmax, wk, wk1,
     *            nodenm, type0, pid)
c
c Update the diagonal preconditioner, BFGS (approximate) formula
c
c PARAMETERS
c D        <-> double precision, size m, diagonal preconditioner
c alpha     -> double precision, size k*k, matrix from routine BCG
c R         -> double precision, size m*k, matrix from routine BCG
c AR        -> double precision, size m*k, matrix from routine BCG
c mm        -> integer, leading dimension of R and AR
c m         -> integer, number of rows handled by this processor
c kk        -> integer, leading dimension of alpha
c k         -> integer, current block size
c id        -> integer, i.d. number of this processor
c iunit     -> integer, unit number for output
c kmax      -> integer, number of processors in use
c wk        -  double precision, size k, work array
c wk1       -  double precision, size k, work array
c nodenm    -  integer, size kmax, work array for lower routines
c type0    <-> integer, records current message type
c pid       -> integer, process i.d. number
c
      implicit         double precision (a-h,o-z)
      double precision D(*), R(mm,*), AR(mm,*), alpha(kk,*),
     *                 wk(*), wk1(*)
      integer          nodenm(*), type0, pid
c
c form v'Dv for this node, and then produce global result
c
      do 20 ind = 1,k
          vdv = 0.d0
          do 10 i = 1,m
              vdv = vdv + r(i,ind)*D(i)*r(i,ind)
10        continue
          wk(ind) = vdv
20    continue
      call xgdsum (wk, k, wk1, kmax, id, type0, nodenm, pid)
c
c update D
c
      do 40  ind = 1,k
          vgv = alpha(ind,ind)
          vdv = wk(ind)
          if (abs(vdv) .le. 1.d-12) go to 40
          if (abs(vgv) .le. 1.d-12) go to 40
          do 30 i = 1,m
              t1 = D(i)*R(i,ind)
              t2 = AR(i,ind)
              D(i) = D(i) - t1*t1/vdv + t2*t2/vgv
              if (D(i) .le. 1.d-6) D(i) = 1.d0
30        continue
c
40    continue
c
      return
      end
c
c
      subroutine precg (V, ldv, m, myrows, b, kmax, iter, initv, iflag)
c
c initialize the matrix V
c      initv = 1  col 1 = b, all others random
c      initv = 2  col 1 = b, col 2 = previous direc, all others random
c      initv = 3  col 1 = b, alternate previous direc and previous grad
c
c PARAMETERS
c V      <-> double precision, size m*kmax, initial matrix for block CG
c            iteration
c ldv     -> integer, leading dimension of V
c m       -> integer, # of rows on this processor
c myrows  -> integer, index of first row on this processor
c b       -> double precision, size n [size of problem], right-hand side
c            vector
c kmax    -> integer, # of active processors
c iter    -> integer, outer iteration number
c initv   -> integer, type of initialization desired
c iflag   -> integer, indicator flag (tests if initv=3 is failing)
c
      implicit         double precision (a-h,o-z)
      double precision V(ldv,*), b(*)
c
c Store b as first column of V
c
      call dcopy (m, b(myrows), 1, V, 1)
c
c INITV=1,2,3:  Dynamic (and random) initialization
c
      krand = 2
      if (initv .eq. 2) then
          if (iter .gt. 1) krand = 3
      else if (initv .eq. 3) then
          if (iflag .eq. -1) then
              krand = 3
              go to 20
          endif
          krand = 2*iter
          if (krand .gt. kmax) go to 40
      endif
20    do 30 j = krand,kmax
          call rancol (V, ldv, j, m, myrows)
30    continue
c
40    return
      end
c
c
      subroutine postcg (V, ldv, n, m, myrows, k, iter, initv,
     *                  Vinit, ldvi, x, g, gnrm, iflag, id, kmax,
     *                  nodenm, type0, pid)
c
c initialize the matrix V
c    initv = 2  col 1 = b, col 2 = previous direc(x), all others random
c    initv = 3  col 1 = b, alternate previous direcs(x) and grads(g)
c If directions are linearly dependent (at the first inner iteration)
c the offending column is replaced by a random column.  If this happens
c for the same column two outer iterations in a row, subroutine btn
c switches to initialization strategy 3.
c
c V        <-  double precision, size m*k, new initialization matrix
c ldv       -> integer, leading dimension of Vf
c n         -> integer, size of problem
c m         -> integer, number of rows on this processor
c myrows    -> integer, index of first row on this processor
c k         -> integer, block size
c iter      -> integer, outer iteration number
c initv     -> integer, choice of initialization scheme
c Vinit    <-> double precision, size m*k, storage of initialization vectors
c ldvi      -> integer, leading dimension of Vinit
c x         -> double precision, size n, search-direction vector
c g         -> double precision, size n, current gradient
c gnrm      -> double precision, infinity-norm of g
c iflag     -> integer, flag from BCG (indicates linear dependence)
c id        -> integer, i.d. number of this processor
c kmax      -> integer, number of active processors
c nodenm    -  integer, size kmax, array used by lower-level routines
c type0    <-> integer, type number for message passing
c pid       -> integer, process i.d. number
c
      implicit         double precision (a-h,o-z)
      double precision V(ldv,*), Vinit(ldvi,*), x(*), g(*)
      integer          nodenm(*), type0, pid
c
      if (k .eq. 1) return
      if (initv .eq. 1) return
c
c store search direction in column 2
c
      xnrm = pdnrmi (m, x(myrows), 1, kmax,
     *            id, type0, nodenm, pid)
      call dcopy (m, x(myrows), 1, V(1,2), 1)
      call dscal (m, 1.d0/xnrm, V(1,2), 1)
      if (initv .eq.  2) return
      if (iflag .eq. -1) return
      if (k     .eq.  2) return
c
c store current gradient in column 3
c
      call dcopy (m, g(myrows), 1, V(1,3), 1)
      call dscal (m, 1.d0/gnrm, V(1,3), 1)
      if (k .eq. 3) return
      if (iflag .gt. 0)
     *     call rancol (Vinit, ldvi, iflag, m, myrows)
c
c update remaining columns
c
      do 10 j = k, 4, -1
          call dcopy (m, Vinit(1,j-2), 1, Vinit(1,j), 1)
          call dcopy (m, Vinit(1,j  ), 1,     V(1,j), 1)
10    continue
c
      call dcopy (m, V(1,2), 1, Vinit(1,2), 1)
      call dcopy (m, V(1,3), 1, Vinit(1,3), 1)
c
      return
      end
c
c
      subroutine psrch (iflag, n, x, f, g, d, x1, g1, gopt,
     *            sfun, nf, eta, stepmx, id, pid, kmax, cubesz,
     *            nodenm, type0, dg, alpha)
c
c Parallel line search
c      Using Armijo convergence test based on function values only
c      See Luenberger (2nd edition), p. 212
c
c Parameters:
c iflag  <--  integer, error code:
c                 0 => okay
c                -1 => okay, but alpha <> 1
c                 1 => no acceptable point found
c                 2 => d is a direction of ascent
c                 3 => function may be unbounded below
c n       --> integer, number of variables
c x      <--> double precision, size n, current and new vector of parameters
c f      <--> double precision, current and new function value
c g      <--> double precision, size n, current and new gradient vector
c d       --> double precision, size n, search direction vector
c x1      --  double precision, size n, work array to store temporary x
c g1      --  double precision, size n, work array to store temporary g
c gopt    --  double precision, size n, work array to store temporary g
c sfun    --> subroutine to evaluate f(x) (call sfun (n,x,f,g,a,k))
c nf     <--> integer, total number of function/gradient evaluations
c eta     --> double precision, parameter for line search
c stepmx  --> double precision, maximum step allowed
c id      --> integer, id number of processor
c pid     --> integer, process id number
c kmax    --> integer, number of processors available to evaluate f(x)
c cubesz  --> integer, number of processors
c alpha  <--  double precision, final step size
c
      implicit         double precision (a-h, o-z)
      double precision d(*), g(*), g1(*), gopt(*),
     *                 x(*), x1(*), ww(3), wx(3)
      integer          nodenm(*), kmax, pid, type0, cubesz
      logical          active, aup, adown
      external         fcomp
c
c  set up
c
      id      = mynode ()
      aup     = .true.
      adown   = .true.
      idpsze  = 8
      nidpsze = n * idpsze
      alfopt  = 0.d0
      fopt    = f
      itmax   = 2 + 30/kmax
c
      iflag   = 0
      if (dg .ge. 0.d0) then
          iflag = 2
          return
      endif
      icount = 0
c
c  get maximum step and initialize alpha
c
      if (kmax .eq. 1) then
          alpha = 1.d0
          if (stepmx .lt. alpha) alpha = stepmx
          amax = alpha
          amin = amax
      else
          amax = kmax
          if (amax .gt. stepmx) amax = stepmx
          if (amax .gt. 1.d0) then
              amin = 1.d0 / dfloat(kmax)
              if (kmax .eq. 2) amin = 1.d0
          else
              amin = amax / dfloat(kmax)
          end if
          call setalf (amax, amin, alpha, kmax, id, 1)
      endif
c
c  test function values at initial points
c
20    nf = nf + 1
      if (id .lt. kmax) then
          active = .true.
          call dsvtvp (n, alpha, d, 1, x, 1, x1, 1)
          call sfun (n, x1, f1, g1, active, kmax)
      else
          active = .false.
          call sfun (n, x1, f1, g1, active, kmax)
          f1 = f + 1.d0
      end if
      ftest = f + eta * alpha * dg
c
c determine minimum function value using the subroutine gopf
c
      fmin = f
      f2   = f1
      if (f2 .gt. ftest) f2 = fmin + 1.d0
      if (cubesz .eq. 1) then
          fmin = f2
          imin = 1
      else
          wx(1) = f2
          wx(2) = alpha
          wx(3) = id + 1
          nw = 3 * idpsze
          call gopf (wx, nw, ww, fcomp)
          fmin  = wx(1)
          alpha = wx(2)
          imin  = wx(3)
      endif
      if (fmin .ge. f) then
          imin  = 0
          fmin  = f
          alpha = 0.d0
      end if
      icount = icount + 1
c
c Test for success at this iteration and failure at last iteration.
c In this case, use the previous optimal step and exit.
c
      if (fmin .ge. fopt .and. alfopt .ne. 0.d0) then
          imin  = kmax
          alpha = alfopt
          fmin  = fopt
          if (id .eq. kmax-1) call dcopy (n, gopt, 1, g1, 1)
          go to 30
      end if
c
c Test for too many iterations
c
      if (icount .ge. itmax .and. imin .eq. 0) then
          iflag = 1
          return
      endif
      if (icount .ge. itmax .and. imin .eq. kmax) then
          iflag = 3
      endif
c
c Test to see if step must be reduced
c
      if (imin .eq. 0 .and. adown) then
          aup  = .false.
          amax = 0.9d0 * amin
          amin = amax / dfloat (kmax)
          if (kmax .eq. 1) then
              amax = amin / 2.d0
              amin = amax
          end if
          call setalf (amax, amin, alpha, kmax, id, 2)
          go to 20
      endif
c
c Test to see if step must be increased
c
      if (kmax .gt. 1 .and. imin .eq. kmax
     *          .and. amax .lt. stepmx .and. aup) then
          adown = .false.
          if (fmin .lt. fopt) then
              alfopt = alpha
              fopt   = fmin
              if (id .eq. kmax-1) call dcopy (n, g1, 1, gopt, 1)
          end if
          amin = amax * 1.1d0
          amax = amin * kmax
          if (amax .gt. stepmx) amax = stepmx
          call setalf (amax, amin, alpha, kmax, id, 1)
          go to 20
      endif
c
c form the new x, and send g to all the nodes
c
30    call dsvtvp (n, alpha, d, 1, x, 1, x, 1)
      if (alpha .ne. 1.d0) iflag = -1
      f = fmin
      if (imin .eq. id + 1) then
          call dcopy (n, g1, 1, g, 1)
          do 60 j = 1, cubesz-1
              nodenm(j) = j-1
              if (j .gt. id) nodenm(j) = nodenm(j) + 1
60        continue
          call gsendx (type0, g, nidpsze, nodenm, cubesz-1)
      else
          call crecv (type0, g, nidpsze)
      endif
      type0 = type0 + 1
c
      return
      end
c
c
      subroutine setalf (amax, amin, alpha, cubesz, id, ind)
c
c determine a set of steplengths alpha for the line search
c
c Parameters
c amax    --> maximum value of alpha
c amin   <--  minimum value of alpha
c alpha  <--  value of alpha for this processor
c cubesz  --> number of processors
c id      --> id number of this processor
c ind     --> indicator: =1 (normal) =2 (shrinking step)
c
      implicit   double precision (a-h, o-z)
      integer    id, cubesz
c
c Set up
c
      if (id .ge. cubesz) then
          alpha = 0.0d0
          return
      end if
      np1 = cubesz - 1
      if (amin .ge. amax) amin = amax / dfloat(cubesz)
      if (ind .eq. 2) go to 100
c
      if (cubesz .gt. 2) then
          if (amin .lt. 1.d0 .and. amax .gt. 1.d0) then
              imid = cubesz/2
              if (id .lt. imid) then
                  alpha = amin + id * (1.d0-amin)/dfloat(imid)
              else if (id .gt. imid) then
                  alpha = 1.d0 +
     *                  (id-imid) * (amax-1.d0)/dfloat(np1-imid)
              else if (id .eq. imid) then
                  alpha = 1.d0
              end if
          else
              alpha = amin + id * (amax-amin)/dfloat(np1)
          end if
      else
          if (id .eq. 0) alpha = amin
          if (id .eq. 1) alpha = amax
      end if
      return
c
c Shrinking of step (more rapid reduction of step to zero)
c
100   if (cubesz .ge. 2) then
          alpha = amax / 2.d0**(cubesz-id-1)
          amin  = amax / 2.d0**(cubesz-1)
      else
          alpha = amax
          amin  = amax
      end if
      return
c
      end
c
c
      subroutine fcomp (wx, ww)
c
c Comparison routine used by gopf above.
c Test for smallest function value, and set related parameters
c
      double precision wx(*), ww(*)
c
      if (ww(1) .lt. wx(1)) then
          wx(1) = ww(1)
          wx(2) = ww(2)
          wx(3) = ww(3)
      else if (ww(1) .eq. wx(1) .and. ww(3) .lt. wx(3)) then
          wx(1) = ww(1)
          wx(2) = ww(2)
          wx(3) = ww(3)
      end if
c
      return
      end
c
c
      subroutine aminsb (A, lda, B, ldb, C, ldc, m, n)
      double precision A(lda,*), B(ldb,*), C(ldc,*)
c
c computes C = A - B, for m*n matrices A, B, and C
c [C can overwrite either A or B]
c
      do 10 j = 1, n
          call dvsub (m, A(1,j), 1, B(1,j), 1, C(1,j), 1)
10    continue
c
      return
      end
c
c
      subroutine atimsb (A, lda, n, m, B, ldb, k, C, ldc)
      implicit         double precision (a-h, o-z)
      double precision A(lda,*), B(ldb,*), C(ldc,*)
c
c forms C = A x B,  for A(n*m), B(m*k), and C(n*k)
c
      do 10 j = 1,k
      do 10 i = 1,n
          C(i,j) = ddot (m,A(i,1),lda,B(1,j),1)
10    continue
c
      return
      end
c
c
      subroutine disast (routne, msg, iunit, iflag)
c
c print fatal error message to unit iunit, then return
c
c PARAMETERS
c routne -> character*(*), name of routine where error was detected
c msg    -> character*(*), error message
c iunit  -> integer, unit for output
c iflag <-  integer, flag=999 upon return from disaster
c
      character*(*) routne, msg
c
      write (iunit,800) routne, msg
      iflag = 999
c
      return
800   format (' ********************',         /,
     *        ' ERROR, ERROR, ERROR',          /,
     *        ' Fatal error in routine ', a10, /,
     *        ' ', a40,                        /,
     *        ' Terminating',                  /,
     *        ' ********************')
      end
c
c
      subroutine dpofa2 (A, lda, n, info, id, iunit, iflag, rnktol)
      implicit         double precision (a-h,o-z)
      double precision A(lda,*),  t, s, tol, mnorm
      integer          info, n
c
c Cholesky factor of a double precision positive definite matrix.
c this is a modification of the LINPACK routine DPOFA, designed to
c produce the lower triangular matrix a column at a time.
c
c info:   integer = 0 for normal return
c                 = k signals an error condition - the leading minor
c                     of order k is not positive definite.
c
      tol = rnktol * mnorm (A, lda, n, n)
c
c  get LDL decomposition from a Cholesky factorization
c
      do 30 i = 1,n
          s = ddot (i-1, A(i,1), lda, A(i,1), lda)
          s = A(i,i) - s
c
          info = i
          if (s .lt. -tol) return
          if (s .le.  tol) then
              info = - info
              return
          endif
          A(i,i) = sqrt(s)
c
          do 20 k = i+1,n
              t = a(k,i)  - ddot (i-1, A(i,1), lda, A(k,1), lda)
              if (A(i,i) .eq. 0.d0) then
                  call disast ('DPOFA2', 'A(i,i) = 0', iunit, iflag)
                  return
              end if
              t = t / A(i,i)
              A(k,i)  = t
20        continue
30    continue
      info = 0
c
      return
      end
c
c
      subroutine getld2 (L, ldl, k, D, iunit, iflag)
      implicit         double precision (a-h, o-z)
      double precision L(ldl,*), D(*)
c
c  get LDL decomposition from a Cholesky factorization
c
      do 10 i = 1,k
          D(i) = L(i,i)
          if (D(i) .eq. 0.d0) then
              call disast ('GETLD2', 'D(i) = 0', iunit, iflag)
              return
          end if
          do 10 j = 1,k
              if (j .gt. i) L(i,j) = 0.d0
              if (j .le. i) L(i,j) = L(i,j) / D(j)
10    continue
      call dvmul (k, D, 1, D, 1, D, 1)
c
      return
      end
c
c
      subroutine getmr (n, p, rowprc, nodemx, istart, iend, istep)
      integer rowprc(nodemx,2), p
c
c determine the rows handled by each processor
c
c PARAMETERS
c n        -> # of variables
c p        -> # of processors
c rowprc  <-  array to store indexing information
c nodemx   -> declared dimension of rowprc
c istart   -> beginning index
c iend     -> ending index
c istep    -> stride for index
c
      nn = (iend - istart + istep) / istep
c     m = n / p
      m = nn / p
c     rowprc(1,1) = 1
      rowprc(1,1) = istart
      rowprc(1,2) = m
      mnp = mod(nn,p)
c
      do 10 i = 2,p
c         rowprc(i,1) = rowprc(i-1,1) + rowprc(i-1,2)
          rowprc(i,1) = rowprc(i-1,1) + rowprc(i-1,2)*istep
          if (mnp .ge. i-1) then
              rowprc(i,2) = m + 1
          else
              rowprc(i,2) = m
          endif
10    continue
c
      return
      end
c
c
      subroutine lsol (l, ldl, k, y, x, iunit, iflag)
      implicit         double precision (a-h, o-z)
      double precision l(ldl,*), y(*), x(*)
c
c solve Lx = y where L is lower triangular
c the vectors x and y may be the same (overwrite solution on rhs)
c
      do 10 i = 1,k
          x(i) = y(i) - ddot (i-1, l(i,1), ldl, x, 1)
          if (l(i,i) .eq. 0.d0) then
              call disast ('LSOL', 'l(i,i) = 0', iunit, iflag)
              return
          end if
          x(i) = x(i) / l(i,i)
10    continue
c
      return
      end
c
c
      subroutine matcpy (A, lda, B, ldb, m, n)
      double precision A(lda,*), B(ldb,*)
c
c copies matrix A  (m x n) onto matrix B
c
      do 10 j = 1,n
          call dcopy (m, A(1,j), 1, B(1,j), 1)
10    continue
c
      return
      end
c
c
      subroutine maxpy (y, ldy, n, k, alpha, a, lda, m, x, ldx)
      double precision y(ldy,*), a(lda,*), x(ldx,*), alpha, t
c
c form y = y + alpha * Ax
c      y      n x k
c      A      n x m
c      x      m x k
c      alpha  scalar
c
      do 10 i = 1,n
      do 10 l = 1,m
          t = alpha*A(i,l)
          call daxpy (k,t,x(l,1),ldx,y(i,1),ldy)
10    continue
c
      return
      end
c
c
      double precision function mnorm (A, lda, m, n)
      implicit double precision (a-h,o-z)
      double precision A(lda,*), colnrm, zero
      data   zero /0.d0/
c
c compute the 1-norm of the m x n matrix A
c
      mnorm = zero
      do 10 j = 1,n
          colnrm = dasum (m, A(1,j), 1)
          if (mnorm .lt. colnrm) mnorm = colnrm
10    continue
c
      return
      end
c
c
      subroutine msolve (A, lda, m, n, Am, ldam, Rm)
      double precision A(lda,*), Am(ldam,*), Rm(*)
c
c  preconditions the m*n matrix A, using the vector Rm.
c  resulting matrix is in Am [A and Am can be the same]
c
      do 10 j = 1,n
          call dvdiv (m, A(1,j), 1, Rm, 1, Am(1,j), 1)
10    continue
c
      return
      end
c
c
      double precision function pddot (m, x, lx, y, ly, p, id,
     *                  type0, nodenm, pid)
c
c Parallel inner product routine
c Each node forms the inner product of its part of x and y in parallel
c Node 0 accumulates the inner products and sends result to other nodes.
c
c PARAMETERS
c m         -> integer, size of arrays on each processor
c x         -> double precision, size m, input array 1
c lx        -> integer, stride for x (as in DDOT)
c y         -> double precision, size m, input array 2
c ly        -> integer, stride for y (as in DDOT)
c p         -> integer, number of active processors
c id        -> integer, i.d. number of this processor
c type0    <-> integer, message number for communication
c nodenm    -  integer, size p, array for lower-level routines
c pid       -> integer, process i.d. number
c
      implicit         double precision (a-h,o-z)
      double precision x(*), y(*)
      integer          nodenm(*), p, type0, pid
c
      prod  = ddot (m, x, lx, y, ly)
      call xgdsum (prod, 1, sval, p, id, type0, nodenm, pid)
      pddot = prod
c
      return
      end
c
c
      double precision function pdnrmi (m, x, lx, p, id,
     *                  type0, nodenm, pid)
c
c Parallel infinity-norm routine
c Each node computes the norm of its portion of x in parallel
c Node 0 accumulates the norms and sends result to other nodes.
c
c PARAMETERS
c m         -> integer, size of arrays on each processor
c x         -> double precision, size m, input array
c lx        -> integer, stride for x (as in DNRM2)
c p         -> integer, number of active processors
c id        -> integer, i.d. number of this processor
c type0    <-> integer, message number for communication
c nodenm    -  integer, size p, array for lower-level routines
c pid       -> integer, process i.d. number
c
      implicit         double precision (a-h,o-z)
      double precision x(*)
      integer          nodenm(*), p, type0, pid
c
      if (m .gt. 0) then
          ix   = idamax (m, x, lx)
          xnrm = abs(x(ix))
      else
          xnrm = 0.d0
      end if
      call xgdmax (xnrm, 1, sval, p, id, type0, nodenm, pid)
      pdnrmi = xnrm
c
      return
      end
c
c
      subroutine premlt (A, lda, k1, n,  L, ldl, k2, B,ldb)
      implicit         double precision (a-h,o-z)
      double precision A(lda,*), L(ldl,*), B(ldb,*)
c
c forms B =  L' x A
c for A(k1*n) and L(k1*k2) lower triangular (with k1 .ge. k2)
c
      do 10 j = 1,n
      do 10 i = 1,k2
          B(i,j) = ddot (k1-i+1,A(i,j),1,L(i,i),1)
10    continue
c
      return
      end
c
c
      subroutine rancol (V, ldv, j, m, myrows)
c
c  generate a random vector for the j-th column of the matrix V
c
c PARAMETERS
c V      <-  double precision, size m*j, initialization matrix (see
c            PREBCG)
c ldv     -> integer, leading dimension of V
c j       -> integer, column to be randomized
c m       -> integer, # of rows on this processor
c myrows  -> integer, index of first row on this processor
c
      implicit         double precision (a-h,o-z)
      double precision V(ldv,*)
      integer          click, seed, rand
c
c setup
c
      click = 52343
      icol  = j
      seed  = mod(2*(icol)*click+1,65536)
      rand  = seed
c
c increment seed
c
      do 15 i = 1, myrows-1
         rand = mod(3125*rand,65536)
15    continue
c
c generate random column
c
      do 20 i = 1, m
          rand = mod(3125*rand,65536)
          V(i,j) = (rand - 32768.0d0)/16384.0d0
20    continue
c
      return
      end
c
c
      subroutine timesl (A, lda, n, k1, L, ldl, k2)
      implicit         double precision (a-h,o-z)
      double precision A(lda,*), L(ldl,*), t
c
c performs A = A x L
c for A(n*k1) and L(k1*k2) lower triangular (with k1 .ge. k2)
c
      do 20 j = 1,k2
      do 20 i = 1,n
          t      = ddot(k1-j+1,A(i,j),lda,L(j,j),1)
          A(i,j) = t
20    continue
c
      return
      end
c
c
      subroutine vltinv (V, ldv, m, k, P, ldp, L, ldl, iunit, iflag)
      implicit         double precision (a-h,o-z)
      double precision L(ldl,*),  V(ldv,*), P(ldp,*), t
c
c Solves P L' = V, for L(k*k) lower triangular, P(m*k) and V(m*k)
c V may be overwritten
c
      do 20 i = 1,m
          do 10 j = 1,k
            t = V(i,j) - ddot (j-1,P(i,1),ldp,L(j,1),ldl)
            if (L(j,j) .eq. 0.d0) then
                call disast ('VLTINV', 'L(j,j) = 0', iunit, iflag)
                return
            end if
            P(i,j) = t / L(j,j)
10        continue
20    continue
c
      return
      end
c
c
      subroutine xgcol (x, lx, y, ly, ncnt, p, m, id, type0, n,
     *                  nodenm, pid)
c
c If p=cubesz, then call GCOL.
c Otherwise, collect a vector x in the vector y on node 0, and then
c     send y to all nodes.
c
c PARAMETERS
c x         -> double precision, size lx, input vector (one on each
c              processor)
c lx        -> integer, length of x (in bytes)
c y         -> double precision, size ly, output vector
c ly        -> integer, length of y (in bytes)
c ncnt      -  integer, dummy used by GCOL
c p         -> integer, number of active processors
c m         -> integer, dimension of x
c id        -> integer, i.d. number of this processor
c type0    <-> integer, type number for messages
c n         -> integer, dimension of y
c nodenm    -  integer, size p, array for global send
c pid       -> integer, process i.d. number
c
      implicit         double precision (a-h,o-z)
      double precision x(*), y(*)
      integer          nodenm(*), type0, p, pid, cubesz
      parameter        (izero = 0, idpsze=8)
c
      cubesz = numnodes()
      if (p .eq. cubesz) then
          call gcol (x, lx, y, ly, ncnt)
          return
      end if
c
      if (id .eq. izero) then
          call dcopy (m, x, 1, y, 1)
          mnp = mod(n,p)
          loc = m + 1
          do 20 iproc = 1, p-1
              msgl = m
              if (iproc .le. mnp) msgl = m + 1
              call crecv (type0+iproc, y(loc), msgl*idpsze)
              loc = loc + msgl
20        continue
          do 30 i = 1,cubesz-1
              nodenm(i) = i
30        continue
          ndpsize = n * idpsze
          call gsendx (type0, y, ndpsize, nodenm, cubesz-1)
      else if (id .lt. p) then
          call csend (type0+id, x, lx, izero, pid)
          call crecv (type0,  y, ly)
      else
          call crecv (type0,  y, ly)
      endif
      type0 = type0 + p
c
      return
      end
c
c
      subroutine xgdmax (x, n, work, p, id, type0, nodenm, pid)
c
c If p=cubesz, call GDHIGH.
c Otherwise, maximize the components of the vectors x on the nodes, and
c    store the result on all the nodes.
c
c PARAMETERS
c x        <-> double precision, size n, vector to be maxed, as well
c              as the result
c n         -> integer, dimension of x
c work      -  double precision, size n, work array
c p         -> integer, number of active processors
c id        -> integer, i.d. number of processor
c type0    <-> integer, type number for messages
c nodenm    -  integer, size p, array for global send
c pid       -> integer, process i.d. number
c
      implicit         double precision (a-h, o-z)
      double precision x(*), work(*)
      integer          nodenm(*), p, type0, pid, cubesz
      parameter        (izero=0, idpsze=8)
c
      cubesz = numnodes()
      if (p .eq. cubesz) then
          call gdhigh (x, n, work)
          return
      end if
c
      ndpsize = n * idpsze
      if (id .eq. izero) then
          do 10 iproc = 1, p-1
              call crecv (type0+iproc, work, ndpsize)
              call dvmax (n, x, 1, work, 1, x, 1)
10        continue
          do 20 i = 1,p-1
              nodenm(i) = i
20        continue
          call gsendx (type0, x, ndpsize, nodenm, p-1)
      else
          call csend (type0+id, x, ndpsize, izero, pid)
          call crecv (type0,  x, ndpsize)
      endif
      type0 = type0 + p
c
      return
      end
c
c
      subroutine xgdsum (x, n, work, p, id, type0, nodenm, pid)
c
c If p=cubesz, call GDSUM.
c Otherwise, sum the components of the vectors x on the nodes, and
c    store the result on all the nodes.
c
c PARAMETERS
c x        <-> double precision, size n, vector to be summed and
c              the result
c n         -> integer, dimension of x
c work      -  double precision, size n, work array
c p         -> integer, number of active processors
c id        -> integer, i.d. number of processor
c type0    <-> integer, type number for messages
c nodenm    -  integer, size p, array for global send
c pid       -> integer, process i.d. number
c
      implicit         double precision (a-h, o-z)
      double precision x(*), work(*)
      integer          nodenm(*), p, type0, pid, cubesz
      parameter        (izero=0, idpsze=8)
c
      cubesz = numnodes()
      if (p .eq. cubesz) then
          call gdsum (x, n, work)
          return
      end if
c
      ndpsize = n * idpsze
      if (id .eq. izero) then
          do 10 iproc = 1, p-1
              call crecv (type0+iproc, work, ndpsize)
              call dvadd (n, x, 1, work, 1, x, 1)
10        continue
          do 20 i = 1,p-1
              nodenm(i) = i
20        continue
          call gsendx (type0, x, ndpsize, nodenm, p-1)
      else
          call csend (type0+id, x, ndpsize, izero, pid)
          call crecv (type0,  x, ndpsize)
      endif
      type0 = type0 + p
c
      return
      end
c********************************************************************
c BTN: CHKDER, parallel derivative checker for nonlinear functions
c last changed: 09/27/91
c********************************************************************
c
      subroutine chkez (n, x, f, g, w, lw, iw, sfun, iflag, errmax,
     *                  imax)
      implicit         double precision (a-h,o-z)
      double precision x(*), g(*), w(*)
      integer          iw(*)
      external         sfun
c
c Check derivatives of objective function via finite-differencing.
c Easy to use version
c
c PARAMETERS
c n       -> integer, dimension of problem
c x       -> double precision, size n, point where derivatives are checked
c f      <-  double precision, value of function at the point x (from sfun)
c g      <-  double precision, size n, gradient at the point x (from sfun)
c w       -  double precision, size n, work array
c lw      -> integer, declared length of array w [not used]
c iw      -> integer, size 2k, integer work array (k = # of processors
c            on hypercube)
c sfun    -> subroutine sfun (n,x,f,g) to evaluate f(x), g(x);
c            see subroutine BTNEZ for details. Routine SFUN must
c            be declared EXTERNAL in the calling program.
c iflag  <-  integer, error code:   0 => gradient values appear accurate
c                                 100 => gradient error > .01
c errmax <-  double precision, size of largest error in the gradient
c imax   <-  integer, index where errmax was obtained
c
c set default values of customizing parameters
c
      call chkpar (n, kmax, msglvl, iunit, istart, iend,
     *      istep)
      nodemx  = numnodes()
c
c check selected gradient values
c
      call chkder (n, x, f, g, w, lw, iw, sfun, iflag,
     *             nodemx, kmax, errmax, imax,
     *             msglvl, iunit, istart, iend, istep)
      return
      end
c
c
      subroutine chkpar (n, kmax, msglvl, iunit, istart, iend,
     *                   istep)
c
c
c  sets parameters for  derivative checker
c
c n         -> integer, number of variables
c kmax     <-  integer, block size
c              default:   kmax = numnodes()
c msglvl   <-  integer,  amount of printing desired,
c              (<-1 => none
c                -1 => warning messages only
c                 0 => one line with maximum error
c                 1 => individual components are printed.
c                      Processor i will print to iunit + i
c                      (if unit is not already opened, then
c                      output will be sent to file chkout.<iunit+i>)
c              default:   msglvl = 0
c iunit    <-  see discussion for msglvl
c              default:   iunit = 10
c istart   <-  integer, index of the first partial derivative
c              to be checked
c              default:  istart = 1
c iend     <-  integer, index of the last partial derivative to be
c              checked
c              default:  iend = istep*kmax
c istep    <-  integer, gradient is tested every istep components.
c              default: istep = n/kmax
c              Warning: if n is large or if gradient values are
c                       expensive, checking the gradient values can
c                       be very time consuming.  The default values
c                       result in each processor checking exactly
c                       one gradient component.
c
      kmax    = numnodes()
      msglvl  = 0
      iunit   = 10
      istart  = 1
      istep   = n/kmax
      if (istep .le. 0) istep = 1
      iend    = istart + istep*kmax - 1
      if (iend .gt. n) iend = n
c
      return
      end
c
c
      subroutine chkder (n, x, f, g, w, lw, rowinf, sfun, iflag,
     *                   nproc, kmax, errmax, imax,
     *                   msglvl, iunit, istart, iend, istep)
c
c Check derivatives of objective function via finite-differencing.
c
c PARAMETERS
c n       -> integer, dimension of problem
c x       -> double precision, size n, point where derivatives are checked
c f      <-  double precision, value of function at the point x (from sfun)
c g      <-  double precision, size n, gradient at the point x (from sfun)
c w       -  double precision, size n, work array
c lw      -> integer, declared length of array w [not used]
c rowinf <-  integer, size (nproc,2), information on processor allocation
c sfun    -> subroutine sfun (n,x,f,g,a,k) to evaluate f(x), g(x);
c            see subroutine BTNEZ for details. [Note: if more than one
c            processor is being used to compute each function value,
c            then an 'inactive' processor # (id) will call sfun at the
c            same time as processor # (mod(id,kmax)).] Routine sfun must
c            be declared EXTERNAL in the calling program.
c iflag  <-  integer, error code:   0 => gradient values appear accurate
c                                 100 => gradient error > .01
c nproc   -> integer, maximum number of processors allowed (also the
c            leading dimension of rowinf); need not equal actual number
c            of processors
c kmax    -> integer, block size [normally equal to nproc, unless more
c            than one processor is being used to compute each function
c            value; see subroutine BTN for further details]
c errmax <-  double precision, size of largest error in the gradient
c imax   <-  integer, index where errmax was obtained
c msglvl  -> integer, specify amount of printing desired:
c                   <0 => none
c                  <-1 => none
c                   -1 => warning messages only
c                    1 => individual components to unit (iunit+id)
c                         (if unit is not already opened, then output
c                         will be sent to file chkout.<iunit+id>)
c            where id is the processor #
c iunit   -> integer, see discussion of parameter msglvl
c istart  -> integer, test gradient starting at component istart
c iend    -> integer, test gradient starting at component iend
c istep   -> integer, test gradient at every istep component
c                    e.g., if istep=2 then test every 2nd component
c
c REQUIRES
c d1mach  -  function to specify machine constants
c
      implicit         double precision (a-h,o-z)
      double precision x(*), g(*), w(*), ww(2), wx(2), f
      integer          rowinf(nproc,2), cubesz
      logical          active, msg0, msg1, inuse
      character        outfle*10
      external         fcomp1
c
c Setup: obtain cube and processor information
c
      iflag  = 0
      id     = mynode()
      cubesz = numnodes()
      idpsze = 8
      eps = d1mach(3)
      if (1.d0 + 2.d0*eps .eq. 1.d0) then
          if (id .eq. 0) then
              write (*,*) ' ### CHKDER: ERROR ###'
              write (*,*) '     Machine epsilon too small'
              write (*,*) '     Value = ', eps
              write (*,*) '     Modify routine D1MACH'
              write (*,*) '     Terminating execution'
          end if
          stop
      end if
      h      = sqrt(eps)
      errmax = 0.d0
      imax   = 0
      msg0   = msglvl .ge. 0
      msg1   = msglvl .ge. 1
      iu     = iunit + id
      inquire (iu, opened = inuse)
      if (msg1 .and. .not. inuse) then
          if (id .gt. 99) then
              write (outfle,830) id
          else if (id .gt. 9) then
              write (outfle,840) id
          else
              write (outfle,850) id
          end if
          open (iu, file = outfle, status = 'unknown')
      end if
      if (msg1) write (iu,800)
c
c compare n with blocksize
c
      if (n .lt. kmax) then
          if (id .eq. 0 .and. msglvl .ge. -1) then
              write (*,*) ' CHKDER - Warning: blocksize too small'
              write (*,*) '          kmax = ', kmax
              write (*,*) '          n    = ', n
              write (*,*) '          Setting kmax = n'
          end if
          kmax = n
      end if
      if (id .lt. kmax) then
          active = .true.
      else
          active = .false.
          id0 = mod (id,kmax)
      end if
c
c get row information for this processor
c
      call getmr (n, kmax, rowinf, nproc, istart, iend, istep)
      if (active) then
          i1 = rowinf(id+1,1)
          if (id+1 .lt. kmax) then
              i2 = rowinf(id+2,1)-1
          else
              i2 = iend
          end if
c
c evaluate function and gradient at x
c
          call sfun (n, x, f, g, active, kmax)
c
c compute finite-difference estimate of gradient
c
          do 10 i = i1, i2, istep
              xi = x(i)
              x(i) = x(i) + h
              call sfun (n, x, fx, w, active, kmax)
              gi = (fx - f) / h
              erri = abs(gi - g(i)) / (1.d0 + abs(g(i)))
              if (msg1) write (iu,810) i, g(i), gi, erri
              if (erri .gt. errmax) then
                  errmax = erri
                  imax = i
              end if
              x(i) = xi
10        continue
      else
          i1 = rowinf(id0+1,1)
          if (id0+1 .lt. kmax) then
              i2 = rowinf(id0+2,1)-1
          else
              i2 = iend
          end if
          call sfun (n, x, f, g, active, kmax)
          do 15 i = i1, i2, istep
              call sfun (n, x, fx, w, active, kmax)
15        continue
      end if
c
c determine maximum error (and its index) using the subroutine GOPF
c with subroutine FCOMP1 used to make the comparison
c
      if (cubesz .gt. 1) then
          wx(1) = errmax
          wx(2) = imax
          nw = 2 * idpsze
          call gopf (wx, nw, ww, fcomp1)
          errmax  = wx(1)
          imax    = wx(2)
      endif
      if (errmax .gt. 1.d-2) iflag = 100
      if (msg0 .and. id .eq. 0) write (*,820) errmax, imax
      if (msg1) write (iu,820) errmax, imax
c
      if (.not. inuse) close (iu)
      return
800   format (' CHKDER:  Testing Derivatives', //,
     *        '   i', 9x, 'g(i)', 14x, 'gi', 13x, 'error')
810   format (' ', i3, 2x, d16.8, 2x, d16.8, 2x, d12.4)
820   format (/, ' CHKDER:  Max. error in gradient = ', d12.4, /
     *           '          observed at component ', i6)
830   format ('chkout.',   i3)
840   format ('chkout.0',  i2)
850   format ('chkout.00', i1)
      end
c
c
      subroutine fcomp1 (wx, ww)
c
c Comparison subroutine used by GOPF to find maximum error
c See subroutine CHKDER
c
      double precision wx(*), ww(*)
c
      if (ww(1) .gt. wx(1)) then
          wx(1) = ww(1)
          wx(2) = ww(2)
      end if
c
      return
      end
      DOUBLE PRECISION FUNCTION D1MACH(I)
C***BEGIN PROLOGUE  D1MACH
C***DATE WRITTEN   750101   (YYMMDD)
C***REVISION DATE  901002   (YYMMDD)
C***CATEGORY NO.  R1
C***KEYWORDS  MACHINE CONSTANTS
C***AUTHOR  FOX, P. A., (BELL LABS)
C           HALL, A. D., (BELL LABS)
C           SCHRYER, N. L., (BELL LABS)
C***PURPOSE  Returns double precision machine dependent constants
C***DESCRIPTION
C
C     D1MACH can be used to obtain machine-dependent parameters
C     for the local machine environment.  It is a function
C     subprogram with one (input) argument, and can be called
C     as follows, for example
C
C          D = D1MACH(I)
C
C     where I=1,...,5.  The (output) value of D above is
C     determined by the (input) value of I.  The results for
C     various values of I are discussed below.
C
C  Double-precision machine constants
C  D1MACH( 1) = B**(EMIN-1), the smallest positive magnitude.
C  D1MACH( 2) = B**EMAX*(1 - B**(-T)), the largest magnitude.
C  D1MACH( 3) = B**(-T), the smallest relative spacing.
C  D1MACH( 4) = B**(1-T), the largest relative spacing.
C  D1MACH( 5) = LOG10(B)
C***REFERENCES  FOX P.A., HALL A.D., SCHRYER N.L.,*FRAMEWORK FOR A
C                 PORTABLE LIBRARY*, ACM TRANSACTIONS ON MATHEMATICAL
C                 SOFTWARE, VOL. 4, NO. 2, JUNE 1978, PP. 177-188.
C***ROUTINES CALLED  NONE
C***END PROLOGUE  D1MACH
C
      INTEGER SMALL(4)
      INTEGER LARGE(4)
      INTEGER RIGHT(4)
      INTEGER DIVER(4)
      INTEGER LOG10(4)
C
      DOUBLE PRECISION DMACH(5)
C
      EQUIVALENCE (DMACH(1),SMALL(1))
      EQUIVALENCE (DMACH(2),LARGE(1))
      EQUIVALENCE (DMACH(3),RIGHT(1))
      EQUIVALENCE (DMACH(4),DIVER(1))
      EQUIVALENCE (DMACH(5),LOG10(1))
C
C
C     MACHINE CONSTANTS FOR THE CDC CYBER 170 SERIES (FTN5).
C
C     DATA SMALL(1) / O"00604000000000000000" /
C     DATA SMALL(2) / O"00000000000000000000" /
C
C     DATA LARGE(1) / O"37767777777777777777" /
C     DATA LARGE(2) / O"37167777777777777777" /
C
C     DATA RIGHT(1) / O"15604000000000000000" /
C     DATA RIGHT(2) / O"15000000000000000000" /
C
C     DATA DIVER(1) / O"15614000000000000000" /
C     DATA DIVER(2) / O"15010000000000000000" /
C
C     DATA LOG10(1) / O"17164642023241175717" /
C     DATA LOG10(2) / O"16367571421742254654" /
C
C     MACHINE CONSTANTS FOR THE CDC CYBER 200 SERIES
C
C     DATA SMALL(1) / X'9000400000000000' /
C     DATA SMALL(2) / X'8FD1000000000000' /
C
C     DATA LARGE(1) / X'6FFF7FFFFFFFFFFF' /
C     DATA LARGE(2) / X'6FD07FFFFFFFFFFF' /
C
C     DATA RIGHT(1) / X'FF74400000000000' /
C     DATA RIGHT(2) / X'FF45000000000000' /
C
C     DATA DIVER(1) / X'FF75400000000000' /
C     DATA DIVER(2) / X'FF46000000000000' /
C
C     DATA LOG10(1) / X'FFD04D104D427DE7' /
C     DATA LOG10(2) / X'FFA17DE623E2566A' /
C
C
C     MACHINE CONSTANTS FOR THE CDC 6000/7000 SERIES.
C
C     DATA SMALL(1) / 00564000000000000000B /
C     DATA SMALL(2) / 00000000000000000000B /
C
C     DATA LARGE(1) / 37757777777777777777B /
C     DATA LARGE(2) / 37157777777777777777B /
C
C     DATA RIGHT(1) / 15624000000000000000B /
C     DATA RIGHT(2) / 00000000000000000000B /
C
C     DATA DIVER(1) / 15634000000000000000B /
C     DATA DIVER(2) / 00000000000000000000B /
C
C     DATA LOG10(1) / 17164642023241175717B /
C     DATA LOG10(2) / 16367571421742254654B /
C
C     MACHINE CONSTANTS FOR THE CRAY 1
C
C     DATA SMALL(1) / 201354000000000000000B /
C     DATA SMALL(2) / 000000000000000000000B /
C
C     DATA LARGE(1) / 577767777777777777777B /
C     DATA LARGE(2) / 000007777777777777774B /
C
C     DATA RIGHT(1) / 376434000000000000000B /
C     DATA RIGHT(2) / 000000000000000000000B /
C
C     DATA DIVER(1) / 376444000000000000000B /
C     DATA DIVER(2) / 000000000000000000000B /
C
C     DATA LOG10(1) / 377774642023241175717B /
C     DATA LOG10(2) / 000007571421742254654B /
C
C
C     MACHINE CONSTANTS FOR THE IBM 360/370 SERIES,
C     THE XEROX SIGMA 5/7/9, THE SEL SYSTEMS 85/86, AND
C     THE PERKIN ELMER (INTERDATA) 7/32.
C
C     DATA SMALL(1),SMALL(2) / Z00100000, Z00000000 /
C     DATA LARGE(1),LARGE(2) / Z7FFFFFFF, ZFFFFFFFF /
C     DATA RIGHT(1),RIGHT(2) / Z33100000, Z00000000 /
C     DATA DIVER(1),DIVER(2) / Z34100000, Z00000000 /
C     DATA LOG10(1),LOG10(2) / Z41134413, Z509F79FF /
C
C     MACHINE CONSTATNS FOR THE IBM PC FAMILY (D. KAHANER NBS)
C
C     DATA DMACH/2.23D-308,1.79D+308,1.11D-16,2.22D-16,
C    *  0.301029995663981195D0/
C
C     MACHINE CONSTATNS FOR THE SEQUENT BALANCE (S. NASH)
C
C     DATA DMACH/5.562684646268003D-309,
C    *           8.988465674311579D+307,
C    *           1.110223024625157D-16,
C    *           2.2204460492503131D-16,
C    *           0.3010299956639812D+00/
C
C     MACHINE CONSTATNS FOR THE IPSC/2 HYPERCUBE (S. NASH)
C
      DATA DMACH/5.562684646268003D-309,
     *           8.988465674311579D+307,
     *           1.110223024625157D-16,
     *           2.2204460492503131D-16,
     *           0.3010299956639812D+00/
C
C     MACHINE CONSTANTS FOR THE PDP-10 (KA PROCESSOR).
C
C     DATA SMALL(1),SMALL(2) / "033400000000, "000000000000 /
C     DATA LARGE(1),LARGE(2) / "377777777777, "344777777777 /
C     DATA RIGHT(1),RIGHT(2) / "113400000000, "000000000000 /
C     DATA DIVER(1),DIVER(2) / "114400000000, "000000000000 /
C     DATA LOG10(1),LOG10(2) / "177464202324, "144117571776 /
C
C     MACHINE CONSTANTS FOR THE PDP-10 (KI PROCESSOR).
C
C     DATA SMALL(1),SMALL(2) / "000400000000, "000000000000 /
C     DATA LARGE(1),LARGE(2) / "377777777777, "377777777777 /
C     DATA RIGHT(1),RIGHT(2) / "103400000000, "000000000000 /
C     DATA DIVER(1),DIVER(2) / "104400000000, "000000000000 /
C     DATA LOG10(1),LOG10(2) / "177464202324, "476747767461 /
C
C
C     MACHINE CONSTANTS FOR THE SUN-3 (INCLUDES THOSE WITH 68881 CHIP,
C       OR WITH FPA BOARD. ALSO INCLUDES SUN-2 WITH SKY BOARD. MAY ALSO
C       WORK WITH SOFTWARE FLOATING POINT ON EITHER SYSTEM.)
C
C     DATA SMALL(1),SMALL(2) / X'00100000', X'00000000' /
C     DATA LARGE(1),LARGE(2) / X'7FEFFFFF', X'FFFFFFFF' /
C     DATA RIGHT(1),RIGHT(2) / X'3CA00000', X'00000000' /
C     DATA DIVER(1),DIVER(2) / X'3CB00000', X'00000000' /
C     DATA LOG10(1),LOG10(2) / X'3FD34413', X'509F79FF' /
C
C
C     MACHINE CONSTANTS FOR VAX 11/780
C     (EXPRESSED IN INTEGER AND HEXADECIMAL)
C    *** THE INTEGER FORMAT SHOULD BE OK FOR UNIX SYSTEMS***
C
C     DATA SMALL(1), SMALL(2) /        128,           0 /
C     DATA LARGE(1), LARGE(2) /     -32769,          -1 /
C     DATA RIGHT(1), RIGHT(2) /       9344,           0 /
C     DATA DIVER(1), DIVER(2) /       9472,           0 /
C     DATA LOG10(1), LOG10(2) /  546979738,  -805796613 /
C
C    ***THE HEX FORMAT BELOW MAY NOT BE SUITABLE FOR UNIX SYSYEMS***
C     DATA SMALL(1), SMALL(2) / Z00000080, Z00000000 /
C     DATA LARGE(1), LARGE(2) / ZFFFF7FFF, ZFFFFFFFF /
C     DATA RIGHT(1), RIGHT(2) / Z00002480, Z00000000 /
C     DATA DIVER(1), DIVER(2) / Z00002500, Z00000000 /
C     DATA LOG10(1), LOG10(2) / Z209A3F9A, ZCFF884FB /
C
C   MACHINE CONSTANTS FOR VAX 11/780 (G-FLOATING)
C     (EXPRESSED IN INTEGER AND HEXADECIMAL)
C    *** THE INTEGER FORMAT SHOULD BE OK FOR UNIX SYSTEMS***
C
C     DATA SMALL(1), SMALL(2) /         16,           0 /
C     DATA LARGE(1), LARGE(2) /     -32769,          -1 /
C     DATA RIGHT(1), RIGHT(2) /      15552,           0 /
C     DATA DIVER(1), DIVER(2) /      15568,           0 /
C     DATA LOG10(1), LOG10(2) /  1142112243, 2046775455 /
C
C    ***THE HEX FORMAT BELOW MAY NOT BE SUITABLE FOR UNIX SYSYEMS***
C     DATA SMALL(1), SMALL(2) / Z00000010, Z00000000 /
C     DATA LARGE(1), LARGE(2) / ZFFFF7FFF, ZFFFFFFFF /
C     DATA RIGHT(1), RIGHT(2) / Z00003CC0, Z00000000 /
C     DATA DIVER(1), DIVER(2) / Z00003CD0, Z00000000 /
C     DATA LOG10(1), LOG10(2) / Z44133FF3, Z79FF509F /
C
C
C***FIRST EXECUTABLE STATEMENT  D1MACH
      IF (I .LT. 1  .OR.  I .GT. 5) THEN
          WRITE (*,*) 'D1MACH -- I OUT OF BOUNDS'
          STOP
      END IF
C
      D1MACH = DMACH(I)
      RETURN
C
      END
c********************************************************************
c BTN: Sample problem 1 (simple usage of BTNEZ, CHKDER)
c main host program
c last changed: 7/30/90
c********************************************************************
c
c problem declarations
c
      program    host
      integer    pid
c
c set up hypercube
c
      pid = 0
      call setpid   (pid)
      call killcube (-1, pid)
      call load     ('node', -1, pid)
c
      stop
      end
c********************************************************************
c BTN: Sample problem 2 (simple customization of BTN, CHKDER)
c main host program
c last changed: 7/30/90
c********************************************************************
c
c problem declarations
c
      program    host
      integer    pid
c
c set up hypercube
c
      pid = 0
      call setpid   (pid)
      call killcube (-1, pid)
      call load     ('node', -1, pid)
c
      stop
      end
c********************************************************************
c BTN: Sample problem 4 (using BTN to solve a constrained problem)
c main host program
c last changed: 7/30/90
c********************************************************************
c
c problem declarations
c
      program    host
      integer    pid
c
c set up hypercube
c
      pid = 0
      call setpid   (pid)
      call killcube (-1, pid)
      call load     ('node', -1, pid)
c
      stop
      end
c********************************************************************
c BTN: Sample problem 3 (complex usage of BTN, CHKDER)
c main host program
c last changed: 10/5/90
c********************************************************************
c
c problem declarations
c
      program          host
      parameter        (nmax = 100, idpsze = 8,
     *                 nmsgi = 8, lmsgi = 4*nmsgi,
     *                 nmsgr = 2, lmsgr = idpsze*nmsgr)
      implicit         double precision (a-h,o-z)
      integer          msgi(nmsgi), pid
      double precision x(nmax), g(nmax), msgr(nmsgr)
c
c set up hypercube
c
      pid = 0
      call setpid   (pid)
      call killcube (-1, pid)
      call load     ('node', -1, pid)
c
c read data from file
c
      open (7, file = 'host3.d', status = 'old')
      read (7,800) n
      read (7,800) ichk
      read (7,800) istart
      read (7,800) iend
      read (7,800) istep
      read (7,800) msglvl
      read (7,800) iunit
      read (7,800) kblock
c
      read (7,810) rnktol
      read (7,810) accrcy
c
c Send problem specification to the cube
c
      msgi(1)  = n
      msgi(2)  = ichk
      msgi(3)  = istart
      msgi(4)  = iend
      msgi(5)  = istep
      msgi(6)  = msglvl
      msgi(7)  = iunit
      msgi(8)  = kblock
      call csend (101, msgi, lmsgi, -1, pid)
c
      msgr(1)  = rnktol
      msgr(2)  = accrcy
      call csend (102, msgr, lmsgr, -1, pid)
c
      if (ichk .ne. 0) stop
c
c receive solution
c
      call crecv (103, iflag, 4)
      call crecv (104, f, idpsze)
      call crecv (105, x, n*idpsze)
      call crecv (106, g, n*idpsze)
c
c write results to file
c
      open  (iunit, file = 'host.out', status = 'unknown')
      write (iunit,820)
      write (iunit,830) iflag
      write (iunit,840) f
      write (iunit,850)
      write (iunit,860) (x(i), i = 1,n)
      write (iunit,870)
      write (iunit,860) (g(i), i = 1,n)
      close (iunit)
c
      stop
800   format (10x, i5)
810   format (10x, d16.4)
820   format (' Results of BTN', /)
830   format (' Error code = ', i4)
840   format (' Function value = ', 1pd24.16)
850   format (/, ' x = ')
860   format (3(1pd16.8))
870   format (/, ' g = ')
      end
c********************************************************************
c BTN: Sample problem 1 (simple usage of BTNEZ, CHKDER)
c main node program
c last changed: 10/5/90
c********************************************************************
c
      program           node
      implicit          double precision (a-h,o-z)
      parameter         (n  = 100, ndmx = 16,
     *                  liw = 3*ndmx,
     *                  lw  = 14*n + ndmx*(4*ndmx + 9))
      double precision  x(n), g(n), w(lw)
      integer           iw(liw)
      logical           idzero
      external          sfun
c
c set up node information
c
      id     = mynode ()
      idzero = id .eq. 0
c
c specify initial guess of parameters
c
      call xstart (x, n)
c
c check derivative values at initial point
c
      call chkez (n, x, f, g, w, lw, iw, sfun, iflag, errmax, imax)
      if (iflag .ne. 0) stop
c
c solve optimization problem
c
      call btnez (n, x, f, g, w, lw, iw, sfun, iflag)
      if (.not. idzero) stop
c
c Print results (node 0 only)
c
      if (iflag .eq. 999) write (*,*) ' DISASTER'
      iunit = 9
      open (iunit, file = 'node.out', status = 'unknown')
      write (iunit,810)
      if (iflag .ne.   0) then
          write (    *,*) ' Error code = ', iflag
          write (iunit,*) ' Error code = ', iflag
      end if
      ig   = idamax (n, g, 1)
      write (iunit,820) f
      write (iunit,830) abs(g(ig))
      write (iunit,840)
      write (iunit,850) (x(i), i = 1,n)
      close (iunit)
c
      stop
810   format (//, ' Results of optimization problem', /)
820   format (' Final function value = ', 1pd24.16)
830   format (' Norm of gradient     = ', 1pd12.4)
840   format (' Parameters')
850   format (4(1pd16.8))
      end
c
c----------------------------------------------------------------------
c initial guess
c----------------------------------------------------------------------
c
      subroutine xstart (x, n)
      double precision x(n)
c
c specify initial values of variables
c
      do 10 i = 1,n
          x(i) = 0.d0
10    continue
c
      return
      end
c
c----------------------------------------------------------------------
c function evaluation
c----------------------------------------------------------------------
c
      subroutine sfun (n ,x, f, g, active, k)
c
c evaluate nonlinear function and gradient
c
      implicit         double precision (a-h,o-z)
      double precision x(n), f, g(n)
      logical          active
c
      if (.not. active) return
c
      f = 0.d0
      do 30 i = 1,n
          b = i
          if (i .gt. n/2) b = -b
          f = f + 5.d-1*i*x(i)*x(i) - b*x(i)
          g(i) = i*x(i) - b
30    continue
c
      return
      end
c********************************************************************
c BTN: Sample problem 2 (simple customization of BTN, CHKDER)
c main node program
c last changed: 10/5/90
c********************************************************************
c
      program          node
      implicit         double precision (a-h,o-z)
      parameter        (n  = 100, ndmx = 16,
     *                 liw = 3*ndmx,
     *                 lw  = 14*n + ndmx*(4*ndmx + 9))
      double precision x(n), g(n), w(lw)
      integer          iw(liw), pid
      logical          idzero
      external         sfun
c
c set up node information
c
      id     = mynode ()
      idzero = id .eq. 0
c
c set initial guess of parameters
c
      call xstart (x, n)
c
c check derivative values (checking every 5th component)
c
      call chkpar (n, kmax, msglvl, iunit, istart, iend, istep)
      iend   = n
      istep  = 5
      call chkder (n, x, f, g, w, lw, iw, sfun, iflag,
     *             ndmx, kmax, errmax, imax,
     *             msglvl, iunit, istart, iend, istep)
      if (iflag .ne. 0) stop
c
c Set customizing parameters for BTN
c
      call btnpar (nodemx, kmax, maxit, msglvl, pid, iprec, nlmax,
     *    initv, tolq, iunit, rnktol, maxfun, accrcy, stepmx, eta,
     *    ireset, indstp)
      accrcy  = 1.d-6
c
c solve optimization problem
c
      call btn (n, x, f, g, w, lw, iw, sfun, iflag,
     *          nodemx, kmax, maxit, msglvl, pid, iprec, nlmax,
     *          initv, tolq, iunit, rnktol, maxfun, accrcy,
     *          stepmx, eta, ireset, indstp, sfun)
c
      if (.not. idzero) stop
c
c Print results
c
      if (iflag .eq. 999) write (*,*) ' DISASTER'
      iunit = 9
      open (iunit, file = 'node.out', status = 'unknown')
      write (iunit,810)
      if (iflag .ne.   0) then
          write (    *,*) ' Error code = ', iflag
          write (iunit,*) ' Error code = ', iflag
      end if
      ig   = idamax (n, g, 1)
      write (iunit,820) f
      write (iunit,830) abs(g(ig))
      write (iunit,840)
      write (iunit,850) (x(i), i = 1,n)
      close (iunit)
c
      stop
810   format (//, ' Results of optimization problem', /)
820   format (' Final function value = ', 1pd24.16)
830   format (' Norm of gradient     = ', 1pd12.4)
840   format (' Parameters')
850   format (4(1pd16.8))
      end
c
c----------------------------------------------------------------------
c initial guess
c----------------------------------------------------------------------
c
      subroutine xstart (x, n)
      double precision x(n)
c
c specify initial values of variables
c
      do 10 i = 1,n
          x(i) = 0.d0
10    continue
c
      return
      end
c
c----------------------------------------------------------------------
c function evaluation
c----------------------------------------------------------------------
c
      subroutine sfun (n ,x, f, g, active, k)
c
c evaluate nonlinear function and gradient
c
      implicit         double precision (a-h,o-z)
      double precision x(n), f, g(n)
      logical          active
c
      if (.not. active) return
c
      f = 0.d0
      do 30 i = 1,n
          b = i
          if (i .gt. n/2) b = -b
          f = f + 5.d-1*i*x(i)*x(i) - b*x(i)
          g(i) = i*x(i) - b
30    continue
c
      return
      end
c********************************************************************
c BTN: Sample problem 3 (complex usage of BTN, CHKDER)
c main node program
c last changed: 10/5/90
c********************************************************************
c
      program          node
      implicit         double precision (a-h,o-z)
      parameter        (nn = 100, ndmx = 16,
     *                 liw = 3*ndmx,
     *                 lw = 14*nn + ndmx*(4*ndmx + 9),
     *                 nmsgi = 8, lmsgi = 4*nmsgi, idpsze = 8,
     *                 nmsgr = 2, lmsgr = idpsze*nmsgr)
      double precision x(nn), g(nn), w(lw), msgr(nmsgr)
      integer          iw(liw), msgi(nmsgi), hid, pid
      logical          idzero
      external         sfun
      common /cube/    kmax, pid
c
c set up node information
c
      id     = mynode ()
      hid    = myhost ()
      idzero = id .eq. 0
c
c Receive problem specification from host processor
c
      call crecv (101, msgi, lmsgi)
c
c set default parameters for chkder and btn
c
      n      = msgi(1)
      call chkpar (n, kmax, msglvl, iunit, istart, iend, istep)
      call btnpar (nodemx, kmax, maxit, msglvl, pid, iprec, nlmax,
     *    initv, tolq, iunit, rnktol, maxfun, accrcy, stepmx, eta,
     *    ireset, indstp)
c
c modify selected parameters
c
      ichk   = msgi(2)
      istart = msgi(3)
      iend   = msgi(4)
      istep  = msgi(5)
      msglvl = msgi(6)
      iunit  = msgi(7)
      kmax   = msgi(8)
c
      call crecv (102, msgr, lmsgr)
      rnktol = msgr(1)
      accrcy = msgr(2)
c
c check derivative values (if desired)
c
      call xstart (x, n)
      if (ichk .ne. 0) then
          call chkder (n, x, f, g, w, lw, iw, sfun, iflag,
     *                 nodemx, kmax, errmax, imax,
     *                 msglvl, iunit, istart, iend, istep)
          if (idzero) then
              open  (iunit, file = 'node.out', status = 'unknown')
              write (iunit,800) errmax, imax
              close (iunit)
          end if
          stop
      end if
c
c solve optimization problem
c
      call btn (n, x, f, g, w, lw, iw, sfun, iflag,
     *          nodemx, kmax, maxit, msglvl, pid, iprec, nlmax,
     *          initv, tolq, iunit, rnktol, maxfun, accrcy,
     *          stepmx, eta, ireset, indstp, sfun)
      if (.not. idzero) stop
c
c Send results to host processor (node 0 only)
c
      call csend (103, iflag, 4, hid, pid)
      call csend (104, f, idpsze, hid, pid)
      call csend (105, x, n*idpsze, hid, pid)
      call csend (106, g, n*idpsze, hid, pid)
c
      stop
800   format (' CHKDER: Max. error in gradient = ', 1pd12.4,
     *        ' at index ', i5)
      end
c
c----------------------------------------------------------------------
c initial guess
c----------------------------------------------------------------------
c
      subroutine xstart (x, n)
      double precision x(n)
c
c specify initial values of the variables
c
      do 10 i = 1,n
          x(i) = 0.d0
10    continue
c
      return
      end
c
c----------------------------------------------------------------------
c function evaluation
c----------------------------------------------------------------------
c This example illustrates how the function evaluation can be
c spread over several processors (with blocksize less than cubesize).
c
c This example assumes that the cube size (cubesz) is exactly divisible
c by kmax (the maximum block size), and that n (the number of variables)
c is exactly divisible by m (the number of processors working on a
c function evaluation).
c
c The block size and the cube size can be varied.
c
c This example uses message passing.  The message types used are
c 200, 201, ..., 200+(cubesize), 401, 402, ..., 400+(cubesize)
c----------------------------------------------------------------------
c
      subroutine sfun (n ,x, f, g, active, kblock)
c
c evaluate the nonlinear function and its gradient [in parallel]
c
      parameter        (idpsze = 8)
      implicit         double precision (a-h,o-z)
      double precision x(*), g(*)
      integer          cubesz, pid
      logical          active
      common /cube/    kmax, pid
c
c get cube information, and set up indexes for this processor
c    kblock = current block size
c    id     = # of this processor
c    cubesz = # of processors in hypercube
c    itotal = # of processors working together to compute f(x)
c    icount = counter for the processors working to compute f(x)
c             [icount = 1, 2, ..., itotal]
c    nt     = # of variables handled by this processor
c    i1     = subscript of first variable handled by this processor
c    i2     = subscript of  last variable handled by this processor
c    id0    = # of processor that accumulates f(x) and g(x)
c
      id     = mynode ()
      cubesz = numnodes ()
      itotal = cubesz / kmax
      icount = id/kmax + 1
      nt     = n / itotal
      i1     = nt*(icount-1) + 1
      i2     = nt*icount
      id0    = mod (id,kmax)
c
c determine if a function evaluation is necessary
c
      if (id0 .ge. kblock) return
c
c send portion of the x vector to the other nodes computing f(x)
c
      if (id+1 .le. kmax) then
          icount = 1
          do 20 id1 = id+kmax,cubesz-1,kmax
              icount = icount + 1
              j1     = nt*(icount-1) + 1
              call csend (200,  x(j1), nt*idpsze, id1, pid)
20        continue
          icount = 1
      else
          call crecv (200,  x(i1), nt*idpsze)
      end if
c
c compute portion of function value and gradient
c
      f = 0.d0
      do 30 i = i1,i2
          b = i
          if (i .gt. n/2) b = -b
          f = f + 5.d-1*i*x(i)*x(i) - b*x(i)
          g(i) = i*x(i) - b
30    continue
c
c send partial results to main processor
c
      if (id+1 .le. kmax) then
          icount = 1
          do 40 id1 = id+kmax,cubesz-1,kmax
              icount = icount + 1
              i1 = nt*(icount-1) + 1
              call crecv (201+id1,    f1,    idpsze)
              call crecv (401+id1, g(i1), nt*idpsze)
              f = f + f1
40        continue
      else
          call csend (201+id,     f,    idpsze, id0, pid)
          call csend (401+id, g(i1), nt*idpsze, id0, pid)
      end if
c
      return
      end
c********************************************************************
c BTN: Sample problem 4 (using BTN to solve a constrained problem)
c
c A logarithmic barrier method is used to minimize a nonlinear
c function subject to simple bounds on the variables ("box" constraints)
c
c main node program
c last changed: 10/10/90
c********************************************************************
c
      program          node
      implicit         double precision (a-h,o-z)
      parameter        (n = 100, ndmx = 16, liw = 3*ndmx,
     *                 lw = 14*n + ndmx*(4*ndmx + 9))
      double precision x(n), g(n), w(lw), lb(100), ub(100), mu
      integer          iw(liw), pid
      logical          idzero
      common /bnd/     lb, ub
      common /bar/     mu
      external         sfun, maxstp
c
c set up node information
c
      id     = mynode ()
      pid    = 0
      idzero = id .eq. 0
      iu = 9
      open (iu, file = 'node.out', status = 'unknown')
c
c Initialize nonlinear parameters x and barrier parameter mu
c
      call xstart (x, n)
      mu    = 1.d0
c
c set up parameters for derivative checker
c
      call chkpar (n, kmax, msglvl, iunit, istart, iend, istep)
c
c check derivative values at initial point
c
      call chkder (n, x, f, g, w, lw, iw, sfun, iflag,
     *             ndmx, kmax, errmax, imax,
     *             msglvl, iunit, istart, iend, istep)
      if (idzero) write (iu,800) errmax, imax
      if (iflag .ne. 0) stop
c
c Set customizing parameters for BTN (the user-supplied routine
c maxstp will be used to bound the step length in the line search)
c
      call btnpar (nodemx, kmax, maxit, msglvl, pid, iprec, nlmax,
     *          initv, tolq, iunit, rnktol, maxfun, accrcy, stepmx,
     *          eta, ireset, indstp)
      indstp = 1
c
c solve optimization problem
c
      scale = 10.d0
10    if (idzero) write (*,860) mu
      call btn (n, x, f, g, w, lw, iw, sfun, iflag,
     *          nodemx, kmax, maxit, msglvl, pid, iprec, nlmax,
     *          initv, tolq, iunit, rnktol, maxfun, accrcy,
     *          stepmx, eta, ireset, indstp, maxstp)
      if (iflag .eq. 999) then
          if (idzero) write (*,*) ' Fatal error in BTN'
          stop
      end if
c
c update barrier parameter
c
      mu = mu / scale
      if (mu .gt. 1.d-6) go to 10
c
c Print results (node 0 only)
c
      if (.not. idzero) stop
      write (iu,810)
      if (iflag .ne.   0) then
          write ( *,*) ' Error code = ', iflag
          write (iu,*) ' Error code = ', iflag
      end if
      ig   = idamax (n, g, 1)
      write (iu,820) f
      write (iu,830) abs(g(ig))
      write (iu,840)
      write (iu,850) (x(i), i = 1,n)
      close (iu)
      stop
800   format (' CHKDER: Max. error in gradient = ', 1pd12.4,
     *        ' at index ', i5)
810   format (//, ' Results of optimization problem', /)
820   format (' Final function value = ', 1pd24.16)
830   format (' Norm of gradient     = ', 1pd12.4)
840   format (' Parameters')
850   format (4(1pd16.8))
860   format (//,' mu =', 1pd12.4)
      end
c
c----------------------------------------------------------------------
c initial guess
c----------------------------------------------------------------------
c
      subroutine xstart (x, n)
      double precision x(n)
      double precision lb(100), ub(100), mu
      common /bnd/     lb, ub
      common /bar/     mu
c
c specify initial values of variables and their bounds
c
      do 10 i = 1,n
          x(i)  = 0.5d0
          lb(i) = 0.d0
          ub(i) = 1.d0
10    continue
c
      return
      end
c
c----------------------------------------------------------------------
c function evaluation
c----------------------------------------------------------------------
c
      subroutine sfun (n ,x, f, g, active, k)
c
c evaluate nonlinear function and gradient
c
      implicit         double precision (a-h,o-z)
      double precision x(n), f, g(n)
      double precision lb(100), ub(100), mu
      common /bnd/     lb, ub
      common /bar/     mu
      logical          active
c
      if (.not. active) return
c
      f = 0.d0
      do 30 i = 1,n
          b = 1.d0
          if (i .gt. n/2) b = -b
          d1 = x(i) - lb(i)
          d2 = ub(i) - x(i)
          if (d1 .lt. ep) write (*,800) d1
          if (d2 .lt. ep) write (*,800) d2
          f = f + b * x(i)
     *       - mu * (log (d1)) - mu * (log (d2))
          g(i) =  b - mu / d1 + mu / d2
30    continue
c
      return
800   format (' Warning: negative slack value ', d16.8, /,
     *        ' Parameter mu may be too small')
      end
c
c********************************************************************
c maxstp
c computes maximum feasible step in a given direction d
c under box constraints
c********************************************************************
c
      subroutine maxstp (n, x, d, stepmx)
c
c Computes maximum stepsize moving from the (feasible) point x
c in direction d, under box constraints
c
c Parameters:
c n       --> integer, number of variables
c x       --> double precision, size n, current vector of parameters
c d       --> double precision, size n, search direction vector
c stepmx <--  double precision, maximum step allowed
c
      implicit         double precision (a-h, o-z)
      double precision d(*), x(*), lb(100), ub(100)
      common /bnd/     lb, ub
c
c  set up
c
      alpha   = 1.d8
      t       = alpha
      ep      = 1.d-8
c
c find distance to nearest bound
c (this is done on all processors, but could be done in parallel)
c
      do 10 i = 1,n
          if (d(i) .gt. ep) then
              t = (ub(i) - x(i)) / d(i)
           else if (d(i) .lt. -ep) then
              t = (x(i) - lb(i)) /(-d(i))
           endif
           if (t .lt. alpha) alpha = t
10    continue
      stepmx = alpha*0.99999
c
      return
      end
[READ.ME]

This subdirectory contains the version of the BTN software suitable for both
scalar (ordinary) computers and for the Sequent Balance and Symmetry (shared
memory) parallel computers.  The commands that control parallelism on the
Sequent will be interpreted as comments by Fortran compilers.

THE FOLLOWING FILES ARE INCLUDED:

   read.me      - this file
   btnlib.f     - subroutines for optimization
   chklib.f     - subroutines for checking derivatives
   basics.f     - lower-level routines
   d1mach.f     - routine to evaluate machine constants
   main1.f      - sample main program 1
   main2.f      - sample main program 2
   main3.f      - sample main program 3
   main3.d      - data file for main3.f
   main4.f      - sample main program 4
   makep        - "make" file for parallel execution on the Sequent computers
   makes        - "make" file for scalar   execution on the Sequent computers
   makeu        - "make" file for scalar   execution on many Unix computers
   install.com  - "command" file to compile and link a main program for
                  VAX VMS computers

Main programs 1-4 are analogs of the sample programs in the guide to the BTN
software.

TO RUN THE SOFTWARE:

   On a Unix system:  Set the machine constants in d1mach.f [see below]
                      Copy the appropriate "make" file to "makefile"
                      Copy the desired main program to "main.f"
                      Type "make"
                      Type "main"
   On a VMS system:   Set the machine constants in d1mach.f [see below]
                      Copy the desired main program to "main.f"
                      Type "@install"
                      Type "run main"


SETTING THE MACHINE CONSTANTS:

Routine D1MACH in file d1mach.f sets five machine constants associated with
double-precision arithmetic.  Constants for many computers are already listed
in the file - if yours is there, you need only "uncomment" the DATA statements
for your computer and "comment out" the DATA statements for the Sequent
Balance.

If your computer is not listed, then you must determine the appropriate
machine constants.  BTN only used D1MACH(3), and (on binary computers) an
adequate value for D1MACH(3) can usually be determined using the following
Fortran program:
      IMPLICIT DOUBLE PRECISION (A-H,O-Z)
      EPS    = 5.D-1
      OLDEPS = 1.0D0
10    EPS1   = 1.D0 + EPS
      IF (EPS1 .EQ. 1.D0) GO TO 20
      OLDEPS = EPS
      EPS    = EPS * 5.D-1
      GO TO 10
20    WRITE (*,*) ' D1MACH(3) = ', OLDEPS
      STOP
      END

WARNINGS:

1.  If run on a "scalar" computer, BTN will be simulating the performance of a
parallel computer, with the work of multiple processors being done
sequentially by a single processor.
2.  When run in parallel, two consecutive runs of the same program can produce
different results, since different rounding errors can occur.  This does not
indicate a problem with the program, but is rather an artifact of parallel
computing.
3.  The parallel computing software was prepared using version 3.0 of the
Dynix operating system on the Sequent Balance computer (Dynix V3.0.17.9).
It was compiled using the ATS fortran compiler (version 1.8.0). The "parallel"
commands are not part of the Fortran 77 language and are not standardized.
They may not work correctly under different versions of the operating system,
or with different fortran compilers.
4.  The fortran compiler mentioned in item 3 did not always produce correct
executable code when the floating-point accelerator on the Sequent Symmetry
was used.  In particular, it did not compile correctly when the following
three compiler options were simultaneously selected (however, any two of these
options can be selected without problems):
    a) [-mp] parallel processing
    b) [-fpa] the floating point accelerator
    c) [-O1 or higher] optimized compiling at higher than level 0 (since
       default optimization is at higher than level 0, you must compile
       using (say): fortran -c main.f -mp -fpa -O0, if you want to run
       in parallel using the floating-point accelerator)
5.  The sample main programs declare work arrays for problems with at most 100
variables, and for computers with at most 10 processors.  The PARAMETER
statements should be adjusted, increasing the values of N and NDMX, if these
values are not appropriate.
6.  Usage of the CUBE and SEQUENT versions is slightly different.  A list of
differences is given below.

DIFFERENCES BETWEEN CUBE AND SEQUENT VERSIONS

1.  The CUBE version requires an integer work array: INTEGER IW(3*NDMX)
    [NDMX is the maximum number of processors available]
2.  The CUBE version requires a real work array of different length:
    CUBE:    REAL W(14*N+NDMX*(4*NDMX+9))
    SEQUENT: REAL W((3+7*NDMX)*(N+NDMX))
    [N is the number of variables in the problem]
3.  The calls to the subroutines are slightly different:
    CUBE:
        CALL CHKEZ  (N, X, F, G, W, LW, IW, SFUN, IFLAG, ERRMAX, IMAX)
        CALL CHKDER (N, X, F, G, W, LW, IW, SFUN, IFLAG, NDMX, KMAX, ...)
        CALL BTNEZ  (N, X, F, G, W, LW, IW, SFUN, IFLAG)
        CALL BTNPAR (NODEMX, KMAX, MAXIT, MSGLVL, PID, IPREC, ...)
        CALL BTN    (N, X, F, G, W, LW, IW, SFUN, IFLAG,
                     NODEMX, KMAX, MAXIT, MSGLVL, PID, IPREC, ...
    SEQUENT:
        CALL CHKEZ  (N, X, F, G, W, LW,     SFUN, IFLAG, ERRMAX, IMAX)
        CALL CHKDER (N, X, F, G, W, LW,     SFUN, IFLAG,       KMAX, ...)
        CALL BTNEZ  (N, X, F, G, W, LW,     SFUN, IFLAG)
        CALL BTNPAR (NODEMX, KMAX, MAXIT, MSGLVL,      IPREC, ...)
        CALL BTN    (N, X, F, G, W, LW,     SFUN, IFLAG,
                     NODEMX, KMAX, MAXIT, MSGLVL,      IPREC, ...
4. The calling sequence for the user-supplied subroutine is different:
   CUBE:
        SUBROUTINE SFUN (N ,X, F, G, ACTIVE, KBLOCK)
   SEQUENT
        SUBROUTINE SFUN (N ,X, F, G)
   In addition, on the Sequent computers, the SFUN routine must contain the
   line
   C$    PRIVATE
   before its first executable statement.

CONVERSION TO OTHER (SHARED MEMORY) PARALLEL COMPUTERS:

All the commands involving parallelism have the characters "C$" in columns 1
and 2.  Virtually all of them are "DOACROSS" commands, meaning that the
iterations of a Fortran DO loop are to be spread among the processors.  These
"C$" commands must be appropriately converted to corresponding commands on the
target computer.  No other changes should be required.

[END OF FILE READ.ME]
=========================================================================
all:	main

GOBJS =	btnlib.o chklib.o basics.o d1mach.o

G1 =	main.o $(GOBJS)

.SUFFIXES: .f .o

main.o: main.f
	fortran -mp -c main.f

.f.o:
	fortran -mp -c $<

main:	$(G1)
	fortran -mp $(G1) -o main
=========================================================================
all:	main

GOBJS =	btnlib.o chklib.o basics.o d1mach.o

G1 =	main.o $(GOBJS)

.SUFFIXES: .f .o

main.o: main.f
	fortran -c main.f

.f.o:
	fortran -c $<

main:	$(G1)
	fortran $(G1) -o main
=========================================================================
all:	main

GOBJS =	btnlib.o chklib.o basics.o d1mach.o

G1 =	main.o $(GOBJS)

.SUFFIXES: .f .o

main.o: main.f
	f77 -c main.f

.f.o:
	f77 -c $<

main:	$(G1)
	f77 $(G1) -o main
=========================================================================
n        =  100    % number of variables    format (10x, i5)
ichk     =    0    % 0 to minimize, 1 to check derivatives
istart   =    1    % CHKDER: starting index
iend     =   86    % CHKDER: starting index
istep    =    5    % CHKDER: starting index
msglvl   =    1    % print level
iunit    =   10    % unit number for output
kblock   =    4    % block size (default: k = [# procs]/2 on Sequent)
rnktol   = 1.d-9          % tolerance for rank test
accrcy   = 1.d-6          % convergence tolerance
c********************************************************************
c BTN: BASICS, low level routines (including Linpack BLAS)
c last changed: 11/14/90
c for Sequent Balance (parallel) and scalar computers
c******************************************************************
c
      subroutine dsvtvp (n, a, y, ly, x, lx, x1, lx1)
      double precision y(*), x(*), x1(*), a
C$    private
c
c Form x1 = x + a*y (with strides ly, lx, and lx1, as in Linpack)
c
      iy = 1
      ix = 1
      ix1 = 1
      do 10 i = 1,n
          x1(ix1) = x(ix) + a*y(iy)
          iy = iy + ly
          ix = ix + lx
          ix1 = ix1 + lx1
10    continue
c
      return
      end
c
c
      subroutine dfill (n, a, x, lx)
      double precision x(*), a
c
c Set x(i) = a (with stride lx, as in Linpack)
c
      ix = 1
      do 10 i = 1,n
          x(ix) = a
          ix = ix + lx
10    continue
c
      return
      end
c
c
      subroutine dneg (n, x, lx)
      double precision x(*)
c
c Set x(i) = -x(i) (with stride lx, as in Linpack)
c
      ix = 1
      do 10 i = 1,n
          x(ix) = -x(ix)
          ix = ix + lx
10    continue
c
      return
      end
c
c
      subroutine dvsub (n, x, lx, y, ly, z, lz)
      double precision x(*), y(*), z(*)
C$    private
c
c Form z(i) = x(i) - y(i) (with strides, as in Linpack)
c
      ix = 1
      iy = 1
      iz = 1
      do 10 i = 1,n
          z(iz) = x(ix) - y(iy)
          ix = ix + lx
          iy = iy + ly
          iz = iz + lz
10    continue
c
      return
      end
c
c
      subroutine dvdiv (n, x, lx, y, ly, z, lz)
      double precision x(*), y(*), z(*)
C$    private
c
c Form z(i) = x(i) / y(i) (with strides, as in Linpack)
c
      ix = 1
      iy = 1
      iz = 1
      do 10 i = 1,n
          z(iz) = x(ix) / y(iy)
          ix = ix + lx
          iy = iy + ly
          iz = iz + lz
10    continue
c
      return
      end
c
c
      subroutine dvmul (n, x, lx, y, ly, z, lz)
      double precision x(*), y(*), z(*)
c
c Form z(i) = x(i) * y(i) (with strides, as in Linpack)
c
      ix = 1
      iy = 1
      iz = 1
      do 10 i = 1,n
          z(iz) = x(ix) * y(iy)
          ix = ix + lx
          iy = iy + ly
          iz = iz + lz
10    continue
c
      return
      end
c
c
      subroutine pdsvtv (n, a, y, x, x1)
c     subroutine pdsvtvp (n, a, y, x, x1) [NOTE CHANGE FROM INTEL NAME]
      double precision y(*), x(*), x1(*), a
c
c Form x1 = x + a*y
c
C$DOACROSS SHARE(x1, x, y, a)
      do 10 i = 1,n
          x1(i) = x(i) + a*y(i)
10    continue
c
      return
      end
c
c
      subroutine pdfill (n, a, x)
      double precision x(*), a
c
c Set x(i) = a (with stride = 1)
c
C$DOACROSS SHARE(x, a)
      do 10 i = 1,n
          x(i) = a
10    continue
c
      return
      end
c
c
      subroutine pdneg (n, x)
      double precision x(*)
c
c Set x(i) = -x(i) (with stride = 1)
c
C$DOACROSS SHARE(x)
      do 10 i = 1,n
          x(i) = -x(i)
10    continue
c
      return
      end
c
c
      double precision function pddot (n, x, y)
c
c Parallel inner product routine
c Forms the inner product of its part of x and y in parallel
c
c PARAMETERS
c n         -> integer, size of arrays
c x         -> double precision, size n, input array 1
c y         -> double precision, size n, input array 2
c
      implicit         double precision (a-h,o-z)
      double precision x(*), y(*)
c
      t = 0.d0
C$DOACROSS SHARE(x,y), REDUCTION(t)
      do 10 i = 1,n
          t = t + x(i)*y(i)
10    continue
c
      pddot = t
c
      return
      end
c
c
      double precision function pdnrmi (n, x)
c
c Parallel infinity-norm routine
c
c PARAMETERS
c n         -> integer, size of arrays on each processor
c x         -> double precision, size n, input array
c
      implicit         double precision (a-h,o-z)
      double precision x(*), t
c
      t = 0.d0
C$DOACROSS SHARE(x), LOCAL(absxi), REDUCTION(t)
      do 10 i = 1,n
          absxi = abs(x(i))
          t = max(t,absxi)
10    continue
c
      pdnrmi = t
c
      return
      end
c
c
      subroutine pdcopy (n, x, y)
c
c Parallel vector copy routine
c Copies: y(i) = x(i)
c
c PARAMETERS
c n         -> integer, size of arrays
c x         -> double precision, size n, input array 1
c y         -> double precision, size n, input array 2
c
      implicit         double precision (a-h,o-z)
      double precision x(*), y(*)
c
C$DOACROSS SHARE(x,y)
      do 10 i = 1,n
          y(i) = x(i)
10    continue
c
      return
      end
c
c
      subroutine pdscal (n, a, x)
c
c Parallel vector scaling routine
c Scales: x(i) = a*x(i)
c
c PARAMETERS
c n         -> integer, size of arrays
c a         -> double precision, scaling factor
c x         -> double precision, size n, input/output array
c
      implicit         double precision (a-h,o-z)
      double precision x(*), a
c
C$DOACROSS SHARE(x,a)
      do 10 i = 1,n
          x(i) = a*x(i)
10    continue
c
      return
      end
c
c
      subroutine  dscal(n,da,dx,incx)
c
c     scales a vector by a constant.
c     uses unrolled loops for increment equal to one.
c     jack dongarra, linpack, 3/11/78.
c
      double precision da,dx(*)
      integer i,incx,m,mp1,n,nincx
C$    private
c
      if(n.le.0)return
      if(incx.eq.1)go to 20
c
c        code for increment not equal to 1
c
      nincx = n*incx
      do 10 i = 1,nincx,incx
        dx(i) = da*dx(i)
   10 continue
      return
c
c        code for increment equal to 1
c
c
c        clean-up loop
c
   20 m = mod(n,5)
      if( m .eq. 0 ) go to 40
      do 30 i = 1,m
        dx(i) = da*dx(i)
   30 continue
      if( n .lt. 5 ) return
   40 mp1 = m + 1
      do 50 i = mp1,n,5
        dx(i) = da*dx(i)
        dx(i + 1) = da*dx(i + 1)
        dx(i + 2) = da*dx(i + 2)
        dx(i + 3) = da*dx(i + 3)
        dx(i + 4) = da*dx(i + 4)
   50 continue
      return
      end
c
c***********************************************************************
c
      double precision function ddot(n,dx,incx,dy,incy)
c
c     forms the dot product of two vectors.
c     uses unrolled loops for increments equal to one.
c     jack dongarra, linpack, 3/11/78.
c
      double precision dx(*),dy(*),dtemp
      integer i,incx,incy,ix,iy,m,mp1,n
C$    private
c
      ddot = 0.0d0
      dtemp = 0.0d0
      if(n.le.0)return
      if(incx.eq.1.and.incy.eq.1)go to 20
c
c        code for unequal increments or equal increments
c          not equal to 1
c
      ix = 1
      iy = 1
      if(incx.lt.0)ix = (-n+1)*incx + 1
      if(incy.lt.0)iy = (-n+1)*incy + 1
      do 10 i = 1,n
        dtemp = dtemp + dx(ix)*dy(iy)
        ix = ix + incx
        iy = iy + incy
   10 continue
      ddot = dtemp
      return
c
c        code for both increments equal to 1
c
c
c        clean-up loop
c
   20 m = mod(n,5)
      if( m .eq. 0 ) go to 40
      do 30 i = 1,m
        dtemp = dtemp + dx(i)*dy(i)
   30 continue
      if( n .lt. 5 ) go to 60
   40 mp1 = m + 1
      do 50 i = mp1,n,5
        dtemp = dtemp + dx(i)*dy(i) + dx(i + 1)*dy(i + 1) +
     *   dx(i + 2)*dy(i + 2) + dx(i + 3)*dy(i + 3) + dx(i + 4)*dy(i + 4)
   50 continue
   60 ddot = dtemp
      return
      end
c
c***********************************************************************
c
      subroutine daxpy(n,da,dx,incx,dy,incy)
c
c     constant times a vector plus a vector.
c     uses unrolled loops for increments equal to one.
c     jack dongarra, linpack, 3/11/78.
c
      double precision dx(*),dy(*),da
      integer i,incx,incy,ix,iy,m,mp1,n
C$    private
c
      if(n.le.0)return
      if (da .eq. 0.0d0) return
      if(incx.eq.1.and.incy.eq.1)go to 20
c
c        code for unequal increments or equal increments
c          not equal to 1
c
      ix = 1
      iy = 1
      if(incx.lt.0)ix = (-n+1)*incx + 1
      if(incy.lt.0)iy = (-n+1)*incy + 1
      do 10 i = 1,n
        dy(iy) = dy(iy) + da*dx(ix)
        ix = ix + incx
        iy = iy + incy
   10 continue
      return
c
c        code for both increments equal to 1
c
c
c        clean-up loop
c
   20 m = mod(n,4)
      if( m .eq. 0 ) go to 40
      do 30 i = 1,m
        dy(i) = dy(i) + da*dx(i)
   30 continue
      if( n .lt. 4 ) return
   40 mp1 = m + 1
      do 50 i = mp1,n,4
        dy(i) = dy(i) + da*dx(i)
        dy(i + 1) = dy(i + 1) + da*dx(i + 1)
        dy(i + 2) = dy(i + 2) + da*dx(i + 2)
        dy(i + 3) = dy(i + 3) + da*dx(i + 3)
   50 continue
      return
      end
c
c***********************************************************************
c
      subroutine  dswap (n,dx,incx,dy,incy)
c
c     interchanges two vectors.
c     uses unrolled loops for increments equal one.
c     jack dongarra, linpack, 3/11/78.
c
      double precision dx(*),dy(*),dtemp
      integer i,incx,incy,ix,iy,m,mp1,n
C$    private
c
      if(n.le.0)return
      if(incx.eq.1.and.incy.eq.1)go to 20
c
c       code for unequal increments or equal increments not equal
c         to 1
c
      ix = 1
      iy = 1
      if(incx.lt.0)ix = (-n+1)*incx + 1
      if(incy.lt.0)iy = (-n+1)*incy + 1
      do 10 i = 1,n
        dtemp = dx(ix)
        dx(ix) = dy(iy)
        dy(iy) = dtemp
        ix = ix + incx
        iy = iy + incy
   10 continue
      return
c
c       code for both increments equal to 1
c
c
c       clean-up loop
c
   20 m = mod(n,3)
      if( m .eq. 0 ) go to 40
      do 30 i = 1,m
        dtemp = dx(i)
        dx(i) = dy(i)
        dy(i) = dtemp
   30 continue
      if( n .lt. 3 ) return
   40 mp1 = m + 1
      do 50 i = mp1,n,3
        dtemp = dx(i)
        dx(i) = dy(i)
        dy(i) = dtemp
        dtemp = dx(i + 1)
        dx(i + 1) = dy(i + 1)
        dy(i + 1) = dtemp
        dtemp = dx(i + 2)
        dx(i + 2) = dy(i + 2)
        dy(i + 2) = dtemp
   50 continue
      return
      end
c
c***********************************************************************
c
      integer function idamax(n,dx,incx)
c
c     finds the index of element having max. absolute value.
c     jack dongarra, linpack, 3/11/78.
c
      double precision dx(*),dmax
      integer i,incx,ix,n
C$    private
c
      idamax = 0
      if( n .lt. 1 ) return
      idamax = 1
      if(n.eq.1)return
      if(incx.eq.1)go to 20
c
c        code for increment not equal to 1
c
      ix = 1
      dmax = dabs(dx(1))
      ix = ix + incx
      do 10 i = 2,n
         if(dabs(dx(ix)).le.dmax) go to 5
         idamax = i
         dmax = dabs(dx(ix))
    5    ix = ix + incx
   10 continue
      return
c
c        code for increment equal to 1
c
   20 dmax = dabs(dx(1))
      do 30 i = 2,n
         if(dabs(dx(i)).le.dmax) go to 30
         idamax = i
         dmax = dabs(dx(i))
   30 continue
      return
      end
c
c***********************************************************************
c
      double precision function dasum(n,dx,incx)
c
c     takes the sum of the absolute values.
c     jack dongarra, linpack, 3/11/78.
c
      double precision dx(*),dtemp
      integer i,incx,m,mp1,n,nincx
C$    private
c
      dasum = 0.0d0
      dtemp = 0.0d0
      if(n.le.0)return
      if(incx.eq.1)go to 20
c
c        code for increment not equal to 1
c
      nincx = n*incx
      do 10 i = 1,nincx,incx
        dtemp = dtemp + dabs(dx(i))
   10 continue
      dasum = dtemp
      return
c
c        code for increment equal to 1
c
c
c        clean-up loop
c
   20 m = mod(n,6)
      if( m .eq. 0 ) go to 40
      do 30 i = 1,m
        dtemp = dtemp + dabs(dx(i))
   30 continue
      if( n .lt. 6 ) go to 60
   40 mp1 = m + 1
      do 50 i = mp1,n,6
        dtemp = dtemp + dabs(dx(i)) + dabs(dx(i + 1)) + dabs(dx(i + 2))
     *  + dabs(dx(i + 3)) + dabs(dx(i + 4)) + dabs(dx(i + 5))
   50 continue
   60 dasum = dtemp
      return
      end
c
c***********************************************************************
c
      double precision function dnrm2 ( n, dx, incx)
      integer          next
      double precision   dx(*), cutlo, cuthi, hitest, sum,
     *                   xmax, zero, one
      data   zero, one /0.0d0, 1.0d0/
C$    private
c
c     euclidean norm of the n-vector stored in dx() with storage
c     increment incx .
c     if    n .le. 0 return with result = 0.
c     if n .ge. 1 then incx must be .ge. 1
c
c           c.l.lawson, 1978 jan 08
c
c     four phase method     using two built-in constants that are
c     hopefully applicable to all machines.
c         cutlo = maximum of  dsqrt(u/eps)  over all known machines.
c         cuthi = minimum of  dsqrt(v)      over all known machines.
c     where
c         eps = smallest no. such that eps + 1. .gt. 1.
c         u   = smallest positive no.   (underflow limit)
c         v   = largest  no.            (overflow  limit)
c
c     brief outline of algorithm..
c
c     phase 1    scans zero components.
c     move to phase 2 when a component is nonzero and .le. cutlo
c     move to phase 3 when a component is .gt. cutlo
c     move to phase 4 when a component is .ge. cuthi/m
c     where m = n for x() real and m = 2*n for complex.
c
c     values for cutlo and cuthi..
c     from the environmental parameters listed in the imsl converter
c     document the limiting values are as follows..
c     cutlo, s.p.   u/eps = 2**(-102) for  honeywell.  close seconds are
c                   univac and dec at 2**(-103)
c                   thus cutlo = 2**(-51) = 4.44089e-16
c     cuthi, s.p.   v = 2**127 for univac, honeywell, and dec.
c                   thus cuthi = 2**(63.5) = 1.30438e19
c     cutlo, d.p.   u/eps = 2**(-67) for honeywell and dec.
c                   thus cutlo = 2**(-33.5) = 8.23181d-11
c     cuthi, d.p.   same as s.p.  cuthi = 1.30438d19
c     data cutlo, cuthi / 8.232d-11,  1.304d19 /
c     data cutlo, cuthi / 4.441e-16,  1.304e19 /
      data cutlo, cuthi / 8.232d-11,  1.304d19 /
c
      if(n .gt. 0) go to 10
         dnrm2  = zero
         go to 300
c
   10 assign 30 to next
      sum = zero
      nn = n * incx
c                                                 begin main loop
      i = 1
   20    go to next,(30, 50, 70, 110)
   30 if( dabs(dx(i)) .gt. cutlo) go to 85
      assign 50 to next
      xmax = zero
c
c                        phase 1.  sum is zero
c
   50 if( dx(i) .eq. zero) go to 200
      if( dabs(dx(i)) .gt. cutlo) go to 85
c
c                                prepare for phase 2.
      assign 70 to next
      go to 105
c
c                                prepare for phase 4.
c
  100 i = j
      assign 110 to next
      sum = (sum / dx(i)) / dx(i)
  105 xmax = dabs(dx(i))
      go to 115
c
c                   phase 2.  sum is small.
c                             scale to avoid destructive underflow.
c
   70 if( dabs(dx(i)) .gt. cutlo ) go to 75
c
c                     common code for phases 2 and 4.
c                     in phase 4 sum is large.  scale to avoid overflow.
c
  110 if( dabs(dx(i)) .le. xmax ) go to 115
         sum = one + sum * (xmax / dx(i))**2
         xmax = dabs(dx(i))
         go to 200
c
  115 sum = sum + (dx(i)/xmax)**2
      go to 200
c
c
c                  prepare for phase 3.
c
   75 sum = (sum * xmax) * xmax
c
c
c     for real or d.p. set hitest = cuthi/n
c     for complex      set hitest = cuthi/(2*n)
c
   85 hitest = cuthi/float( n )
c
c                   phase 3.  sum is mid-range.  no scaling.
c
      do 95 j =i,nn,incx
      if(dabs(dx(j)) .ge. hitest) go to 100
   95    sum = sum + dx(j)**2
      dnrm2 = dsqrt( sum )
      go to 300
c
  200 continue
      i = i + incx
      if ( i .le. nn ) go to 20
c
c              end of main loop.
c
c              compute square root and adjust for scaling.
c
      dnrm2 = xmax * dsqrt(sum)
  300 continue
      return
      end
c
c***********************************************************************
c
      subroutine  dcopy(n,dx,incx,dy,incy)
c
c     copies a vector, x, to a vector, y.
c     uses unrolled loops for increments equal to one.
c     jack dongarra, linpack, 3/11/78.
c
      double precision dx(*),dy(*)
      integer i,incx,incy,ix,iy,m,mp1,n
C$    private
c
      if(n.le.0)return
      if(incx.eq.1.and.incy.eq.1)go to 20
c
c        code for unequal increments or equal increments
c          not equal to 1
c
      ix = 1
      iy = 1
      if(incx.lt.0)ix = (-n+1)*incx + 1
      if(incy.lt.0)iy = (-n+1)*incy + 1
      do 10 i = 1,n
        dy(iy) = dx(ix)
        ix = ix + incx
        iy = iy + incy
   10 continue
      return
c
c        code for both increments equal to 1
c
c
c        clean-up loop
c
   20 m = mod(n,7)
      if( m .eq. 0 ) go to 40
      do 30 i = 1,m
        dy(i) = dx(i)
   30 continue
      if( n .lt. 7 ) return
   40 mp1 = m + 1
      do 50 i = mp1,n,7
        dy(i) = dx(i)
        dy(i + 1) = dx(i + 1)
        dy(i + 2) = dx(i + 2)
        dy(i + 3) = dx(i + 3)
        dy(i + 4) = dx(i + 4)
        dy(i + 5) = dx(i + 5)
        dy(i + 6) = dx(i + 6)
   50 continue
      return
      end
c********************************************************************
c BTN: parallel software for unconstrained optimization
c last changed: 10/02/91
c For: Sequent Balance (parallel) and scalar computers
c********************************************************************
c
      subroutine btnez (n, x, f, g, w, lw, sfun, iflag)
c
c parallel block truncated-Newton routine for unconstrained optimization
c
c Parameters
c n        -> integer, number of variables
c x       <-> double precision, size n, nonlinear variables (initial guess
c                 and solution)
c f       <-  double precision, nonlinear function value (output)
c g       <-  double precision, size n, gradient (output)
c w        -  double precision, size lw, work space; currently must be of
c                 length at least 3n + 3k + 4k^2 + 7(n*k);
c                 where k = # of processors being used
c lw       -> integer, length of w
c sfun     -  subroutine to evaluation nonlinear function:
c                 subroutine sfun (n, x, f, g)
c             Routine sfun must be declared EXTERNAL in the calling program.
c             The parameters n, x, f, and g are as above; x and n must
c             not be modified by routine sfun; sfun must return the
c             nonlinear function value in f and the gradient vector in g.
c             To run in parallel on the Sequent, this routine (and any
c             routine that it calls) must have the command
c             C$    PRIVATE
c             as its first executable statement.
c iflag   <-  integer, error code:
c                 0 => normal return: successful termination
c                 1 => linesearch failed to find lower point
c                 2 => search direction points uphill
c                 3 => function appears to be unbounded below
c                 4 => more than maxfun evaluations of f(x) in linesearch
c                 5 => not converged after nlmax iterations
c                 6 => n <= 0
c               900 => insufficient work space provided
c               999 => disaster (see file fort.iunit for message)
c             See the user manual for more information about the meaning
c             of these error codes.
c
      implicit         double precision (a-h,o-z)
      double precision x(*), g(*), w(*)
      external         sfun
c
c set parameters for BTN
c
      call btnpar (nodemx, kmax, maxit, msglvl, iprec, nlmax,
     *    initv, tolq, iunit, rnktol, maxfun, accrcy, stepmx, eta,
     *    ireset, indstp)
c
c solve optimization problem
c
      call btn (n, x, f, g, w, lw, sfun, iflag,
     *          nodemx, kmax, maxit, msglvl, iprec, nlmax,
     *          initv, tolq, iunit, rnktol, maxfun, accrcy,
     *          stepmx, eta, ireset, indstp, sfun)
c
      return
      end
c
c
      subroutine btnpar (nodemx, kmax, maxit, msglvl, iprec, nlmax,
     *    initv, tolq, iunit, rnktol, maxfun, accrcy, stepmx, eta,
     *    ireset, indstp)
      implicit double precision (a-h,o-z)
C$    integer  cpus_online
c
c Sets parameters for BTN
c
c Parameters:
c nodemx  <-  integer, maximum number of processors used
c             default:   nodemx = [# of processors]/2 on Sequent
c kmax    <-  integer, block size
c             default:   kmax = nodemx
c maxit   <-  integer, maximum number of inner iterations allowed.
c             default:   maxit = 20
c msglvl  <-  integer,  amount of printing desired,
c             (<-1: none
c               -1: warning messages only
c                0: one line per outer iteration)
c             default:   msglvl = 0
c iprec   <-  integer, type of preconditioner
c             (0 = none, 1 = BFGS, 2 = approx. BFGS)
c             default:      iprec = 1.
c nlmax   <-  integer, maximum number of outer iterations allowed
c             default:   nlmax = 500
c initv   <-  integer, type of initialization
c             (1 = random,2 = rhs + random, 3 = limited memory)
c             default:   initv = 3
c tolq    <-  double precision, tolerance for quadratic-based test
c             default:   tolq =  5.d-1
c iunit   <-  integer, output unit # for printing from this processor
c rnktol  <-  double precision, tolerance for rank test in inner iteration
c             default: rnktol = 1.d-9]
c maxfun  <-  integer, maximum number of function evaluations allowed in
c             the linesearch.
c             default:  maxfun = 2*nlmax
c accrcy  <-  double precision, user-chosen convergence tolerance (stop the
c             algorithm if the infinity-norm of the gradient is  <=
c             accrcy*(1.d0 + |f(x)|).
c             default:  accrcy = 0.d0
c stepmx  <-  double precision, maximum step allowed in line search
c             default:   stepmx = 1.d3
c eta     <-  double precision, accuracy parameter for line search (must be
c             strictly between 0 and 1, and it is recommended that
c             it be chosen between .1 and .9)
c             default:  eta = 0.2
c ireset  <-  integer, reset preconditioner after ireset iterations.
c             If ireset <= 0, then no resetting is done.
c             default:  ireset = 0 (no resetting)
c indstp  <-  integer, indicates if the user wishes to control the
c             maximum step size in the line search with a user provided
c             subroutine MAXSTP.  If indstp=0, BTN automatically determines
c             the maximum step.  If indstp<>0, then subroutine MAXSTP must
c             be provided.
c             default:  indstp = 0 (automatic setting of step size)
c
      nodemx  = 5
C$    nodemx  = cpus_online()/2
      kmax    = nodemx
      maxit   = 20
      msglvl  = 0
      iprec   = 1
      nlmax   = 500
      initv   = 3
      tolq    = 5.0d-1
      iunit   = 10
      rnktol  = 1.0d-9
      maxfun  = 2*nlmax
      accrcy  = 0.d0
      stepmx  = 1.d3
      eta     = 0.2d0
      ireset  = 0
      indstp  = 0
c
      return
      end
c
c
      subroutine btn (n, x, f, g, w, lw, sfun, iflag,
     *            nodemx, kmax, maxit, msglvl, iprec, nlmax,
     *            initv, tolq, iunit, rnktol, maxfun, accrcy,
     *            stepmx, eta, ireset, indstp, maxstp)
c
c parallel block truncated-Newton routine
c
c Parameters
c n        -> integer, number of variables
c x        -> double precision, size n, nonlinear variables (initial
c                  guess and solution)
c f       <-  double precision, nonlinear function value (output)
c g       <-  double precision, size n, gradient (output)
c w        -  double precision, size lw, work space; currently must be of
c                 length at least 3n + 3kmax + 4kmax^2 + 7(n*kmax);
c lw       -> integer, length of w
c sfun     -  subroutine to evaluation nonlinear function:
c                 subroutine sfun (n, x, f, g)
c             Routine sfun must be declared EXTERNAL in the calling program.
c             The parameters n, x, f, and g are as above; x and n must
c             not be modified by routine sfun; sfun must return the
c             nonlinear function value in f and the gradient vector in g.
c             To run in parallel on the Sequent, this routine (and any
c             routine that it calls) must have the command
c             C$    PRIVATE
c             as its first executable statement.
c iflag   <-  integer, error code:
c                 0 => ok
c                 1 => linesearch failed to find lower point
c                 2 => search direction points uphill
c                 3 => function appears to be unbounded below
c                 4 => more than maxfun evaluations of f(x)
c                 5 => not converged after nlmax iterations
c                 6 => n <= 0
c               900 => insufficient work space provided
c               999 => disaster (see file fort.iunit for message)
c            See the user manual for more information about the meaning
c            of these error codes.
c nodemx   -> integer, number of processors used
c kmax     -> integer, block size [usually equal to nodemx]
c maxit    -> integer, maximum number of inner iterations allowed
c                 [typical value: 20]
c msglvl   -> integer, amount of printing desired (<0 = none;
c                 0 = 1 line per outer iteration)
c iprec    -> integer, type of preconditioner (0 = none, 1 = BFGS,
c                 2 = approx. BFGS) [it is strongly suggested that
c                 iprec = 1 be used]
c nlmax    -> integer, maximum number of outer iterations allowed
c                 [typical value: 500]
c initv    -> integer, type of initialization (1 = random,
c                 (2 = rhs + random, 3 = limited memory) [it is
c                 suggested that initv = 3 be used]
c tolq     -> double precision, tolerance for quadratic-based test
c                 [typical value: 5.d-1]
c iunit    -> integer, output unit # for printing from this processor
c rnktol   -> double precision, tolerance for rank test in inner iteration
c                 [typical value: 1.d-9]
c maxfun   -> integer, maximum number of function evaluations allowed in
c                 the linesearch [typical value: 2*nlmax]
c accrcy   -> double precision, user-chosen convergence tolerance (stop
c                 the algorithm if the infinity-norm of the gradient is
c                 <= accrcy*(1.d0 + |f(x)|).  Under normal circumstances,
c                 the user should set accrcy=0.0 and let the stringent
c                 convergence test built-in to BTN be used.  If the function
c                 f(x) or its gradient cannot be obtained accurately, then
c                 the user may wish to override this test by setting
c                 accrcy = 1.d-6 (say), or perhaps some slightly larger value.
c                 [typical value: 0.d0]
c stepmx   -> double precision, maximum step allowed in line search
c                 [typical value: 1.d3]
c eta      -> double precision, accuracy parameter for line search (must be
c                 strictly between 0 and 1, and it is recommended that
c                 it be chosen between .1 and .9) [typical value: 0.2]
c ireset   -> integer, reset preconditioner after ireset iterations; if
c                 no resetting is desired, set ireset=0; normally it
c                 will not be necessary to reset the preconditioner based
c                 on the iteration count, but it can occasionally improve
c                 performance on certain specially structured problems; see
c                 the user manual for further guidance.
c                 [recommended value: 0]
c indstp   -> integer, indicates if the user wishes to select the maximum
c                 step in the line search manually at every iteration.
c                 See sample main program 4 for an example.
c                 If indstp=0, then BTN sets the maximum step automatically.
c                 If indstp<>0, then the user-supplied routine MAXSTP is
c                 called (see below).
c                 [typical value: 0]
c maxstp   -  (optional) subroutine to set the maximum step for the line
c             search.  It must have the following calling sequence:
c                 subroutine maxstp (n, x, d, stepmx)
c             where n and x are as for subroutine sfun, d is the current
c             search direction (the new vector of parameters will be of
c             the form x+a*d for some scalar a), and stepmx is the largest
c             allowable step (computed by the user within subroutine
c             maxstp).  The values of n, x, and d must not be modified by
c             the user, and stepmx must be defined by the user.  Subroutine
c             maxstp must be declared EXTERNAL in the calling program.
c             NOTE: most applications will not require this feature, and
c             should set indstp=0 when calling BTN (see above); when
c             indstp=0, BTN will not attempt to call maxstp, and no subroutine
c             need be provided.  In this case, the call to BTN can have the
c             form:  call btn (..., indstp, sfun), where sfun is the name
c             of the function evaluation routine described above.  See the
c             sample main programs for further examples.
c
      implicit         double precision (a-h,o-z)
      double precision x(*), g(*), w(*)
      integer          flgbcg, cubesz
C$    integer*4        m_get_numprocs
      logical          msg
      external         sfun
c
c check for inappropriate parameter settings
c
      if (maxit .lt. 0) then
          if (msglvl .ge. -1) then
              write (*,*) ' BTN - Warning: maxit < 0'
              write (*,*) '       maxit = ', maxit
              write (*,*) '       Resetting maxit = 20'
          end if
          maxit = 20
      end if
      if (iprec .lt. 0 .or. iprec .gt. 2) then
          if (msglvl .ge. -1) then
              write (*,*) ' BTN - Warning: iprec out of range'
              write (*,*) '       iprec = ', iprec
              write (*,*) '       Resetting iprec = 1'
          end if
          iprec = 1
      end if
      if (initv .lt. 1 .or. initv .gt. 3) then
          if (msglvl .ge. -1) then
              write (*,*) ' BTN - Warning: initv out of range'
              write (*,*) '       initv = ', initv
              write (*,*) '       Resetting initv = 3'
          end if
          initv = 3
      end if
      if (tolq .le. 0.d0 .or. tolq .ge. 1.d0) then
          if (msglvl .ge. -1) then
              write (*,*) ' BTN - Warning: tolq out of range'
              write (*,*) '       tolq = ', tolq
              write (*,*) '       Resetting tolq = 0.5'
          end if
          tolq = 5.d-1
      end if
      if (rnktol .lt. 0.d0) then
          if (msglvl .ge. -1) then
              write (*,*) ' BTN - Warning: rnktol out of range'
              write (*,*) '       rnktol = ', rnktol
              write (*,*) '       Resetting rnktol = 1.d-9'
          end if
          rnktol = 1.d-9
      end if
      if (eta .le. 0.d0 .or. eta .ge. 1.d0) then
          if (msglvl .ge. -1) then
              write (*,*) ' BTN - Warning: eta out of range'
              write (*,*) '       eta = ', eta
              write (*,*) '       Resetting eta = 0.2'
          end if
          eta = 2.d-1
      end if
c
c set up
c
      if (n .le. 0) then
          iflag = 6
          go to 500
      end if
      cubesz = kmax
C$    cubesz = m_get_numprocs()
c
c compare n with blocksize
c
      if (n .lt. kmax) then
          if (msglvl .ge. -1) then
              write (*,*) ' BTN - Warning: blocksize too small'
              write (*,*) '       kmax = ', kmax
              write (*,*) '       n    = ', n
              write (*,*) '       Setting kmax = n'
          end if
          kmax = n
      end if
      maxit1   = maxit
      kxk      = kmax * kmax
      k        = kmax
      nk       = n * k
      nf       = 1
c
c subdivide the work array
c
      lp      = 1
      lpre    = lp     + n
      ivinit  = lpre   + n
      iv0     = ivinit + nk
      iv1     = iv0    + nk
      ir      = iv1    + nk
      iar     = ir     + nk
      ip      = iar    + nk
      ialpha  = ip     + nk
      ibeta   = ialpha + kxk
      il1     = ibeta  + kxk
      il2     = il1    + kxk
      id1     = il2    + kxk
      id2     = id1    + kmax
      is      = id2    + kmax
      iwk     = is     + kmax
      lprenw  = iwk    + nk
      lwtest  = lprenw + n - 1
      if (lw .lt. lwtest) then
          call disast ('BTN', 'INSUFFICIENT STORAGE', iunit, iflag)
          write (iunit,*) ' # of locations required = ', lwtest
          iflag = 900
          go to 500
      end if
c
      msg     = msglvl .ge.  0
      ncg     = 0
      iter    = 0
      iflag   = 0
      flgbcg  = 0
      intflg  = 0
      itsave  = 0
c
c initial printouts
c
      call sfun (n, x, f, g)
      fsave = abs(f)
      gnrm = pdnrmi (n, g)
      if (msg) then
          write (*,800)
          write (*,810) iter, nf, ncg, f, gnrm
      end if
c
c set tolerances for convergence tests
c
      call settol (tol0,tol1,tol2,tol3,tol4)
c
c convergence test at initial point
c
      ftest = 1.d0 + dabs(f)
      if (gnrm .lt. tol0*ftest) go to 500
c
c initialize preconditioner to the identity
c
      if (iprec .gt. 0) call pdfill (n, 1.d0, w(lprenw))
c
c  main loop
c
      do 10 iter = 1,nlmax
c
c  set the preconditioner
c
          itsave = itsave + 1
          if (iprec .eq. 0) then
              call pdfill (n, 1.d0, w(lpre))
          else
              call pdcopy (n, w(lprenw), w(lpre))
          end if
c
c  get search direction
c
          call precg (w(iv1), n, n, g, kmax, iter,
     *        initv, flgbcg)
          call bcg (n, w(lp), g, gnrm, k, kmax, flgbcg,
     *        maxit1, x, nbl, w(lpre), w(lprenw),
     *        tolq, n, kmax, w(iv0), w(iv1), w(ir),
     *        w(iar), w(ip), w(ialpha), w(ibeta), w(il1),
     *        w(il2), w(id1), w(id2), w(is), w(iwk),
     *        nodemx, iprec, sfun, iunit, rnktol)
          k = kmax
          if (iflag .eq. 999) go to 500
          call pdneg (n, w(lp))
          dg = pddot (n, w(lp), g)
          ncg = ncg + nbl
          if (initv .eq. 3 .and.
     *        flgbcg .gt. 0 .and. intflg .eq. flgbcg)
     *        initv = 2
          call postcg (w(iv1), n, n, k, iter, initv,
     *        w(ivinit), n, w(lp), g, gnrm, flgbcg, cubesz)
          if (flgbcg .ge. 0) intflg = flgbcg
          if (indstp .ne. 0) call maxstp (n, x, w(lp), stepmx)
c
c parallel line search
c
          oldf = f
          call psrch (iflag, n, x, f, g, w(lp), w(ip), w(ir),
     *               w(iar), sfun, nf, eta, stepmx, kmax,
     *               cubesz, dg, w(iv0), w(iwk), alpha)
          if (iflag .lt. 0) then
              maxit1 = 1 + 3/kmax
          else
              maxit1 = maxit
          end if
          if (iflag .lt. 0) iflag = 0
          gnrm = pdnrmi (n, g)
          if (msg) write (*,810) iter, nf, ncg, f, gnrm
          if (iflag .ne. 0) go to 500
c
c test for convergence
c
          diff  = abs(oldf - f)
          ftest = 1.d0 + dabs(f)
          xnrm = pdnrmi (n, x)
          xtest = 1.d0 + xnrm
          pnrm = pdnrmi (n, w(lp))
c
          if ((alpha*pnrm  .lt.   tol1*xtest
     *         .and. diff  .lt.   tol2*ftest
     *         .and. gnrm  .lt.   tol3*ftest)
     *         .or.  gnrm  .lt.   tol4*ftest
     *         .or.  gnrm  .lt. accrcy*ftest) go to 500
          if (nf .gt. maxfun) then
              iflag = 4
              go to 500
          end if
          ftest = abs(f)
          if (ftest .le. fsave*1.d-5 .or. itsave .eq. ireset) then
              if (iprec .gt. 0) call pdfill (n, 1.d0, w(lprenw))
              fsave = ftest
              itsave = 0
          end if
10    continue
      iflag = 5
c
500   if (msg) write (*,820) iflag
      return
800   format (/, 3x, 'it', 3x, 'nf', 2x, 'ncg', 11x, 'f', 14x, '|g|')
810   format (' ', i4, 1x, i4, 1x, i4, 2x, 1pd16.8, 2x, 1pd12.4)
820   format (' BTN terminating with error code = ', i4)
      end
c
c
      subroutine settol (tol0,tol1,tol2,tol3,tol4)
      implicit double precision (a-h,o-z)
c
c set tolerances for convergence tests
c
      eps    = d1mach(3)
      if (1.d0 + 2.d0*eps .eq. 1.d0) then
          write (*,*) ' ### BTN: ERROR ###'
          write (*,*) '     Machine epsilon too small'
          write (*,*) '     Value = ', eps
          write (*,*) '     Modify routine D1MACH'
          write (*,*) '     Terminating execution'
          stop
      end if
      rteps  = sqrt(eps)
      rtol   = 1.d1*rteps
      rtolsq = rtol*rtol
      peps   = eps**0.6666d0
c
      tol0 = 1.d-2  * rteps
      tol1 = rtol   + rteps
      tol2 = rtolsq + eps
      tol3 = sqrt(peps)
      tol4 = 1.d-2*rtol
c
      return
      end
c
c
      subroutine bcg (n, x, b, bnrm, k, kmax, iflag, maxit,
     *                xnl, ncg, PC, PCnew,
     *                tolq, nn, kk, V0, V1, R, AR, U,
     *                alpha, beta, L1, L2, D1, D2, s,
     *                wk, nodemx, iprec, sfun, iunit, rnktol)
c
c Parallel block conjugate-gradient iteration (to compute a
c search direction for a truncated-Newton optimization algorithm)
c
c PARAMETERS
c n          -> integer, dimension of problem
c x         <-  double precision, size n, search direction
c b          -> double precision, size n, right-hand side vector
c bnrm       -> double precision, infinity-norm of right-hand side
c k         <-> integer, current block size
c kmax       -> integer, maximum block size
c iflag     <-  integer, flag:
c                       0 => okay
c                      -1 => steepest-descent direction
c                       i => linear depend. in col. i at iteration 1
c                     999 => disaster (message to unit iunit)
c maxit      -> integer, maximum number of inner iterations allowed
c                     (if maxit<0, then nonlinearity test is also used)
c xnl        -> double precision, size n, vector of nonlinear parameters
c ncg       <-  integer, counter for number of inner iterations
c PC        <-> double precision, size n, current and new preconditioner
c PCnew      -  double precision, size n, work array (new PC)
c tolq       -> double precision, tolerance for quadratic test
c nn         -> integer, leading dimension of n*k matrices
c kk         -> integer, leading dimension of k*k matrices
c V0         -  double precision, size n*k, work array (old Lanczos
c               vectors)
c V1         -> double precision, size n*k, work array (current Lanczos
c               vectors)
c R          -  double precision, size n*k, work array (preconditioned V1)
c AR         -  double precision, size n*k, work array (Hessian times R)
c U          -  double precision, size n*k, work array (inner search
c               directions)
c alpha      -  double precision, size k*k, work array (diagonal block
c               of V'AV)
c beta       -  double precision, size k*k, work array (off-diag block
c               of V'AV)
c L1         -  double precision, size k*k, work array (Cholesky factor
c               of V'AV)
c L2         -  double precision, size k*k, work array (Cholesky factor
c               of V'AV)
c D1         -  double precision, size   k, work array (Cholesky factor
c               of V'AV)
c D2         -  double precision, size   k, work array (Cholesky factor
c               of V'AV)
c s          -  double precision, size   k, work array (inner step sizes)
c wk         -  double precision, size n*k, work array
c nodemx     -> integer, maximum number of processors available
c iprec      -> integer, choice of preconditioner:
c                       0 => no preconditioner
c                       1 => LMQN diagonal PC
c                       2 => approximate LMQN diagonal PC
c sfun       -> external, subroutine sfun (n,x,f,g), evaluate f(x)
c               To run in parallel on the Sequent, this routine (and any
c               routine that it calls) must have the command
c               C$    PRIVATE
c               as its first executable statement.
c iunit      -> integer, output unit # for printing
c rnktol     -> double precision, tolerance for rank test in inner iteration
c
      implicit         double precision (a-h,o-z)
      double precision b(n), xnl(n), x(n), PC(n), PCnew(n),
     *                 V0(nn,kk), V1(nn,kk), R(nn,kk), AR(nn,kk),
     *                 U(nn,kk), alpha(kk,kk), beta(kk,kk),
     *                 L1(kk,kk), L2(kk,kk), D1(kk), D2(kk), s(kk),
     *                 wk(nn,kk)
      integer          kmax, cubesz
      logical          indef
      external         sfun
c
c set up
c
      cubesz = kmax
      kold   = k
      iflag  = 0
      info   = 0
      iend   = 0
      indef  = .false.
      xAx    = 0.d0
      nblock = n/k
      if (nblock*k .lt. n) nblock = nblock + 1
c
c**********************************************************************
c Initialization
c**********************************************************************
c
      call pdfill (n, 0.d0, x)
      call dfill (k, 0.d0, s, 1)
      if (bnrm .eq. 0.d0) then
          call disast ('BCG', 'bnrm = 0', iunit, iflag)
          go to 500
      end if
      call pdcopy (n, b, V1)
      call pdscal (n, 1.d0/bnrm, V1)
c
c**********************************************************************
c Main loop
c**********************************************************************
c
      maxnbl = nblock
      if (maxit  .eq.     0) maxit  = 1
      if (maxnbl .gt. maxit) maxnbl = maxit
c
      do 100 nbl = 1, maxnbl
          ncg  = nbl
          kold = k
c
c**********************************************************************
c Get QR of V1
c**********************************************************************
c
          call msolve (V1, nn, n, k, R, nn, PC)
          call pgetch (V1, nn, R, nn, beta, kk, n, kold,
     *            k, iflag, rnktol, iunit)
          if (iflag .eq. 999) return
          if (nbl .eq. 1) then
              if (k .eq. 0) then
                  call disast ('BCG', 'K=0 AT ITER=1', iunit, iflag)
                  iflag = -1
                  call pdcopy  (n, b, x)
                  return
              else
                  if (k. lt. kold) iflag = k+1
                  kold = k
              endif
          else
              if (k .eq. 0) go to 500
          endif
c
c Compute V1 x (Beta')^(-1)
c
          call vltnv1 (V1, nn, n, k, beta, kk, iunit, iflag)
          if (iflag .eq. 999) return
          call msolve (V1, nn, n, k, R, nn, PC)
          if (nbl .eq. 1) then
              s(1) = pddot (n, b, R(1,1))
              s(1) = s(1) / bnrm
          end if
c
c**********************************************************************
c Compute AR, alpha
c**********************************************************************
c
          call getar (R, nn, k, AR, nn, cubesz, wk, nn, n, xnl,
     *                b, sfun)
          call AtrBA (R, nn, k, AR, nn, alpha, kk, n)
c
c update diagonal preconditioner (if desired)
c
          if (iprec .eq. 1) then
              call frmpc (PCnew, alpha, R, AR, nn, n, kk, k, wk)
          endif
          if (iprec .eq. 2) then
              call frmpc1 (PCnew, alpha, R, AR, nn, n, kk, k, wk)
          endif
c
c**********************************************************************
c Cholesky for L1
c set D1 = D2 (kold * kold), then solve for L1 (kold * k)
c L2 * D1 * L1 = beta, where L2, D1 are of dimension kold
c**********************************************************************
c
          if (nbl .gt. 1) then
              call dcopy (kold, D2, 1, D1, 1)
              do 29 i = 1,k
                  if (L2(i,i) .eq. 0.d0) then
                      call disast ('LSOL', 'l(i,i) = 0',
     *                             iunit, iflag)
                      return
                  end if
29            continue
C$DOACROSS SHARE(L2,L1,beta,D1,kk,kold,iunit,iflag)
              do 30 i = 1,k
                  call lsol (L2(i,i), kk, kold-i+1, beta(i,i),
     *                  L1(i,i), iunit, iflag)
                  call dvdiv (kold-i+1,L1(i,i),1,D1(i),1,L1(i,i),1)
30            continue
          end if
c
c**********************************************************************
c Cholesky for alpha
c Cholesky factorization of block tridiagonal matrix.
c In the following L2 will be the factor of the diagonal block.
c All processors compute the Cholesky simultaneously.
c**********************************************************************
c
          call matcpy (alpha, kk, L2, kk, k, k)
          if (nbl .gt. 1) then
C$DOACROSS SHARE(L2,L1,D1,kold), LOCAL(j,l)
              do 50 i = 1,k
              do 50 j = 1,i
                  do 40 l = i,kold
                      L2(i,j) = L2(i,j) - L1(l,i) * D1(l) * L1(l,j)
40                continue
                  if (j .lt. i) L2(j,i) = L2 (i,j)
50            continue
          end if
          call dpofa2 (L2, kk, k, info, iunit, iflag, rnktol)
          if (iflag .eq. 999) return
          if (info .ne. 0) then
              indef = .true.
              k = abs(info) - 1
              if (nbl .eq. 1 .and. k .eq. 0) then
                  call pdcopy (n, b, x)
                  iflag = -1
                  return
              endif
              if (nbl .gt. 1 .and. k .eq. 0) go to 500
          endif
c
c get L2 and D2
c
          call getld2 (L2, kk, k, D2, iunit, iflag)
          if (iflag .eq. 999) return
c
c**********************************************************************
c Search direction U
c**********************************************************************
c compute  V1 - U L1
c     V1 (n * k), U (n * kold), L1 (kold * k) lower triangular
c     V1 - U L1 will be stored in R
c     U*L1 is stored in U (=U)
c
          if (nbl .gt. 1) then
              call timsl1 (U, nn, n, kold, L1, kk, k)
              call aminsb (R, nn, U, nn, R, nn, n, k)
          end if
c
c solve for U:
c    U * L2' = V1
c    V1 is n * k
c    L2 is k * k lower triangular
c
          call matcpy (R, nn, U, nn, n, k)
          call vltnv2 (U, nn, n, k, L2, kk, iunit, iflag)
          if (iflag .eq. 999) return
c
c**********************************************************************
c Step size
c**********************************************************************
c
          if (nbl .eq. 1) then
              call lsol  (L2, kk, k, s, s, iunit, iflag)
              if (iflag .eq. 999) return
              call dvdiv (k, s, 1, D2, 1, s, 1)
          else
              call dvmul  (kold, D1, 1, s, 1, s, 1)
              call premlt (s, nn, kold, 1, L1, kk, k, wk, n)
              call lsol   (L2, kk, k, wk, s, iunit, iflag)
              if (iflag .eq. 999) return
              call dvdiv  (k, s, 1, D2, 1, s, 1)
              call dscal  (k, -1.d0, s, 1)
          end if
c
c**********************************************************************
c Update x
c**********************************************************************
c
          call maxpy (x, n, n, 1, 1.d0, U, nn, k, s, kk)
c
c**********************************************************************
c compute termination criterion using quadratic test
c**********************************************************************
c
          if (indef) go to 500
          if (nbl .eq. maxnbl) go to 500
          if (nbl .gt. 1 .and. k .eq. 0) go to 500
c
c quadratic test
c
          call dvmul  (k, D2, 1, s, 1, wk, 1)
          xAx  = xAx + ddot(k, s, 1, wk, 1)
          bx = pddot (n, b, x)
          qnew = xAx * 0.5d0 - bx
          if (nbl .gt. 1) then
              diff = qnew - qold
              if (qnew .eq. 0.d0) then
                  call disast ('BCG','Q(P) = 0',iunit, iflag)
                  go to 500
              end if
              if ((nbl * diff / qnew) .lt. tolq) iend = 1
          end if
          qold = qnew
c
          if (iend .eq. 1) go to 500
c
c**********************************************************************
c Update for next iteration:  form new V
c**********************************************************************
c AR:= AR - V0 * beta; V0 (n * kold), beta (kold * k)
c
          if (nbl .gt. 1) then
              call timsl1 (V0, nn, n, kold, beta, kk, k)
              call aminsb (AR, nn, V0, nn, AR, nn, n, k)
          end if
c
c V1:= AR - V1 * alpha; update V0
c
          call matcpy (V1, nn,  V0, nn, n, k)
          call atimsb (V0, nn, n, k, alpha, kk, k, V1, nn)
          call aminsb (AR, nn, V1, nn, V1, nn, n, k)
c
100   continue
c
500   call pdscal (n, bnrm, x)
      return
      end
c
c
      subroutine pgetch (V, ldv, Av, ldav, Beta, ldb, n, k,
     *             kr, iflag, rnktol, iunit)
c
c Cholesky factorization of V'A V, where
c
c PARAMETERS
c V         -> double precision, size n*k, input matrix
c ldv       -> integer, leading dimension of V
c Av        -> double precision, size n*k, input matrix A*V
c ldav      -> integer, leading dimension of AV
c Beta     <-  double precision, size k*k, Cholesky factor
c ldb       -> integer, leading dimension of Beta
c n         -> integer, size of arrays
c k         -> integer, block size (current)
c kr       <-  integer, rank of Beta (new block size)
c iflag    <-  integer, flag (=999 in case of disaster)
c rnktol    -> double precision, tolerance for rank test in inner
c              iteration
c iunit     -> integer, unit # for disaster messages
c
      implicit    double precision (a-h,o-z)
      dimension   V(ldv,*), Av(ldav,*), Beta(ldb,*)
c
c form V'AV
c
      call AtrBA (V, ldv, k, Av, ldav, Beta, ldb, n)
c
c find Cholesky factorization of V'AV (simultaneously on all processors)
c and determine new block size (i.e., rank of Beta)
c
      call dpofa2 (Beta, ldb, k, info, iunit, iflag, rnktol)
      if (iflag .eq. 999) return
      if (info .ne. 0) then
          kr = abs(info) - 1
      else if (info .eq. 0) then
          kr = k
      endif
c
      return
      end
c
c
      subroutine AtrBA (A, lda, k, BA, ldb, C, ldc, n)
c
c computes C = A'* BA, where BA is B*A with B symmetric.
c A (n x k), BA (n * k) and C (k * k)
c The result is explicitly symmetrized, and both halves are computed.
c
c PARAMETERS
c A         -> double precision, size n*k, input matrix
c lda       -> integer, leading dimension of A
c k         -> integer, block size
c BA        -> double precision, size n*k, input matrix B*A
c ldb       -> integer, leading dimension of BA
c C        <-  double precision, size k*k, result matrix
c ldc       -> integer, leading dimension of C
c n         -> integer, size of arrays
c
      implicit         double precision (a-h,o-z)
      double precision A(lda,*), BA(ldb,*), C(ldc,*)
c
c form piece of A'BA
c
C$DOACROSS SHARE(C,A,BA,k,n), LOCAL(j)
      do 10 i = 1, k
          do 5 j = 1, k
              C(i,j) = ddot(n, A(1,i), 1, BA(1,j), 1)
5         continue
10    continue
c
c symmetrize result
c
C$DOACROSS SHARE(C), LOCAL(j)
      do 20 i = 1,k
          do 15 j = 1,i-1
              C(j,i) = (C(i,j)+C(j,i))*5.d-1
              C(i,j) = C(j,i)
15        continue
20    continue
c
      return
      end
c
c
      subroutine getar (R, ldr, k, AR, ldar, kmax, wk, ldwk,
     *                  n, xnl, g, sfun)
c
c getar: forms A*R, where R is a block matrix, by finite differencing
c
c PARAMETERS
c
c R             -> double precision, size n*k, original block matrix
c ldr           -> integer, leading dimension of R
c k             -> integer, block size
c AR           <-  double precision, size n*k, result: A*R (A = Hessian)
c ldar          -> integer, leading dimension of AR
c kmax          -> integer, # of active processors
c wk            -  double precision, size n*k, work array
c ldwk          -> integer, leading dimension of wk
c n             -> integer, # of variables
c xnl           -> double precision, current estimate of nonlinear
c                  variables
c g             -> double precision, current gradient
c sfun          -  subroutine to evaluation nonlinear function
c                  To run in parallel on the Sequent, this routine (and
c                  any routine that it calls) must have the command
c                  C$    PRIVATE
c                  as its first executable statement.
c
      implicit         double precision (a-h,o-z)
      double precision R(ldr,*), AR(ldar,*), wk(ldwk,*), g(*), xnl(*)
      integer          kmax
      external         sfun
c
C$DOACROSS SHARE(R,AR,xnl,g,wk,n,k,sfun)
      do 20 j = 1,kmax
          call atimes (R(1,j), AR(1,j), n, xnl, g, wk(1,j),
     *                 sfun, k)
20    continue
c
      return
      end
c
c
      subroutine atimes (v, av, n, x, g, wk, sfun, k)
      implicit         double precision  (a-h,o-z)
      double precision v(*), av(*), x(*), g(*), wk(*)
C$    private
c
c compute matrix/vector product via finite-differencing
c
c PARAMETERS
c v       -> double precision, size n, input for matrix/vector product
c av      <- double precision, size n, result of matrix/vector product
c n       -> integer, dimension of problem
c x       -> double precision, size n, current nonlinear parameter vector
c g       -> double precision, size n, current nonlinear gradient
c wk      -  double precision, size n, work array
c sfun    -> subroutine sfun (n,x,f,g) to evaluate nonlinear function
c            To run in parallel on the Sequent, this routine (and any
c            routine that it calls) must have the command
c            C$    PRIVATE
c            as its first executable statement.
c k       -> integer, the current block size
c
c REQUIRES
c d1mach -  function to specify machine epsilon [d1mach(3)]
c BLAS   -  basic linear algebra subroutines
c
      eps = d1mach(3)
      h   = 1.d1*sqrt(eps)
      rh  = 1.d0 / h
      call dcopy (n, x, 1, wk, 1)
      call daxpy (n, h, v, 1, wk, 1)
      call sfun  (n, wk, fw, av)
      call daxpy (n, -1.d0, g, 1, av, 1)
      call dscal (n, rh, av, 1)
c
      return
      end
c
c
      subroutine frmpc  (D, alpha, R, AR, nn, n, kk, k, wk)
c
c Update the diagonal preconditioner, based on BFGS formula
c
c PARAMETERS
c D        <-> double precision, size n, diagonal preconditioner
c alpha     -> double precision, size k*k, matrix from routine BCG
c R         -> double precision, size n*k, matrix from routine BCG
c AR        -> double precision, size n*k, matrix from routine BCG
c nn        -> integer, leading dimension of R and AR
c n         -> integer, size of arrays
c kk        -> integer, leading dimension of alpha
c k         -> integer, current block size
c wk        -  double precision, size 1, work variable
c
      implicit         double precision (a-h,o-z)
      double precision D(*), R(nn,*), AR(nn,*), alpha(kk,*),
     *                 wk(*)
c
c update D for each vector in the block
c
      do 30 ind = 1,k
c
c form v'Dv
c
          vdv = 0.d0
C$DOACROSS SHARE(R,D,ind), REDUCTION(vdv)
          do 10 i = 1,n
              vdv = vdv + R(i,ind)*D(i)*R(i,ind)
10        continue
c
c update D
c
          vgv = alpha(ind,ind)
          if (abs(vdv) .le. 1.d-12) go to 30
          if (abs(vgv) .le. 1.d-12) go to 30
C$DOACROSS SHARE(R,AR,D,vdv,vgv,ind), LOCAL(t1,t2)
          do 20 i = 1,n
              t1 = D(i)*R(i,ind)
              t2 = AR(i,ind)
              D(i) = D(i) - t1*t1/vdv + t2*t2/vgv
              if (D(i) .le. 1.d-6) D(i) = 1.d0
20        continue
c
30    continue
c
      return
      end
c
c
      subroutine frmpc1 (D, alpha, R, AR, nn, n, kk, k, wk)
c
c Update the diagonal preconditioner, BFGS (approximate) formula
c
c PARAMETERS
c D        <-> double precision, size n, diagonal preconditioner
c alpha     -> double precision, size k*k, matrix from routine BCG
c R         -> double precision, size n*k, matrix from routine BCG
c AR        -> double precision, size n*k, matrix from routine BCG
c nn        -> integer, leading dimension of R and AR
c n         -> integer, size of arrays
c kk        -> integer, leading dimension of alpha
c k         -> integer, current block size
c wk        -  double precision, size k, work array
c
      implicit         double precision (a-h,o-z)
      double precision D(*), R(nn,*), AR(nn,*), alpha(kk,*),
     *                 wk(*)
c
c form v'Dv for this processor, and then produce global result
c
C$DOACROSS SHARE(r,D,wk,n), LOCAL(vdv,i)
      do 20 ind = 1,k
          vdv = 0.d0
          do 10 i = 1,n
              vdv = vdv + r(i,ind)*D(i)*r(i,ind)
10        continue
          wk(ind) = vdv
20    continue
c
c update D
c
C$DOACROSS SHARE(R,AR,alpha,wk,D,k), LOCAL(vgv,vdv,t1,t2,ind)
      do 40 i = 1,n
          do 30  ind = 1,k
              vgv = alpha(ind,ind)
              vdv = wk(ind)
              if (abs(vdv) .le. 1.d-12) go to 30
              if (abs(vgv) .le. 1.d-12) go to 30
              t1 = D(i)*R(i,ind)
              t2 = AR(i,ind)
              D(i) = D(i) - t1*t1/vdv + t2*t2/vgv
              if (D(i) .le. 1.d-6) D(i) = 1.d0
30        continue
40    continue
c
      return
      end
c
c
      subroutine precg (V, ldv, n, b, kmax, iter, initv, iflag)
c
c initialize the matrix V
c      initv = 1  col 1 = b, all others random
c      initv = 2  col 1 = b, col 2 = previous direc, all others random
c      initv = 3  col 1 = b, alternate previous direc and previous grad
c
c PARAMETERS
c V      <-> double precision, size n*kmax, initial matrix for block
c            CG iteration
c ldv     -> integer, leading dimension of V
c n       -> integer, size of arrays
c b       -> double precision, size n, right-hand side vector
c kmax    -> integer, # of active processors
c iter    -> integer, outer iteration number
c initv   -> integer, type of initialization desired
c iflag   -> integer, indicator flag (tests if initv=3 is failing)
c
      implicit         double precision (a-h,o-z)
      double precision V(ldv,*), b(*)
c
c Store b as first column of V
c
      call pdcopy (n, b, V)
c
c INITV=1,2,3:  Dynamic (and random) initialization
c
      krand = 2
      if (initv .eq. 2) then
          if (iter .gt. 1) krand = 3
      else if (initv .eq. 3) then
          if (iflag .eq. -1) then
              krand = 3
              go to 20
          endif
          krand = 2*iter
          if (krand .gt. kmax) go to 40
      endif
20    do 30 j = krand,kmax
          call rancol (V, ldv, j, n)
30    continue
c
40    return
      end
c
c
      subroutine postcg (V, ldv, n, k, iter, initv,
     *                  Vinit, ldvi, x, g, gnrm, iflag, kmax)
c
c initialize the matrix V
c    initv = 2  col 1 = b, col 2 = previous direc(x), all others random
c    initv = 3  col 1 = b, alternate previous direcs(x) and grads(g)
c If directions are linearly dependent (at the first inner iteration)
c the offending column is replaced by a random column.  If this happens
c for the same column two outer iterations in a row, subroutine btn
c switches to initialization strategy 3.
c
c V        <-  double precision, size n*k, new initialization matrix
c ldv       -> integer, leading dimension of V
c n         -> integer, size of problem
c k         -> integer, block size
c iter      -> integer, outer iteration number
c initv     -> integer, choice of initialization scheme
c Vinit    <-> double precision, size n*k, storage of initialization
c              vectors
c ldvi      -> integer, leading dimension of Vinit
c x         -> double precision, size n, search-direction vector
c g         -> double precision, size n, current gradient
c gnrm      -> double precision, infinity-norm of g
c iflag     -> integer, flag from BCG (indicates linear dependence)
c kmax      -> integer, number of active processors
c
      implicit         double precision (a-h,o-z)
      double precision V(ldv,*), Vinit(ldvi,*), x(*), g(*)
c
      if (k .eq. 1) return
      if (initv .eq. 1) return
c
c store search direction in column 2
c
      xnrm = pdnrmi (n, x)
      call pdcopy (n, x, V(1,2))
      call pdscal (n, 1.d0/xnrm, V(1,2))
      if (initv .eq.  2) return
      if (iflag .eq. -1) return
      if (k     .eq.  2) return
c
c store current gradient in column 3
c
      call pdcopy (n, g, V(1,3))
      call pdscal (n, 1.d0/gnrm, V(1,3))
      if (k .eq. 3) return
      if (iflag .gt. 0)
     *     call rancol (Vinit, ldvi, iflag, n)
c
c update remaining columns
c
      do 10 j = k, 4, -1
          call pdcopy (n, Vinit(1,j-2), Vinit(1,j))
          call pdcopy (n, Vinit(1,j  ), V(1,j))
10    continue
c
      call pdcopy (n, V(1,2), Vinit(1,2))
      call pdcopy (n, V(1,3), Vinit(1,3))
c
      return
      end
c
c
      subroutine psrch (iflag, n, x, f, g, d, x1, g1, gopt,
     *            sfun, nf, eta, stepmx, kmax, cubesz,
     *            dg, alf, f1, alpha)
c
c Parallel line search
c      Using Armijo convergence test based on function values only
c      See Luenberger (2nd edition), p. 212
c
c Parameters:
c iflag  <--  integer, error code:
c                 0 => okay
c                -1 => okay, but alpha <> 1
c                 1 => no acceptable point found
c                 2 => d is a direction of ascent
c                 3 => function may be unbounded below
c n       --> integer, number of variables
c x      <--> double precision, size n, current and new vector of
c             parameters
c f      <--> double precision, current and new function value
c g      <--> double precision, size n, current and new gradient vector
c d       --> double precision, size n, search direction vector
c x1      --  double precision, size n*k, work array to store temporary x
c g1      --  double precision, size n*k, work array to store temporary g
c gopt    --  double precision, size   n, work array to store temporary g
c             (optimal)
c sfun    --> subroutine to evaluate f(x) (call sfun (n,x,f,g))
c             To run in parallel on the Sequent, this routine (and any
c             routine that it calls) must have the command
c             C$    PRIVATE
c             as its first executable statement.
c nf     <--> integer, total number of function/gradient evaluations
c eta     --> double precision, parameter for line search
c stepmx  --> double precision, maximum step allowed
c kmax    --> integer, number of processors available to evaluate f(x)
c cubesz  --> integer, number of processors
c alf     --  double precision, length cubesz, array for step lengths
c f1      --  double precision, length cubesz, array for function values
c alpha  <--  double precision, step length
c
      implicit         double precision (a-h, o-z)
      double precision d(*), g(*), g1(n,*), gopt(*),
     *                 x(*), x1(n,*), alf(*), f1(*)
      integer          kmax, cubesz
      logical          aup, adown
      external         sfun
c
c  set up
c
      aup     = .true.
      adown   = .true.
      alfopt  = 0.d0
      fopt    = f
      itmax   = 2 + 30/kmax
c
      iflag   = 0
      if (dg .ge. 0.d0) then
          iflag = 2
          return
      endif
      icount = 0
c
c  get maximum step and initialize alpha
c
      if (kmax .eq. 1) then
          alf(1) = 1.d0
          if (stepmx .lt. alf(1)) alf(1) = stepmx
          amax = alf(1)
          amin = amax
      else
          amax = kmax
          if (amax .gt. stepmx) amax = stepmx
          if (amax .gt. 1.d0) then
              amin = 1.d0 / dfloat(kmax)
              if (kmax .eq. 2) amin = 1.d0
          else
              amin = amax / dfloat(kmax)
          end if
          call setalf (amax, amin, alf, kmax, 1)
      endif
c
c  test function values at initial points
c
20    nf = nf + 1
      fmin = f
      imin = 0
      icount = icount + 1
C$DOACROSS SHARE(n,alf,d,x,x1,f1,g1,f,eta,dg,fmin,alpha,
C$&        ftest,imin,sfun),
C$&        LOCAL(f2)
      do 700 id = 1,kmax
          call dsvtvp (n, alf(id), d, 1, x, 1, x1(1,id), 1)
          call feval (n, x1(1,id), f1(id), g1(1,id), sfun)
          ftest = f + eta * alf(id) * dg
c
c determine minimum function value using the subroutine gopf
c
          f2   = f1(id)
          if (f2 .gt. ftest) f2 = fmin + 1.d0
C$        call m_lock
          if (f2 .lt. fmin) then
              fmin  = f2
              alpha = alf(id)
              imin  = id
          endif
C$        call m_unlock
700    continue
c
c Test for failure at this iteration and success at last iteration.
c In this case, use the previous optimal step and exit.
c
      if (fmin .ge. fopt .and. alfopt .ne. 0.d0) then
          imin  = kmax
          alpha = alfopt
          fmin  = fopt
          call pdcopy (n, gopt, g1(1,imin))
          go to 30
      end if
c
c Test for too many iterations
c
      if (icount .ge. itmax .and. imin .eq. 0) then
          iflag = 1
          return
      endif
      if (icount .ge. itmax .and. imin .eq. kmax) then
          iflag = 3
      endif
c
c Test to see if step must be reduced
c
      if (imin .eq. 0 .and. adown) then
          aup  = .false.
          amax = 0.9d0 * amin
          amin = amax / dfloat (kmax)
          if (kmax .eq. 1) then
              amax = amin / 2.d0
              amin = amax
          end if
          call setalf (amax, amin, alf, kmax, 2)
          go to 20
      endif
c
c Test to see if step must be increased
c
      if (kmax .gt. 1 .and. imin .eq. kmax
     *          .and. amax .lt. stepmx .and. aup) then
          adown = .false.
          if (fmin .lt. fopt) then
              alfopt = alpha
              fopt   = fmin
              call pdcopy (n, g1(1,imin), gopt)
          end if
          amin = amax * 1.1d0
          amax = amin * kmax
          if (amax .gt. stepmx) amax = stepmx
          call setalf (amax, amin, alf, kmax, 1)
          go to 20
      endif
c
c form the new x, and retrieve g
c
30    call pdsvtv (n, alpha, d, x, x)
c     call pdsvtvp (n, alpha, d, x, x) [NOTE CHANGE FROM INTEL NAME]
      if (alpha .ne. 1.d0) iflag = -1
      f = fmin
      call pdcopy (n, g1(1,imin), g)
c
      return
      end
c
c
      subroutine setalf (amax, amin, alf, cubesz, ind)
c
c determine a set of steplengths alpha for the line search
c
c Parameters
c amax    --> maximum value of alpha
c amin   <--  minimum value of alpha
c alf     --  double precision, length cubesz, array for step lengths
c cubesz  --> number of processors
c ind     --> indicator: =1 (normal) =2 (shrinking step)
c
      implicit         double precision (a-h, o-z)
      double precision alf(*)
      integer          cubesz
c
c Set up
c
      np1 = cubesz - 1
      if (amin .ge. amax) amin = amax / dfloat(cubesz)
c
c loop
c
      do 200 id = 1,cubesz
         id1 = id - 1
         if (ind .eq. 2) go to 100
c
         if (cubesz .gt. 2) then
             if (amin .lt. 1.d0 .and. amax .gt. 1.d0) then
                 imid = cubesz/2
                 if (id1 .lt. imid) then
                     alf(id) = amin + id1 * (1.d0-amin)/dfloat(imid)
                 else if (id1 .gt. imid) then
                     alf(id) = 1.d0 +
     *                  (id1-imid) * (amax-1.d0)/dfloat(np1-imid)
                 else if (id1 .eq. imid) then
                     alf(id) = 1.d0
                 end if
             else
                 alf(id) = amin + id1 * (amax-amin)/dfloat(np1)
             end if
         else
             if (id .eq. 1) alf(id) = amin
             if (id .eq. 2) alf(id) = amax
         end if
         go to 200
c
c Shrinking of step (more rapid reduction of step to zero)
c
100      if (cubesz .ge. 2) then
             alf(id) = amax / 2.d0**(cubesz-id1-1)
             amin  = amax / 2.d0**(cubesz-1)
         else
             alf(id) = amax
             amin  = amax
         end if
200   continue
c
      return
      end
c
c
      subroutine aminsb (A, lda, B, ldb, C, ldc, m, n)
      double precision A(lda,*), B(ldb,*), C(ldc,*)
c
c computes C = A - B, for m*n matrices A, B, and C
c [C can overwrite either A or B]
c
C$DOACROSS SHARE(A,B,C,m)
      do 10 j = 1, n
          call dvsub (m, A(1,j), 1, B(1,j), 1, C(1,j), 1)
10    continue
c
      return
      end
c
c
      subroutine atimsb (A, lda, n, m, B, ldb, k, C, ldc)
      implicit         double precision (a-h, o-z)
      double precision A(lda,*), B(ldb,*), C(ldc,*)
c
c forms C = A x B,  for A(n*m), B(m*k), and C(n*k)
c
C$DOACROSS SHARE(A,B,C,m,n,lda), LOCAL(i)
      do 10 j = 1,k
      do 10 i = 1,n
          C(i,j) = ddot (m,A(i,1),lda,B(1,j),1)
10    continue
c
      return
      end
c
c
      subroutine disast (routne, msg, iunit, iflag)
c
c print fatal error message on fort.iunit, then return
c
c PARAMETERS
c routne -> character*(*), name of routine where error was detected
c msg       -> character*(*), error message
c iunit       -> integer, unit for output
c iflag <-  integer, flag=999 upon return from disaster
c
      character*(*) routne, msg
c
      write (iunit,800) routne, msg
      iflag = 999
c
      return
800   format (' ********************',         /,
     *        ' ERROR, ERROR, ERROR',          /,
     *        ' Fatal error in routine ', a10, /,
     *        ' ', a40,                        /,
     *        ' Terminating',                  /,
     *        ' ********************')
      end
c
c
      subroutine dpofa2 (A, lda, n, info, iunit, iflag, rnktol)
      implicit         double precision (a-h,o-z)
      double precision A(lda,*),  t, s, tol, mnorm
      integer          info, n
c
c Cholesky factor of a double precision positive definite matrix.
c this is a modification of the LINPACK routine DPOFA, designed to
c produce the lower triangular matrix a column at a time.
c
c info:   integer = 0 for normal return
c                 = k signals an error condition - the leading minor
c                     of order k is not positive definite.
c
      tol = rnktol * mnorm (A, lda, n, n)
c
c  get LDL decomposition from a Cholesky factorization
c
      do 30 i = 1,n
          s = ddot (i-1, A(i,1), lda, A(i,1), lda)
          s = A(i,i) - s
c
          info = i
          if (s .lt. -tol) return
          if (s .le.  tol) then
              info = - info
              return
          endif
          A(i,i) = sqrt(s)
c
          do 20 k = i+1,n
              t = a(k,i)  - ddot (i-1, A(i,1), lda, A(k,1), lda)
              if (A(i,i) .eq. 0.d0) then
                  call disast ('DPOFA2', 'A(i,i) = 0', iunit, iflag)
                  return
              end if
              t = t / A(i,i)
              A(k,i)  = t
20        continue
30    continue
      info = 0
c
      return
      end
c
c
      subroutine getld2 (L, ldl, k, D, iunit, iflag)
      implicit         double precision (a-h, o-z)
      double precision L(ldl,*), D(*)
c
c  get LDL decomposition from a Cholesky factorization
c
      do 10 i = 1,k
          D(i) = L(i,i)
          if (D(i) .eq. 0.d0) then
              call disast ('GETLD2', 'D(i) = 0', iunit, iflag)
              return
          end if
          do 10 j = 1,k
              if (j .gt. i) L(i,j) = 0.d0
              if (j .le. i) L(i,j) = L(i,j) / D(j)
10    continue
      call dvmul (k, D, 1, D, 1, D, 1)
c
      return
      end
c
c
      subroutine lsol (l, ldl, k, y, x, iunit, iflag)
      double precision l(ldl,*), y(*), x(*), ddot
C$    private
c
c solve Lx = y where L is lower triangular
c the vectors x and y may be the same (overwrite solution on rhs)
c
      do 10 i = 1,k
          x(i) = y(i) - ddot (i-1, l(i,1), ldl, x, 1)
          if (l(i,i) .eq. 0.d0) then
              call disast ('LSOL', 'l(i,i) = 0', iunit, iflag)
              return
          end if
          x(i) = x(i) / l(i,i)
10    continue
c
      return
      end
c
c
      subroutine matcpy (A, lda, B, ldb, m, n)
      double precision A(lda,*), B(ldb,*)
c
c copies matrix A  (m x n) onto matrix B
c
C$DOACROSS SHARE(A,B,m)
      do 10 j = 1,n
          call dcopy (m, A(1,j), 1, B(1,j), 1)
10    continue
c
      return
      end
c
c
      subroutine maxpy (y, ldy, n, k, alpha, a, lda, m, x, ldx)
      implicit         double precision (a-h,o-z)
      double precision y(ldy,*), a(lda,*), x(ldx,*)
c
c form y = y + alpha * Ax
c      y      n x k
c      A      n x m
c      x      m x k
c      alpha  scalar
c
C$DOACROSS SHARE(A,x,y,alpha,lda,m,n), LOCAL(j)
      do 20 i = 1,k
          do 10 j = 1,n
              y(j,i) = y(j,i) +
     *                 alpha * ddot(m,A(j,1),lda,x(1,i),1)
10        continue
20    continue
c
      return
      end
c
c
      double precision function mnorm (A, lda, m, n)
      implicit         double precision (a-h,o-z)
      double precision A(lda,*), colnrm, zero
      data             zero /0.d0/
c
c compute the 1-norm of the m x n matrix A
c
      mnorm = zero
      do 10 j = 1,n
          colnrm = dasum (m, A(1,j), 1)
          if (mnorm .lt. colnrm) mnorm = colnrm
10    continue
c
      return
      end
c
c
      subroutine msolve (A, lda, n, k, Am, ldam, Rm)
      double precision A(lda,*), Am(ldam,*), Rm(*)
c
c  preconditions the n*k matrix A, using the vector Rm.
c  resulting matrix is in Am [A and Am can be the same]
c
C$DOACROSS SHARE(Am,A,Rm,n)
      do 10 j = 1,k
          call dvdiv (n, A(1,j), 1, Rm, 1, Am(1,j), 1)
10    continue
c
      return
      end
c
c
      subroutine premlt (A, lda, k1, n,  L, ldl, k2, B,ldb)
      implicit         double precision (a-h,o-z)
      double precision A(lda,*), L(ldl,*), B(ldb,*)
c
c forms B =  L' x A
c for A(k1*n) and L(k1*k2) lower triangular (with k1 .ge. k2)
c
C$DOACROSS SHARE(A,B,L,k1,k2), LOCAL(i)
      do 10 j = 1,n
      do 10 i = 1,k2
          B(i,j) = ddot (k1-i+1,A(i,j),1,L(i,i),1)
10    continue
c
      return
      end
c
c
      subroutine rancol (V, ldv, j, m)
c
c  generate a random vector for the j-th column of the matrix V
c
c PARAMETERS
c V      <-  double precision, size m*j, initialization matrix
c            (see PREBCG)
c ldv     -> integer, leading dimension of V
c j       -> integer, column to be randomized
c m       -> integer, length of row
c
      implicit         double precision (a-h,o-z)
      double precision V(ldv,*)
      integer          click, seed, rand
c
c setup
c
      click = 52343
      icol  = j
      seed  = mod(2*(icol)*click+1,65536)
      rand  = seed
c
c generate random column
c
      do 20 i = 1, m
          rand = mod(3125*rand,65536)
          V(i,j) = (rand - 32768.0d0)/16384.0d0
20    continue
c
      return
      end
c
c
      subroutine timsl1 (A, lda, n, k1, L, ldl, k2)
      implicit         double precision (a-h,o-z)
      double precision A(lda,*), L(ldl,*)
c
c performs A = A x L
c for A(n*k1) and L(k1*k2) lower triangular (with k1 .ge. k2)
c
      do 40 i = 1,k2
          call dscal (n, L(i,i), A(1,i), 1)
          do 30 k = i+1,k1
              call daxpy (n, L(k,i), A(1,k), 1, A(1,i), 1)
30        continue
40    continue
c
      return
      end
c
c
      subroutine vltnv1 (V, ldv, m, k, L, ldl, iunit, iflag)
      implicit         double precision (a-h,o-z)
      double precision L(ldl,*), V(ldv,*)
c
c Solves P L' = V, for L(k*k) lower triangular, P(m*k) and V(m*k)
c V is overwritten with P
c
      do 40 i = 1,k
          do 20 j = 1,i-1
              call daxpy (m, -L(i,j), V(1,j), 1, V(1,i), 1)
20        continue
          rl = 1.d0 / L(i,i)
          call dscal (m, rl, V(1,i), 1)
40    continue
c
      return
      end
c
c
      subroutine vltnv2 (V, ldv, m, k, L, ldl, iunit, iflag)
      implicit         double precision (a-h,o-z)
      double precision L(ldl,*), V(ldv,*)
c
c Solves P L' = V, for L(k*k) unit lower triangular, P(m*k) and V(m*k)
c V is overwritten with P
c
      do 40 i = 1,k
          do 20 j = 1,i-1
              call daxpy (m, -L(i,j), V(1,j), 1, V(1,i), 1)
20        continue
40    continue
c
      return
      end
c
c
      subroutine feval (n, x, f, g, sfun)
c
c compute function evaluation within line search
c
c PARAMETERS
c n       -> integer, dimension of problem
c x       -> double precision, size n, nonlinear parameter vector
c f      <-  double precision, function value at x
c g      <-  double precision, size n, gradient at x
c sfun    -> subroutine sfun (n,x,f,g) to evaluate nonlinear function
c            To run in parallel on the Sequent, this routine (and any
c            routine that it calls) must have the command
c            C$    PRIVATE
c            as its first executable statement.
c
      implicit         double precision  (a-h,o-z)
      double precision x(*), g(*), f
C$    private
c
c perform function evaluation
c
      call sfun  (n, x, f, g)
c
      return
      end
c********************************************************************
c BTN: CHKDER, parallel derivative checker for nonlinear functions
c last changed: 0927/91
c for Sequent Balance (parallel) and scalar computers
c******************************************************************
c
      subroutine chkez (n, x, f, g, w, lw, sfun, iflag, errmax,
     *                  imax)
      implicit         double precision (a-h,o-z)
      double precision x(*), g(*), w(*)
      external         sfun
c
c Check derivatives of objective function via finite-differencing.
c Easy to use version
c
c PARAMETERS
c n       -> integer, dimension of problem
c x       -> double precision, size n, point where derivatives are checked
c f      <-  double precision, value of function at the point x (from sfun)
c g      <-  double precision, size n, gradient at the point x (from sfun)
c w       -  double precision, size n*(2k), work array
c lw      -> integer, declared length of array w [not used]
c sfun    -> subroutine sfun (n,x,f,g) to evaluate f(x), g(x);
c            see subroutine BTNEZ for details. Routine SFUN must
c            be declared EXTERNAL in the calling program.
c            To run in parallel on the Sequent, this routine (and any
c            routine that it calls) must have the command
c            C$    PRIVATE
c            as its first executable statement.
c iflag  <-  integer, error code:   0 => gradient values appear accurate
c                                 100 => gradient error > .01
c errmax <-  double precision, size of largest error in the gradient
c imax   <-  integer, index where errmax was obtained
c
c set default values of customizing parameters
c
      call chkpar (n, kmax, msglvl, iunit, istart, iend,
     *      istep)
c
c check selected gradient values
c
      call chkder (n, x, f, g, w, lw, sfun, iflag,
     *             kmax, errmax, imax,
     *             msglvl, iunit, istart, iend, istep)
      return
      end
c
c
      subroutine chkpar (n, kmax, msglvl, iunit, istart, iend,
     *      istep)
C$    integer*4 cpus_online
c
c  sets parameters for  derivative checker
c
c n         -> integer, number of variables
c kmax     <-  integer, block size = # of processors used
c              default: kmax = [# of processors]/2 on Sequent
c msglvl   <-  integer, amount of printing desired,
c              (<-1 =>  none
c              ( -1 =>  warning messages only
c                 0 =>  one line with maximum error
c                 1 =>  individual components are printed. Processor i
c                       will print to iunit + i
c              default: msglvl = 0
c iunit    <-  integer, see discussion for msglvl
c              default: iunit = 10
c istart   <-  integer, index of the first partial derivative
c              to be checked
c              default: istart = 1
c iend     <-  integer, index of the last partial derivative to be
c              checked
c              default: iend = istep*kmax
c istep    <-  integer, gradient is tested every istep components.
c              default: istep = n/kmax
c              Warning: if n is large or if gradient values are
c                       expensive, checking the gradient values can
c                       be very time consuming.  The default values
c                       result in each processor checking exactly
c                       one gradient component.
c
      nodemx  = 5
C$    nodemx  = cpus_online()/2
      kmax    = nodemx
      if (kmax .gt. n) kmax = n
      msglvl  = 0
      iunit   = 10
      istart  = 1
      istep   = n/kmax
      iend    = istart + istep*kmax - 1
      if (iend .gt. n) iend = n
c
      return
      end
c
c
      subroutine chkder (n, x, f, g, w, lw, sfun, iflag,
     *                   kmax, errmax, imax,
     *                   msglvl, iu, istart, iend, istep)
c
c Check derivatives of objective function via finite-differencing.
c
c PARAMETERS
c n       -> integer, dimension of problem
c x       -> double precision, size n, point where derivatives are checked
c f      <-  double precision, value of function at the point x (from sfun)
c g      <-  double precision, size n, gradient at the point x (from sfun)
c w       -  double precision, size n*(2kmax), work array
c lw      -> integer, length of work array w [not used]
c sfun    -> subroutine sfun (n,x,f,g) to evaluate f(x), g(x);
c            see subroutine BTNEZ for details. Routine SFUN must
c            be declared EXTERNAL in the calling program.
c            To run in parallel on the Sequent, this routine (and any
c            routine that it calls) must have the command
c            C$    PRIVATE
c            as its first executable statement.
c iflag  <-  integer, error code:   0 => gradient values appear accurate
c                                 100 => gradient error > .01
c kmax    -> integer, block size [normally equal to number of processors,
c            unless more than one processor is being used to compute each
c            function value; see subroutine BTN for further details]
c errmax <-  double precision, size of largest error in the gradient
c imax   <-  integer, index where errmax was obtained
c msglvl  -> integer, specify amount of printing desired:
c                  <-1 => none
c                   -1 => warning messages only
c                    0 => one line with maximum error
c                    1 => output to unit (iu)
c iu      -> integer, see discussion of parameter msglvl
c istart  -> integer, test gradient starting at component istart
c iend    -> integer, test gradient starting at component iend
c istep   -> integer, test gradient at every istep component
c                    e.g., if istep=2 then test every 2nd component
c
c REQUIRES
c d1mach  -  function to specify machine epsilon [d1mach(3)]
c
      implicit         double precision (a-h,o-z)
      double precision x(*), g(*), w(n,*)
C$    integer*4        m_get_myid
      logical          msg0, msg1
c
c Check value of machine epsilon
c
      eps = d1mach(3)
      if (1.d0 + 2.d0*eps .eq. 1.d0) then
          write (*,*) ' ### CHKDER: ERROR ###'
          write (*,*) '     Machine epsilon too small'
          write (*,*) '     Value = ', eps
          write (*,*) '     Modify routine D1MACH'
          write (*,*) '     Terminating execution'
          stop
      end if
c
c Setup: obtain processor information
c
      iflag  = 0
      h      = sqrt(eps)
      imax   = 0
      msg0   = msglvl .ge. 0
      msg1   = msglvl .ge. 1
      if (msg1) write (iu,800)
      errmax = 0.d0
c
c evaluate function and gradient at x
c
      call sfun (n, x, f, g)
c
c compute finite-difference estimate of gradient
c
C$DOACROSS SHARE(w,x,g,f,h,errmax,imax,istart,istep,
C$&              n,sfun,kmax),
C$&        LOCAL(fx,gi,erri,i1,i2)
      do 10 i = istart, iend, istep
          i1 = (i-istart)/istep + 1
C$        i1 = m_get_myid() + 1
          i2 = i1 + kmax
          call dcopy (n, x, 1, w(1,i2), 1)
          w(i,i2) = w(i,i2) + h
          call sfun (n, w(1,i2), fx, w(1,i1))
          gi = (fx - f) / h
          erri = abs(gi - g(i)) / (1.d0 + abs(g(i)))
c         if (msg1) write (iu,810) i, g(i), gi, erri
C$        call m_lock
          if (erri .gt. errmax) then
              errmax = erri
              imax   = i
          end if
C$        call m_unlock
10    continue
c
      if (errmax .gt. 1.d-2) iflag = 100
      if (msg0) write ( *,820) errmax, imax
      if (msg1) write (iu,820) errmax, imax
c
      return
800   format (' Testing Derivatives', //,
     *        '   i', 9x, 'g(i)', 14x, 'gi', 13x, 'error')
810   format (' ', i3, 2x, d16.8, 2x, d16.8, 2x, d12.4)
820   format (/, ' CHKDER:  Max. error in gradient = ', d12.4, /,
     *           '          observed at component = ', i6)
      end
      DOUBLE PRECISION FUNCTION D1MACH(I)
C***BEGIN PROLOGUE  D1MACH
C***DATE WRITTEN   750101   (YYMMDD)
C***REVISION DATE  901002   (YYMMDD)
C***CATEGORY NO.  R1
C***KEYWORDS  MACHINE CONSTANTS
C***AUTHOR  FOX, P. A., (BELL LABS)
C           HALL, A. D., (BELL LABS)
C           SCHRYER, N. L., (BELL LABS)
C***PURPOSE  Returns double precision machine dependent constants
C***DESCRIPTION
C
C     D1MACH can be used to obtain machine-dependent parameters
C     for the local machine environment.  It is a function
C     subprogram with one (input) argument, and can be called
C     as follows, for example
C
C          D = D1MACH(I)
C
C     where I=1,...,5.  The (output) value of D above is
C     determined by the (input) value of I.  The results for
C     various values of I are discussed below.
C
C  Double-precision machine constants
C  D1MACH( 1) = B**(EMIN-1), the smallest positive magnitude.
C  D1MACH( 2) = B**EMAX*(1 - B**(-T)), the largest magnitude.
C  D1MACH( 3) = B**(-T), the smallest relative spacing.
C  D1MACH( 4) = B**(1-T), the largest relative spacing.
C  D1MACH( 5) = LOG10(B)
C***REFERENCES  FOX P.A., HALL A.D., SCHRYER N.L.,*FRAMEWORK FOR A
C                 PORTABLE LIBRARY*, ACM TRANSACTIONS ON MATHEMATICAL
C                 SOFTWARE, VOL. 4, NO. 2, JUNE 1978, PP. 177-188.
C***ROUTINES CALLED  NONE
C***END PROLOGUE  D1MACH
C
      INTEGER SMALL(4)
      INTEGER LARGE(4)
      INTEGER RIGHT(4)
      INTEGER DIVER(4)
      INTEGER LOG10(4)
C$    PRIVATE
C
      DOUBLE PRECISION DMACH(5)
C
      EQUIVALENCE (DMACH(1),SMALL(1))
      EQUIVALENCE (DMACH(2),LARGE(1))
      EQUIVALENCE (DMACH(3),RIGHT(1))
      EQUIVALENCE (DMACH(4),DIVER(1))
      EQUIVALENCE (DMACH(5),LOG10(1))
C
C
C     MACHINE CONSTANTS FOR THE CDC CYBER 170 SERIES (FTN5).
C
C     DATA SMALL(1) / O"00604000000000000000" /
C     DATA SMALL(2) / O"00000000000000000000" /
C
C     DATA LARGE(1) / O"37767777777777777777" /
C     DATA LARGE(2) / O"37167777777777777777" /
C
C     DATA RIGHT(1) / O"15604000000000000000" /
C     DATA RIGHT(2) / O"15000000000000000000" /
C
C     DATA DIVER(1) / O"15614000000000000000" /
C     DATA DIVER(2) / O"15010000000000000000" /
C
C     DATA LOG10(1) / O"17164642023241175717" /
C     DATA LOG10(2) / O"16367571421742254654" /
C
C     MACHINE CONSTANTS FOR THE CDC CYBER 200 SERIES
C
C     DATA SMALL(1) / X'9000400000000000' /
C     DATA SMALL(2) / X'8FD1000000000000' /
C
C     DATA LARGE(1) / X'6FFF7FFFFFFFFFFF' /
C     DATA LARGE(2) / X'6FD07FFFFFFFFFFF' /
C
C     DATA RIGHT(1) / X'FF74400000000000' /
C     DATA RIGHT(2) / X'FF45000000000000' /
C
C     DATA DIVER(1) / X'FF75400000000000' /
C     DATA DIVER(2) / X'FF46000000000000' /
C
C     DATA LOG10(1) / X'FFD04D104D427DE7' /
C     DATA LOG10(2) / X'FFA17DE623E2566A' /
C
C
C     MACHINE CONSTANTS FOR THE CDC 6000/7000 SERIES.
C
C     DATA SMALL(1) / 00564000000000000000B /
C     DATA SMALL(2) / 00000000000000000000B /
C
C     DATA LARGE(1) / 37757777777777777777B /
C     DATA LARGE(2) / 37157777777777777777B /
C
C     DATA RIGHT(1) / 15624000000000000000B /
C     DATA RIGHT(2) / 00000000000000000000B /
C
C     DATA DIVER(1) / 15634000000000000000B /
C     DATA DIVER(2) / 00000000000000000000B /
C
C     DATA LOG10(1) / 17164642023241175717B /
C     DATA LOG10(2) / 16367571421742254654B /
C
C     MACHINE CONSTANTS FOR THE CRAY 1
C
C     DATA SMALL(1) / 201354000000000000000B /
C     DATA SMALL(2) / 000000000000000000000B /
C
C     DATA LARGE(1) / 577767777777777777777B /
C     DATA LARGE(2) / 000007777777777777774B /
C
C     DATA RIGHT(1) / 376434000000000000000B /
C     DATA RIGHT(2) / 000000000000000000000B /
C
C     DATA DIVER(1) / 376444000000000000000B /
C     DATA DIVER(2) / 000000000000000000000B /
C
C     DATA LOG10(1) / 377774642023241175717B /
C     DATA LOG10(2) / 000007571421742254654B /
C
C
C     MACHINE CONSTANTS FOR THE IBM 360/370 SERIES,
C     THE XEROX SIGMA 5/7/9, THE SEL SYSTEMS 85/86, AND
C     THE PERKIN ELMER (INTERDATA) 7/32.
C
C     DATA SMALL(1),SMALL(2) / Z00100000, Z00000000 /
C     DATA LARGE(1),LARGE(2) / Z7FFFFFFF, ZFFFFFFFF /
C     DATA RIGHT(1),RIGHT(2) / Z33100000, Z00000000 /
C     DATA DIVER(1),DIVER(2) / Z34100000, Z00000000 /
C     DATA LOG10(1),LOG10(2) / Z41134413, Z509F79FF /
C
C     MACHINE CONSTATNS FOR THE IBM PC FAMILY (D. KAHANER NBS)
C
C     DATA DMACH/2.23D-308,1.79D+308,1.11D-16,2.22D-16,
C    *  0.301029995663981195D0/
C
C     MACHINE CONSTATNS FOR THE SEQUENT BALANCE (S. NASH)
C
      DATA DMACH/5.562684646268003D-309,
     *           8.988465674311579D+307,
     *           1.110223024625157D-16,
     *           2.2204460492503131D-16,
     *           0.3010299956639812D+00/
C
C     MACHINE CONSTATNS FOR THE IPSC/2 HYPERCUBE (S. NASH)
C
C     DATA DMACH/5.562684646268003D-309,
C    *           8.988465674311579D+307,
C    *           1.110223024625157D-16,
C    *           2.2204460492503131D-16,
C    *           0.3010299956639812D+00/
C
C     MACHINE CONSTANTS FOR THE PDP-10 (KA PROCESSOR).
C
C     DATA SMALL(1),SMALL(2) / "033400000000, "000000000000 /
C     DATA LARGE(1),LARGE(2) / "377777777777, "344777777777 /
C     DATA RIGHT(1),RIGHT(2) / "113400000000, "000000000000 /
C     DATA DIVER(1),DIVER(2) / "114400000000, "000000000000 /
C     DATA LOG10(1),LOG10(2) / "177464202324, "144117571776 /
C
C     MACHINE CONSTANTS FOR THE PDP-10 (KI PROCESSOR).
C
C     DATA SMALL(1),SMALL(2) / "000400000000, "000000000000 /
C     DATA LARGE(1),LARGE(2) / "377777777777, "377777777777 /
C     DATA RIGHT(1),RIGHT(2) / "103400000000, "000000000000 /
C     DATA DIVER(1),DIVER(2) / "104400000000, "000000000000 /
C     DATA LOG10(1),LOG10(2) / "177464202324, "476747767461 /
C
C
C     MACHINE CONSTANTS FOR THE SUN-3 (INCLUDES THOSE WITH 68881 CHIP,
C       OR WITH FPA BOARD. ALSO INCLUDES SUN-2 WITH SKY BOARD. MAY ALSO
C       WORK WITH SOFTWARE FLOATING POINT ON EITHER SYSTEM.)
C
C     DATA SMALL(1),SMALL(2) / X'00100000', X'00000000' /
C     DATA LARGE(1),LARGE(2) / X'7FEFFFFF', X'FFFFFFFF' /
C     DATA RIGHT(1),RIGHT(2) / X'3CA00000', X'00000000' /
C     DATA DIVER(1),DIVER(2) / X'3CB00000', X'00000000' /
C     DATA LOG10(1),LOG10(2) / X'3FD34413', X'509F79FF' /
C
C
C     MACHINE CONSTANTS FOR VAX 11/780
C     (EXPRESSED IN INTEGER AND HEXADECIMAL)
C    *** THE INTEGER FORMAT SHOULD BE OK FOR UNIX SYSTEMS***
C
C     DATA SMALL(1), SMALL(2) /        128,           0 /
C     DATA LARGE(1), LARGE(2) /     -32769,          -1 /
C     DATA RIGHT(1), RIGHT(2) /       9344,           0 /
C     DATA DIVER(1), DIVER(2) /       9472,           0 /
C     DATA LOG10(1), LOG10(2) /  546979738,  -805796613 /
C
C    ***THE HEX FORMAT BELOW MAY NOT BE SUITABLE FOR UNIX SYSYEMS***
C     DATA SMALL(1), SMALL(2) / Z00000080, Z00000000 /
C     DATA LARGE(1), LARGE(2) / ZFFFF7FFF, ZFFFFFFFF /
C     DATA RIGHT(1), RIGHT(2) / Z00002480, Z00000000 /
C     DATA DIVER(1), DIVER(2) / Z00002500, Z00000000 /
C     DATA LOG10(1), LOG10(2) / Z209A3F9A, ZCFF884FB /
C
C   MACHINE CONSTANTS FOR VAX 11/780 (G-FLOATING)
C     (EXPRESSED IN INTEGER AND HEXADECIMAL)
C    *** THE INTEGER FORMAT SHOULD BE OK FOR UNIX SYSTEMS***
C
C     DATA SMALL(1), SMALL(2) /         16,           0 /
C     DATA LARGE(1), LARGE(2) /     -32769,          -1 /
C     DATA RIGHT(1), RIGHT(2) /      15552,           0 /
C     DATA DIVER(1), DIVER(2) /      15568,           0 /
C     DATA LOG10(1), LOG10(2) /  1142112243, 2046775455 /
C
C    ***THE HEX FORMAT BELOW MAY NOT BE SUITABLE FOR UNIX SYSYEMS***
C     DATA SMALL(1), SMALL(2) / Z00000010, Z00000000 /
C     DATA LARGE(1), LARGE(2) / ZFFFF7FFF, ZFFFFFFFF /
C     DATA RIGHT(1), RIGHT(2) / Z00003CC0, Z00000000 /
C     DATA DIVER(1), DIVER(2) / Z00003CD0, Z00000000 /
C     DATA LOG10(1), LOG10(2) / Z44133FF3, Z79FF509F /
C
C
C***FIRST EXECUTABLE STATEMENT  D1MACH
      IF (I .LT. 1  .OR.  I .GT. 5) THEN
          WRITE (*,*) 'D1MACH -- I OUT OF BOUNDS'
          STOP
      END IF
C
      D1MACH = DMACH(I)
      RETURN
C
      END
c********************************************************************
c BTN: Sample problem 1 (simple usage of BTNEZ, CHKDER)
c main program
c last changed: 10/10/90
c********************************************************************
c
      program          main
      implicit         double precision (a-h,o-z)
      parameter        (n = 100, ndmx = 10,
     *                 lw = (3+7*ndmx)*(n+ndmx))
      double precision x(n), g(n), w(lw)
      external         sfun
c
c specify initial guess of parameters
c
      call xstart (x, n)
c
c check derivative values at initial point
c
      call chkez (n, x, f, g, w, lw, sfun, iflag, errmax, imax)
c
c solve optimization problem
c
      call btnez (n, x, f, g, w, lw, sfun, iflag)
c
c Print results (using output unit 10)
c
      if (iflag .eq. 999) write (*,*) ' DISASTER'
      iunit = 10
      write (iunit,810)
      if (iflag .ne.   0) then
          write (    *,*) ' Error code = ', iflag
          write (iunit,*) ' Error code = ', iflag
      end if
      ig   = idamax (n, g, 1)
      write (iunit,820) f
      write (iunit,830) abs(g(ig))
      write (iunit,840)
      write (iunit,850) (x(i), i = 1,n)
c
      stop
810   format (' Results of optimization problem', /)
820   format (' Final function value = ', 1pd24.16)
830   format (' Norm of gradient     = ', 1pd12.4)
840   format (' Parameters')
850   format (4(1pd16.8))
      end
c
c----------------------------------------------------------------------
c initial guess
c----------------------------------------------------------------------
c
      subroutine xstart (x, n)
      double precision    x(n)
c
c specify initial values of variables
c
      do 10 i = 1,n
          x(i) = 0.d0
10    continue
c
      return
      end
c
c----------------------------------------------------------------------
c function evaluation
c----------------------------------------------------------------------
c
      subroutine sfun (n ,x, f, g)
c
c evaluate nonlinear function and gradient
c
      implicit  double precision (a-h,o-z)
      double precision    x(n), f, g(n)
C$    PRIVATE
c
      f = 0.d0
      do 30 i = 1,n
          b = i
          if (i .gt. n/2) b = -b
          f = f + 5.d-1*i*x(i)*x(i) - b*x(i)
          g(i) = i*x(i) - b
30    continue
c
      return
      end
c********************************************************************
c BTN: Sample problem 2 (simple customization of BTN, CHKDER)
c main program
c last changed: 10/10/90
c********************************************************************
c
      program          main
      implicit         double precision (a-h,o-z)
      parameter        (n = 100, ndmx = 10,
     *                 lw = (3+7*ndmx)*(n+ndmx))
      double precision x(n), g(n), w(lw)
      external         sfun
c
c set initial guess of parameters
c
      call xstart (x, n)
c
c check derivative values (checking every 5th component)
c
      call chkpar (n, kmax, msglvl, iunit, istart, iend, istep)
      iend  = n
      istep = 5
      call chkder (n, x, f, g, w, lw, sfun, iflag,
     *             kmax, errmax, imax,
     *             msglvl, iunit, istart, iend, istep)
      if (iflag .ne. 0) stop
c
c Set customizing parameters for BTN
c
      call btnpar (nodemx, kmax, maxit, msglvl, iprec, nlmax,
     *    initv, tolq, iunit, rnktol, maxfun, accrcy, stepmx, eta,
     *    ireset, indstp)
      accrcy  = 1.d-6
c
c solve optimization problem
c
      call btn (n, x, f, g, w, lw, sfun, iflag,
     *          nodemx, kmax, maxit, msglvl, iprec, nlmax,
     *          initv, tolq, iunit, rnktol, maxfun, accrcy,
     *          stepmx, eta, ireset, indstp, sfun)
c
c Print results
c
      if (iflag .eq. 999) write (*,*) ' DISASTER'
      write (iunit,810)
      if (iflag .ne.   0) then
          write (    *,*) ' Error code = ', iflag
          write (iunit,*) ' Error code = ', iflag
      end if
      ig   = idamax (n, g, 1)
      write (iunit,820) f
      write (iunit,830) abs(g(ig))
      write (iunit,840)
      write (iunit,850) (x(i), i = 1,n)
c
      stop
810   format (//, ' Results of optimization problem', /)
820   format (' Final function value = ', 1pd24.16)
830   format (' Norm of gradient     = ', 1pd12.4)
840   format (' Parameters')
850   format (4(1pd16.8))
      end
c
c----------------------------------------------------------------------
c initial guess
c----------------------------------------------------------------------
c
      subroutine xstart (x, n)
      double precision x(n)
c
c specify initial values of variables
c
      do 10 i = 1,n
          x(i) = 0.d0
10    continue
c
      return
      end
c
c----------------------------------------------------------------------
c function evaluation
c----------------------------------------------------------------------
c
      subroutine sfun (n ,x, f, g)
c
c evaluate nonlinear function and gradient
c
      implicit         double precision (a-h,o-z)
      double precision x(n), f, g(n)
C$    PRIVATE
c
      f = 0.d0
      do 30 i = 1,n
          b = i
          if (i .gt. n/2) b = -b
          f = f + 5.d-1*i*x(i)*x(i) - b*x(i)
          g(i) = i*x(i) - b
30    continue
c
      return
      end
c********************************************************************
c BTN: Sample problem 3 (complex usage of BTN, CHKDER)
c main program
c last changed: 10/10/90
c********************************************************************
c
      program          main
      implicit         double precision (a-h,o-z)
      parameter        (nn = 100, ndmx = 10,
     *                 lw = (3+7*ndmx)*(nn+ndmx))
      double precision x(nn), g(nn), w(lw)
      external         sfun
c
c Read data from file
c
      open (7, file = 'main3.d', status = 'old')
      read (7,800) n
c
c Set default values of parameters
c
      call chkpar (n, kmax, msglvl, iunit, istart, iend, istep)
      call btnpar (nodemx, kmax, maxit, msglvl, iprec, nlmax,
     *    initv, tolq, iunit, rnktol, maxfun, accrcy, stepmx, eta,
     *    ireset, indstp)
c
c Read in new values of selected parameters
c
      read (7,800) ichk
      read (7,800) istart
      read (7,800) iend
      read (7,800) istep
      read (7,800) msglvl
      read (7,800) iunit
      read (7,800) kmax
c
      read (7,810) rnktol
      read (7,810) accrcy
c
c check derivative values (if desired)
c
      call xstart (x, n)
      if (ichk .ne. 0) then
          call chkder (n, x, f, g, w, lw, sfun, iflag,
     *                 kmax, errmax, imax,
     *                 msglvl, iunit, istart, iend, istep)
          write (iunit,815) errmax, imax
          stop
      end if
c
c solve optimization problem
c
      call btn (n, x, f, g, w, lw, sfun, iflag,
     *          nodemx, kmax, maxit, msglvl, iprec, nlmax,
     *          initv, tolq, iunit, rnktol, maxfun, accrcy,
     *          stepmx, eta, ireset, indstp, sfun)
c
c write results to file
c
      write (iunit,820)
      write (iunit,830) iflag
      write (iunit,840) f
      write (iunit,850)
      write (iunit,860) (x(i), i = 1,n)
      write (iunit,870)
      write (iunit,860) (g(i), i = 1,n)
c
      stop
800   format (10x, i5)
810   format (10x, d16.4)
815   format (' CHKDER: Max. error in gradient = ', 1pd12.4,
     *        ' at index ', i5)
820   format (' Results of BTN', /)
830   format (' Error code = ', i4)
840   format (' Function value = ', 1pd24.16)
850   format (/, ' x = ')
860   format (3(1pd16.8))
870   format (/, ' g = ')
      end
c
c----------------------------------------------------------------------
c initial guess
c----------------------------------------------------------------------
c
      subroutine xstart (x, n)
      double precision x(n)
c
c specify initial values of the variables
c
      do 10 i = 1,n
          x(i) = 0.d0
10    continue
c
      return
      end
c
c----------------------------------------------------------------------
c function evaluation
c----------------------------------------------------------------------
c
      subroutine sfun (n ,x, f, g)
c
c evaluate the nonlinear function and its gradient
c
      implicit         double precision (a-h,o-z)
      double precision x(*), g(*)
C$    PRIVATE
c
c compute function value and gradient
c
      f = 0.d0
      do 30 i = 1,n
          b = i
          if (i .gt. n/2) b = -b
          f = f + 5.d-1*i*x(i)*x(i) - b*x(i)
          g(i) = i*x(i) - b
30    continue
c
      return
      end
c********************************************************************
c BTN: Sample problem 4 (using BTN to solve a constrained problem)
c
c A logarithmic barrier method is used to minimize a nonlinear
c function subject to simple bounds on the variables ("box" constraints)
c
c main program
c last changed: 05/20/91
c********************************************************************
c
      program          main
      implicit         double precision (a-h,o-z)
      parameter        (n = 100, ndmx = 10,
     *                 lw = (3+7*ndmx)*(n+ndmx))
      double precision x(n), g(n), w(lw), lb(n), ub(n), mu
      common /bnd/     lb, ub
      common /bar/     mu
      external         sfun, maxstp
c
c Initialize nonlinear parameters x and barrier parameter mu
c
      call xstart (x, n)
      mu    = 1.d0
c
c set up parameters for derivative checker
c
      call chkpar (n, kmax, msglvl, iunit, istart, iend, istep)
c
c check derivative values at initial point
c
      call chkder (n, x, f, g, w, lw, sfun, iflag,
     *             kmax, errmax, imax,
     *             msglvl, iunit, istart, iend, istep)
      if (iflag .ne. 0) stop
c
c Set customizing parameters for BTN (the user-supplied routine
c maxstp will be used to bound the step length in the line search)
c
      call btnpar (nodemx, kmax, maxit, msglvl, iprec, nlmax, initv,
     *          tolq, iunit, rnktol, maxfun, accrcy, stepmx, eta,
     *          ireset, indstp)
      indstp = 1
c
c solve optimization problem
c
      scale = 10.d0
10    write (*,860) mu
      call btn (n, x, f, g, w, lw, sfun, iflag,
     *          nodemx, kmax, maxit, msglvl, iprec, nlmax,
     *          initv, tolq, iunit, rnktol, maxfun, accrcy,
     *          stepmx, eta, ireset, indstp, maxstp)
c
c Print results
c
      if (iflag .eq. 999) then
          write (*,*) ' Fatal error in BTN'
          stop
      end if
c
c update barrier parameter
c
      mu = mu / scale
      if (mu .gt. 5.d-6) go to 10
      write (iunit,810)
      if (iflag .ne.   0) then
          write (    *,*) ' Error code = ', iflag
          write (iunit,*) ' Error code = ', iflag
      end if
      ig   = idamax (n, g, 1)
      write (iunit,820) f
      write (iunit,830) abs(g(ig))
      write (iunit,840)
      write (iunit,850) (x(i), i = 1,n)
      stop
810   format (//, ' Results of optimization problem', /)
820   format (' Final function value = ', 1pd24.16)
830   format (' Norm of gradient     = ', 1pd12.4)
840   format (' Parameters')
850   format (4(1pd16.8))
860   format (//,' mu =', 1pd12.4)
      end
c
c----------------------------------------------------------------------
c initial guess
c----------------------------------------------------------------------
c
      subroutine xstart (x, n)
      double precision x(n)
      double precision lb(100), ub(100), mu
      common /bnd/     lb, ub
      common /bar/     mu
c
c specify initial values of variables and their bounds
c
      do 10 i = 1,n
          x(i)  = 0.5d0
          lb(i) = 0.d0
          ub(i) = 1.d0
10    continue
c
      return
      end
c
c----------------------------------------------------------------------
c function evaluation
c----------------------------------------------------------------------
c
      subroutine sfun (n ,x, f, g)
c
c evaluate nonlinear function and gradient
c
      implicit         double precision (a-h,o-z)
      double precision x(n), f, g(n)
      double precision lb(100), ub(100), mu
      common /bnd/     lb, ub
      common /bar/     mu
C$    PRIVATE
c
      eps = 1.d-20
      f = 0.d0
      do 30 i = 1,n
          b = 1.d0
          if (i .gt. n/2) b = -b
          d1 = x(i) - lb(i)
          d2 = ub(i) - x(i)
          if (d1 .lt. eps) write (*,800) d1
          if (d2 .lt. eps) write (*,800) d2
          f = f + b * x(i)
     *       - mu * (log (d1)) - mu * (log (d2))
          g(i) =  b - mu / d1 + mu / d2
30    continue
c
      return
800   format (' Warning: negative slack value ', d16.8, /,
     *        ' Parameter mu may be too small')
      end
c
c
      subroutine maxstp (n, x, d, stepmx)
c
c computes maximum stepsize moving from the (feasible) point x
c in direction d, under box constraints
c
c Parameters:
c n       --> integer, number of variables
c x       --> double precision, size n, current and new vector
c             of parameters
c d       --> double precision, size n, search direction vector
c stepmx <--  double precision, maximum step allowed
c
      implicit         double precision (a-h, o-z)
      double precision d(*), x(*), lb(100), ub(100)
      common /bnd/     lb, ub
c
c  set up
c
      alpha   = 1.d8
      ep = 1.d-8
      i1 = 1
      i2 = n
      do 10 i = i1,i2
          if (d(i) .gt. ep) then
              t = (ub(i) - x(i)) / d(i)
           else if (d(i) .lt. -ep) then
              t = (x(i) - lb(i)) /(-d(i))
           endif
           if (t .lt. alpha) alpha = t
10    continue
      stepmx = alpha*0.99999d0
c
      return
      end