(********************* EXTREE version 1.5 ********************************) (* *) (* EXTREE copyright 1982-1995 by James E. Corter *) (* All rights reserved. *) (* *) (* EXTREE is a program in the PASCAL language for fitting the extended *) (* tree model (Corter & Tversky, 1986) to proximity data. The data may *) (* be either similarity or distance data. A listing of parameters and *) (* their estimates, drawing of the extended tree, and summary statistics *) (* are reported. Various other output options are available. A help file *) (* accompanies this source file. See it for more information on using *) (* the program. This program was written by James E. Corter. *) (* Questions or comments regarding the program may be addressed to him: *) (* INTERNET: jec34@columbia.edu *) (* Box 41, Teachers College, Columbia University, New York, NY 10027 *) (* *) (* NOTICE REGARDING RETRIEVAL AND USE OF EXTREE AND ALL ASSOCIATED *) (* MATERIAL: *) (* By retrieving or using any of the files in this directory *) (* pertaining to EXTREE, you are agreeing to the following *) (* terms and conditions: *) (* *) (* 1. All the material you retrieved is and will remain the property *) (* of the author, James E. Corter. It is provided only for not-for- *) (* profit research and teaching purposes. The material to be supplied *) (* is not to be published or redistributed, with or without modifi- *) (* cation, or resold, or used for any business or commercial purpose *) (* without the express written permission of the author. *) (* *) (* 2. THIS MATERIAL IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR *) (* IMPLIED WARRANTY. IN PARTICULAR, BUT WITHOUT LIMITATION, NEITHER THE*) (* AUTHOR NOR AT&T MAKE ANY REPRESENTATION OR WARRANTY OF ANY KIND *) (* CONCERNING THE MERCHANTABILITY OF THIS MATERIAL, ITS FITNESS FOR ANY *) (* PARTICULAR PURPOSE, ITS FREEDOM FROM INFRINGEMENT OF ANY PATENT OR *) (* COPYRIGHT, ITS ACCURACY, OR PROVISION OF TECHNICAL SUPPORT. *) (* *) (********************** EXTREE version 1.5 *******************************) program extree(input,output,infile,outfile); (* change these constants for bigger datasets, or for different compilers *) (* for IBM-PC Turbo PASCAL 6.0 version, set to 32,64,96,496,(80) *) const maxobject = 32; (* = maximum number of objects *) twiceobject = 64; (* = 2n = max # of tree nodes *) thriceobject = 96; (* = 3n = max # of tree + marked features *) maxsize = 496; (* = n*(n-1)/2 (number of distances) *) linelength = 80; (* set to local printer linelength *) type grouptype = set of 1..maxobject; matrixarray = array[1..maxsize] of real; nodearray = array[1..thriceobject] of real; rootingtype = (asis,minvar,specified); var prox: matrixarray; (* raw input data *) d: matrixarray; (* transformed data *) c: array[1..maxsize] of integer; numobject: integer; (* number of objects, n *) numprox: integer; (* number of distances (=n*(n-1)/2) *) n,size: integer; (* same as numobject, numprox *) proxcode: real; (* =-1.0 for similarity data, 1.0 for dis*) field,decplaces: integer; (* describes output data fields *) addconstant,maxdis,diff,sum,sum1,sum2: real; group: array[1..maxobject] of grouptype; numgroup,nreduce: integer; numnode,rootnode: integer; child: array[1..thriceobject,1..2] of 0..thriceobject; leafcount: array[1..thriceobject] of integer; height: array[1..thriceobject] of real; parent: array[1..thriceobject] of 0..thriceobject; length: nodearray; index: array[1..thriceobject] of 1..thriceobject; use: array[1..thriceobject] of boolean; name: array[1..thriceobject,1..20] of char; branchset: array[1..thriceobject] of grouptype; leafss: nodearray; numfeat: integer; setzero,zeroleft: integer; iteration: integer; spillover: array[1..thriceobject] of boolean; host: array[1..thriceobject,1..6] of 0..thriceobject; printdata,printtransformed,printnumbers, specifyorder,specifytree,specifymarked,printpatternm, printheight,printtree,printmodeldis,printresiduals: boolean; (* output options requests *) similarities,lowerhalf: boolean; (* similarities or distances; lowerhalf or full matrix *) subtractconst: boolean; (* is data adjusted to EXACTLY satisfy triangle inequality? *) rooting: rootingtype; (* three options are available for selection of root: asis=just put root at last two nodes minvar=find root with min. variance of root-to-leaf distances specified=root at specified nodes *) rootchild: array[1..2] of integer; (* "children" of root node *) linesize: integer; (* output linelength for crt, etc. *) namelength: integer; (* length of longest name *) symbol: array[1..thriceobject] of char; feed: integer; (* signals when output is about to run off line *) column,fld: integer; (* hold column & data field info *) threshold, (* threshold for eliminating small marked segmentss *) tthreshold, (* " " " small tree branches *) cthreshold: real; (* relative threshold for defining cliques *) numberhosts: array[1..thriceobject] of integer; maxspillover,newconstant,rootarclength: real; nummarked: integer; depthofsearch: integer; saveroot,rootd: real; nodeposition: nodearray; flagzero: boolean; i,j,k,l,m,ij,ik,jk,il,jl,kl,q,x,y,int,whichgroup,hold: integer; ch: char; infile,outfile: text; infilnam,outfilnam: string[24]; (***********************************************************************) (* ijfunction: converts the row and column of a lowerhalf matrix *) (* to the ordinal position of that entry in a vector (matrixarray) *) (***********************************************************************) function ijfunction(i:integer; j:integer): integer; begin if (i>j) then ijfunction:=(((i-1)*(i-2)) div 2) + j else if (j>i) then ijfunction:=(((j-1)*(j-2)) div 2) + i else writeln('error: i=j in ijfunction'); end; (* ijfunction *) (***********************************************************************) (* writematrix: write a matrixarray as lowerhalf matrix; *) (* n = size of the matrix, *) (* field = number of digits for each entry, and *) (* decplaces = number of digits after the decimal place *) (***********************************************************************) procedure writematrix(mtx: matrixarray; n,field,decplaces: integer); var row,col: integer; begin ij:=0; for row:=2 to n do begin for col:=1 to (row-1) do begin ij:=ij+1; write(outfile,mtx[ij]:field:decplaces); if (col=feed) then writeln(outfile); end; writeln(outfile); end; writeln(outfile); writeln(outfile); end; (* writematrix *) (***********************************************************************) (* qsort: does quicksort of array dd, saves new order in array c *) (***********************************************************************) procedure qsort(var dd: matrixarray; lower,upper: integer); begin if (upper>lower+1) then begin x:=lower+1; y:=upper; while (x<=y) do if (dd[c[x]]>=dd[c[lower]]) then x:=x+1 else begin while (dd[c[y]]dd[c[upper]]) then begin hold:=c[lower]; c[lower]:=c[upper]; c[upper]:=hold; end; end; (* qsort *) (************************************************************************) (* readin: read program control information and data *) (************************************************************************) procedure readin; var i,j,k,ij,count: integer; dumreal: real; ch: char; word: array[1..3] of char; procedure readword; (* read a word of input control keywords *) begin ch:=' '; while ((ch=' ') or (ch=',')) do if not(eoln(infile)) then read(infile,ch); i:=0; while not((ch=' ') or (ch=',') or (eoln(infile))) do begin i:=i+1; if not(i=1) then read(infile,ch); j:=ord(ch); if ((j>=65) and (j<=90)) then j:=j+32; ch:=chr(j); if not(i>3) then word[i]:=ch; end; end; (* readword *) begin (* readin *) (* initialize & set defaults *) printdata:=false; printtransformed:=false; printheight:=false; printmodeldis:=false; printresiduals:=false; printtree:=true; printnumbers:=true; printpatternm:=true; specifyorder:=false; specifytree:=false; specifymarked:=false; linesize:=linelength; rooting:=asis; subtractconst:=true; cthreshold:=0.5; threshold:=-9999.9; tthreshold:=-9999.9; nummarked:=-9999; (* reset later *) (* read analysis & output option line *) while not (eoln(infile)) do begin readword; if ((word[1]='d')and(word[2]='a')and(word[3]='t')) then printdata:=true; if ((word[1]='t')and(word[2]='r')and(word[3]='a')) then printtransformed:=true; if ((word[1]='m')and(word[2]='o')and(word[3]='d')) then printmodeldis:=true; if ((word[1]='r')and(word[2]='e')and(word[3]='s')) then printresiduals:=true; if ((word[1]='n')and(word[2]='o')and(word[3]='t')) then printtree:=false; if ((word[1]='c')and(word[2]='r')and(word[3]='t')) then linesize:=80; if ((word[1]='n')and(word[2]='o')and(word[3]='n')) then printnumbers:=false; if ((word[1]='s')and(word[2]='p')and(word[3]='e')) then specifyorder:=true; if ((word[1]='n')and(word[2]='o')and(word[3]='s')) then subtractconst:=false; if ((word[1]='h')and(word[2]='e')and(word[3]='i')) then printheight:=true; if ((word[1]='t')and(word[2]='r')and(word[3]='e')) then specifytree:=true; if ((word[1]='m')and(word[2]='a')and(word[3]='r')) then specifymarked:=true; if ((word[1]='n')and(word[2]='o')and(word[3]='s')) then subtractconst:=false; if ((word[1]='n')and(word[2]='o')and(word[3]='p')) then printpatternm:=false; if ((word[1]='l')and(word[2]='i')and(word[3]='n')) then read(infile,linesize); if ((word[1]='t')and(word[2]='h')and(word[3]='r')) then read(infile,threshold); if ((word[1]='t')and(word[2]='t')and(word[3]='h')) then read(infile,tthreshold); if ((word[1]='r')and(word[2]='h')and(word[3]='o')) then read(infile,cthreshold); if ((word[1]='p')and(word[2]='a')and(word[3]='r')) then read(infile,nummarked); if ((word[1]='m')and(word[2]='i')and(word[3]='n')) then rooting:=minvar; if ((word[1]='d')and(word[2]='e')and(word[3]='e')) then writeln(outfile,'deepest rooting no longer supported: option ignored'); if ((word[1]='r')and(word[2]='o')and(word[3]='o')) then begin rooting:=specified; read(infile,rootchild[1],rootchild[2]); end; end; readln(infile); (* if order of leaves is to be specified, then read that (optional) line *) count:=0; if (specifyorder) then begin while not(eoln(infile)) do begin count:=count+1; read(infile,i); nodeposition[i]:=count end; readln(infile); end; writeln(outfile); writeln(outfile,'extree analysis (EXTREE version 1.5):'); (* read & write comment line *) for i:=1 to linelength do if not(eoln(infile)) then begin read(infile,ch); write(outfile,ch) end; readln(infile); writeln(outfile); writeln(outfile); writeln(outfile); (* begin reading data description line *) read(infile,numobject); if (numobject>maxobject) then writeln(outfile,'error: maximum number of objects allowed is ',maxobject:4); if (nummarked>maxobject) then writeln('error: no more than',maxobject:4, ' overlapping features can be specified'); if ((specifyorder) and not(count=numobject)) then writeln(outfile,'error: optional line specifying order of leaves ', 'has wrong number of entries'); readword; if ((word[1]='s')and(word[2]='i')and(word[3]='m')) then similarities:=true else if ((word[1]='d')and(word[2]='i')and(word[3]='s')) then similarities:=false else writeln(outfile,' can''t tell if similarities or distances'); readword; if ((word[1]='l')and(word[2]='o')and(word[3]='w')) then lowerhalf:=true else if ((word[1]='f')and(word[2]='u')and(word[3]='l')) then lowerhalf:=false else writeln(outfile,' can''t tell if full or lowerhalf matrix'); if not(eoln(infile)) then read(infile,field) else writeln(outfile,' can''t find field-width specification'); if not(eoln(infile)) then read(infile,decplaces) else writeln(outfile,' can''t find digits-after-decimal-point specification'); readln(infile); numprox:=(numobject*(numobject-1)) div 2; feed:= linesize div field; (* begin reading actual data *) (* read full matrix, treating as asymmetric; print data if requested *) if (lowerhalf=false) then begin if (printdata=true) then writeln(outfile,'data matrix:'); for ij:=1 to maxsize do prox[ij]:=0.0; for i:=1 to numobject do begin for j:=1 to numobject do begin if (eoln(infile)) then readln(infile); read(infile,dumreal); if (printdata) then write(outfile,dumreal: field: decplaces); if ((printdata) and (j=feed)) then writeln(outfile); if not(i=j) then begin ij:=ijfunction(i,j); prox[ij]:=prox[ij]+(dumreal/2) end; end; readln(infile); if (printdata) then writeln(outfile); end; writeln(outfile); writeln(outfile,'full matrix symmetrized'); writeln(outfile); writeln(outfile); end; (* read lowerhalf matrix *) if (lowerhalf) then begin ij:=0; for i:=2 to numobject do begin if (eoln(infile)) then readln(infile); for j:=1 to (i-1) do begin ij:=ij+1; if (eoln(infile)) then readln(infile); read(infile,prox[ij]); end; end; readln(infile); end; (* read in names if they are given *) j:=1; namelength:=0; while not((eof(infile)) or (j>numobject)) do begin i:=1; while not((eoln(infile)) or (i>20)) do begin read(infile,name[j,i]); i:=i+1; end; k:=i-1; if not(k=0) then while (name[j,k]=' ') do k:=k-1; if (namelengthmaxdis) then maxdis:=prox[ij]; end; if (min<0.0) then addconstant:=-min; writeln(outfile); writeln(outfile); writeln(outfile,'(',addconstant:field:decplaces, ' needed for positivity of distances )'); (* check every triple of objects for triangle inequality *) min:=maxdis; xxij:=1; for i:=1 to (numobject-2) do begin xxij:=xxij+i-1; ij:=xxij; xxjk:=xxij; for j:=(i+1) to (numobject-1) do begin ij:=ij+(j-2); ik:=ij; xxjk:=xxjk+j-1; jk:=xxjk; for k:=(j+1) to (numobject) do begin ik:=ik+(k-2); jk:=jk+(k-2); if ((prox[jk]>=prox[ij]) and (prox[jk]>=prox[ik])) then diff:=(prox[ij]+prox[ik])-prox[jk] else if ((prox[ik]>=prox[ij]) and (prox[ik]>=prox[jk])) then diff:=(prox[ij]+prox[jk])-prox[ik] else diff:=(prox[ik]+prox[jk])-prox[ij]; if (difflinelength) then begin linesize:=linelength; writeln(outfile,'warning: linesize must be <= ',linelength:3,' (resetting it)') end; if (nummarked=-9999) then nummarked:=(numobject div 2); if (cthreshold<0.0) then cthreshold:=0.0; if (cthreshold>1.0) then cthreshold:=1.0; if (threshold=-9999.9) then threshold:=(maxdis+addconstant)/50.0 else if (threshold<0.0) then begin threshold:=0.0; writeln(outfile,'warning: threshold must be >= 0.0 (resetting it)') end; if (tthreshold=-9999.9) then tthreshold:=threshold else if (tthreshold<0.0) then tthreshold:=0.0; end; (* initialize *) (************************************************************************) (* quadruples: check every quadruple of objects, finding the pairs that *) (* are closest; increment their "neighbor score", c *) (************************************************************************) procedure quadruples; var i,j,k,l,ij,ik,jk,il,jl,kl: integer; xxij,xxjk,xxkl: integer; sum1,sum2,sum3: real; begin for ij:=1 to size do c[ij]:=0; (* check every quadruple of objects *) xxij:=1; for i:=1 to (n-3) do begin xxij:=xxij+i-1; ij:=xxij; xxjk:=xxij; for j:=(i+1) to (n-2) do begin ij:=ij+(j-2); ik:=ij; xxjk:=xxjk+j-1; jk:=xxjk; xxkl:=xxjk; for k:=(j+1) to (n-1) do begin ik:=ik+(k-2); jk:=jk+(k-2); il:=ik; jl:=jk; xxkl:=xxkl+k-1; kl:=xxkl; for l:=(k+1) to n do begin il:=il+(l-2); jl:=jl+(l-2); kl:=kl+(l-2); sum1:=d[ij]+d[kl]; sum2:=d[ik]+d[jl]; sum3:=d[il]+d[jk]; (* object pairs in smallest of these sums receive a nearest-neighbor score of 2,those in the next smallest 1, those in largest 0 *) if (sum1maxloser) then maxloser:=losercount; end; (* findcloserneighbor *) begin (* findgroups *) (* find nearest-neighbors of each object & calc. rowsumdis. for tiebreaking *) holdcount:=0; for i:=1 to n do begin rowmaxcount:=-1; rowsumdistance[i]:=0.0; for j:=1 to n do if not(i=j) then begin ij:=ijfunction(i,j); rowsumdistance[i]:=rowsumdistance[i]+d[ij]; if (c[ij]>rowmaxcount) then begin rowmaxcount:=c[ij]; numnn:=1; neighbors[1]:=j; end else if (c[ij]=rowmaxcount) then begin numnn:=numnn+1; neighbors[numnn]:=j; end; end; (* loop on j *) (* check if any pair made from a new neighbor of i is already on list *) for new:=1 to numnn do begin goodone[new]:=true; for k:=1 to holdcount do if (([i]+[neighbors[new]])=([pair[k,1]]+[pair[k,2]])) then goodone[new]:=false; end; numnew:=0; for new:=1 to numnn do begin if (goodone[new]) then begin numnew:=numnew+1; pair[holdcount+numnew,1]:=i; pair[holdcount+numnew,2]:=neighbors[new]; end; end; holdcount:=holdcount+numnew; end; (* loop on each object, i *) for i:=1 to holdcount do goodone[i]:=true; maxloser:=0; (* compare every pair of pairs with a common member to see which is better *) for j:=2 to holdcount do for i:=1 to (j-1) do begin pairi:=[pair[i,1]]+[pair[i,2]]; pairj:=[pair[j,1]]+[pair[j,2]]; common:= pairi*pairj; if (common<>[]) (* if pairs overlap, then resolve by l.s. test *) then begin otheri:=pairi-common; otherj:=pairj-common; for int:=1 to n do begin if (int in common) then com:=int; if (int in otheri) then oth1:=int; if (int in otherj) then oth2:=int; end; findcloserneighbor(com,oth1,oth2); if (neighbor=oth1) then goodone[j]:=false else goodone[i]:=false; end; end; (* loop on pairs j,i of object pairs *) (* now each object i should have only a single nearest neighbor *) numgroup:=0; maxgood:=0; for i:=1 to holdcount do begin ij:=ijfunction(pair[i,1],pair[i,2]); if (goodone[i] and (c[ij]>maxgood)) then maxgood:=c[ij]; if ((goodone[i]) and (c[ij]>=maxloser)) then begin numgroup:=numgroup+1; group[numgroup]:=[pair[i,1]]+[pair[i,2]]; end; end; if (numgroup=0) (* if no good pair was > maxloser, take max good one(s) *) then for i:=1 to holdcount do if goodone[i] then begin ij:=ijfunction(pair[i,1],pair[i,2]); if (c[ij]=maxgood) then begin numgroup:=numgroup+1; group[numgroup]:=[pair[i,1]]+[pair[i,2]] end; end; end; (* findgroups *) (*************************************************************************) (* makenode: using list of groups from "findgroups", make new nodes from *) (* groups. Compute length of arcs between new node and its *) (* children, and define distances from new node as weighted *) (* average of distances from children. *) (*************************************************************************) procedure makenode; var i,j,k,ij,node,holdcount,sumcount: integer; nodegroup: array[1..2] of integer; (* holds pair being made into node *) procedure estimate; (* compute arc lengths *) var sum,avedis,diff: array[1..2] of real; dis,avelength: real; i,j,holdcount: integer; begin ij:=ijfunction(nodegroup[1],nodegroup[2]); dis:=d[ij]; for i:=1 to 2 do begin holdcount:=0; sum[i]:=0.0; for j:=1 to n do if (use[j]) then begin ij:=ijfunction(nodegroup[i],j); sum[i]:=sum[i]+leafcount[index[j]]*d[ij]; holdcount:=holdcount+leafcount[index[j]]; end; avedis[i]:=sum[i]/holdcount; end; avelength:=(avedis[1]+avedis[2])/2; for i:=1 to 2 do begin diff[i]:=avedis[i]-avelength; length[index[nodegroup[i]]]:=dis/2+diff[i]-height[index[nodegroup[i]]]; if (length[index[nodegroup[i]]]<0.0) then length[index[nodegroup[i]]]:=0.0; end; end; (* estimate *) begin (* makenode *) numnode:=numnode+1; node:=numnode; k:=0; for i:=1 to n do if (i in group[whichgroup]) then begin k:=k+1; use[i]:=false; parent[index[i]]:=node; nodegroup[k]:=i end; if not(k=2) then writeln(outfile,'error: wrong number of children: node',node:4); for i:=1 to k do begin leafcount[node]:=leafcount[node]+leafcount[index[nodegroup[i]]]; branchset[node]:=branchset[node]+branchset[index[nodegroup[i]]]; end; (* order children according to specified or input order *) if (nodeposition[index[nodegroup[2]]]rootarclength) then diff:=rootarclength; if (diff<-rootarclength) then diff:=-rootarclength; length[child[rootnode,1]]:=(rootarclength-diff)/2.0; length[child[rootnode,2]]:=(rootarclength+diff)/2.0; sum1:=height[child[rootnode,1]]+length[child[rootnode,1]]; sum2:=height[child[rootnode,2]]+length[child[rootnode,2]]; height[rootnode]:=((sum1*leafcount[child[rootnode,1]]) +(sum2*leafcount[child[rootnode,2]])) / leafcount[rootnode]; leafss[rootnode]:=leafss[child[rootnode,1]]+leafss[child[rootnode,2]] +leafcount[child[rootnode,1]]*sqr(height[rootnode]-sum1) +leafcount[child[rootnode,2]]*sqr(height[rootnode]-sum2); if (rooting=minvar) then begin writeln(outfile); writeln(outfile,'var. of dis. from root to leaves =', leafss[rootnode]:9:4); end; if (rootarclength>0.0) then saveroot:=length[child[rootnode,1]]/rootarclength else saveroot:=0.0; end; (* hangroot *) (************************************************************************) (* findbestroot: find optimal rooting of the tree *) (* (for rooting=minvar or rooting=specified) *) (************************************************************************) procedure findbestroot; var parentlength,parentheight,parentss,minscore,psum,bsum,othersum,fooheight, newparentheight,newparentss,foo1,foo2: real; score: nodearray; parentcount,newparentcount,goodnode,badnode,goodchild,badchild, ch1,ch2: integer; foundroot: boolean; procedure checkchild(gp,bp,nde,othernde: integer); begin if ((nde>0) and (othernde>0)) then begin if (rooting=specified) then if ((branchset[rootchild[1]]*branchset[nde]<>[]) and (branchset[rootchild[2]]*branchset[nde]<>[])) then score[nde]:=-1.0 else score[nde]:=0.0; if (rooting=minvar) then begin psum:=parentheight+parentlength; othersum:=height[othernde]+length[othernde]; newparentcount:=parentcount+leafcount[othernde]; newparentheight:=((psum*parentcount) + (othersum*leafcount[othernde])) / newparentcount; newparentss:=parentss+leafss[othernde] +(parentcount*sqr(psum-newparentheight)) +(leafcount[othernde]*sqr(othersum-newparentheight)); diff:=height[nde]-newparentheight; if (diff>length[nde]) then diff:=length[nde]; if (diff<-length[nde]) then diff:=-length[nde]; foo1:=(length[nde]-diff)/2.0; foo2:=(length[nde]+diff)/2.0; sum1:=height[nde]+foo1; sum2:=newparentheight+foo2; fooheight:=((sum1*leafcount[nde])+(sum2*newparentcount))/numobject; score[nde]:=leafss[nde]+newparentss +leafcount[nde]*sqr(fooheight-sum1) +newparentcount*sqr(fooheight-sum2); writeln(outfile,'node',nde:3,' score=',score[nde]:8:3); end; (* minvar section *) if (score[nde]',nde:3,')'); parent[nde]:=rootnode; parent[bp]:=gp; length[bp]:=rootarclength; branchset[gp]:=branchset[bp]+branchset[othernde]; leafcount[gp]:=leafcount[bp]+leafcount[othernde]; height[gp]:=((leafcount[bp]*(height[bp]+length[bp])) +(leafcount[othernde]*(height[othernde]+length[othernde]))) / leafcount[gp]; othersum:=height[bp]+rootarclength; bsum:=height[othernde]+length[othernde]; leafss[gp]:=leafss[bp]+leafss[othernde] +(leafcount[bp]*sqr(othersum-height[gp])) +(leafcount[othernde]*sqr(bsum-height[gp])); nodeposition[gp]:=((leafcount[bp]*nodeposition[bp]) +(leafcount[othernde]*nodeposition[othernde])) / leafcount[gp]; if (nodeposition[gp]=score[rootnode]) then foundroot:=true else changeroot(goodnode,badnode,goodchild,badchild); until (foundroot); end; (* findbestroot *) (*********************************************************************) (* report: describe nodes, tree structure, and draw tree *) (*********************************************************************) procedure report; procedure drawtree; const vertchar = '|'; (* characters used to draw tree; change if desired *) horizchar = '-'; var start,stop,maxstop,linestop,middle,node,numleaf: integer; maxxposition,stretch: real; line: array[1..linelength] of char; xposition: array[1..thriceobject] of real; yposition: array[1..thriceobject] of integer; topkid,bottomkid: array[1..thriceobject] of integer; writej: boolean; (* descend calculates: vertical position of a branch ("yposition") horizontal position of branch ("xposition") endpoints of vertical connecting lines ("topkid","bottomkid") *) procedure descend(nde: integer); var i: integer; begin if not(nde=rootnode) then xposition[nde]:=xposition[parent[nde]]+length[nde]; if (nde<=numobject) (* if nde is a leaf *) then begin numleaf:=numleaf+1; yposition[nde]:=(numleaf*2)-1 end else begin (* if nde is an interior node *) for i:=1 to 2 do begin descend(child[nde,i]); yposition[nde]:=yposition[nde]+ (yposition[child[nde,i]]*leafcount[child[nde,i]]); end; yposition[nde]:=yposition[nde] div leafcount[nde]; topkid[nde]:=yposition[child[nde,1]]; bottomkid[nde]:=yposition[child[nde,2]] end; end; (* descend *) begin (* drawtree *) xposition[rootnode]:=0.0; numleaf:=0; for i:=(numobject+1) to numnode do yposition[i]:=0; descend(rootnode); if not(numleaf=numobject) then writeln(outfile,'error in tree structure: numleaf=',numleaf); maxxposition:=0.0; for i:=1 to numobject do if (xposition[i]>maxxposition) then maxxposition:=xposition[i]; maxstop:=(linesize-namelength-7); if ((printpatternm=true) and (numfeat>numnode)) then begin maxstop:=maxstop-2-(2*nummarked); for i:=1 to nummarked do if not(use[numnode+i]) then maxstop:=maxstop+2; for i:=1 to (linesize-15) do write(outfile,' '); writeln(outfile,'marked feature'); for i:=1 to (linesize-15) do write(outfile,' '); writeln(outfile,'pattern matrix'); for i:=1 to (linesize-15) do write(outfile,' '); writeln(outfile,'--------------'); writeln(outfile); end; stretch:=maxstop/maxxposition; (* begin loop on printer lines *) for y:=1 to ((2*numobject)-1) do begin linestop:=0; for i:=1 to linesize do line[i]:=' '; for node:=1 to numnode do begin (* place tree features *) if (yposition[node]=y) then begin if (parent[node]<>0) then start:=trunc(stretch*xposition[parent[node]])+1 else start:=1; stop:=trunc(stretch*xposition[node])+1; if (stop>linestop) then linestop:=stop; if (start<=stop) then for x:=start to stop do line[x]:=horizchar; (* place marked features *) if (numfeat>numnode) then begin for j:=1 to (numfeat-numnode) do for k:=1 to numberhosts[j] do if (host[j,k]=node) then begin if (length[j+numnode]>0.0) then middle:=start+trunc(stretch*length[j+numnode]) else middle:=start; if (use[j+numnode]=true) then for x:=start to middle do if (x<=maxstop) then line[x]:=symbol[j]; if (length[j+numnode]>0.0) then start:=middle+1; end; (* loop when host=node *) end; (* marked feature section *) end; (* loop that is performed only when yposition[node] equals y *) (* place vertical connecting lines *) if (node>numobject) then if ((y>topkid[node]) and (ylinestop) then linestop:=stop; line[stop]:=vertchar; end; end; (* loop on node *) (* write out computed line *) for i:=1 to linestop do if not(i>maxstop) then write(outfile,line[i]); if ((numfeat>numnode) and not(odd(y)) and printpatternm) then begin for i:=1 to (maxstop-linestop+4+namelength) do write(outfile,' '); if (printnumbers=true) then write(outfile,' '); for i:=1 to nummarked do if (use[numnode+i]) then write(outfile,' .'); end; for i:=1 to numobject do if (yposition[i]=y) then begin if (printnumbers=true) then write(outfile,i:4); write(outfile,' '); for j:=1 to namelength do write(outfile,name[i,j]); if ((numfeat>numnode) and (printpatternm)) then begin write(outfile,' '); if (maxstop>linestop) then for j:=1 to (maxstop-linestop) do write(outfile,' '); for j:=1 to (numfeat-numnode) do begin writej:=false; for k:=1 to numberhosts[j] do if ((i in branchset[host[j,k]]) and (use[numnode+j]=true)) then writej:=true; if (writej=true) then write(outfile,' ',symbol[j]) else if (use[numnode+j]=true) then write(outfile,' .'); end; end; end; (* loop on i *) writeln(outfile); end; writeln(outfile); writeln(outfile); end; (* drawtree *) begin (* report *) writeln(outfile); writeln(outfile,' node length children label'); writeln(outfile); for i:=1 to numfeat do begin write(outfile,i:4,' '); write(outfile,length[i]:field:decplaces,' '); if (i<=numobject) (* if i is a leaf *) then begin write(outfile,' '); for j:=1 to 20 do write(outfile,name[i,j]) end else if (i<=numnode) (* if i is an internal tree node *) then for j:=1 to 2 do write(outfile,child[i,j]:4) else begin (* if i is a marked feature *) k:=i-numnode; for j:=1 to numberhosts[k] do write(outfile,host[k,j]:4); write(outfile,' "',symbol[k],'"'); end; writeln(outfile); end; writeln(outfile); writeln(outfile); writeln(outfile); writeln(outfile); if (printtree=true) then drawtree; end; (* report *) (***************************************************************************) (* stats: calculate stress and r-squared between data & model distances *) (***************************************************************************) procedure stats; var modeldis: matrixarray; procedure derivedistances; begin ij:=0; for i:=2 to numobject do begin for j:=1 to (i-1) do begin ij:=ij+1; modeldis[ij]:=0.0; for k:=1 to numnode do if (((i in branchset[k]) and not(j in branchset[k])) or ((j in branchset[k]) and not(i in branchset[k]))) then modeldis[ij]:=modeldis[ij]+length[k]; if (numfeat>numnode) then for k:=1 to (numfeat-numnode) do for l:=1 to (numberhosts[k]-1) do for m:=(l+1) to numberhosts[k] do if (((i in branchset[host[k,l]]) and (j in branchset[host[k,m]])) or((j in branchset[host[k,l]]) and (i in branchset[host[k,m]]))) then modeldis[ij]:=modeldis[ij]-2.0*length[k+numnode]; end; end; end; (* derivedistances *) procedure lsmonotransform(var modeldis,d: matrixarray); var i,j,place,ordered: integer; bin: real; endcondition: integer; begin repeat ordered:=1; i:=1; repeat if (d[c[i+1]]>d[c[i]]) then begin ordered:=0; endcondition:=0; bin:=d[c[i]]; place:=i; repeat place:=place+1; bin:=bin+d[c[place]]; if (place=numprox) then endcondition:=1 else if (d[c[place+1]]<(bin/(place-i+1))) then endcondition:=1; until (endcondition=1); for j:=i to place do d[c[j]]:=(bin/(place-i+1)); i:=place end; i:=i+1 until (i>=numprox); until (ordered=1); end; (* lsmonotransform *) procedure stresscalc(modeldis,d: matrixarray); var i: integer; stress,stress1,stress2,denom, varn,varob,nmean,obmean,propor,rmonsq: real; begin stress:=0.0; denom:=0.0; varn:=0.0; varob:=0.0; nmean:=0.0; obmean:=0.0; propor:=0.0; for i:=1 to numprox do begin obmean:=obmean+prox[i]; nmean:=nmean+modeldis[i] end; nmean:=nmean/numprox; obmean:=obmean/numprox; for i:=1 to numprox do begin stress:=stress+sqr(d[i]-modeldis[i]); propor:=propor+(prox[i]*modeldis[i]); denom:=denom+sqr(modeldis[i]); varn:=varn+sqr(modeldis[i]-nmean); varob:=varob+sqr(prox[i]-obmean); end; stress1:=sqrt(stress/denom); stress2:=sqrt(stress/varn); rmonsq:=1.0-(stress/varn); varn:=varn/numprox; varob:=varob/numprox; propor:=(sqr(propor/numprox - nmean*obmean))/(varn*varob); writeln(outfile,'stress formula 1 =',stress1:7:4); writeln(outfile,'stress formula 2 =',stress2:7:4); writeln(outfile,'r(monotonic) squared=',rmonsq:6:4); writeln(outfile,'r-squared (p.v.a.f.)=',propor:6:4); writeln(outfile); writeln(outfile); writeln(outfile); writeln(outfile); end; (* stresscalc *) begin (* stats *) derivedistances; (* re-use arrays c & d for lsmt transform of model distances *) for ij:=1 to numprox do begin d[ij]:=modeldis[ij]; c[ij]:=ij end; qsort(prox,1,numprox); lsmonotransform(modeldis,d); stresscalc(modeldis,d); (* now re-use array d to store (untransformed) tree distances *) for ij:=1 to numprox do d[ij]:=modeldis[ij]; end; (* stats *) (****************************************************************************) (* beginning of extree section -- look for marked features *) (****************************************************************************) (* findfeatures: check all possible marked features, save those > threshold *) (****************************************************************************) procedure findfeatures; var est: matrixarray; (* use to store estimates of marked segments *) row,col: array[1..maxsize] of 1..twiceobject; leaf,ileaf,jleaf,icount,jcount,count,branch1,branch2: integer; ileaflist,jleaflist: array[1..maxobject] of 1..maxobject; avedisij,avedisik,avedisil,avedisjk,avedisjl,diskl: real; treedisij,treedisik,treedisil,treedisjk,treedisjl,treediskl: real; node,othernode,nmarked,hold,pq,ii,jj,iijj: integer; sum,sum1,sum2,sum3,diff,estimate, treesum,treesum1,treesum2,treesum3: real; nerf: array[1..2] of integer; procedure findbestset; type intarray = array[0..thriceobject] of integer; var useit: array[1..thriceobject] of boolean; connected: array[1..thriceobject,1..thriceobject] of boolean; all,compsub: intarray; weight: array[1..thriceobject] of real; feature,group,othergroup,clique,r,cc,nc,ig,io,nh: integer; groupset,othergroupset: grouptype; hght1,hght2,test: real; (* cliques is the recursive part of the clique-finding algorithm *) (* uses Bron-Kerbosch algorithm (Alg. 457, CACM) *) procedure cliques(old: intarray; ne,ce: integer); var new: intarray; nod, fixp: integer; newne,newce,i,j,count,pos,p,s,sel,minnod: integer; loc: integer; begin minnod:=ce; i:=0; nod:=0; (* determine each counter value and look for minimum *) while ((i0)) do begin i:=i+1; p:=old[i]; count:=0; j:=ne; (* count points not connected to p *) while ((j2)) (* report only cliques with more than 2 members *) then begin clique:=clique+1; write(outfile,' clique = '); for loc:=1 to cc do begin host[clique,loc]:=compsub[loc]; write(outfile,compsub[loc]:3) end; numberhosts[clique]:=cc; writeln(outfile) end else if ((newce<>0) and (newne1) then begin (* select a candidate not connected to the fixed point *) s:=ne; repeat s:=s+1 until not(connected[fixp,old[s]]); end; end; (* backtrackcycle *) end; (* procedure cliques *) begin (* procedure findbestset *) for i:=1 to depthofsearch do useit[i]:=true; (* throw out "false" children & parents *) if (depthofsearch>=2) then for group:=1 to (depthofsearch-1) do for othergroup:=(group+1) to depthofsearch do begin ig:=c[group]; io:=c[othergroup]; if (row[ig]=row[io]) then if (((parent[col[ig]]=col[io]) and (est[io]<(0.5*est[ig]))) or (col[ig]=parent[col[io]])) then useit[othergroup]:=false; if (col[ig]=col[io]) then if (((parent[row[ig]]=row[io]) and (est[io]<(0.5*est[ig]))) or (row[ig]=parent[row[io]])) then useit[othergroup]:=false; if (row[ig]=col[io]) then if (((parent[col[ig]]=row[io]) and (est[io]<(0.5*est[ig]))) or (col[ig]=parent[row[io]])) then useit[othergroup]:=false; if (col[ig]=row[io]) then if (((parent[row[ig]]=col[io]) and (est[io]<(0.5*est[ig]))) or (row[ig]=parent[col[io]])) then useit[othergroup]:=false; end; for q:=1 to numnode do (* prepare for clique-finding *) begin for r:=1 to numnode do connected[q,r]:=false; connected[q,q]:=true end; for i:=1 to depthofsearch do if useit[i] then begin pq:=c[i]; connected[row[pq],col[pq]]:=true; connected[col[pq],row[pq]]:=true; end; (* look through binary feature list for maximal-weight complete subgraphs *) clique:=0; cc:=0; for r:=1 to numnode do all[r]:=r; cliques(all,0,numnode); if (clique>0) then (* estimate clique weights *) for nc:=1 to clique do begin weight[nc]:=0.0; for i:=1 to depthofsearch do begin count:=0; for j:=1 to numberhosts[nc] do if ((row[c[i]]=host[nc,j]) or (col[c[i]]=host[nc,j])) then count:=count+1; if (count>1) then weight[nc]:=weight[nc]+est[c[i]] end; nh:=numberhosts[nc]; test:=weight[nc]/(nh*(nh-1)/2); writeln(outfile,'clique',nc:3,' ave. weight =',test:field:decplaces); (* eliminate each pairwise feature i that went into clique, unless it's twice as large as clique *) for i:=1 to depthofsearch do begin count:=0; for j:=1 to numberhosts[nc] do if ((row[c[i]]=host[nc,j]) or (col[c[i]]=host[nc,j])) then count:=count+1; if (count>1) then begin nh:=numberhosts[nc]; test:=(weight[nc]-est[c[i]])/((nh*(nh-1)/2)-1); if ((est[c[i]]*cthreshold)=2) then for group:=1 to (depthofsearch-1) do if useit[group] then begin groupset:=[]; for i:=1 to numberhosts[group] do groupset:=groupset+branchset[host[group,i]]; for othergroup:=(group+1) to (depthofsearch) do if useit[othergroup] then begin othergroupset:=[]; for i:=1 to numberhosts[othergroup] do othergroupset:=othergroupset+branchset[host[othergroup,i]]; if ((groupset+othergroupset)=branchset[rootnode]) then begin (* represent as feature of objects that are nearer in tree approx. of data *) hght1:=0.0; hght2:=0.0; for i:=1 to numberhosts[group] do hght1:=hght1+height[host[group,i]]+length[host[group,i]]; for i:=1 to numberhosts[othergroup] do hght2:=hght2+height[host[othergroup,i]]+length[host[othergroup,i]]; if ((hght1/numberhosts[group])<(hght2/numberhosts[othergroup])) then useit[othergroup]:=false else useit[group]:=false; end; end; end; (* clean up arrays, throwing out features deleted as redundant *) if (depthofsearch>=1) then begin feature:=0; for i:=1 to depthofsearch do if (useit[i] and ((feature+numnode) threshold *) for node:=(numobject+1) to (numnode-2) do (* don't consider rootnode *) begin write('.'); for othernode:=(node+1) to (numnode-1) do (* ditto *) begin for i:=1 to 2 do for j:=1 to 2 do if ((branchset[child[node,i]]*branchset[child[othernode,j]])=[]) then begin branch1:=child[node,i]; branch2:=child[othernode,j]; count:=0; estimate:=0.0; sum:=0.0; treesum:=0.0; icount:=0; jcount:=0; for leaf:=1 to numobject do begin if (leaf in branchset[branch1]) then begin icount:=icount+1; ileaflist[icount]:=leaf end; if (leaf in branchset[branch2]) then begin jcount:=jcount+1; jleaflist[jcount]:=leaf end end; for ii:=1 to icount do for jj:=1 to jcount do begin iijj:=ijfunction(ileaflist[ii],jleaflist[jj]); sum:=sum+prox[iijj]; treesum:=treesum+d[iijj]; end; avedisij:=sum/(icount*jcount); treedisij:=treesum/(icount*jcount); for k:=1 to (numobject-1) do if not((k in branchset[branch1]) or (k in branchset[branch2])) then begin avedisik:=0.0; treedisik:=0.0; for ileaf:=1 to icount do begin avedisik:=avedisik+prox[ijfunction(ileaflist[ileaf],k)]; treedisik:=treedisik+d[ijfunction(ileaflist[ileaf],k)]; end; avedisik:=avedisik/icount; treedisik:=treedisik/icount; avedisjk:=0.0; treedisjk:=0.0; for jleaf:=1 to jcount do begin avedisjk:=avedisjk+prox[ijfunction(jleaflist[jleaf],k)]; treedisjk:=treedisjk+d[ijfunction(jleaflist[jleaf],k)]; end; avedisjk:=avedisjk/jcount; treedisjk:=treedisjk/jcount; for l:=(k+1) to numobject do if not((l in branchset[branch1]) or (l in branchset[branch2])) then begin avedisil:=0.0; treedisil:=0.0; for ileaf:=1 to icount do begin avedisil:=avedisil+prox[ijfunction(ileaflist[ileaf],l)]; treedisil:=treedisil+d[ijfunction(ileaflist[ileaf],l)]; end; avedisil:=avedisil/icount; treedisil:=treedisil/icount; avedisjl:=0.0; treedisjl:=0.0; for jleaf:=1 to jcount do begin avedisjl:=avedisjl+prox[ijfunction(jleaflist[jleaf],l)]; treedisjl:=treedisjl+d[ijfunction(jleaflist[jleaf],l)]; end; avedisjl:=avedisjl/jcount; treedisjl:=treedisjl/jcount; diskl:=prox[ijfunction(k,l)]; treediskl:=d[ijfunction(k,l)]; sum1:=avedisij+diskl; treesum1:=treedisij+treediskl; sum2:=avedisik+avedisjl; treesum2:=treedisik+treedisjl; sum3:=avedisil+avedisjk; treesum3:=treedisil+treedisjk; (* if tree structure is ((i j)(k l)), don't use to estimate feat. for i&j *) if not((treesum10) then estimate:=estimate/(count*2); if (nmarkedthreshold) then begin nmarked:=nmarked+1; (* re-use array c to sort marked segments *) c[nmarked]:=nmarked; est[nmarked]:=estimate; row[nmarked]:=branch1; col[nmarked]:=branch2; end end else begin writeln(outfile,'WARNING: too many marked features found;', ' increase threshold and re-run'); writeln(outfile) end; end; (* loop on children of i and j *) end; (* loop on j *) end; (* loop on i *) if (nmarkedlinesize)) then begin writeln(outfile); column:=18+field; for l:=1 to column do write(outfile,' ') end; write(outfile,j:3); column:=column+3; end; write(outfile,' ]'); column:=column+1; end; writeln(outfile); end; writeln(outfile); writeln(outfile); (* find cliques & eliminate redundant features *) writeln(outfile, 'checking for cliques & redundant patterns of marked features'); if (nmarked>thriceobject) then nmarked:=thriceobject; findbestset end; (* if nmarked > 0 *) writeln(outfile); writeln(outfile); end; (* findfeatures *) (*********************************************************************) (* lsq: find least-squares solution of model equations *) (*********************************************************************) procedure lsq(var xvar: nodearray); var ata: array[1..thriceobject,1..thriceobject] of real; arow,perm: array[1..thriceobject] of integer; atb,scale: array[1..thriceobject] of real; rowmax,big,sizepivot: real; pivotrow,hold: integer; sum,pivot,rowfactor: real; numparms,kk: integer; usetwo: boolean; (* set up one row of model matrix, a, and use to make aTa *) procedure setup; begin (* can't estimate a features shared by both children of root *) if ((use[child[rootnode,1]]=true) and (use[child[rootnode,2]]=true)) then begin usetwo:=true; use[child[rootnode,2]]:=false; setzero:=setzero+1 end else usetwo:=false; numparms:=numfeat-setzero; (* initialize for setup of A'A matrix and A'b array *) for k:=1 to numparms do begin for l:=1 to numparms do ata[k,l]:=0.0; atb[k]:=0.0; end; (* now loop on each distance, ij, to see which parameters are involved *) ij:=0; for i:=2 to numobject do for j:=1 to (i-1) do begin ij:=ij+1; (* first set up tree features *) for l:=1 to numfeat do arow[l]:=0; kk:=0; for k:=1 to numnode do if (use[k]=true) then begin kk:=kk+1; if (((i in branchset[k]) and not(j in branchset[k])) or ((j in branchset[k]) and not(i in branchset[k]))) then arow[kk]:=1 else arow[kk]:=0; end; (* now set up marked features *) for k:=1 to (numfeat-numnode) do if (use[k+numnode]=true) then begin kk:=kk+1; arow[kk]:=0; for l:=1 to numberhosts[k] do if ((spillover[host[k,l]]=true) and (((i in branchset[host[k,l]]) and not(j in branchset[host[k,l]])) or ((j in branchset[host[k,l]]) and not(i in branchset[host[k,l]])))) then arow[kk]:=1; for l:=1 to (numberhosts[k]-1) do for m:=(l+1) to numberhosts[k] do if (((i in branchset[host[k,l]]) and (j in branchset[host[k,m]])) or((j in branchset[host[k,l]]) and (i in branchset[host[k,m]]))) then arow[kk]:=arow[kk]-2; end; (* now use both tree and marked features to make A'A matrix *) (* also set up A'b array *) for k:=1 to numparms do begin for l:=1 to numparms do ata[k,l]:=ata[k,l]+(arow[k]*arow[l]); atb[k]:=atb[k]+(prox[ij]*arow[k]) end; end; (* loop on ij *) end; (* setup *) procedure eliminate; begin (* invert ata *) for i:=1 to numparms do begin perm[i]:=i; rowmax:=0.0; for j:=1 to numparms do if (rowmax0.0) then scale[i]:=1.0/rowmax else writeln(outfile,' zero row encountered',i:4); end; for k:=1 to (numparms-1) do begin big:=0.0; for i:=k to numparms do begin sizepivot:=abs(ata[perm[i],k]*scale[perm[i]]); if (sizepivot>big) then begin big:=sizepivot; pivotrow:=i end; end; if (big<=0.0) then writeln(outfile,' zero pivot, col:',k:4); if (pivotrow<>k) then begin hold:=perm[k]; perm[k]:=perm[pivotrow]; perm[pivotrow]:=hold end; pivot:=ata[perm[k],k]; for i:=(k+1) to numparms do begin rowfactor:=ata[perm[i],k]/pivot; ata[perm[i],k]:=rowfactor; for j:=(k+1) to numparms do ata[perm[i],j]:=ata[perm[i],j]-rowfactor*ata[perm[k],j]; end; end; (* loop on k *) if (ata[perm[numparms],numparms]=0.0) then writeln(outfile,' last pivot=0'); end; (* eliminate *) procedure backsubstitute; begin xvar[1]:=atb[perm[1]]; for i:=2 to numparms do begin sum:=0.0; for j:=1 to (i-1) do sum:=sum+ata[perm[i],j]*xvar[j]; xvar[i]:=atb[perm[i]]-sum; end; xvar[numparms]:=xvar[numparms]/ata[perm[numparms],numparms]; for i:=(numparms-1) downto 1 do begin sum:=0.0; for j:=(i+1) to numparms do sum:=sum+ata[perm[i],j]*xvar[j]; xvar[i]:=(xvar[i]-sum)/ata[perm[i],i]; end; end; (* backsubstitute *) procedure adjustlengths; var h1,h2: real; begin zeroleft:=setzero; for k:=numfeat downto 1 do if (use[k]=true) then length[k]:=length[k-zeroleft] else begin length[k]:=0.0; zeroleft:=zeroleft-1 end; for i:=(numobject+1) to numnode do if (spillover[i]=true) then for j:=1 to (numfeat-numnode) do for k:=1 to numberhosts[j] do if (host[j,k]=i) then length[i]:=length[i]+length[j+numnode]; rootarclength:=length[child[rootnode,1]]+length[child[rootnode,2]]; length[child[rootnode,1]]:=saveroot*rootarclength; length[child[rootnode,2]]:=(1.0-saveroot)*rootarclength; h1:=height[child[rootnode,1]]+length[child[rootnode,1]]; h2:=height[child[rootnode,2]]+length[child[rootnode,2]]; if (h1>h2) then height[rootnode]:=h1 else height[rootnode]:=h2; if (usetwo=true) then begin use[child[rootnode,2]]:=true; setzero:=setzero-1 end; end; (* adjustlengths *) begin (* lsq *) setup; eliminate; backsubstitute; adjustlengths; end; (* lsq *) (*********************************************************************) (* checkspillover: check for small and negative parameter estimates *) (*********************************************************************) procedure checkspillover; var markedfsum: array[1..thriceobject] of real; branch,which,markedf: integer; thresh: real; begin flagzero:=false; (* check if marked features overflow their host arc *) for branch:=1 to numnode do markedfsum[branch]:=0.0; for markedf:=1 to nummarked do if ((use[markedf+numnode]=true) and (length[markedf+numnode]>=threshold)) then for which:=1 to numberhosts[markedf] do begin branch:=host[markedf,which]; markedfsum[branch]:=markedfsum[branch]+length[markedf+numnode]; end; write(outfile,'spillover of marked features on arcs: '); for branch:=(numobject+1) to numnode do if (markedfsum[branch]>length[branch]) then begin spillover[branch]:=true; write(outfile,branch:4); if (use[branch]=true) then begin flagzero:=true; setzero:=setzero+1; use[branch]:=false; end; end; writeln(outfile); (* eliminate interior tree arcs & marked features smaller than threshold *) write(outfile,'features smaller than threshold: '); for i:=(numobject+1) to numfeat do begin if (i<=numnode) then thresh:=tthreshold else thresh:=threshold; if ((use[i]=true) and (length[i]0) then fld:=namelength else fld:=3; for feature:=1 to nummarked do if use[feature+numnode] then begin writeln(outfile); write(outfile,' ',symbol[feature],' [ '); column:=12; for j:=1 to numberhosts[feature] do for k:=1 to numobject do if (k in branchset[host[feature,j]]) then begin if ((column+fld)>linesize) then begin writeln(outfile); column:=12; for l:=1 to column do write(outfile,' ') end; if (namelength=0) then begin write(outfile,k:3); column:=column+3 end else for l:=1 to namelength do if (name[k,l]<>' ') then begin write(outfile,name[k,l]); column:=column+1 end; write(outfile,', '); column:=column+2; end; writeln(outfile,']'); end; end; (* describemarked *) (****************************************************************************) (**************** start of main program ***********************************) (****************************************************************************) begin (* main *) writeln; writeln('extree version 1.5'); writeln; (* VAX VMS PASCAL FILE DEFINITION STATEMENTS *) (* open(infile,'extree.raw',old); *) (* open(outfile,'extree.out',new); *) (* PC TURBO PASCAL FILE DEFINITION STATEMENTS *) writeln('enter input file name:'); readln(infilnam); writeln('enter output file name:'); readln(outfilnam); assign(infile,infilnam); assign(outfile,outfilnam); (* Turbo PASCAL compiler directive to increase stack size *) {$M 65520,0,655360} reset(infile); rewrite(outfile); writeln; writeln('reading data'); readin; if (lowerhalf and printdata) then begin writeln(outfile,'data matrix:'); writematrix(prox,numobject,field,decplaces); end; chkmetric; if printtransformed (* prox has been transformed to satisfy m. axioms *) then begin writeln(outfile,'transformed data distances:'); writematrix(prox,numobject,field,decplaces); end; initialize; write('fitting tree..'); (***** begin addtree section *********************************************) while (n>=4) do begin if (specifytree=false) then begin (* find objects (branches) to be joined *) quadruples; numgroup:=0; nreduce:=0; findgroups; end else begin (* only if user-specified tree structure *) readln(infile,i,j); numgroup:=1; nreduce:=0; group[1]:=[]; for int:=1 to n do if ((index[int]=i) or (index[int]=j)) then group[1]:=group[1]+[int]; end; for int:=1 to n do use[int]:=true; for whichgroup:=1 to numgroup do makenode; cleanup; end; (* end main addtree loop ****************************************************) (* choose a root for the tree (there are now only 2 or 3 nodes left) *) if (n=3) then begin numgroup:=1; whichgroup:=1; if (height[index[1]]>height[index[2]]) and (height[index[1]]>height[index[3]]) then group[1]:=[2]+[3]; if (height[index[2]]>height[index[1]]) and (height[index[2]]>height[index[3]]) then group[1]:=[1]+[3]; if (height[index[3]]>height[index[1]]) and (height[index[3]]>height[index[2]]) then group[1]:=[1]+[2]; for int:=1 to 3 do use[int]:=true; makenode; cleanup; end; writeln; (* now n=2 *) makeroot; hangroot; if not(rooting=asis) then begin findbestroot; (* specified rooting or minvarroot *) hangroot end; (* end root-choosing section *) numfeat:=numnode; report; (* draw addtree *) writeln('calculating fit statistics'); stats; (* & report fit statistics *) (****** begin extree section ************************************************) writeln(outfile); writeln(outfile); writeln(outfile,'extree analysis:'); writeln(outfile); if not(specifymarked) then writeln(outfile,nummarked:4,' marked features will be tried'); writeln(outfile,'features smaller than ',threshold:field:decplaces, ' will be eliminated'); writeln(outfile); for int:=1 to numnode do use[int]:=true; for int:=(numnode+1) to thriceobject do use[int]:=false; write('estimating marked segments...'); if not(specifymarked) then findfeatures (* check & estimate marked features *) else begin (* (unless they are to be specified by user) *) numgroup:=0; repeat readln(infile,ch) until ch='*'; while not(eof(infile)) do begin numgroup:=numgroup+1; int:=0; while not(eoln(infile)) do begin int:=int+1; read(infile,host[numgroup,int]); end; numberhosts[numgroup]:=int; use[numgroup+numnode]:=true; if (numberhosts[numgroup]<2) then writeln(outfile,'error in reading marked feature # ',numgroup:3); readln(infile); end; nummarked:=numgroup; end; writeln; (* describe initial set of marked features *) if (nummarked>0) then begin for i:=1 to nummarked do begin writeln(outfile,'feature ',symbol[i]); for j:=1 to numberhosts[i] do begin write(outfile,host[i,j]:4); write(outfile,' ('); for k:=1 to numobject do if (k in branchset[host[i,j]]) then write(outfile,k:3); writeln(outfile,' )'); end; end; writeln(outfile); writeln(outfile); numfeat:=numfeat+nummarked; for int:=1 to numnode do spillover[int]:=false; use[rootnode]:=false; setzero:=1; writeln('simultaneous estimation of parameters'); repeat (* estimate & prune: loop till all feature estimates >= 0.0 *) iteration:=iteration+1; writeln(outfile); writeln(outfile); writeln(outfile,'iteration:',iteration:4); writeln(outfile); lsq(length); checkspillover; until (flagzero=false); for int:=1 to numobject do length[int]:=length[int]+maxspillover; report; (* draw extree *) writeln('calculating fit statistics'); stats; (* & report fit statistics *) end; (* if nummarked > 0 *) (* describe any marked features that have been retained *) if (nummarked>0) then describemarked; writeln(outfile); writeln(outfile); writeln(outfile); writeln(outfile); if printmodeldis (* note that array d now contains tree distances *) then begin writeln(outfile,'model distances:'); writematrix(d,numobject,field,decplaces); end; if printresiduals (* note that array d now contains tree distances *) then begin for int:=1 to numprox do prox[int]:=prox[int]-d[int]+newconstant; writeln(outfile,'residual distances:'); writematrix(prox,numobject,field,decplaces); end; close(infile); close(outfile); writeln; writeln('end extree run'); end. (* main *)