C hiclust.f C This software implements the method called HICLUST. C The author of this software is Stephen C Johnson. C Note that this program is DIFFERENT from the same-named program in C icicle.shar, though the two programs are very similar C Copyright (c) 1993 by AT&T. C Permission to use, copy, modify, and distribute this software for any C purpose without fee is hereby granted, provided that this entire C notice is included in all copies of any software which is or includes C a copy or modification of this software and in all copies of the C supporting documentation for such software. C THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMP- C LIED WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR AT&T MAKE ANY C REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY C OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. C This software comes from the FIRST MDS Package of AT&T Bell Laboratories C For explanation of the method this software implements, see C "Hierarchical Clustering Schemes" by Stephen C Johnson in C Psychometrika, 1967, vol. 32, no. 3, pp 241-254 C----+----@----+----@----+----@----+----@----+----@----+----@----+----@ C HIERARCHICAL CLUSTERING REVISITED C NOTE -- THIS TAKES 28 K ON THE GE 645--MODEL A C SMALLER DIMENSIONS, LESS CORE. C THE GE 645 HAS 36 BIT WORDS C IF YOUR MACHINE HAS LONGER OR SHORTER WORDS, YOU WILL C WANT TO MODIFY ARRAY FORM AND THE ASSOCIATED READ STATEMENT C ACCORDINGLY C THIS WILL TREAT UP TO 100 BY 100 DATA INPUT MATRICES C IF YOU CANT MORE, YOU WILL HAVE TO CHANGE THE OUTPUT ROUTINES C AND POSSIBLY NEED SECONDARY STORAGE C CURRENTLY, THE MAXIMUM AND MINIMUM METHODS ARE OUTPUTED C OTHER METHODS (SUCH AS AVERAGEING) MAY BE ADDED C I HAVE TRIED TO INDICATE WHERE YOU WOULD ADD THESE DIMENSION D(100,100), FORM(11), E(100,100) DIMENSIONXX(100),LK(100),LKI(100),LKJ(100),IP(200) DIMENSIONP(200) LOGICAL W(100) DIMENSIONSIG(100) DATA XXXX/1HX/, BLANK/1H /, PERIOD/1H./ C READ IN N = NUMBER OF VARIABLES C IS = +1 IF MATRIX IS MATRIX OF DISTANCES C -1 IF MATRIX IS MATRIX OF PROXIMITIES 10 READ11,N,IS C IF N = NEGATIVE OR 0, STOP IF (N .LE. 0) STOP S = IS N1=N-1 N2=N1-1 XN=N C READ IN FORMAT READ12,(FORM(I),I=1,11) E(1,1)=0.0 C READ IN TRIANGULAR MATRIX WITHOUT DIAGONAL C MAKE IT INTO A SYMMETRIC ARRAY WITH 0 DIAG., E(I,J) DO20I=2,N E(I,I)=0.0 I1=I-1 READFORM,(E(I,J),J=1,I1) DO20J=1,I1 20 E(J,I)=E(I,J) C **** NOTE **** C THIS IS EXPERIMENTAL AND MAY BE OMMITTED C COMPUTE SUMS, GAMMAS, AND SIGMA FOR EACH I FORM 2 TO N-1 C S1 = SUM OF ENTRIES, SQUARED C S2 = SUM OF SQUARED ROW SUMS C C S3 EQUALS SUM OF SQUARES S1=0. S2=0. S3=0. DO30I=1,N T=0. DO25J=1,N S3=S3+E(I,J)**2 25 T=T+E(I,J) S2=S2+T**2 30 S1=S1+T S1=S1**2 DO40I=2,N1 XK=I T1=(XK-3.)/(XK-1.)+(XK-1.)/(XN-XK) T2=(XN-2.)/(XN-XK)-4./(XK-1.) T3=2./(XK-1.)+1./(XN-XK) XX3=XN*(XN-1.)*XK XX2=XX3*(XN-2.) XX1=XX2*(XN-3.) G1=(-T1)/XX1 G2=T2/XX2-4.*G1 G3=T3/XX3-G2-2.*G1 40 SIG(I)=S*SQRT(S1*G1+S2*G2+S3*G3) C **** END OF EXPERIMENTAL SECTION C INITIALIZE TO FIRST METHOD K=1 C START A METHOD C PRINT A TITLE 50 GOTO(51,52),K 51 PRINT61 GOTO98 52 PRINT62 C SET W(I) = .FALSE. C LK(I)=0 C W(I) = .TRUE. IF LI I HAS BEEN MERGED WITH SOME OTHER PT C LK(I) = LIST OF PTS MERGED, FOR OUTPUT PURPOSES---SEE BELOW 98 DO99I=1,N W(I) = .FALSE. LK(I)=0 C COPY OFF THE ORIGINAL MATRIX INTO TEMPORARY STORAGE C **** NOTE **** C YOU MAY NOT WANT TO DO THIS IF PRESSED FOR SPACE C YOU COULD READ THE DATA DIRECTLY INTO THE D ARRAY, AND NOT C NEED THE SPACE FOR THE E ARRAY AT ALL C THEN USING TWO METHODS WOULD REQUIRE READING THE DATA TWICE DO99J=1,N 99 D(I,J)=E(I,J) C II COUNTS THE NUMBER OF PAIRS MERGED DO181II=1,N1 C FIND MIN PAIR DISTANCE C FIRST FIND TWO UNMERGED POINTS -- INITIALIZE X C X IS THE RUNNING MINIMUM DISTANCE 100 DO102NI=2,N IF (W(NI)) GO TO 102 NI1=NI-1 DO101NJ=1,NI1 101 IF (.NOT. W(NJ)) GO TO 104 102 CONTINUE C MUST BE ONE LEFT STOP 104 X=D(NI,NJ) C BEGIN THE REAL BUSINESS OF FINDING THE CLOSEST PAIR C ONLY LOOK AT UNMERGED POINTS DO150I=2,N IF (W(I)) GO TO 150 I1 = I-1 DO149J=1,I1 IF (W(J)) GO TO 149 C THIS IS THE CENTRAL COMPARE STATEMENT IF((D(I,J)-X)*S .GT. 0.0) GO TO 149 C WE HAVE FOUND A CLOSER PAIR -- UPDATE X, NI, NJ NI = I NJ=J X=D(I,J) 149 CONTINUE 150 CONTINUE C WE HAVE NOW LOOKED AT EVERY PAIR OF POINTS -- C STORE X IN XX, NI AND NJ IN ARRAYS LKI AND LKJ XX(II)=X LKI(II)=NI LKJ(II)=NJ C POINT NJ IS CONSIDERED 'MERGED', WHERE BY THIS IS MEANT THAT C IT NO LONGER POSSESSES A SEPARATE ROW OF THE MATRIX, C BUT IS ENTIRELY SUBSUMED TO NI, (WHICH IS LARGER). W(NJ) = .TRUE. C SELECT UNMERGED POINTS, L, WHICH ARE NOT NEITHER NI NOR NJ DO179L=1,N IF (W(L)) GO TO 179 IF (L .EQ. NI) GO TO 179 C BRANCH ACCORDING TO METHOD, AND FIND D(NI,L) AND D(L,NI) 160 GOTO(200,300),K C *****MINIMUM METHOD***** C D(NI,L)= MIN(D(NI,L),D(NJ,L)) 200 IF((D(NI,L)-D(NJ,L))*S .LE. 0.0) GO TO 170 D(NI,L)=D(NJ,L) D(L,NI)=D(L,NJ) GOTO170 C *****MAXIMUM METHOD***** C D(NI,L)= MAX(D(NI,L),D(NJ,L)) 300 IF((D(NI,L)-D(NJ,L))*S .GE. 0.0) GO TO 170 D(NI,L)=D(NJ,L) D(L,NI)=D(L,NJ) 170 CONTINUE C GO GET MORE L'S 179 CONTINUE C NOW GO BACK AND DO AGAIN USING NEW D ARRAY --(ITERATE) 181 CONTINUE C ITERATION COMPLETE -- BEG IN PRINTOUT ROUTINE C WE NOW HAVE C 1. LKI ARRAY C 2. LKJ ARRAY C THESE CONTANN SUCCESSIVE LINKS MERGED AT EACH C ITERATI ON C LKI(I) IS THE LARGER OF THE TWO C LKJ(I) IS THE SMALLER OF THE TWO, THE ONE THAT IS C MERGED (ELIMINATED) AT STEP I C 3. XX(I) = THE SIZE (VALUE) OF THE LINK MERGED AT STEP I C ARRANGE THE LINKS IN ORDER FOR PRINTING OUT C FIRST PUT THE LAST TWO POINTS CONNECTED INTO THE LK ARRAY LK(1)=LKJ(N1) LK(2)=LKI(N1) C THEN WORK BACKWARDS -- PICK A LINKAGE. THE LARGER MEMBER OF C THIS GROUP MUST BE ALREADY PLACED IN THE LK LIST. C MOVE THE ENTRIES IN THE LK LIST OVER ONE, AND 'MAKE C ROOM' FOR THE NEW ENTRY, WHICH GOES IMMEDIATELY TO C THE LEFT OF THE LARGER ELEMENT. C IF THESE COMMENTS ARE NOT CLEAR, IT IS IN THE COMBIN- C ATORIAL NATURE OF THE SUBJECT THAT THIS BE SO, C AND WORKING THROUGH A SIMPLE EXAMPLE SHOULD BE C VERY EDIFYING. IM DOING THE BEST I CAN. DO550II=2,N1 JJ=N-II C FIND THE LKI ENTRY IN THE LK LIST DO510KK=1,II 510 IF(LKI(JJ) .EQ. LK(KK)) GO TO 515 C IT MUST BE SOMEWHERE STOP C SHOVE THINGS OVER AND PUT THE LKJ ENTRY IN THE LK LIST 515 DO530IK=KK,II KI=II+KK-IK 530 LK(KI+1)=LK(KI) 550 LK(KK)=LKJ(JJ) C OVER AND OVER IN THE DO LOOP UNTIL ALL IS DONE C THE LK ORDER IS THE ORDER IN WHICH THE FINAL OUTPUT WILL C BE ARRANGED C INDEX THROUGH THE LKI AND LKJ ARRAYS, USING I DO585I=1,N1 C FIND LKI(I) IN THE LK ARRAY-- THE INDEX IS JI DO565JI=1,N 565 IF(LK(JI) .EQ. LKI(I)) GO TO 570 C IT MUST BE SOMEWHERE STOP C FIND LKJ(I) IN THE LK ARRAY-- THE INDEX IS JJ 570 DO575JJ=1,N 575 IF(LK(JJ) .EQ. LKJ(I)) GO TO 580 C IT MUST BE SOMEWHERE STOP 580 LKI(I)=JI LKJ(I)=JJ 585 CONTINUE C LKI(II)AND LKJ(II)NOW CONTAIN THE ORDINAL POSITIONS OF THE II LINK C THE FOLLOWING CODE MAY BE DONE MORE THAN ONCE, DEPENDING ON C HOW LARGE N IS -- IF N IS MORE THAN 50, THE CODE IS DONE C TWICE. THE ONLY DIFFERENCE BETWEEN THE TIMES IS WHICH C PROTION OF THE IP AND P ARRAYS IS PRINTED OUT -- OTHERWISE C THE SITUATION IS IDENTICAL. C M1 = STARTING I VALUE C M2 = FINAL I VALUE C IP1 = STARTING P VALUE C IP2 = FINAL P VALUE C THE FIRST TIME, C M1 = 1, IP1=1, M2=MIN(50,N), IP2=100 OR 2*N-1, DEPENDING C AS N GREATER THAN 50, OR NOT C THE SECOND TIME(WHEN N GREATER THAN 50), C M1=51, M2=N, IP1=100, IP2=2*N-1 C SET UP FIRST TIME 700 M1=1 IP1=1 IF (N .GT. 50) GO TO 710 M2 = N ISAIL=1 IP2=2*N-1 GOTO800 710 M2=50 ISAIL=2 IP2=100 GOTO800 C SET UP SECOND TIME 750 PRINT751 ISAIL=1 M1=51 M2=N IP1=100 IP2=2*N-1 C PRINT FIRST DIGITS 800 DO551I=M1,M2 551 IP(I)=LK(I)/10 PRINT553,(IP(I),I=M1,M2) C PRINT SECOND DIGITS DO552I=M1,M2 552 IP(I)=LK(I)-10*IP(I) PRINT553,(IP(I),I=M1,M2) PRINT554 C INITIALIZE THE P ARRAY BY FILLING WITH BLANKS AND PERIODS C THE PERIODS REPRESENT VARIABLES AS YET UNCLUSTERED -- THE C BLANKS ARE FOR SPACING DO560I=1,199,2 P(I+1)=BLANK 560 P(I)=PERIOD C INDEX THROUGH THE MERGES, USIGG I DO600I=1,N1 C COMPUTE THE APPROPRIATE PLACES IN THE P ARRAY JI=2*LKI(I)-1 JJ=2*LKJ(I)-1 C F+LL IN XS BETWEEN JI AND JJ IN THE P ARRAY DO590KK=JJ,JI 590 P(KK)=XXXX C IF NEXT ONE IS CLUSTERED AT SAME LEVEL, DONT PRINT YET IF ((XX(I) .EQ. XX(I+1)) .AND. (I .NE. N1)) GO TO 600 C PRINT OUT THE P ARRAY 595 PRINT596,XX(I),(P(III),III=IP1,IP2) 600 CONTINUE G OTO(900,750),ISAIL 900 PRINT901 C **** NOTE **** C THIS IS EXPERIMENTAL AND MAY BE OMMITTED C EXPAND BY CLUSTERS AND COMPUTE THE Z SCORES C CLEAR IP C IP HAS THE CLUSTER NUMBER DO910I=1,N 910 IP(I)=0 C LOOK AT THE LINKS, INDEXED BY I DO950I=1,N2 LI=LKI(I) LJ=LKJ(I) IP1=IP(LJ) IP2=IP(LI) C LI AND LJ LINK CLUSTERS NUMBERED IP1 AND IP2 C SET ALL IP1 AND IP2 ELEMENTS, AND THOSE BETWEEN LJ AND LI, TO I. DO912II=LJ,LI 912 IP(II)=I IF(IP1.EQ.0)GOTO920 915 IF(LJ.EQ.1)GOTO920 LJ=LJ-1 IF(IP(LJ).NE.IP1)GOTO919 IP(LJ)=I GOTO915 919 LJ=LJ+1 920 LKJ(I)=LJ IF(IP2.EQ.0)GOTO930 925 IF(LI.EQ.N)GOTO930 LI=LI+1 IF(IP(LI).NE.IP2)GOTO929 IP(LI)=I GOTO925 929 LI=LI-1 930 LKI(I)=LI C LI AND LJ NOW HAVE THE TOTAL SPAN OF THE CLUSTER C COMPUTE THE STATISTIC AND DIVIDE BY THE CORRECT SIGMA SS=0. ST=0. DO960II=LJ,LI LLI=LK(II) DO961JJ=1,N 961 SS=SS+E(JJ,LLI) DO962JJ=LJ,LI LLJ=LK(JJ) 962 ST=ST+E(LLI,LLJ) 960 CONTINUE SS=SS-ST KXK=LI-LJ+1 XK=KXK XD1=(XN-XK)*XK XD2=XK*(XK-1.) VAL=(SS/XD1-ST/XD2)/SIG(KXK) PRINT970,KXK,VAL,(LK(II),II=LJ,LI) 970 FORMAT(//1X,I5,5H PTS.,F25.10,/,(/,1X,25I4)) 950 CONTINUE C **** NOTE **** C THIS STATISTIC HAS NO VALIDITY IF THE DATA HAS ONLY RANK ORDER C STRUCTURE. IT SHOULD BE DISREGARDED IN THIS CASE C IF YOU TRANSFORM YOUR DATA INTO RANKS, HOWEVER, THESE VALUES C MAKE SENSE. C **** END OF EXPERIMENTAL SECTION **** C GOTO NEXT METHOD -- IF DONE, GO READ ANOTHER N K=K+1 IF (K .EQ. 3) GO TO 10 GO TO 50 11 FORMAT(I3,I2) 12 FORMAT(11A6) 61 FORMAT(21H1CONNECTEDNESS METHOD) 62 FORMAT(16H1DIAMETER METHOD) 751 FORMAT(10H1CONTINUED) 553 FORMAT(20X,50(1X,I1)) 554 FORMAT(//) 596 FORMAT(1X,E16.8,4X,100A1) 901 FORMAT(//,14H END OF METHOD/1H1) END