#!/usr/local/bin/perl # # Extracts the title of each document. The title is contained within # the HTML tags ... # # Usage: extractTitles # # # Read in one document at a time # $/ = "\n\n"; $inFile = @ARGV[0]; $outFile = @ARGV[1]; # # Open the files # open(INFILE, "<$inFile") || die("Couldn't open $inFile for reading\n"); open(OUTFILE, ">$outFile") || die("Couldn't open $outFile for writing\n"); while ( $doc = ) { # # Remove newlines # $doc =~ s#\n+# #g; # # Extract title # $doc =~ s#(.*)#$title = $1#gei; # # Remove HTML tags from the title # $title =~ s#<[^>]*>##g; # # Write the title # print(OUTFILE "$title\n"); } close(INFILE); close(OUTFILE);