#!/usr/local/bin/perl
#
# Extracts the title of each document. The title is contained within
# the HTML tags
...
#
# Usage: extractTitles
#
#
# Read in one document at a time
#
$/ = "\n\n";
$inFile = @ARGV[0];
$outFile = @ARGV[1];
#
# Open the files
#
open(INFILE, "<$inFile") || die("Couldn't open $inFile for reading\n");
open(OUTFILE, ">$outFile") || die("Couldn't open $outFile for writing\n");
while ( $doc = )
{
#
# Remove newlines
#
$doc =~ s#\n+# #g;
#
# Extract title
#
$doc =~ s#(.*)#$title = $1#gei;
#
# Remove HTML tags from the title
#
$title =~ s#<[^>]*>##g;
#
# Write the title
#
print(OUTFILE "$title\n");
}
close(INFILE);
close(OUTFILE);