#!/usr/local/bin/perl
#
# Removes HTML markup commands from a textfile. Reads from a file
# and writes to stdout.
#
# Usage: filterHTML
#
#
# Force a flush after every write or print
#
$| = 1;
#
# Read in one document at a time
#
$/ = "\n\n";
$filename = @ARGV[0];
#
# Open the file
#
open(INFILE, "$filename") || die("Couldn't open $filename for writing\n");
while ( $doc = )
{
#
# Remove newlines
#
$doc =~ s#\n+# #g;
#
# Remove HTML markups
#
$doc =~ s#<[^>]*># #g;
#
# Print the document, followed by two newlines (to separate documents)
#
@words = split(/\s+/, $doc);
$len = 0;
foreach $word (@words)
{
$len += length($word);
if ($len > 60)
{
print("\n");
$len = 0;
}
print("$word ");
}
print("\n\n");
}
close(INFILE);