#!/usr/local/bin/perl

#
# Removes HTML markup commands from a textfile.  Reads from a file
# and writes to stdout.
#
# Usage: filterHTML <file>
#

#
# Force a flush after every write or print
#
$| = 1;

#
# Read in one document at a time
#
$/ = "\n\n";


$filename = @ARGV[0];

#
# Open the file
#
open(INFILE, "$filename") || die("Couldn't open $filename for writing\n");

while ( $doc = <INFILE> )
{
  #
  # Remove newlines
  #
  $doc =~ s#\n+# #g;
  
  #
  # Remove HTML markups
  #
  $doc =~ s#<[^>]*># #g;

  #
  # Print the document, followed by two newlines (to separate documents)
  #
  @words = split(/\s+/, $doc);

  $len = 0;
  foreach $word (@words)
  {
      $len += length($word);
      if ($len > 60)
      {
	  print("\n");
	  $len = 0;
      }

      print("$word ");
  }

  print("\n\n");
}

close(INFILE);