#!/usr/local/bin/perl # # Removes HTML markup commands from a textfile. Reads from a file # and writes to stdout. # # Usage: filterHTML # # # Force a flush after every write or print # $| = 1; # # Read in one document at a time # $/ = "\n\n"; $filename = @ARGV[0]; # # Open the file # open(INFILE, "$filename") || die("Couldn't open $filename for writing\n"); while ( $doc = ) { # # Remove newlines # $doc =~ s#\n+# #g; # # Remove HTML markups # $doc =~ s#<[^>]*># #g; # # Print the document, followed by two newlines (to separate documents) # @words = split(/\s+/, $doc); $len = 0; foreach $word (@words) { $len += length($word); if ($len > 60) { print("\n"); $len = 0; } print("$word "); } print("\n\n"); } close(INFILE);