#!/bin/sh
# Remove HTML markup from stdin, or a list of files, sending the
# output to stdout with line breaks preserved. This is useful before
# processing .html files with spelling checkers, doubled-word finders,
# grammar checkers, etc.
#
# Usage:
# dehtml outfile
# dehtml file1 file2 ... >outfile
#
# Only simple pattern matching is used on the input; in particular,
# angle brackets that do not delimit HTML tags may be eliminated.
# [02-Sep-1997] -- update to handle a few SGML &xyz; entities
# [07-Apr-1995] -- original version
/bin/sed -e 's@<[/]*[A-Za-z][A-Za-z0-9]*[^>]*>@@g' \
-e 's@[&]nbsp;@ @g' \
-e 's@[&]amp;@\&@g' \
-e 's@[&]quot;@"@g' \
-e 's@[&][a-z][a-z]*;@ @g' \
"$@"