### /u/sy/beebe/public_html/html-pretty.awk, Fri Apr 7 12:55:26 1995
### Edit by Nelson H. F. Beebe
### ====================================================================
### @Awk-file{
### author = "Nelson H. F. Beebe",
### version = "0.05",
### date = "17 April 1995",
### time = "09:34:07 MDT",
### filename = "html-pretty.awk",
### address = "Center for Scientific Computing
### Department of Mathematics
### University of Utah
### Salt Lake City, UT 84112
### USA",
### telephone = "+1 801 581 5254",
### FAX = "+1 801 581 4148",
### checksum = "18136 847 2880 24100",
### email = "beebe@math.utah.edu (Internet)",
### codetable = "ISO/ASCII",
### keywords = "HTML, prettyprinter, SGML, WWW, World-Wide Web",
### supported = "yes",
### docstring = "This is a simple HTML file prettyprinter.
###
### Usage:
### gawk -f html-pretty.awk [-v FILE=name]
### [-v INDENT=nnn] [-v MAXLINE=72]
### or
### nawk -f html-pretty.awk [-v FILE=name]
### [-v INDENT=nnn] [-v MAXLINE=72]
###
### The optional FILE value sets the input file
### name, to override the actual name, or in
### the event that input is from standard
### input.
###
### The optional INDENT value sets the default
### indentation (default: 4 spaces per level).
###
### The optional MAXLINE value sets the maximum
### line length (default: 72).
###
### The idea is that each ... pair
### should contain material that is indented
### according to the nesting depth of the tags.
### However, certain common tags, such as those
### to change fonts, are excluded from this
### indenting.
###
### As with any simple-minded prettyprinter,
### especially ones for natural language text,
### which rarely conforms to a strict grammar,
### you should check the output of the
### prettyprinter before replacing the input by
### it. In particular, if your input file
### follows the standards imposed by this
### prettyprinter, a file difference listing may
### be reassuring. Otherwise, try viewing the
### output with one or more World-Wide Web client
### programs (arena, hotjava, netscape, xmosaic,
### ...). If you have an SGML parser or
### translator, you can use that for checking
### too.
###
### TO DO:
###
### (1) More extensive testing, and perhaps a
### validation suite.
###
### The checksum field above contains a CRC-16
### checksum as the first value, followed by the
### equivalent of the standard UNIX wc (word
### count) utility output of lines, words, and
### characters. This is produced by Robert
### Solovay's checksum utility.",
### }
###
### ====================================================================
BEGIN {
VERSION = "0.05" # these MUST match version
DATE = "[17-Apr-1995]" # and date above
initialize()
out_banner()
do_input()
}
END { wrapup() }
function adjust_level(n)
{
level += n
if (level < 0)
error("negative indentation requested")
}
function base_tag(t)
{
if (iscomment(t))
return (t) # HTML comment
match(t,/^<\/?[A-Za-z][A-Za-z0-9]*/)
if ((RLENGTH == 0) || (RSTART != 1))
error("unmatched HTML tag")
return (toupper(substr(t,RSTART,RLENGTH)) ">")
}
function check_for_omitted_tag(s, tag)
{
# SGML allows ending tags to be omitted if it can be determined
# from context that they should be implicitly supplied. This
# unfortunate practice complicates construction of processing
# tools like this prettyprinter. The only one that we handle for
# now is the common case of an omitted end-paragraph tag,
.
# static : paragraph_level
# print "DEBUG: line_buffer = [" line_buffer "]"
tag = base_tag(s) # discard optional key=value args in SGML tag
if (tag in exclude)
return
else if ((tag == "
") || (tag ~ /^$/))
{ # then at new paragraph or new section
for ( ; paragraph_level > 0; paragraph_level--)
out_tag("
") # supply missing end-paragraph tags
if (tag == "
")
paragraph_level++
}
else if (tag == "
")
{
if (paragraph_level <= 0)
error("negative paragraph level")
paragraph_level--
}
}
function do_input( token)
{
while ((token = get_token()) != EOF)
{
# print "TOKEN: [" token "]" >"/dev/tty"
if (substr(token,1,1) == "<")
{
check_for_omitted_tag(token)
out_tag(token)
}
else if (token == "\n")
{
trim_line()
if (line_length() > 0)
newline()
newline()
out_indentation()
last_token_was_tag = 0
}
else
out_word(token)
}
}
function error(msg)
{
# global : input_line, last_k
print FILE ":" FNR ":" msg \
": [" substr(input_line,1,last_k) "...]" > "/dev/tty"
wrapup()
exit (1)
}
function get_char()
{
# global : input_line, last_k
if (last_k >= length(input_line))
{
if (getline input_line <= 0) # then EOF or error
return (EOF)
last_k = 0
input_line = input_line "\n"
}
return (substr(input_line,++last_k,1))
}
function get_token( c,token)
{
if (skip_whitespace() > 1) # adjacent newlines are unmarked-up parbreak
return ("\n")
c = get_char()
if (c == EOF)
return (EOF)
else if (c == "<")
return (normalize_whitespace(get_tag(c)))
else
return (get_word(c))
}
function get_tag(c, token)
{
token = c
for (;;)
{
c = get_char()
if (c == ">")
{
token = token c
break
}
else if (c == EOF)
{
token = token ">" # supply missing delimiter
break
}
token = token c
}
if (iscomment(token))
return token # then possibly multiline comment
else
return (token)
}
function get_word(c)
{
token = c
for (;;)
{
c = get_char()
if (c == EOF)
return (token)
else if (iswhite(c) || (c == "<"))
{
unget_char()
return (token)
}
token = token c
}
}
function indentation_size()
{
return (INDENT * level)
}
function initialize()
{
# Set defaults for command-line variables.
if (FILE == "")
FILE = FILENAME
if (FILE == "-")
FILE = ""
if (INDENT == "") INDENT = 4
if (MAXLINE == "") MAXLINE = 72
EOF = -1 # global end-of-file flag (opaque value)
# Tags in exclude[] do not cause indentation actions.
exclude[""] = 1
exclude[""] = 1
exclude[""] = 1
exclude[""] = 1
exclude[""] = 1
exclude[""] = 1
exclude[""] = 1
exclude[""] = 1
exclude[""] = 1
exclude[""] = 1
exclude[""] = 1
exclude[" "] = 1
exclude[""] = 1
exclude["
"] = 1
exclude[""] = 1
exclude[""] = 1
exclude[""] = 1
exclude[""] = 1
# Tags in list_tags[] get an extra level of indentation, so that
# embedded list item tags have their own level.
list_tags[""] = 2 # %HTML.Deprecated
list_tags[""] = 2 # %HTML.Deprecated
list_tags["