### /u/sy/beebe/public_html/html-pretty.awk, Fri Apr 7 12:55:26 1995 ### Edit by Nelson H. F. Beebe ### ==================================================================== ### @Awk-file{ ### author = "Nelson H. F. Beebe", ### version = "0.05", ### date = "17 April 1995", ### time = "09:34:07 MDT", ### filename = "html-pretty.awk", ### address = "Center for Scientific Computing ### Department of Mathematics ### University of Utah ### Salt Lake City, UT 84112 ### USA", ### telephone = "+1 801 581 5254", ### FAX = "+1 801 581 4148", ### checksum = "18136 847 2880 24100", ### email = "beebe@math.utah.edu (Internet)", ### codetable = "ISO/ASCII", ### keywords = "HTML, prettyprinter, SGML, WWW, World-Wide Web", ### supported = "yes", ### docstring = "This is a simple HTML file prettyprinter. ### ### Usage: ### gawk -f html-pretty.awk [-v FILE=name] ### [-v INDENT=nnn] [-v MAXLINE=72] ### or ### nawk -f html-pretty.awk [-v FILE=name] ### [-v INDENT=nnn] [-v MAXLINE=72] ### ### The optional FILE value sets the input file ### name, to override the actual name, or in ### the event that input is from standard ### input. ### ### The optional INDENT value sets the default ### indentation (default: 4 spaces per level). ### ### The optional MAXLINE value sets the maximum ### line length (default: 72). ### ### The idea is that each ... pair ### should contain material that is indented ### according to the nesting depth of the tags. ### However, certain common tags, such as those ### to change fonts, are excluded from this ### indenting. ### ### As with any simple-minded prettyprinter, ### especially ones for natural language text, ### which rarely conforms to a strict grammar, ### you should check the output of the ### prettyprinter before replacing the input by ### it. In particular, if your input file ### follows the standards imposed by this ### prettyprinter, a file difference listing may ### be reassuring. Otherwise, try viewing the ### output with one or more World-Wide Web client ### programs (arena, hotjava, netscape, xmosaic, ### ...). If you have an SGML parser or ### translator, you can use that for checking ### too. ### ### TO DO: ### ### (1) More extensive testing, and perhaps a ### validation suite. ### ### The checksum field above contains a CRC-16 ### checksum as the first value, followed by the ### equivalent of the standard UNIX wc (word ### count) utility output of lines, words, and ### characters. This is produced by Robert ### Solovay's checksum utility.", ### } ### ### ==================================================================== BEGIN { VERSION = "0.05" # these MUST match version DATE = "[17-Apr-1995]" # and date above initialize() out_banner() do_input() } END { wrapup() } function adjust_level(n) { level += n if (level < 0) error("negative indentation requested") } function base_tag(t) { if (iscomment(t)) return (t) # HTML comment match(t,/^<\/?[A-Za-z][A-Za-z0-9]*/) if ((RLENGTH == 0) || (RSTART != 1)) error("unmatched HTML tag") return (toupper(substr(t,RSTART,RLENGTH)) ">") } function check_for_omitted_tag(s, tag) { # SGML allows ending tags to be omitted if it can be determined # from context that they should be implicitly supplied. This # unfortunate practice complicates construction of processing # tools like this prettyprinter. The only one that we handle for # now is the common case of an omitted end-paragraph tag,

. # static : paragraph_level # print "DEBUG: line_buffer = [" line_buffer "]" tag = base_tag(s) # discard optional key=value args in SGML tag if (tag in exclude) return else if ((tag == "

") || (tag ~ /^$/)) { # then at new paragraph or new section for ( ; paragraph_level > 0; paragraph_level--) out_tag("

") # supply missing end-paragraph tags if (tag == "

") paragraph_level++ } else if (tag == "

") { if (paragraph_level <= 0) error("negative paragraph level") paragraph_level-- } } function do_input( token) { while ((token = get_token()) != EOF) { # print "TOKEN: [" token "]" >"/dev/tty" if (substr(token,1,1) == "<") { check_for_omitted_tag(token) out_tag(token) } else if (token == "\n") { trim_line() if (line_length() > 0) newline() newline() out_indentation() last_token_was_tag = 0 } else out_word(token) } } function error(msg) { # global : input_line, last_k print FILE ":" FNR ":" msg \ ": [" substr(input_line,1,last_k) "...]" > "/dev/tty" wrapup() exit (1) } function get_char() { # global : input_line, last_k if (last_k >= length(input_line)) { if (getline input_line <= 0) # then EOF or error return (EOF) last_k = 0 input_line = input_line "\n" } return (substr(input_line,++last_k,1)) } function get_token( c,token) { if (skip_whitespace() > 1) # adjacent newlines are unmarked-up parbreak return ("\n") c = get_char() if (c == EOF) return (EOF) else if (c == "<") return (normalize_whitespace(get_tag(c))) else return (get_word(c)) } function get_tag(c, token) { token = c for (;;) { c = get_char() if (c == ">") { token = token c break } else if (c == EOF) { token = token ">" # supply missing delimiter break } token = token c } if (iscomment(token)) return token # then possibly multiline comment else return (token) } function get_word(c) { token = c for (;;) { c = get_char() if (c == EOF) return (token) else if (iswhite(c) || (c == "<")) { unget_char() return (token) } token = token c } } function indentation_size() { return (INDENT * level) } function initialize() { # Set defaults for command-line variables. if (FILE == "") FILE = FILENAME if (FILE == "-") FILE = "" if (INDENT == "") INDENT = 4 if (MAXLINE == "") MAXLINE = 72 EOF = -1 # global end-of-file flag (opaque value) # Tags in exclude[] do not cause indentation actions. exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude["
"] = 1 exclude[""] = 1 exclude["

"] = 1 exclude["

"] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude["

"] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude["

"] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 # %HTML.Deprecated exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude["~~"] = 1 exclude["~~"] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude["_{"] = 1
exclude["}"] = 1 exclude["^{"] = 1
exclude["}"] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 exclude[""] = 1 # Tags in list_tags[] get an extra level of indentation, so that # embedded list item tags have their own level. list_tags[""] = 2 # %HTML.Deprecated list_tags[""] = 2 # %HTML.Deprecated list_tags["

"] = 2 list_tags[""] = 2 list_tags[""] = 2 # %HTML.Deprecated list_tags[""] = 2 # %HTML.Deprecated list_tags["

"] = 2 list_tags[""] = 2 list_tags["

"] = 2 list_tags[""] = 2 # Tags in list_item[] get outdented one level inside lists. list_item["

"] = 1 list_item["

"] = 1 # Tags in newline_around[] get a newline inserted before and after # them, unless they are at end-of-line already. newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around["

"] = 1 newline_around["

"] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around["

"] = 1 newline_around["

"] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 # %HTML.Deprecated newline_around[""] = 1 # %HTML.Deprecated newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around["

"] = 1 newline_around[""] = 1 # %HTML.Deprecated newline_around[""] = 1 # %HTML.Deprecated newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around["

"] = 1 newline_around[""] = 1 newline_around["

"] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around["

"] = 1 newline_around["

"] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around["

"] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around["

"] = 1
    newline_around["

"] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around[""] = 1 newline_around["