: ########################################################################## # Title : dumphtmltbl - extract ASCII data from HTML table # Author : Heiner Steven # Date : 2000-02-25 # Requires : [gnm]awk # Category : HTML, File Conversion # SCCS-Id. : @(#) dumphtmltbl 1.6 04/01/02 ########################################################################## # Description # Reads from standard input, searches for HTML tables (starting # with ""), and extracts all information contained in # "
" and "" elements. The table data is then printed # with columns separated by "TAB" (ASCII 9) characters. # # o HTML tags are stripped from the input # o Multiple HTML tables are printed as one # o The table data is printed without any formatting. No # attempt is made to process tag attributes like # "ALIGN=CENTER", "WIDTH=..." etc. ########################################################################## PN=`basename "$0"` # Program name VER='1.6' Usage () { echo >&2 "$PN - extract ASCII data from HTML table, $VER usage: $PN [-fp] [file ...] -f: force recognition of table data elements outside of tables -p: preserve whitespace within table data The -f option is useful if parts of a HTML page are given as input." exit 1 } Msg () { for MsgLine do echo "$PN: $MsgLine" >&2 done } Fatal () { Msg "$@"; exit 1; } set -- `getopt fhp "$@"` || Usage [ $# -lt 1 ] && Usage # "getopt" detected an error Force=no PreserveWS=no # Preserve whitspace while [ $# -gt 0 ] do case "$1" in -f) Force=yes;; # recognize elements outside of tables -p) PreserveWS=yes;; --) shift; break;; -h) Usage;; -*) Usage;; *) break;; # First file name esac shift done # We need a "new" AWK implementation with functions, "getline()", # "match()". GNU awk is better for long input lines, but "mawk" is faster. if [ -z "$NAWK" ] then for awk in mawk gawk nawk do for path in `echo "$PATH" | sed 's/^:/.:/;s/:$/:./;s/:/ /g'` do [ -x $path/$awk ] || continue NAWK=$path/$awk break 2 done done : ${NAWK:=awk} fi $NAWK ' ##################################################################### # getchar - read one character, return -1 on EOF ##################################################################### function getchar () { if ( getchar_pos >= getchar_len || !getchar_line ) { if ( getline > 0 ) { getchar_line = $0 " " getchar_len = length (getchar_line) getchar_pos = 1 } else { return -1 } } #print "GETCHAR: " substr (getchar_line, getchar_pos, 1) return substr (getchar_line, getchar_pos++, 1) } ##################################################################### # ungets - "unget" a string # # The characters of the string will be returned at subsequent # calls of getchar() ##################################################################### function ungets (s) { #print "BEFORE: " getchar_line, getchar_len ungets_buf = substr (getchar_line, 1, getchar_pos - 1) s \ substr (getchar_line, getchar_pos, getchar_len - getchar_pos) getchar_line = ungets_buf getchar_len += length (s) #print " AFTER: " getchar_line, getchar_len } ##################################################################### # readtag - read a tag until closing ">" # # The leading "<" was already read. Ignores all tag # attributes. # # Returns the tag without "<" ">" ##################################################################### function readtag () { readtag_result = "" while ( (readtag_c = getchar()) != ">" && readtag_c != -1 ) { if ( match (readtag_c, /[ ]/) ) { # end of tag name, start of attributes while ( (readtag_c = getchar()) != ">" && readtag_c != -1 ) { # ignore character } return readtag_result } readtag_result = readtag_result readtag_c } return readtag_result } ##################################################################### # readword - read one (whitespace delimited) word of input # # A word is delimited by whitespace or tags # # Returns # the word read ##################################################################### function readword() { readword_result = "" #print "readword: START: <" readword_result ">" while ( (readword_c = getchar()) != -1 \ && readword_c != "<" \ && (PreserveWS || readword_c !~ /[ ]/) ) { readword_result = readword_result readword_c } if ( readword_c != -1 ) ungets(readword_c) #print "readword: END: <" readword_result ">" return readword_result } ##################################################################### # nexttoken - reads the next token of the input stream # # Global variables: # token_type TOK_WORD|TOK_TAG # token_value text or tag name # # Returns # TOK_WORD, TOK_TAG token type # -1 EOF ##################################################################### function nexttoken () { while ( (nexttoken_c = getchar()) != -1 ) { if ( nexttoken_c == "<" ) { token_value = readtag() return token_type = TOK_TAG } else if ( nexttoken_c ~ /[ ]/ && !PreserveWS ) { continue } else { #print "nexttoken: read word starting with <" nexttoken_c ">" ungets(nexttoken_c) token_value = readword() return token_type = TOK_WORD } } return -1 } BEGIN { ForceTable = ("'"$Force"'" == "yes") PreserveWS = ("'"$PreserveWS"'" == "yes") # Return values for the "nexttoken()" function: TOK_TAG = "TAG" # got a HTML tag (i.e. "TABLE") TOK_WORD = "WORD" # got a (whitespace delimited) word COLSEP = " " # output column separator (TAB) WORDSEP = " " # output word separator (BLANK) #rowno # current row number [1...] #row[] # all table data for this row #readdata {0 | 1} # currently reading or data element? while ( (tok = nexttoken()) != -1 ) { # print tok " " token_value if ( ForceTable || (tok==TOK_TAG && token_value ~ /^[tT][aA][bB][lL][eE]$/) ){ do { if ( tok == TOK_TAG ) { if ( readdata ) { # The following tags terminate a element: if ( token_value ~ /^\/[tT][dDhH]/ \ || token_value ~ /^\/[tT][rR]$/ \ || token_value ~ /^[tT][dDhHrR]$/ \ || token_value ~ /^\/[tT][aA][bB][lL][eE]$/) { #print "READDATA: END: " token_value if ( row [rowno] ) { row [rowno] = row [rowno] COLSEP } row [rowno] = row [rowno] td #print "READDATA: TD: " row [rowno] readdata = 0 } } if ( token_value ~ /^[tT][rR]$/ ) { #print "TR: START" row [++rowno] = "" # Start a new table row } else if ( token_value ~ /^[tT][dDhH]$/ ) { #print "TD: START" td = "" # New / column readdata = 1 } } else if ( tok == TOK_WORD ) { if ( readdata ) { # Retain word boundaries if ( td && td !~ /[ ]$/ ) td = td WORDSEP td = td token_value #print "TD: DATA <" td ">" } } } while ( (tok = nexttoken()) != -1 ) } } } END { #print "END: number of rows: " rowno for ( i=1; i<=rowno; ++i ) { print row [i] #line = row [i] #nfields = split (line, F, COLSEP) #print nfields " <" line ">" } } ' "$@"