: ########################################################################## # Title : geturl - get WWW page specified by URL # Author : Heiner Steven # Date : 1997-02-20 # Requires : netcat # Category : WWW # SCCS-Id. : @(#) geturl 2.5 04/03/01 ########################################################################## # Note # o Needs the non-standard program # netcat ("net cat", sometimes named "nc") # to do the actual work ########################################################################## PN=`basename "$0"` # Program name VER='2.5' # Program to establish TCP connection, print to standard output (usage: $NETCAT # host port) : ${NETCAT:=netcat} : ${NETCATFLAGS=} MaxRedirs=5 Header=${TMPDIR:=/tmp}/gu$$ # Temp. file for HTTP response header Verbose=false # true/false PrintHeader=false Followredirections=true JustCheck=false proxy=${HTTP_PROXY:-$http_proxy} if [ -n "$proxy" ] then UseProxy=true else UseProxy=false fi default_contenttype="application/x-www-form-urlencoded" # Set default headers. "generalheader" and "requestheader" contains the # names of all variables that may contain valid header settings. head_connection="Connection: close" head_useragent="User-Agent: $PN/$VER" generalheader="head_connection head_useragent" requestheader="head_host head_contenttype head_contentlength" usage () { echo >&2 "$PN - get WWW page specified by URL, $VER usage: $PN [-chgpsv] [-R num] url [...] -c: only check the url, do not get the contents (implies -h) -C: content-type header field (default $default_contenttype) -h: HEAD request (sets exit code; add -s to see header) -g: GET request (default) -p: POST request (reads post data from standard input) -R: number of redirections to follow (default is $MaxRedirs) -s: print server response header -v: verbose mode An url has the form http:[//HOST][:PORT]PATH i.e. http://www.shelldorado.com:80/index.html" exit 1 } msg () { echo >&2 "$PN: $*" } fatal () { msg "$@"; exit 1; } isint () { for _i do case "$_i" in *[!0-9]*) return 1;; esac done return 0 } # Return the parsed URL in the variables # Protocol, Host, Port, Path ParseUrl () { [ $# -ge 1 ] || return 2 U=$1 # URL-Format: http://HOST/DIR/.../FILE Protocol=`expr "$U" : '\([a-zA-Z0-9][a-zA-Z0-9]*\):.*'` [ -n "$Protocol" ] || return 1 # Protocol must be specified U=`expr "$U" : "$Protocol:\(.*\)"` # Remove protocol H=`expr "$U" : '//\([^/][^/]*\).*'` # hostname:portnumber if [ -n "$H" ] then Host=`expr "$H" : '\([^:][^:]*\).*'` Port=`expr "$H" : '[^:][^:]*:\([0-9][0-9]*\).*'` U=`expr "$U" : "//[^/][^/]*\(.*\)"` # Remove hostname:portnumber fi Path="$U" return 0 } # Strip header from HTTP response, and write it to a file. # A header is terminated by an empty line StripHeader () { # Line terminator may be CR LF (instead of LF) cat "$@" | { OIFS="$IFS"; IFS="" while read line do echo "$line" case "$line" in ""|" ") break; esac done > "$Header" IFS="$OIFS" [ $PrintHeader = true ] && cat "$Header" cat } return 0 } ########################################################################## # Start of main program ########################################################################## set -- `getopt :cC:ghpR:sv "$@"` || usage while [ $# -gt 0 ] do case "$1" in -c) Cmd=HEAD; JustCheck=true;; # only check the file -C) ContentType=$2; shift;; -g) Cmd=GET;; -h) Cmd=HEAD;; -p) Cmd=POST;; -R) isint "$2" || fatal "invalid number: $2" MaxRedirs=$2; shift;; -s) PrintHeader=true;; -v) Verbose=true;; --) shift; break;; -*) usage;; *) break;; # First file name esac shift done [ $# -lt 1 ] && usage : ${Cmd:=GET} if [ X"$Cmd" = X"POST" ] then contenttype=${ContentType:-$default_contenttype} elif [ X"$ContentType" != X"" ] then fatal "a content type may only be specified with POST requests" fi # Remove temporary files at signal or exit trap 'rm -f "$Header" >/dev/null 2>&1' 0 trap "exit 2" 1 2 3 15 redircnt=0 # number of redirections followed exitcode=0 while [ $# -gt 0 ] do Url=$1; shift if [ $UseProxy = true ] then ParseUrl "$Url"; DstHost=$Host ParseUrl "$proxy" || { msg "malformed proxy URL: $proxy"; continue; } Path=$Url else ParseUrl "$Url" || { msg "malformed URL: $Url"; continue; } DstHost=$Host fi [ "${Protocol:-INVALID}" = http ] || { msg "unknown protocol \"$Protocol\" (only \"http\" allowed)"; continue; } # Set default values : ${method:=${Cmd:-GET}} : ${Host:=127.0.0.1} : ${Port:=80} : ${Path:=/} [ $Verbose = true ] && msg "$method http://$Host:$Port$Path" # Build request from three parts: req_line, req_header, [req_body] request= req_line= req_header= req_body= # Clear request-specific header fields for header in $requestheader do eval "$header=" done req_line="$method $Path HTTP/1.0" head_host="Host: $DstHost" case "$method" in POST) # Read post data from standard input. Note that we store all # data in an environment variable. This is not strictly # necessary, but simplifies our script. [ -t 0 ] && msg "[Enter POST data; EOF to continue]" postdata=`cat || exit 1` || exit 1 req_body="$postdata" head_contenttype="Content-Type: $contenttype" length=`wc -c <<-EOT $req_body EOT ` head_contentlength="Content-Length: `echo $length`" ;; esac # Set all headers from $generalheader and $requestheader which # currently have a value req_header= for header in $generalheader $requestheader do eval "[ -n \"\$$header\" ]" || continue [ -n "$req_header" ] && req_header="$req_header " eval req_header="\$req_header\$$header" done #echo >&2 "DEBUG: header=<$requestheader>" # All header lines need to be terminated by CR LF (not just LF) request=`sed 's/$/ /' <<-EOT ${req_line} ${req_header} EOT ` [ -n "$req_body" ] && request="$request $req_body" #echo >&2 "DEBUG: request=<$request>" ###################################################################### # Now issue the request ###################################################################### "$NETCAT" $NETCATFLAGS ${Host:=127.0.0.1} ${Port:=80} <&2 "DEBUG: statuscode=<$statuscode>" case "$statuscode" in 1*|2*) exitcode=0 ;; 300|301|302) # Temporary/permanent redirection exitcode=0 if [ $Followredirections = true ] then redircnt=`expr ${redircnt:-0} + 1` [ $redircnt -gt $MaxRedirs ] && fatal "too many redirections (more than $MaxRedirs)" newlocation=`grep -i '^Location:' "$Header" | head -1 | sed 's/^[^:][^:]*:[ ]*//'` if [ -n "$newlocation" ] then set -- "$newlocation" "$@" #echo >&2 "DEBUG: newlocation=<$newlocation>" [ $method = POST ] && method=GET [ $Verbose = true ] && msg "redirected ($statuscode)" else msg "NOTE: could not get new location of redirected" \ " resource" fi fi ;; 4*|5*) # Client/server errors exitcode=1 ;; *) exitcode=1 msg "NOTE: unexpected status code: $statuscode - ignored" ;; esac else exitcode=1 fi if [ $JustCheck = true ] then [ $exitcode -eq 0 ] && echo "OK $Url" || echo "ERROR $Url" fi done exit $exitcode