#!/usr/bin/perl -w
# Basic webpage GET tool. Much simpler than LWP GET, but not as powerful.
# It only needs modules installed by default, however. Also emulating the
# headers of other browsers is well supported; setting Cookie: and Referer:
# headers is made simpler. AND this is much better at spying on headers
# since it dumps them literally and unmodified (particularly request
# headers), but does not follow redirects.
#
# 22 November 1999	B. Elijah Griffin / Eli the Bearded
use strict;
BEGIN { $ENV{PATH} = '/usr/ucb:/bin' }
use vars qw($EOL $url $tcpproto $nosignal $id $bv %headers $post $forcehost
	$refer $cookie $print_request $print_body $print_heads $user $long
 	$follow $waittime $benchmark $debug $autoname $lang $dirdefault
	$VERSION $LONG_VERSION_INFO);
use Socket;
use Carp;

$VERSION = '1.0';
$LONG_VERSION_INFO = 'initial: 22-Nov-1999; this: 15 Dec 2000';

$EOL = "\cm\cj";
$tcpproto = getprotobyname('tcp');
$print_request = 0;
$print_body    = 1;
$print_heads   = 0;
$follow        = 0;
$lang = '';
$refer = '';
$cookie = '';
$bv = 'lwp-request-1.38';
$dirdefault = 'dir-default';


sub base64 ($);
sub err444 ($$$);
sub monster ($$);
sub usage ($);
sub saferead ();
sub grab ($$$$$$$$);


# Header sets for browser masquerading
%headers = (
# text mode browser for Unix
# http://artax.karlin.mff.cuni.cz/~mikulas/links
# Version 0.84 does not do cookies or referer headers, so we might
# misemulate it that way.
	'links-0.84' => <<'links084Heads',
GET ${URI} HTTP/1.1
Host: ${HOST}
User-Agent: Links (0.84; Linux 2.2.5-15 i686)
${REFERER}
${COOKIE}
links084Heads

# text mode browser for Unix
# http://ei5nazha.yz.yamagata-u.ac.jp/~aito/w3m/
	'w3m-beta99' => <<'w3mb991027Heads',
GET ${URI} HTTP/1.0
User-Agent: w3m/beta-991027
Accept: text/*, image/*, audio/*, application/*
Accept-Language: ja; q=1.0, en; q=0.5
Host: ${HOST}
${REFERER}
${COOKIE}
w3mb991027Heads

# Popular alternative browser for Windows
	'Opera-3.60' => <<'Opera360Heads',
GET ${URI} HTTP/1.0
User-Agent: Mozilla/4.0 (Windows NT 4.0;US) Opera 3.60  [en]
Accept: image/gif, image/x-xbitmap, image/jpeg, image/png, */*
Host: ${HOST}
${REFERER}
${COOKIE}
Opera360Heads

# ab, the apache benchmark tool.
	'ApacheBench-1.3' => <<'AB13Heads',
GET ${URI} HTTP/1.0
User-Agent: ApacheBench/1.3
Host: ${HOST}
Accept: */*
${REFERER}
${COOKIE}
AB13Heads

# Lib WWW Perl module
	'lwp-request-1.38' => <<'LWP138Heads',
GET ${URI} HTTP/1.0
Host: ${HOST}
User-Agent: lwp-request/1.38
${REFERER}
${COOKIE}
LWP138Heads

# Popular alternative browser for Macs
	'iCab-pre1.7' => <<'iCabP17Heads',
GET ${URI} HTTP/1.0
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/xbm, image/png, */*
Accept-Language: iw
Host: ${HOST}
User-Agent: iCab/Pre1.7 (Macintosh; I; PPC)
${REFERER}
${COOKIE}
iCabP17Heads

# Popular text mode browser, predominately unix
	'Lynx-2.8.1' => <<'Lynx281Heads',
GET ${URI} HTTP/1.0
Host: ${HOST}
Accept: text/html, text/plain, application/applefile, application/x-metamail-patch, sun-deskset-message, mail-file, default, postscript-file, audio-file, x-sun-attachment, text/enriched, text/richtext, application/andrew-inset, x-be2
Accept: application/postscript, message/external-body, message/partial, application/pgp, application/pgp, video/mpeg, video/*, image/*, audio/mod, text/sgml, video/mpeg, image/jpeg, image/tiff, image/x-rgb, image/png, image/x-xbitmap, image/x-xbm
Accept: image/gif, application/postscript, video/mpeg, image/jpeg, image/x-tiff, image/x-rgb, image/x-xbm, image/gif, application/postscript, */*;q=0.01
Accept-Encoding: gzip, compress
Accept-Language: en
Negotiate: trans
User-Agent: Lynx/2.8.1rel.2 libwww-FM/2.14
${REFERER}
${COOKIE}
Lynx281Heads

# Explorer 5.0 can be installed with a compatibility mode that emulates
# (or claims to emaulate) Explorer 4.0.
	'WindowsNT-Explorer-5.0-as-4.0' => <<'WinNTExp50-40Heads',
GET ${URI} HTTP/1.0
Accept: */*
Accept-Language: en-us
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 4.01; Windows NT; compat; DigExt)
Host: ${HOST}
${REFERER}
${COOKIE}
WinNTExp50-40Heads

	'Windows98-Explorer-5.5' => <<'Win98Exp55Heads',
GET ${URI} HTTP/1.0
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*
Accept-Language: en-us
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows 98)
Host: ${HOST}
${REFERER}
${COOKIE}
Win98Exp55Heads

# This is on a system with IE5.5 installed, note the reference to
# IE4.01. This one is hard to do right, since in my tests I saw
# two requests for the test file. The first came with this UA,
# the second had this instead:
# User-Agent: Mozilla/4.0 (compatible; MSIE 4.01; MSIECrawler; Windows NT)
# The crawler version had an 'Accept-Language: us-en' as well as a
# different order to the headers (Accept: User-Agent:, Accept-Language:
# Accept-Encoding, Host:).
	'WindowsNT-ActiveDesktop' => <<'WinActDeskHeads',
GET ${URI} HTTP/1.0
Accept: */*
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 4.01; Windows NT)
Host: ${HOST}
${REFERER}
${COOKIE}
WinActDeskHeads

	'WindowsNT-Netscape6' => <<'WinNTNS6Heads',
GET ${URI} HTTP/1.0
Host: ${HOST}
User-Agent: Mozilla/5.0 (Windows; U; WinNT4.0; en-US; m18) Gecko/20001108 Netscape6/6.0
Accept: */*
Accept-Language: en
Accept-Encoding: gzip,deflate,compress,identity
${REFERER}
${COOKIE}
WinNTNS6Heads

	'WindowsNT-Explorer-5.5' => <<'WinNTExp55Heads',
GET ${URI} HTTP/1.0
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*
Accept-Language: en-us
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 4.0)
Host: ${HOST}
${REFERER}
${COOKIE}
WinNTExp55Heads

	'Windows98-Explorer-4.0' => <<'Win98Exp40Heads',
GET ${URI} HTTP/1.0
Accept: */*
Accept-Language: en-us
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 4.01; Windows 98)
Host: ${HOST}
${REFERER}
${COOKIE}
Win98Exp40Heads

# Normal mode Windows NT IE 5.0
	'WindowsNT-Explorer-5.0' => <<'WinNTExp50Heads',
GET ${URI} HTTP/1.0
Accept: */*
Accept-Language: en-us
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)
Host: ${HOST}
Pragma: no-cache
${REFERER}
${COOKIE}
WinNTExp50Heads

# IE can optional crawl pages to cache them for offline browsing.
# This is Windows NT IE 5.01 in crawl mode.
	'WindowsNT-ExplorerOffline-5.0' => <<'WinNTExpOff50Heads',
GET ${URI} HTTP/1.0
Accept: */*
Accept-Language: en-us
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 5.01; Windows NT; MSIECrawler)
Host: ${HOST}
Pragma: no-cache
${REFERER}
${COOKIE}
WinNTExpOff50Heads

	'WindowsNT-Netscape-4.6' => <<'WinNTNS46Heads',
GET ${URI} HTTP/1.0
User-Agent: Mozilla/4.6 [en] (WinNT; I)
Pragma: no-cache
Host: ${HOST}
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*
Accept-Encoding: gzip
Accept-Language: en
Accept-Charset: iso-8859-1,*,utf-8
${REFERER}
${COOKIE}
WinNTNS46Heads

	'MacPPC-Explorer-4.0' => <<'MacPPCExp40Heads',
GET ${URI} HTTP/1.0
Host: ${HOST}
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/xbm, image/x-jg, */*
Accept-Language: en
If-Modified-Since: Fri, 01 Oct 1999 00:25:43 GMT
User-Agent: Mozilla/4.0 (compatible; MSIE 4.01; Mac_PowerPC)
UA-OS: MacOS
UA-CPU: PPC
Extension: Security/Remote-Passphrase
${REFERER}
${COOKIE}
MacPPCExp40Heads

	'MacPPC-Netscape-4.0' => <<'MacPPCNS40Heads',
GET ${URI} HTTP/1.0
Proxy-Connection: Keep-Alive
User-Agent: Mozilla/4.05 (Macintosh; I; PPC, Nav)
Pragma: no-cache
Host: ${HOST}
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*
Accept-Language: en
Accept-Charset: iso-8859-1,*,utf-8
${REFERER}
${COOKIE}
MacPPCNS40Heads

	'MacPPC-Netscape-4.6' => <<'MacPPCNS46Heads',
GET ${URI} HTTP/1.0
User-Agent: Mozilla/4.6 (Macintosh; I; PPC)
Pragma: no-cache
Host: ${HOST}
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*
Accept-Encoding: gzip
Accept-Language: en
Accept-Charset: iso-8859-1,*,utf-8
${REFERER}
${COOKIE}
MacPPCNS46Heads

	'Linux-Netscape-3.0' => <<'LinNS30Heads',
GET ${URI} HTTP/1.0
User-Agent: Mozilla/3.0 (X11; I; Linux 2.2.5-15 i686)
Pragma: no-cache
Host: ${HOST}
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*
${REFERER}
${COOKIE}
LinNS30Heads

	'Linux-Netscape-4.51' => <<'LinNS451Heads',
GET ${URI} HTTP/1.0
User-Agent: Mozilla/4.51 [en] (X11; I; Linux 2.2.5-15 i686)
Pragma: no-cache
Host: ${HOST}
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*
Accept-Encoding: gzip
Accept-Language: en
Accept-Charset: iso-8859-1,*,utf-8
${REFERER}
${COOKIE}
LinNS451Heads

);

sub THEEND {
  my $signame = (shift or '(unknown)');
  die "Got SIG$signame ... exiting\n";
} # &THEEND

sub BUMP {
  my $signame = (shift or '(unknown)');
  $nosignal = 0;
} # end &BUMP 

$SIG{INT}  = 'main::THEEND';
$SIG{TERM} = 'main::THEEND';
$SIG{PIPE} = 'main::BUMP';

while(defined($ARGV[0]) and substr($ARGV[0], 0, 1) eq '-') {
    if (($ARGV[0] eq '-a') or ($ARGV[0] eq '--autoname'))  {
      $autoname = 1;
      shift;
    } elsif (($ARGV[0] eq '-B') or ($ARGV[0] eq '--no-body'))  {
      $print_body = 0;
      shift;
    } elsif (($ARGV[0] eq '-h') or ($ARGV[0] eq '--heads'))  {
      $print_heads = 1;
      shift;
    } elsif (($ARGV[0] eq '-f') or ($ARGV[0] eq '--follow'))  {
      $follow = 1;
      shift;
    } elsif (($ARGV[0] eq '-l') or ($ARGV[0] eq '--long'))  {
      $long = 1;
      shift;
    } elsif (($ARGV[0] eq '-w') or ($ARGV[0] eq '--wait'))  {
      shift;
      $waittime = shift;
      if (!defined($waittime) or $waittime !~ /^\d+$/) {
	print STDERR "$id: -w (--wait) requires an integer argument\n";
	usage(2);
      }
    } elsif (($ARGV[0] eq '-t') or ($ARGV[0] eq '--time'))  {
      eval 'use Benchmark;';
      shift;
      if ($@) { 
        warn "Can't use Benchmark module: $@\n";
      } else {
        $benchmark = shift;
	if (!defined($benchmark) or $benchmark !~ /^\d+$/) {
	  print STDERR "$id: -t (--time) requires an integer argument\n";
	  usage(2);
	}
      }
    } elsif (($ARGV[0] eq '-r') or ($ARGV[0] eq '--request'))  {
      $print_request = 1;
      shift;
    } elsif (($ARGV[0] eq '-L') or ($ARGV[0] eq '--language'))  {
      shift;
      if ($#ARGV >= 1 and substr($ARGV[0], 0, 1) ne '-') {
	$lang = shift;
      } else {
	print STDERR "$id: -L (--language) requires an argument\n";
	usage(2);
      }
    } elsif (($ARGV[0] eq '-H') or ($ARGV[0] eq '--host'))  {
      shift;
      if ($#ARGV >= 1 and substr($ARGV[0], 0, 1) ne '-') {
	$forcehost = shift;
      } else {
	print STDERR "$id: -H (--host) requires an argument\n";
	usage(2);
      }
    } elsif (($ARGV[0] eq '-u') or ($ARGV[0] eq '--user'))  {
      shift;
      if ($#ARGV >= 1 and substr($ARGV[0], 0, 1) ne '-') {
	$user = &base64(shift);
      } else {
	print STDERR "$id: -u (--user) requires an argument\n";
	usage(2);
      }
    } elsif (($ARGV[0] eq '-p') or ($ARGV[0] eq '--post'))  {
      shift;
      if ($#ARGV >= 1 and substr($ARGV[0], 0, 1) ne '-') {
	$post = shift;
      } else {
	print STDERR "$id: -p (--post) requires an argument\n";
	usage(2);
      }
    } elsif (($ARGV[0] eq '-R') or ($ARGV[0] eq '--refer'))  {
      shift;
      if ($#ARGV >= 1 and substr($ARGV[0], 0, 1) ne '-') {
	$refer = shift;
      } else {
	print STDERR "$id: -r (--refer) requires an argument\n";
	usage(2);
      }
    } elsif (($ARGV[0] eq '-c') or ($ARGV[0] eq '--cookie'))  {
      shift;
      if ($#ARGV >= 1 and substr($ARGV[0], 0, 1) ne '-') {
	$cookie = shift;
      } else {
	print STDERR "$id: -c (--cookie) requires an argument\n";
	usage(2);
      }
    } elsif (($ARGV[0] eq '-b') or ($ARGV[0] eq '--browser'))  {
      shift;
      if ($#ARGV >= 1 and substr($ARGV[0], 0, 1) ne '-') {
	$ARGV[0] =~ /([\w.\d-]+)/; shift;
	$bv = $1;
	if (!defined($headers{$bv})) {
	  print STDERR "$id: $bv is not a recognized browser\n";
	  usage(2);
	}
      } else {
	print STDERR "$id: -b (--browser) requires an argument\n";
	usage(2);
      }
  } elsif ($ARGV[0] eq '--version') {
    print "$0 version $VERSION $LONG_VERSION_INFO\n";
    exit(0);
  } elsif ($ARGV[0] eq '--emulations') {
    &usage_emulations();
    exit(0);
  } elsif ($ARGV[0] eq '--languages') {
    &usage_languages();
    exit(0);
  } elsif ($ARGV[0] eq '--help') {
    &usage(0);
  } else {
    print STDERR "$0: $ARGV[0] not a recognized option\n";
    &usage(2);
  }
}


if (!defined($ARGV[0])) {
  print STDERR "No URL found\n";
  usage(2);
}

if ($benchmark) {
  timethis($benchmark, 
    sub {
      for $url (@ARGV) {
	&do_one($url, 1);
      }
    }
  );
} else {
  my $sleep;

  # Normal loop through them.
  while(defined($url = shift)) {
    sleep $sleep if $sleep;
    &do_one($url, 0);
    $sleep = $waittime;
  }

}
exit(0);

#####################################################
# Process one URL from the command line. If $timing is set,
# don't optimize away the actual request.
sub do_one ($$) {
  my $url = shift;
  my $timing = shift;
  my $nport = 80;
  my $host;
  my $connecthost;
  my $proto;
  my $lpart = '/';
  my $header = $headers{$bv} . $EOL;
  my $ans;	# holds response from web server
  my $newreq;

  # Simple-mindedly parse the request

  if ($url !~ m%(https?):/+([^/]+)(/.+)?%) {
    warn("Can't get host for $url; skipping\n");
    return undef;
  } else {
    $proto = $1;
    $host = $2;
    $lpart = $3 if defined($3);
  }

  if ($autoname) {
    my $out = $lpart;

    $out =~ s:.*/::;
    if (length($out) < 1) {
      $out = $dirdefault;
    }

    if (open(STDOUT,">$out")) {
      print STDERR "Sending output going to $out\n";
    } else {
      warn "Can't open $out for output.\n";
    }
  }

  if (defined($forcehost)) {
    $connecthost = $forcehost;
  } else {
    $connecthost = $host;
  }

  # Do referer headers, etc.
  if ($long) { 
    $header =~ s#\${URI}#$proto://${host}$lpart#g;
  } else {
    $header =~ s/\${URI}/$lpart/g;
  }
  $header =~ s/\${HOST}/$host/g;
  $header =~ s/\${REFERER}/Referer: $refer/g;
  $header =~ s/\${COOKIE}/Cookie: $cookie/g;

  if ($lang) {
    $header =~ s/Accept-Language:[^\cm\cj]*\cm?\cj/Accept-Language: $lang$EOL/i;
  }

  if ($user) {
    $header =~ s/\cm?\cj\cm?\cj/${EOL}Authorization: Basic $user$EOL/;
  }

  if ($post) {
    my $size = length($post);
    # may someday support multipart/form-data, too
    my $formtype = 'application/x-www-form-urlencoded';

    $header =~ s/^GET/POST/;
    $header =~ s/\cm?\cj\cm?\cj/${EOL}Content-Type: $formtype${EOL}Content-Length: $size$EOL$EOL/;
    $header .= $post;
  }

  $header =~ s/\cm?\cj/$EOL/g;

  # Grab first line for &grab
  $header =~ s/^([^\cm\cj]+$EOL)//;
  $newreq = $1;

  # Delete empty headers
  $header =~ s/\cM?\cJ([^\s:]+):\s(?=\cM?\cJ)//g;

  # Log the request
  print "$newreq$header" if $print_request;
  print "\n"             if($print_request and $post);

  if (!($print_heads or $print_body) and !$timing) {
    return "$newreq$header";
  }

  # Strip :port off of host before the grab. (It needs to be left in above
  # for the Host: header to work right.)
  if ($connecthost =~ s/:(\d+)//) {
    $nport = $1;
  }

  # Fetch the page
  $ans = &grab($connecthost, $nport, 
	       \$newreq, \$header, 
	       $print_heads, $print_body, $timing, $follow);
} # end &do_one  


#####################################################
# Grab an html page. Needs a remote hostname, a port number
# a first line request (eg "GET / HTTP/1.0"), and the remainder
# of the request (empty string if HTTP/0.9).
sub grab ($$$$$$$$) {
  my ($remote, $port, $request, $heads, $printhead, $printbody, $no_optimize,
      $doredir) = @_;
  my ($iaddr, $paddr, $line);
  my $out = '';
  my $len;
  my $rc;

  if (!($iaddr = inet_aton($remote))) { 
    return &err444("no host: $remote", $printhead, $printbody);
  }

  $paddr   = sockaddr_in($port, $iaddr);

  print 'Peer is ' .  inet_ntoa($iaddr) . ":$port\n" if $debug;

  if (!socket(SOCK, PF_INET, SOCK_STREAM, $tcpproto)) {
    return &err444("socket: $!", $printhead, $printbody);
  }
  if (!connect(SOCK, $paddr)) {
    return &err444("connect: $!", $printhead, $printbody);
  }

  $len = length($$request);
  $rc = syswrite(SOCK, $$request, $len);

  if ($rc != $len) {
    warn("request write to $remote was short ($rc != $len)\n");

  } else {
    $len = length($$heads);
    $rc = syswrite(SOCK, $$heads, $len);

    warn("heads write to $remote was short ($rc != $len)\n")
    	if ($rc != length($$heads));
  }

  $nosignal = 1;

  while ($line = &saferead() and $nosignal) {
    $out .= $line;
    last if ($line =~ /^\015?\012?$/);
  }

  print $out if $printhead;

  if (!$printbody and !$no_optimize) {
    close (SOCK)            || die "close: $!";
    if ($doredir) {
      if ($out =~ /(?:\015?\012|015\012?)Location:[ \t]*([^\015\012]+)/i) {
        my $newurl = $1;
	print STDERR "Following redirection to $newurl\n";
	$out = &do_one($newurl, 0);
      }
    }
    return $out;
  }

  if ($out =~ /\nContent-Length:\s+(\d+)/) {
    # OLD store every way : read(SOCK,$out,$1,length($out));
    my $tograb = $1;
    my $chunk  = 512;	# not too large, since it is off the network
    my $buf;
    my $rc;

    while($tograb >= $chunk) {
      $buf = '';
      $rc = read(SOCK,$buf,$chunk,0);
      print $buf if $printbody;
      if ($rc != $chunk) {
        warn "Return from $remote read was short (got $rc of $chunk)\n";
	return $out;
      }

      $tograb -= $chunk;
    }

    if ($tograb > 0) {
      $buf = '';
      $rc = read(SOCK,$buf,$tograb,0);
      print $buf if $printbody;
      if ($rc != $tograb) {
        warn "Return from $remote read was short (got $rc of $tograb)\n";
	return $out;
      }
    }

  } else {

    $nosignal = 1;
    # Back to line by line mode.
    while (defined($line = <SOCK>) and $nosignal) {
      # OLD store every way : $out .= $line;
      print $line if $printbody;
    }
  }

  close (SOCK)            || die "close: $!";

  if ($doredir) {
    if ($out =~ /(?:\015?\012|015\012?)Location:[ \t]*([^\015\012]+)/i) {
      my $newurl = $1;
      print STDERR "Following redirection to $newurl\n";
      $out = &do_one($newurl, 0);
    }
  }
  return $out;
} # end &grab

#####################################################
# Attempt to read a line safely from SOCK filehandle.
sub saferead () {
  my $line;
  eval {
  	local$SIG{ALRM} = sub { die 'timeout!' };
	alarm 15;
	$line = <SOCK>;
	alarm 0;
       };
  if ($@ and $@ !~ /timeout!/) {warn("during socket read: $@\n")}
  return $line;
} # end &saferead 

#####################################################
# Print a usage message. Exits with the number passed in.
sub usage ($) {
  my $exit = shift;

  print <<"EndUsage";
$0 usage:
  bget [options] URL [URL...]

Basic tool to make HTTP GET requests and monitor the results.
Unlike LWP GET, it does not require special Perl modules, and
by virtue of being cruder makes HTTP headers easier to spy on.
Only URLs of the forms 

     http://hostname/[localpart]
     http://hostname:port/[localpart]

are supported.

Options:
  	-a --autoname		save output automatically based on URI
  	-B --no-body		don't print the body of the response
	-f --follow		follow redirects
	-h --heads		print the response headers
	-l --long		use long address on GET line (using the
				full http://... should work in HTTP/1.1)
	-r --request		print the request headers
	-H --host     HOST[:P] 	connect to HOST for request (useful for
				testing virtual hosts before a DNS change)
	-L --language LANG	use LANG for Accept-Language:
	-R --refer    VALUE	set the referer header with VALUE
	-c --cookie   VALUE	set the cookie header with VALUE
	-b --browser  NAME	what browser to emulate
	-u --user     USER:PW	basic authentification as USER:PW
	-p --post     STRING	use STRING as a post form contents (forms of
				type application/x-www-form-urlencoded only)
	-t --time     N		use Benchmark module to time making
				request(s) N times
	-w --wait     N		wait N seconds between fetching each URL

	--help                  show this help and exit
	--version               print version and exit
	--emulations            print list of available emulations
	--languages             print a sample of language codes

Note: If -H (--host) is used with multiple URLs, all connections are
      made to the specified HOST (and port) even if different hosts
      are used in the URLs. This can be used to fetch files through
      a HTTP proxy if -l (--long) is also used.

      With -L (--langauge) the Accept-Language: header will not be
      added if the browser has not been observed to use it.
EndUsage

  exit($exit);
} # end &usage 

sub usage_languages() {
  print <<'LanguageRef';
In HTTP standard languages have a two letter code, with an optional
two letter country code qualifier. English is 'en', but American
English is 'en-us', Irish English is 'en-ie', Australian English is
'en-au'.

Some other lanuages:
  af	Afrikaans
  sq	Albanian
  eu	Basque
  bg	Bulgarian
  be	Byelorussian
  ca	Catalan
  zh	Chinese
  zh-cn	Chinese/China
  zh-tw	Chinese/Taiwan
  hr	Croatian
  cs	Czech
  da	Danish
  nl	Dutch
  nl-be	Dutch/Belgium
  fo	Faeroese
  fi	Finnish
  fr	French
  fr-be	French/Belgium
  fr-ca	French/Canada
  fr-fr	French/France
  fr-ch	French/Switzerland
  gl	Galician
  de	German
  de-at	German/Austria
  de-de	German/Germany
  de-ch	German/Switzerland
  el	Greek
  hu	Hungarian
  is	Icelandic
  id	Indonesian
  ga	Irish
  it	Italian
  ja	Japanese
  ko	Korean
  mk	Macedonian
  no	Norwegian
  pl	Polish
  pt	Portuguese
  pt-br	Portuguese/Brazil
  ro	Romanian
  ru	Russian
  gd	Scots Gaelic
  sr	Serbian
  sk	Slovak
  sl	Slovenian
  es	Spanish
  es-ar	Spanish/Argentina
  es-co	Spanish/Colombia
  ex-mx	Spanish/Mexico
  es-es	Spanish/Spain
  sv	Swedish
  tr	Turkish
  uk	Ukrainian

This list is from the default set of lanuages in Netscape 4.5.
IE has a different set, including more country variations.

LanguageRef
}

sub usage_emulations() {
  my $key;
  my @keys = sort {$a cmp $b} keys %headers;
  my $k = scalar @keys;

  print "The following $k browsers are recognized for header emulation:\n";
  foreach $key (@keys) {
    print "\t$key\n" if length($headers{$key});
  }

}

#####################################################
# For managing cookies, a monster.
sub monster ($$) {
  my $host = shift;
  my $reqref = shift;

  return unless defined($$reqref) and length($$reqref);
  if ($host =~ /\.doubleclick\./) {
    $$reqref =~ s/\cjCookie:[^\cm\cj]*/\cjX-Monster: doubleclick cookie eaten/gi;
  } elsif ($host =~ /^(ads|adforce|adserv[er]*)\./i) {
    $$reqref =~ s/\cjCookie:[^\cm\cj]*/\cjX-Monster: $1.* host cookie eaten/gi;
  }

} # end &monster 

sub err444 ($$$) {
  my $why = shift;
  my ($phead, $pbody) = @_;

  my $return;
($return = <<"444ErrorHead") =~ s/\cj/\cm\cj/g;
HTTP/1.0 444 Not Found
X-Declined: $why
Content-Type: text/html
Content-Length: 28

444ErrorHead

  my $body;
$body = <<"444ErrorBody";
<html><head><title>Error 444</title></head><body>
<h1>Error 444 Not Found</h1>
<p>$why</p>
</body></html>
444ErrorBody

  print $return if $phead;
  print $body   if $pbody;

  return($return);
} # end &err444

# This code stolen from MIME::Base64's perl-only backup. The XS
# version is much faster, but I don't want to assume it is installed.
sub base64 ($) {
    my $res = "";
    my $eol = "\n";
    pos($_[0]) = 0;                          # ensure start at the beginning
    while ($_[0] =~ /(.{1,45})/gs) {
        $res .= substr(pack('u', $1), 1);
        chop($res);
    }
    $res =~ tr|` -_|AA-Za-z0-9+/|;               # `# help emacs
    # fix padding at the end
    my $padding = (3 - length($_[0]) % 3) % 3;
    $res =~ s/.{$padding}$/'=' x $padding/e if $padding;
    # break encoded string into lines of no more than 76 characters each
    if (length $eol) {
        $res =~ s/(.{1,76})/$1$eol/g;
    }
    $res;
} # end &base64 

__END__
Benchmarking:
$ bget -t 1000 -B http://localhost/
timethis 1000: 53 wallclock secs (17.52 usr +  1.44 sys = 18.96 CPU) @ 52.74/s (n=1000)
$ /tmp/lwp-bm http://localhost/
timethis 1000: 52 wallclock secs (14.66 usr +  1.77 sys = 16.43 CPU) @ 60.86/s (n=1000)
$ cat /tmp/lwp-bm
#!/usr/bin/perl -w

use LWP::UserAgent; 
use HTTP::Request; 
use HTTP::Response; 
use Benchmark;

my $raw_url = shift or die "usage: $0 url\n"; 
my $url = $raw_url; #URI::Heuristic::uf_urlstr($raw_url);
$| = 1;                                  
my $ua = LWP::UserAgent->new(); 

timethis(1000, sub {
my $req = HTTP::Request->new(GET => $url); 

my $response = $ua->request($req);
});
$


From: helgi@NOSPAMdecode.is (Helgi Briem)
Newsgroups: comp.lang.perl.misc
Subject: Re: Faster than LWP
Date: Wed, 13 Dec 2000 16:50:39 GMT
Reply-To: helgi@NOSPAMdecode.is
Message-ID: <3a37a450.537807185@news.itn.is>
References: <m2aea1ygmh.fsf@rt158.private.realtime.co.uk>
	<28424-3A36E39B-22@storefull-247.iap.bryant.webtv.net>

[...]
Using LWP is easy and lightning fast.  Your problem almost 
[...]
This code, slightly modified from the Perl Cookbook works 
for me every day and is lightning fast for either ftp or 
http, substitute your own proxy server and port number.:

Regards,
Helgi Briem

#!/usr/bin/perl -w 

use LWP::UserAgent; 
use HTTP::Request; 
use HTTP::Response; 
use URI::Heuristic;

my $raw_url = shift or die "usage: $0 url\n"; 
my $url = URI::Heuristic::uf_urlstr($raw_url);
$| = 1;                                  
printf "%s =>\n\t", $url;
my $ua = LWP::UserAgent->new(); 
$ua->proxy(['http', 'ftp'] =>
'http://MYPROXY.DOMAIN.COM:80');

my $req = HTTP::Request->new(GET => $url); 

my $response = $ua->request($req);
if ($response->is_error()) {
     printf " %s\n", $response->status_line;
 } else {
     my $count;
     my $bytes;
     my $content = $response->content();
     $bytes = length $content;
     $count = ($content =~ tr/\n/\n/);
     printf "%s (%d lines, %d bytes)\n", $response->content;
} 





=head1 NAME

bget - basic HTTP get tool

=head1 DESCRIPTION

Basic tool to make HTTP GET requests and monitor the results.
Unlike LWP GET, it does not require special Perl modules, and
by virtue of being cruder makes HTTP headers easier to spy on.

Only URLs of the forms 

     http://hostname/[localpart]
     http://hostname:port/[localpart]

are supported.

Options:

=over 4

=item *

-a --autoname	

Save output automatically based on URI.

=item *

-B --no-body	

Don't print the body of the response.
 
=item *

-f --follow

Follow redirects. If printing headers, the redirecting headers and
the destination headers will be printed. (No loop detection is
attempted.) If printing bodies and not saving via autoname, the
redirecting body and the destination body will be printed. If
saving via autoname, a new file will be opened for each request
made.  Some redirects (eg loops) may cause the autonaming to pick
the same filename as a previous request, which will cause the
earlier file to be clobbered.

=item *

-h --heads	

Print the response headers.

=item *

-l --long	

Use long address on GET line (using the
full http://... format, a MUST for HTTP/1.1
server compliance but handy with I<-H> for
proxies).

=item *

-r --request	

Print the request headers.

=item *

-H --host     HOST[:P] 

Connect to HOST for request (useful for
testing virtual hosts before a DNS change or
use with I<-l> for proxies).

=item *

-L --language LANG

Use LANG for Accept-Language: header. See
I<--languages> for a small list.

=item *

-R --refer    VALUE

Set the referer header with VALUE.

=item *

-c --cookie   VALUE

Set the cookie header with VALUE.

=item *

-b --browser  NAME

What browser to emulate. Use I<--emulations> to list
available browser headers.

=item *

-u --user     USER:PW

Basic authentification in the form {username}:{password}.

=item *

-p --post     STRING

Use STRING as a post form contents (forms of
type application/x-www-form-urlencoded only).

=item *

-t --time     N	

Use Benchmark module to time making the command line
request(s) N times.

=item *

-w --wait     N	

Wait N seconds between fetching each URL.

=item *

--help                 

Show a help message and exit.

=item *

--version              

Print version and exit.

=item *

--emulations           

Print list of available browser emulations.

=item *

--languages            

Print a sample of language codes.

=back

=head2 Note

If I<-H> (I<--host>) is used with multiple URLs, all connections are
made to the specified HOST (and port) even if different hosts
are used in the URLs. This can be used to fetch files through
a HTTP proxy if I<-l> (I<--long>) is also used.

With I<-L> (I<--langauge>) the Accept-Language: header will not be
added if the browser has not been observed to use it.

=head1 EMULATIONS

The following browsers are recognized for header emulation. This
might not be the definitive list. Check I<--emulations> for that.
Some have comments to help identify them.

=over 4

=item *

links-0.84

Text mode browser for Unix.
E<lt>http://artax.karlin.mff.cuni.cz/~mikulas/linksE<gt>
Version 0.84 does not do cookies or referer headers, so we might
misemulate it that way.

=item *

w3m-beta99

Text mode browser for Unix.
E<lt>http://ei5nazha.yz.yamagata-u.ac.jp/~aito/w3m/E<gt>
 
=item *

ApacheBench-1.3

ab, the benchmark tool that comes with the Apache httpd package.

=item *

Opera-3.60

Popular alternative browser for Windows.

=item *

lwp-request-1.38

Lib WWW Perl module (these are the default headers).

=item *

iCab-pre1.7

Popular alternative browser for Macs.

=item *

Lynx-2.8.1

Popular text mode browser, predominately unix.

=item *

WindowsNT-Explorer-5.0-as-4.0

Explorer 5.0 can be installed with a compatibility mode that emulates
(or claims to emaulate) Explorer 4.0.

=item *

Windows98-Explorer-5.5

=item *

WindowsNT-ActiveDesktop

This is on a system with IE5.5 installed, but this identifies
itself as IE4.01. This one is hard to do right, since in my
tests I saw two requests for the test file. The first came
with this UA, the second had this instead:

User-Agent: Mozilla/4.0 (compatible; MSIE 4.01; MSIECrawler; Windows NT)

The crawler version had an 'Accept-Language: us-en' as well as a
different order to the headers (Accept: User-Agent:, Accept-Language:
Accept-Encoding, Host:).

=item *

WindowsNT-Netscape6

=item *

WindowsNT-Explorer-5.5

=item *

Windows98-Explorer-4.0

=item *

WindowsNT-Explorer-5.0

Normal mode Windows NT IE 5.0.

=item *

WindowsNT-ExplorerOffline-5.0

IE can optional crawl pages to cache them for offline browsing.
This is Windows NT IE 5.01 in crawl mode.

=item *

WindowsNT-Netscape-4.6

=item *

MacPPC-Explorer-4.0

=item *

MacPPC-Netscape-4.0

=item *

MacPPC-Netscape-4.6

=item *

Linux-Netscape-3.0

=item *

Linux-Netscape-4.51

=back

=head1 LANGUAGES

In HTTP standard languages have a two letter code, with an optional
two letter country code qualifier. English is 'en', but American
English is 'en-us', Irish English is 'en-ie', Australian English is
'en-au'.

Some other lanuages:

  af	Afrikaans
  sq	Albanian
  eu	Basque
  bg	Bulgarian
  be	Byelorussian
  ca	Catalan
  zh	Chinese
  zh-cn	Chinese/China
  zh-tw	Chinese/Taiwan
  hr	Croatian
  cs	Czech
  da	Danish
  nl	Dutch
  nl-be	Dutch/Belgium
  fo	Faeroese
  fi	Finnish
  fr	French
  fr-be	French/Belgium
  fr-ca	French/Canada
  fr-fr	French/France
  fr-ch	French/Switzerland
  gl	Galician
  de	German
  de-at	German/Austria
  de-de	German/Germany
  de-ch	German/Switzerland
  el	Greek
  hu	Hungarian
  is	Icelandic
  id	Indonesian
  ga	Irish
  it	Italian
  ja	Japanese
  ko	Korean
  mk	Macedonian
  no	Norwegian
  pl	Polish
  pt	Portuguese
  pt-br	Portuguese/Brazil
  ro	Romanian
  ru	Russian
  gd	Scots Gaelic
  sr	Serbian
  sk	Slovak
  sl	Slovenian
  es	Spanish
  es-ar	Spanish/Argentina
  es-co	Spanish/Colombia
  ex-mx	Spanish/Mexico
  es-es	Spanish/Spain
  sv	Swedish
  tr	Turkish
  uk	Ukrainian

This list is from the default set of lanuages in Netscape 4.5.
IE has a different set, including more country variations.

=head1 OSNAMES

A unix-like directory structure is assumed.

=head1 COPYRIGHT

Copyright 1999 by Eli the Bearded / Benjamin Elijah Griffin.
Released under the same license(s) as Perl.

=head1 AUTHOR

Eli the Bearded originally wrote this to spy on headers and have a
low cpu impact way to fetch files over http. It evolved from there.

=head1 CPAN INFO

=head1 SCRIPT CATEGORIES

Web

=head1 README

bget - basic HTTP get tool

=head1 PREREQUISITES

This uses the C<strict>, C<vars>, C<Socket>, and C<Carp> modules.

=head1 COREQUISITES

This will try to use the C<Benchmark> module when run with certain
options.

=head1 OSNAMES

Should not be OS dependent. The autoname feature (-a / --autoname)
assumes that C</> seperates directories, however this should have
minimal impact since it always tries to save in the currrent directory. 
Problems will only ensue if the automatically chosen name contains
a directory seperator for the current OS.

=cut