summaryrefslogtreecommitdiff
path: root/win32/bin/webget.pl
diff options
context:
space:
mode:
Diffstat (limited to 'win32/bin/webget.pl')
-rw-r--r--win32/bin/webget.pl1091
1 files changed, 1091 insertions, 0 deletions
diff --git a/win32/bin/webget.pl b/win32/bin/webget.pl
new file mode 100644
index 0000000000..3d72208cb2
--- /dev/null
+++ b/win32/bin/webget.pl
@@ -0,0 +1,1091 @@
+#!/usr/local/bin/perl -w
+
+#-
+#!/usr/local/bin/perl -w
+$version = "951121.18";
+$comments = 'jfriedl@omron.co.jp';
+
+##
+## This is "webget"
+##
+## Jeffrey Friedl (jfriedl@omron.co.jp), July 1994.
+## Copyright 19.... ah hell, just take it.
+## Should work with either perl4 or perl5
+##
+## BLURB:
+## Given a URL on the command line (HTTP and FTP supported at the moment),
+## webget fetches the named object (HTML text, images, audio, whatever the
+## object happens to be). Will automatically use a proxy if one is defined
+## in the environment, follow "this URL has moved" responses, and retry
+## "can't find host" responses from a proxy in case host lookup was slow).
+## Supports users & passwords (FTP), Basic Authorization (HTTP), update-if-
+## modified (HTTP), and much more. Works with perl4 or perl5.
+
+##
+## More-detailed instructions in the comment block below the history list.
+##
+
+##
+## To-do:
+## Add gopher support.
+## Fix up how error messages are passed among this and the libraries.
+##
+
+## 951219.19
+## Lost ftp connections now die with a bit more grace.
+##
+## 951121.18
+## Add -nnab.
+## Brought the "usage" string in line with reality.
+##
+## 951114.17
+## Added -head.
+## Added -update/-refresh/-IfNewerThan. If any URL was not pulled
+## because it was not out of date, an exit value of 2 is returned.
+##
+## 951031.16
+## Added -timeout. Cleaned up (a bit) the exit value. Now exits
+## with 1 if all URLs had some error (timeout exits immediately with
+## code 3, though. This is subject to change). Exits with 0 if any
+## URL was brought over safely.
+##
+## 951017.15
+## Neat -pf, -postfile idea from Lorrie Cranor
+## (http://www.ccrc.wustl.edu/~lorracks/)
+##
+## 950912.14
+## Sigh, fixed a typo.
+##
+## 950911.13
+## Added Basic Authorization support for http. See "PASSWORDS AND STUFF"
+## in the documentation.
+##
+## 950911.12
+## Implemented a most-excellent suggestion by Anthony D'Atri
+## (aad@nwnet.net), to be able to automatically grab to a local file of
+## the same name as the URL. See the '-nab' flag.
+##
+## 950706.11
+## Quelled small -w warning (thanks: Lars Rasmussen <gnort@daimi.aau.dk>)
+##
+## 950630.10
+## Steve Campbell to the rescue again. FTP now works when supplied
+## with a userid & password (eg ftp://user:pass@foo.bar.com/index.txt).
+##
+## 950623.9
+## Incorporated changes from Steve Campbell (steven_campbell@uk.ibm.com)
+## so that the ftp will work when no password is required of a user.
+##
+## 950530.8
+## Minor changes:
+## Eliminate read-size warning message when size unknown.
+## Pseudo-debug/warning messages at the end of debug_read now go to
+## stderr. Some better error handling when trying to contact systems
+## that aren't really set up for ftp. Fixed a bug concerning FTP access
+## to a root directory. Added proxy documentation at head of file.
+##
+## 950426.6,7
+## Complete Overhaul:
+## Renamed from httpget. Added ftp support (very sketchy at the moment).
+## Redid to work with new 'www.pl' library; chucked 'Www.pl' library.
+## More or less new and/or improved in many ways, but probably introduced
+## a few bugs along the way.
+##
+## 941227.5
+## Added follow stuff (with -nofollow, etc.)
+## Added -updateme. Cool!
+## Some general tidying up.
+##
+## 941107.4
+## Allowed for ^M ending a header line... PCs give those kind of headers.
+##
+## 940820.3
+## First sorta'clean net release.
+##
+##
+
+##
+##>
+##
+## Fetch http and/or ftp URL(s) given on the command line and spit to
+## STDOUT.
+##
+## Options include:
+## -V, -version
+## Print version information; exit.
+##
+## -p, -post
+## If the URL looks like a reply to a form (i.e. has a '?' in it),
+## the request is POST'ed instead of GET'ed.
+##
+## -head
+## Gets the header only (for HTTP). This might include such useful
+## things as 'Last-modified' and 'Content-length' fields
+## (a lack of a 'Last-modified' might be a good indication that it's
+## a CGI).
+##
+## The "-head" option implies "-nostrip", but does *not* imply,
+## for example "-nofollow".
+##
+##
+## -pf, -postfile
+## The item after the '?' is taken as a local filename, and the contents
+## are POST'ed as with -post
+##
+## -nab, -f, -file
+## Rather than spit the URL(s) to standard output, unconditionally
+## dump to a file (or files) whose name is that as used in the URL,
+## sans path. I like '-nab', but supply '-file' as well since that's
+## what was originally suggested. Also see '-update' below for the
+## only-if-changed version.
+##
+## -nnab
+## Like -nab, but in addtion to dumping to a file, dump to stdout as well.
+## Sort of like the 'tee' command.
+##
+## -update, -refresh
+## Do the same thing as -nab, etc., but does not bother pulling the
+## URL if it older than the localfile. Only applies to HTTP.
+## Uses the HTTP "If-Modified-Since" field. If the URL was not modified
+## (and hence not changed), the return value is '2'.
+##
+## -IfNewerThan FILE
+## -int FILE
+## Only pulls URLs if they are newer than the date the local FILE was
+## last written.
+##
+## -q, -quiet
+## Suppresses all non-essential informational messages.
+##
+## -nf, -nofollow
+## Normally, a "this URL has moved" HTTP response is automatically
+## followed. Not done with -nofollow.
+##
+## -nr, -noretry
+## Normally, an HTTP proxy response of "can't find host" is retried
+## up to three times, to give the remote hostname lookup time to
+## come back with an answer. This suppresses the retries. This is the
+## same as '-retry 0'.
+##
+## -r#, -retry#, -r #, -retry #
+## Sets the number of times to retry. Default 3.
+##
+## -ns, -nostrip
+## For HTTP items (including other items going through an HTTP proxy),
+## the HTTP response header is printed rather than stripped as default.
+##
+## -np, -noproxy
+## A proxy is not used, even if defined for the protocol.
+##
+## -h, -help
+## Show a usage message and exit.
+##
+## -d, -debug
+## Show some debugging messages.
+##
+## -updateme
+## The special and rather cool flag "-updateme" will see if webget has
+## been updated since you got your version, and prepare a local
+## version of the new version for you to use. Keep updated! (although
+## you can always ask to be put on the ping list to be notified when
+## there's a new version -- see the author's perl web page).
+##
+## -timeout TIMESPAN
+## -to TIMESPAN
+## Time out if a connection can not be made within the specified time
+## period. TIMESPAN is normally in seconds, although a 'm' or 'h' may
+## be appended to indicate minutes and hours. "-to 1.5m" would timeout
+## after 90 seconds.
+##
+## (At least for now), a timeout causes immediate program death (with
+## exit value 3). For some reason, the alarm doesn't always cause a
+## waiting read or connect to abort, so I just die immediately.. /-:
+##
+## I might consider adding an "entire fetch" timeout, if someone
+## wants it.
+##
+## PASSWORDS AND SUCH
+##
+## You can use webget to do FTP fetches from non-Anonymous systems and
+## accounts. Just put the required username and password into the URL,
+## as with
+## webget 'ftp:/user:password@ftp.somesite.com/pub/pix/babe.gif
+## ^^^^^^^^^^^^^
+## Note the user:password is separated from the hostname by a '@'.
+##
+## You can use the same kind of thing with HTTP, and if so it will provide
+## what's know as Basic Authorization. This is >weak< authorization. It
+## also provides >zero< security -- I wouldn't be sending any credit-card
+## numbers this way (unless you send them 'round my way :-). It seems to
+## be used most by providers of free stuff where they want to make some
+## attempt to limit access to "known users".
+##
+## PROXY STUFF
+##
+## If you need to go through a gateway to get out to the whole internet,
+## you can use a proxy if one's been set up on the gateway. This is done
+## by setting the "http_proxy" environmental variable to point to the
+## proxy server. Other variables are used for other target protocols....
+## "gopher_proxy", "ftp_proxy", "wais_proxy", etc.
+##
+## For example, I have the following in my ".login" file (for use with csh):
+##
+## setenv http_proxy http://local.gateway.machine:8080/
+##
+## This is to indicate that any http URL should go to local.gateway.machine
+## (port 8080) via HTTP. Additionally, I have
+##
+## setenv gopher_proxy "$http_proxy"
+## setenv wais_proxy "$http_proxy"
+## setenv ftp_proxy "$http_proxy"
+##
+## This means that any gopher, wais, or ftp URL should also go to the
+## same place, also via HTTP. This allows webget to get, for example,
+## GOPHER URLs even though it doesn't support GOPHER itself. It uses HTTP
+## to talk to the proxy, which then uses GOPHER to talk to the destination.
+##
+## Finally, if there are sites inside your gateway that you would like to
+## connect to, you can list them in the "no_proxy" variable. This will allow
+## you to connect to them directly and skip going through the proxy:
+##
+## setenv no_proxy "www.this,www.that,www.other"
+##
+## I (jfriedl@omron.co.jp) have little personal experience with proxies
+## except what I deal with here at Omron, so if this is not representative
+## of your situation, please let me know.
+##
+## RETURN VALUE
+## The value returned to the system by webget is rather screwed up because
+## I didn't think about dealing with it until things were already
+## complicated. Since there can be more than one URL on the command line,
+## it's hard to decide what to return when one times out, another is fetched,
+## another doesn't need to be fetched, and a fourth isn't found.
+##
+## So, here's the current status:
+##
+## Upon any timeout (via the -timeout arg), webget immediately
+## returns 3. End of story. Otherwise....
+##
+## If any URL was fetched with a date limit (i.e. via
+## '-update/-refresh/-IfNewerThan' and was found to not have changed,
+## 2 is returned. Otherwise....
+##
+## If any URL was successfully fetched, 0 is returned. Otherwise...
+##
+## If there were any errors, 1 is returned. Otherwise...
+##
+## Must have been an info-only or do-nothing instance. 0 is returned.
+##
+## Phew. Hopefully useful to someone.
+##<
+##
+
+## Where latest version should be.
+$WEB_normal = 'http://www.wg.omron.co.jp/~jfriedl/perl/webget';
+$WEB_inlined = 'http://www.wg.omron.co.jp/~jfriedl/perl/inlined/webget';
+
+
+require 'network.pl'; ## inline if possible (directive to a tool of mine)
+require 'www.pl'; ## inline if possible (directive to a tool of mine)
+$inlined=0; ## this might be changed by a the inline thing.
+
+##
+## Exit values. All screwed up.
+##
+$EXIT_ok = 0;
+$EXIT_error = 1;
+$EXIT_notmodified = 2;
+$EXIT_timeout = 3;
+
+##
+##
+
+warn qq/WARNING:\n$0: need a newer version of "network.pl"\n/ if
+ !defined($network'version) || $network'version < "950311.5";
+warn qq/WARNING:\n$0: need a newer version of "www.pl"\n/ if
+ !defined($www'version) || $www'version < "951114.8";
+
+$WEB = $inlined ? $WEB_inlined : $WEB_normal;
+
+$debug = 0;
+$strip = 1; ## default is to strip
+$quiet = 0; ## also normally off.
+$follow = 1; ## normally, we follow "Found (302)" links
+$retry = 3; ## normally, retry proxy hostname lookups up to 3 times.
+$nab = 0; ## If true, grab to a local file of the same name.
+$refresh = 0; ## If true, use 'If-Modified-Since' with -nab get.
+$postfile = 0; ## If true, filename is given after the '?'
+$defaultdelta2print = 2048;
+$TimeoutSpan = 0; ## seconds after which we should time out.
+
+while (@ARGV && $ARGV[0] =~ m/^-/)
+{
+ $arg = shift(@ARGV);
+
+ $nab = 1, next if $arg =~ m/^-f(ile)?$/;
+ $nab = 1, next if $arg =~ m/^-nab$/;
+ $nab = 2, next if $arg =~ m/^-nnab$/;
+ $post = 1, next if $arg =~ m/^-p(ost)?$/i;
+ $post = $postfile = 1, next if $arg =~ m/^-p(ost)?f(ile)?$/i;
+ $quiet=1, next if $arg =~ m/^-q(uiet)?$/;
+ $follow = 0, next if $arg =~ m/^-no?f(ollow)?$/;
+ $strip = 0, next if $arg =~ m/^-no?s(trip)?$/;
+ $debug=1, next if $arg =~ m/^-d(ebug)?$/;
+ $noproxy=1, next if $arg =~ m/^-no?p(roxy)?$/;
+ $retry=0, next if $arg =~ m/^-no?r(etry)?$/;
+ $retry=$2, next if $arg =~ m/^-r(etry)?(\d+)$/;
+ &updateme if $arg eq '-updateme';
+ $strip = 0, $head = 1, next if $arg =~ m/^-head(er)?/;
+ $nab = $refresh = 1, next if $arg =~ m/^-(refresh|update)/;
+
+ &usage($EXIT_ok) if $arg =~ m/^-h(elp)?$/;
+ &show_version, exit($EXIT_ok) if $arg eq '-version' || $arg eq '-V';
+
+ if ($arg =~ m/^-t(ime)?o(ut)?$/i) {
+ local($num) = shift(@ARGV);
+ &usage($EXIT_error, "expecting timespan argument to $arg\n") unless
+ $num =~ m/^\d+(\d*)?[hms]?$/;
+ &timeout_arg($num);
+ next;
+ }
+
+ if ($arg =~ m/^-if?n(ewer)?t(han)?$/i) {
+ $reference_file = shift(@ARGV);
+ &usage($EXIT_error, "expecting filename arg to $arg")
+ if !defined $reference_file;
+ if (!-f $reference_file) {
+ warn qq/$0: ${arg}'s "$reference_file" not found.\n/;
+ exit($EXIT_error);
+ }
+ next;
+ }
+
+ if ($arg eq '-r' || $arg eq '-retry') {
+ local($num) = shift(@ARGV);
+ &usage($EXIT_error, "expecting numerical arg to $arg\n") unless
+ defined($num) && $num =~ m/^\d+$/;
+ $retry = $num;
+ next;
+ }
+ &usage($EXIT_error, qq/$0: unknown option "$arg"\n/);
+}
+
+if ($head && $post) {
+ warn "$0: combining -head and -post makes no sense, ignoring -post.\n";
+ $post = 0;
+ undef $postfile;
+}
+
+if ($refresh && defined($reference_file)) {
+ warn "$0: combining -update and -IfNewerThan make no sense, ignoring -IfNewerThan.\n";
+ undef $reference_file;
+}
+
+if (@ARGV == 0) {
+ warn "$0: nothing to do. Use -help for info.\n";
+ exit($EXIT_ok);
+}
+
+
+##
+## Now run through the remaining arguments (mostly URLs) and do a quick
+## check to see if they look well-formed. We won't *do* anything -- just
+## want to catch quick errors before really starting the work.
+##
+@tmp = @ARGV;
+$errors = 0;
+while (@tmp) {
+ $arg = shift(@tmp);
+ if ($arg =~ m/^-t(ime)?o(ut)?$/) {
+ local($num) = shift(@tmp);
+ if ($num !~ m/^\d+(\d*)?[hms]?$/) {
+ &warn("expecting timespan argument to $arg\n");
+ $errors++;
+ }
+ } else {
+ local($protocol) = &www'grok_URL($arg, $noproxy);
+
+ if (!defined $protocol) {
+ warn qq/can't grok "$arg"/;
+ $errors++;
+ } elsif (!$quiet && ($protocol eq 'ftp')) {
+ warn qq/warning: -head ignored for ftp URLs\n/ if $head;
+ warn qq/warning: -refresh ignored for ftp URLs\n/if $refresh;
+ warn qq/warning: -IfNewerThan ignored for ftp URLs\n/if defined($reference_file);
+
+ }
+ }
+}
+
+exit($EXIT_error) if $errors;
+
+
+$SuccessfulCount = 0;
+$NotModifiedCount = 0;
+
+##
+## Now do the real thing.
+##
+while (@ARGV) {
+ $arg = shift(@ARGV);
+ if ($arg =~ m/^-t(ime)?o(ut)?$/) {
+ &timeout_arg(shift(@ARGV));
+ } else {
+ &fetch_url($arg);
+ }
+}
+
+if ($NotModifiedCount) {
+ exit($EXIT_notmodified);
+} elsif ($SuccessfulCount) {
+ exit($EXIT_ok);
+} else {
+ exit($EXIT_error);
+}
+
+###########################################################################
+###########################################################################
+
+sub timeout_arg
+{
+ ($TimeoutSpan) = @_;
+ $TimeoutSpan =~ s/s//;
+ $TimeoutSpan *= 60 if $TimeoutSpan =~ m/m/;
+ $TimeoutSpan *= 3600 if $TimeoutSpan =~ m/h/;
+
+}
+
+##
+## As a byproduct, returns the basename of $0.
+##
+sub show_version
+{
+ local($base) = $0;
+ $base =~ s,.*/,,;
+ print STDERR "This is $base version $version\n";
+ $base;
+}
+
+##
+## &usage(exitval, message);
+##
+## Prints a usage message to STDERR.
+## If MESSAGE is defined, prints that first.
+## If exitval is defined, exits with that value. Otherwise, returns.
+##
+sub usage
+{
+ local($exit, $message) = @_;
+
+ print STDERR $message if defined $message;
+ local($base) = &show_version;
+ print STDERR <<INLINE_LITERAL_TEXT;
+usage: $0 [options] URL ...
+ Fetches and displays the named URL(s). Supports http and ftp.
+ (if no protocol is given, a leading "http://" is normally used).
+
+Options are from among:
+ -V, -version Print version information; exit.
+ -p, -post If URL looks like a form reply, does POST instead of GET.
+ -pf, -postfile Like -post, but takes everything after ? to be a filename.
+ -q, -quiet All non-essential informational messages are suppressed.
+ -nf, -nofollow Don't follow "this document has moved" replies.
+ -nr, -noretry Doesn't retry a failed hostname lookup (same as -retry 0)
+ -r #, -retry # Sets failed-hostname-lookup-retry to # (default $retry)
+ -np, -noproxy Uses no proxy, even if one defined for the protocol.
+ -ns, -nostrip The HTTP header, normally elided, is printed.
+ -head gets item header only (implies -ns)
+ -nab, -file Dumps output to file whose name taken from URL, minus path
+ -nnab Like -nab, but *also* dumps to stdout.
+ -update HTTP only. Like -nab, but only if the page has been modified.
+ -h, -help Prints this message.
+ -IfNewerThan F HTTP only. Only brings page if it is newer than named file.
+ -timeout T Fail if a connection can't be made in the specified time.
+
+ -updateme Pull the latest version of $base from
+ $WEB
+ and reports if it is newer than your current version.
+
+Comments to $comments.
+INLINE_LITERAL_TEXT
+
+ exit($exit) if defined $exit;
+}
+
+##
+## Pull the latest version of this program to a local file.
+## Clip the first couple lines from this executing file so that we
+## preserve the local invocation style.
+##
+sub updateme
+{
+ ##
+ ## Open a temp file to hold the new version,
+ ## redirecting STDOUT to it.
+ ##
+ open(STDOUT, '>'.($tempFile="/tmp/webget.new")) ||
+ open(STDOUT, '>'.($tempFile="/usr/tmp/webget.new")) ||
+ open(STDOUT, '>'.($tempFile="/webget.new")) ||
+ open(STDOUT, '>'.($tempFile="webget.new")) ||
+ die "$0: can't open a temp file.\n";
+
+ ##
+ ## See if we can figure out how we were called.
+ ## The seek will rewind not to the start of the data, but to the
+ ## start of the whole program script.
+ ##
+ ## Keep the first line if it begins with #!, and the next two if they
+ ## look like the trick mentioned in the perl man page for getting
+ ## around the lack of #!-support.
+ ##
+ if (seek(DATA, 0, 0)) { ##
+ $_ = <DATA>; if (m/^#!/) { print STDOUT;
+ $_ = <DATA>; if (m/^\s*eval/) { print STDOUT;
+ $_ = <DATA>; if (m/^\s*if/) { print STDOUT; }
+ }
+ }
+ print STDOUT "\n#-\n";
+ }
+
+ ## Go get the latest one...
+ local(@options);
+ push(@options, 'head') if $head;
+ push(@options, 'nofollow') unless $follow;
+ push(@options, ('retry') x $retry) if $retry;
+ push(@options, 'quiet') if $quiet;
+ push(@options, 'debug') if $debug;
+ local($status, $memo, %info) = &www'open_http_url(*IN, $WEB, @options);
+ die "fetching $WEB:\n $memo\n" unless $status eq 'ok';
+
+ $size = $info{'content-length'};
+ while (<IN>)
+ {
+ $size -= length;
+ print STDOUT;
+ if (!defined $fetched_version && m/version\s*=\s*"([^"]+)"/) {
+ $fetched_version = $1;
+ &general_read(*IN, $size);
+ last;
+ }
+ }
+
+ $fetched_version = "<unknown>" unless defined $fetched_version;
+
+ ##
+ ## Try to update the mode of the temp file with the mode of this file.
+ ## Don't worry if it fails.
+ ##
+ chmod($mode, $tempFile) if $mode = (stat($0))[2];
+
+ $as_well = '';
+ if ($fetched_version eq $version)
+ {
+ print STDERR "You already have the most-recent version ($version).\n",
+ qq/FWIW, the newly fetched one has been left in "$tempFile".\n/;
+ }
+ elsif ($fetched_version <= $version)
+ {
+ print STDERR
+ "Mmm, your current version seems newer (?!):\n",
+ qq/ your version: "$version"\n/,
+ qq/ new version: "$fetched_version"\n/,
+ qq/FWIW, fetched one left in "$tempFile".\n/;
+ }
+ else
+ {
+ print STDERR
+ "Indeed, your current version was old:\n",
+ qq/ your version: "$version"\n/,
+ qq/ new version: "$fetched_version"\n/,
+ qq/The file "$tempFile" is ready to replace the old one.\n/;
+ print STDERR qq/Just do:\n % mv $tempFile $0\n/ if -f $0;
+ $as_well = ' as well';
+ }
+ print STDERR "Note that the libraries it uses may (or may not) need updating$as_well.\n"
+ unless $inlined;
+ exit($EXIT_ok);
+}
+
+##
+## Given a list of URLs, fetch'em.
+## Parses the URL and calls the routine for the appropriate protocol
+##
+sub fetch_url
+{
+ local(@todo) = @_;
+ local(%circref, %hold_circref);
+
+ URL_LOOP: while (@todo)
+ {
+ $URL = shift(@todo);
+ %hold_circref = %circref; undef %circref;
+
+ local($protocol, @args) = &www'grok_URL($URL, $noproxy);
+
+ if (!defined $protocol) {
+ &www'message(1, qq/can't grok "$URL"/);
+ next URL_LOOP;
+ }
+
+ ## call protocol-specific handler
+ $func = "fetch_via_" . $protocol;
+ $error = &$func(@args, $TimeoutSpan);
+ if (defined $error) {
+ &www'message(1, "$URL: $error");
+ } else {
+ $SuccessfulCount++;
+ }
+ }
+}
+
+sub filedate
+{
+ local($filename) = @_;
+ local($filetime) = (stat($filename))[9];
+ return 0 if !defined $filetime;
+ local($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($filetime);
+ return 0 if !defined $wday;
+ sprintf(qq/"%s, %02d-%s-%02d %02d:%02d:%02d GMT"/,
+ ("Sunday", "Monday", "Tuesdsy", "Wednesday",
+ "Thursday", "Friday", "Saturday")[$wday],
+ $mday,
+ ("Jan", "Feb", "Mar", "Apr", "May", "Jun",
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[$mon],
+ $year,
+ $hour,
+ $min,
+ $sec);
+}
+
+sub local_filename
+{
+ local($filename) = @_;
+ $filename =~ s,/+$,,; ## remove any trailing slashes
+ $filename =~ s,.*/,,; ## remove any leading path
+ if ($filename eq '') {
+ ## empty -- pick a random name
+ $filename = "file0000";
+ ## look for a free random name.
+ $filename++ while -f $filename;
+ }
+ $filename;
+}
+
+sub set_output_file
+{
+ local($filename) = @_;
+ if (!open(OUT, ">$filename")) {
+ &www'message(1, "$0: can't open [$filename] for output");
+ } else {
+ open(SAVEOUT, ">>&STDOUT") || die "$!";;
+ open(STDOUT, ">>&OUT");
+ }
+}
+
+sub close_output_file
+{
+ local($filename) = @_;
+ unless ($quiet)
+ {
+ local($note) = qq/"$filename" written/;
+ if (defined $error) {
+ $note .= " (possibly corrupt due to error above)";
+ }
+ &www'message(1, "$note.");
+ }
+ close(STDOUT);
+ open(STDOUT, ">&SAVEOUT");
+}
+
+sub http_alarm
+{
+ &www'message(1, "ERROR: $AlarmNote.");
+ exit($EXIT_timeout); ## the alarm doesn't seem to cause a waiting syscall to break?
+# $HaveAlarm = 1;
+}
+
+##
+## Given the host, port, and path, and (for info only) real target,
+## fetch via HTTP.
+##
+## If there is a user and/or password, use that for Basic Authorization.
+##
+## If $timeout is nonzero, time out after that many seconds.
+##
+sub fetch_via_http
+{
+ local($host, $port, $path, $target, $user, $password, $timeout) = @_;
+ local(@options);
+ local($local_filename);
+
+ ##
+ ## If we're posting, but -postfile was given, we need to interpret
+ ## the item in $path after '?' as a filename, and replace it with
+ ## the contents of the file.
+ ##
+ if ($postfile && $path =~ s/\?([\d\D]*)//) {
+ local($filename) = $1;
+ return("can't open [$filename] to POST") if !open(IN, "<$filename");
+ local($/) = ''; ## want to suck up the whole file.
+ $path .= '?' . <IN>;
+ close(IN);
+ }
+
+ $local_filename = &local_filename($path)
+ if $refresh || $nab || defined($reference_file);
+ $refresh = &filedate($local_filename) if $refresh;
+ $refresh = &filedate($reference_file) if defined($reference_file);
+
+ push(@options, 'head') if $head;
+ push(@options, 'post') if $post;
+ push(@options, 'nofollow') unless $follow;
+ push(@options, ('retry') x 3);
+ push(@options, 'quiet') if $quiet;
+ push(@options, 'debug') if $debug;
+ push(@options, "ifmodifiedsince=$refresh") if $refresh;
+
+ if (defined $password || defined $user) {
+ local($auth) = join(':', ($user || ''), ($password || ''));
+ push(@options, "authorization=$auth");
+ }
+
+ local($old_alarm);
+ if ($timeout) {
+ $old_alarm = $SIG{'ALRM'} || 'DEFAULT';
+ $SIG{'ALRM'} = "main'http_alarm";
+# $HaveAlarm = 0;
+ $AlarmNote = "host $host";
+ $AlarmNote .= ":$port" if $port != $www'default_port{'http'};
+ $AlarmNote .= " timed out after $timeout second";
+ $AlarmNote .= 's' if $timeout > 1;
+ alarm($timeout);
+ }
+ local($result, $memo, %info) =
+ &www'open_http_connection(*HTTP, $host,$port,$path,$target,@options);
+
+ if ($timeout) {
+ alarm(0);
+ $SIG{'ALRM'} = $old_alarm;
+ }
+
+# if ($HaveAlarm) {
+# close(HTTP);
+# $error = "timeout after $timeout second";
+# $error .= "s" if $timeout > 1;
+# return $error;
+# }
+
+ if ($follow && ($result eq 'follow')) {
+ %circref = %hold_circref;
+ $circref{$memo} = 1;
+ unshift(@todo, $memo);
+ return undef;
+ }
+
+
+ return $memo if $result eq 'error';
+ if (!$quiet && $result eq 'status' && ! -t STDOUT) {
+ #&www'message(1, "Warning: $memo");
+ $error = "Warning: $memo";
+ }
+
+ if ($info{'CODE'} == 304) { ## 304 is magic for "Not Modified"
+ close(HTTP);
+ &www'message(1, "$URL: Not Modified") unless $quiet;
+ $NotModifiedCount++;
+ return undef; ## no error
+ }
+
+
+ &set_output_file($local_filename) if $nab;
+
+ unless($strip) {
+ print $info{'STATUS'}, "\n", $info{'HEADER'}, "\n";
+
+ print SAVEOUT $info{'STATUS'}, "\n", $info{'HEADER'}, "\n" if $nab==2;
+ }
+
+ if (defined $info{'BODY'}) {
+ print $info{'BODY'};
+ print SAVEOUT $info{'BODY'} if $nab==2;
+ }
+
+ if (!$head) {
+ &general_read(*HTTP, $info{'content-length'});
+ }
+ close(HTTP);
+ &close_output_file($local_filename) if $nab;
+
+ $error; ## will be 'undef' if no error;
+}
+
+sub fetch_via_ftp
+{
+ local($host, $port, $path, $target, $user, $password, $timeout) = @_;
+ local($local_filename) = &local_filename($path);
+ local($ftp_debug) = $debug;
+ local(@password) = ($password);
+ $path =~ s,^/,,; ## remove a leading / from the path.
+ $path = '.' if $path eq ''; ## make sure we have something
+
+ if (!defined $user) {
+ $user = 'anonymous';
+ $password = $ENV{'USER'} || 'WWWuser';
+ @password = ($password.'@'. &network'addr_to_ascii(&network'my_addr),
+ $password.'@');
+ } elsif (!defined $password) {
+ @password = ("");
+ }
+
+ local($_last_ftp_reply, $_passive_host, $_passive_port);
+ local($size);
+
+ sub _ftp_get_reply
+ {
+ local($text) = scalar(<FTP_CONTROL>);
+ die "lost connection to $host\n" if !defined $text;
+ local($_, $tmp);
+ print STDERR "READ: $text" if $ftp_debug;
+ die "internal error: expected reply code in response from ".
+ "ftp server [$text]" unless $text =~ s/^(\d+)([- ])//;
+ local($code) = $1;
+ if ($2 eq '-') {
+ while (<FTP_CONTROL>) {
+ ($tmp = $_) =~ s/^\d+[- ]//;
+ $text .= $tmp;
+ last if m/^$code /;
+ }
+ }
+ $text =~ s/^\d+ ?/<foo>/g;
+ ($code, $text);
+ }
+
+ sub _ftp_expect
+ {
+ local($code, $text) = &_ftp_get_reply;
+ $_last_ftp_reply = $text;
+ foreach $expect (@_) {
+ return ($code, $text) if $code == $expect;
+ }
+ die "internal error: expected return code ".
+ join('|',@_).", got [$text]";
+ }
+
+ sub _ftp_send
+ {
+ print STDERR "SEND: ", @_ if $ftp_debug;
+ print FTP_CONTROL @_;
+ }
+
+ sub _ftp_do_passive
+ {
+ local(@commands) = @_;
+
+ &_ftp_send("PASV\r\n");
+ local($code) = &_ftp_expect(227, 125);
+
+ if ($code == 227)
+ {
+ die "internal error: can't grok passive reply [$_last_ftp_reply]"
+ unless $_last_ftp_reply =~ m/\(([\d,]+)\)/;
+ local($a,$b,$c,$d, $p1, $p2) = split(/,/, $1);
+ ($_passive_host, $_passive_port) =
+ ("$a.$b.$c.$d", $p1*256 + $p2);
+ }
+
+ foreach(@commands) {
+ &_ftp_send($_);
+ }
+
+ local($error)=
+ &network'connect_to(*PASSIVE, $_passive_host, $_passive_port);
+ die "internal error: passive ftp connect [$error]" if $error;
+ }
+
+ ## make the connection to the host
+ &www'message($debug, "connecting to $host...") unless $quiet;
+
+ local($old_alarm);
+ if ($timeout) {
+ $old_alarm = $SIG{'ALRM'} || 'DEFAULT';
+ $SIG{'ALRM'} = "main'http_alarm"; ## can use this for now
+# $HaveAlarm = 0;
+ $AlarmNote = "host $host";
+ $AlarmNote .= ":$port" if $port != $www'default_port{'ftp'};
+ $AlarmNote .= " timed out after $timeout second";
+ $AlarmNote .= 's' if $timeout > 1;
+ alarm($timeout);
+ }
+
+ local($error) = &network'connect_to(*FTP_CONTROL, $host, $port);
+
+ if ($timeout) {
+ alarm(0);
+ $SIG{'ALRM'} = $old_alarm;
+ }
+
+ return $error if $error;
+
+ local ($code, $text) = &_ftp_get_reply(*FTP_CONTROL);
+ close(FTP_CONTROL), return "internal ftp error: [$text]" unless $code==220;
+
+ ## log in
+ &www'message($debug, "logging in as $user...") unless $quiet;
+ foreach $password (@password)
+ {
+ &_ftp_send("USER $user\r\n");
+ ($code, $text) = &_ftp_expect(230,331,530);
+ close(FTP_CONTROL), return $text if ($code == 530);
+ last if $code == 230; ## hey, already logged in, cool.
+
+ &_ftp_send("PASS $password\r\n");
+ ($code, $text) = &_ftp_expect(220,230,530,550,332);
+ last if $code != 550;
+ last if $text =~ m/can't change directory/;
+ }
+
+ if ($code == 550)
+ {
+ $text =~ s/\n+$//;
+ &www'message(1, "Can't log in $host: $text") unless $quiet;
+ exit($EXIT_error);
+ }
+
+ if ($code == 332)
+ {
+ &_ftp_send("ACCT noaccount\r\n");
+ ($code, $text) = &_ftp_expect(230, 202, 530, 500,501,503, 421)
+ }
+ close(FTP_CONTROL), return $text if $code >= 300;
+
+ &_ftp_send("TYPE I\r\n");
+ &_ftp_expect(200);
+
+ unless ($quiet) {
+ local($name) = $path;
+ $name =~ s,.*/([^/]),$1,;
+ &www'message($debug, "requesting $name...");
+ }
+ ## get file
+ &_ftp_do_passive("RETR $path\r\n");
+ ($code,$text) = &_ftp_expect(125, 150, 550, 530);
+ close(FTP_CONTROL), return $text if $code == 530;
+
+ if ($code == 550)
+ {
+ close(PASSIVE);
+ if ($text =~ /directory/i) {
+ ## probably from "no such file or directory", so just return now.
+ close(FTP_CONTROL);
+ return $text;
+ }
+
+ ## do like Mosaic and try getting a directory listing.
+ &_ftp_send("CWD $path\r\n");
+ ($code) = &_ftp_expect(250,550);
+ if ($code == 550) {
+ close(FTP_CONTROL);
+ return $text;
+ }
+ &_ftp_do_passive("LIST\r\n");
+ &_ftp_expect(125, 150);
+ }
+
+ $size = $1 if $text =~ m/(\d+)\s+bytes/;
+ binmode(PASSIVE); ## just in case.
+ &www'message($debug, "waiting for data...") unless $quiet;
+ &set_output_file($local_filename) if $nab;
+ &general_read(*PASSIVE, $size);
+ &close_output_file($local_filename) if $nab;
+
+ close(PASSIVE);
+ close(FTP_CONTROL);
+ undef;
+}
+
+sub general_read
+{
+ local(*INPUT, $size) = @_;
+ local($lastcount, $bytes) = (0,0);
+ local($need_to_clear) = 0;
+ local($start_time) = time;
+ local($last_time, $time) = $start_time;
+ ## Figure out how often to print the "bytes read" message
+ local($delta2print) =
+ (defined $size) ? int($size/50) : $defaultdelta2print;
+
+ &www'message(0, "read 0 bytes") unless $quiet;
+
+ ## so $! below is set only if a real error happens from now
+ eval 'local($^W) = 0; undef $!';
+
+
+ while (defined($_ = <INPUT>))
+ {
+ ## shove it out.
+ &www'clear_message if $need_to_clear;
+ print;
+ print SAVEOUT if $nab==2;
+
+ ## if we know the content-size, keep track of what we're reading.
+ $bytes += length;
+
+ last if eof || (defined $size && $bytes >= $size);
+
+ if (!$quiet && $bytes > ($lastcount + $delta2print))
+ {
+ if ($time = time, $last_time == $time) {
+ $delta2print *= 1.5;
+ } else {
+ $last_time = $time;
+ $lastcount = $bytes;
+ local($time_delta) = $time - $start_time;
+ local($text);
+
+ $delta2print /= $time_delta;
+ if (defined $size) {
+ $text = sprintf("read $bytes bytes (%.0f%%)",
+ $bytes*100/$size);
+ } else {
+ $text = "read $bytes bytes";
+ }
+
+ if ($time_delta > 5 || ($time_delta && $bytes > 10240))
+ {
+ local($rate) = int($bytes / $time_delta);
+ if ($rate < 5000) {
+ $text .= " ($rate bytes/sec)";
+ } elsif ($rate < 1024 * 10) {
+ $text .= sprintf(" (%.1f k/sec)", $rate/1024);
+ } else {
+ $text .= sprintf(" (%.0f k/sec)", $rate/1024);
+ }
+ }
+ &www'message(0, "$text...");
+ $need_to_clear = -t STDOUT;
+ }
+ }
+ }
+
+ if (!$quiet)
+ {
+ if ($size && ($size != $bytes)) {
+ &www'message("WARNING: Expected $size bytes, read $bytes bytes.\n");
+ }
+# if ($!) {
+# print STDERR "\$! is [$!]\n";
+# }
+# if ($@) {
+# print STDERR "\$\@ is [$@]\n";
+# }
+ }
+ &www'clear_message($text) unless $quiet;
+}
+
+sub dummy {
+ 1 || &dummy || &fetch_via_ftp || &fetch_via_http || &http_alarm;
+ 1 || close(OUT);
+ 1 || close(SAVEOUT);
+}
+
+__END__