diff options
Diffstat (limited to 'win32/bin')
-rw-r--r-- | win32/bin/PL2BAT.BAT | 40 | ||||
-rw-r--r-- | win32/bin/network.pl | 211 | ||||
-rw-r--r-- | win32/bin/search.bat | 1873 | ||||
-rw-r--r-- | win32/bin/test.bat | 143 | ||||
-rw-r--r-- | win32/bin/webget.bat | 1099 | ||||
-rw-r--r-- | win32/bin/www.pl | 901 |
6 files changed, 4267 insertions, 0 deletions
diff --git a/win32/bin/PL2BAT.BAT b/win32/bin/PL2BAT.BAT new file mode 100644 index 0000000000..462affa7d0 --- /dev/null +++ b/win32/bin/PL2BAT.BAT @@ -0,0 +1,40 @@ +@rem = ' +@echo off +perl -S %0.bat %1 %2 %3 %4 %5 %6 %7 %8 %9 +goto endofperl +@rem '; + +$head = <<'--end--'; +@rem = '--*-Perl-*--'; +@rem = ' +@echo off +perl -S %0.bat %1 %2 %3 %4 %5 %6 %7 %8 %9 +goto endofperl +@rem '; +--end-- + +$tail = "__END__\n:endofperl\n"; + +if ( @ARGV ) { + LOOP: + foreach ( @ARGV ) { + open( FILE, $_ ); + @file = <FILE>; + if ( grep( /:endofperl/, @file ) ) { + warn "$_ has already been converted to a batch file!!\n"; + next LOOP; + } + close( FILE, $_ ); + s/\.pl//; + s/\.bat//; + open( FILE, ">$_.bat" ); + print FILE $head, @file, $tail; + close( FILE ); + } +} else { + @file = <STDIN>; + print $head, @file, $tail; +} + +__END__ +:endofperl diff --git a/win32/bin/network.pl b/win32/bin/network.pl new file mode 100644 index 0000000000..f49045333d --- /dev/null +++ b/win32/bin/network.pl @@ -0,0 +1,211 @@ +## +## Jeffrey Friedl (jfriedl@omron.co.jp) +## Copyri.... ah hell, just take it. +## +## July 1994 +## +package network; +$version = "950311.5"; + +## version 950311.5 -- turned off warnings when requiring 'socket.ph'; +## version 941028.4 -- some changes to quiet perl5 warnings. +## version 940826.3 -- added check for "socket.ph", and alternate use of +## socket STREAM value for SunOS5.x +## + +## BLURB: +## A few simple and easy-to-use routines to make internet connections. +## Similar to "chat2.pl" (but actually commented, and a bit more portable). +## Should work even on SunOS5.x. +## + +##> +## +## connect_to() -- make an internet connection to a server. +## +## Two uses: +## $error = &network'connect_to(*FILEHANDLE, $fromsockaddr, $tosockaddr) +## $error = &network'connect_to(*FILEHANDLE, $hostname, $portnum) +## +## Makes the given connection and returns an error string, or undef if +## no error. +## +## In the first form, FROMSOCKADDR and TOSOCKADDR are of the form returned +## by SOCKET'GET_ADDR and SOCKET'MY_ADDR. +## +##< +sub connect_to +{ + local(*FD, $arg1, $arg2) = @_; + local($from, $to) = ($arg1, $arg2); ## for one interpretation. + local($host, $port) = ($arg1, $arg2); ## for the other + + if (defined($to) && length($from)==16 && length($to)==16) { + ## ok just as is + } elsif (defined($host)) { + $to = &get_addr($host, $port); + return qq/unknown address "$host"/ unless defined $to; + $from = &my_addr; + } else { + return "unknown arguments to network'connect_to"; + } + + return "connect_to failed (socket: $!)" unless &my_inet_socket(*FD); + return "connect_to failed (bind: $!)" unless bind(FD, $from); + return "connect_to failed (connect: $!)" unless connect(FD, $to); + local($old) = select(FD); $| = 1; select($old); + undef; +} + + + +##> +## +## listen_at() - used by a server to indicate that it will accept requests +## at the port number given. +## +## Used as +## $error = &network'listen_at(*LISTEN, $portnumber); +## (returns undef upon success) +## +## You can then do something like +## $addr = accept(REMOTE, LISTEN); +## print "contact from ", &network'addr_to_ascii($addr), ".\n"; +## while (<REMOTE>) { +## .... process request.... +## } +## close(REMOTE); +## +##< +sub listen_at +{ + local(*FD, $port) = @_; + local($empty) = pack('S n a4 x8', 2 ,$port, "\0\0\0\0"); + return "listen_for failed (socket: $!)" unless &my_inet_socket(*FD); + return "listen_for failed (bind: $!)" unless bind(FD, $empty); + return "listen_for failed (listen: $!)" unless listen(FD, 5); + local($old) = select(FD); $| = 1; select($old); + undef; +} + + +##> +## +## Given an internal packed internet address (as returned by &connect_to +## or &get_addr), return a printable ``1.2.3.4'' version. +## +##< +sub addr_to_ascii +{ + local($addr) = @_; + return "bad arg" if length $addr != 16; + return join('.', unpack("CCCC", (unpack('S n a4 x8', $addr))[2])); +} + +## +## +## Given a host and a port name, returns the packed socket addresss. +## Mostly for internal use. +## +## +sub get_addr +{ + local($host, $port) = @_; + return $addr{$host,$port} if defined $addr{$host,$port}; + local($addr); + + if ($host =~ m/^\d+\.\d+\.\d+\.\d+$/) + { + $addr = pack("C4", split(/\./, $host)); + } + elsif ($addr = (gethostbyname($host))[4], !defined $addr) + { + local(@lookup) = `nslookup $host 2>&1`; + if (@lookup) + { + local($lookup) = join('', @lookup[2 .. $#lookup]); + if ($lookup =~ m/^Address:\s*(\d+\.\d+\.\d+\.\d+)/) { + $addr = pack("C4", split(/\./, $1)); + } + } + if (!defined $addr) { + ## warn "$host: SOL, dude\n"; + return undef; + } + } + $addr{$host,$port} = pack('S n a4 x8', 2 ,$port, $addr); +} + + +## +## my_addr() +## Returns the packed socket address of the local host (port 0) +## Mostly for internal use. +## +## +sub my_addr +{ + local(@x) = gethostbyname('localhost'); + local(@y) = gethostbyname($x[0]); +# local($name,$aliases,$addrtype,$length,@addrs) = gethostbyname($x[0]); +# local(@bytes) = unpack("C4",$addrs[0]); +# return pack('S n a4 x8', 2 ,0, $addr); + return pack('S n a4 x8', 2 ,0, $y[4]); +} + + +## +## my_inet_socket(*FD); +## +## Local routine to do socket(PF_INET, SOCK_STREAM, AF_NS). +## Takes care of figuring out the proper values for the args. Hopefully. +## +## Returns the same value as 'socket'. +## +sub my_inet_socket +{ + local(*FD) = @_; + local($socket); + + if (!defined $socket_values_queried) + { + ## try to load some "socket.ph" + if (!defined &main'_SYS_SOCKET_H_) { + eval 'package main; + local($^W) = 0; + require("sys/socket.ph")||require("socket.ph");'; + } + + ## we'll use "the regular defaults" if for PF_INET and AF_NS if unknown + $PF_INET = defined &main'PF_INET ? &main'PF_INET : 2; + $AF_NS = defined &main'AF_NS ? &main'AF_NS : 6; + $SOCK_STREAM = &main'SOCK_STREAM if defined &main'SOCK_STREAM; + + $socket_values_queried = 1; + } + + if (defined $SOCK_STREAM) { + $socket = socket(FD, $PF_INET, $SOCK_STREAM, $AF_NS); + } else { + ## + ## We'll try the "regular default" of 1. If that returns a + ## "not supported" error, we'll try 2, which SunOS5.x uses. + ## + $socket = socket(FD, $PF_INET, 1, $AF_NS); + if ($socket) { + $SOCK_STREAM = 1; ## got it. + } elsif ($! =~ m/not supported/i) { + ## we'll just assume from now on that it's 2. + $socket = socket(FD, $PF_INET, $SOCK_STREAM = 2, $AF_NS); + } + } + $socket; +} + +## This here just to quiet -w warnings. +sub dummy { + 1 || $version || &dummy; +} + +1; +__END__ diff --git a/win32/bin/search.bat b/win32/bin/search.bat new file mode 100644 index 0000000000..0bb123759f --- /dev/null +++ b/win32/bin/search.bat @@ -0,0 +1,1873 @@ +@rem = '--*-Perl-*--'; +@rem = ' +@echo off +perl -S %0.bat %1 %2 %3 %4 %5 %6 %7 %8 %9 +goto endofperl +@rem '; +#!/usr/local/bin/perl -w +'di'; +'ig00'; +############################################################################## +## +## search +## +## Jeffrey Friedl (jfriedl@omron.co.jp), Dec 1994. +## Copyright 19.... ah hell, just take it. +## +## BLURB: +## A combo of find and grep -- more or less do a 'grep' on a whole +## directory tree. Fast, with lots of options. Much more powerful than +## the simple "find ... | xargs grep ....". Has a full man page. +## Powerfully customizable. +## +## This file is big, but mostly comments and man page. +## +## See man page for usage info. +## Return value: 2=error, 1=nothing found, 0=something found. +## + +$version = "950918.5"; +## +## "950918.5"; +## Changed all 'sysread' to 'read' because Linux perl's don't seem +## to like sysread() +## +## "941227.4"; +## Added -n, -u +## +## "941222.3" +## Added -nice (due to Lionel Cons <Lionel.Cons@cern.ch>) +## Removed any leading "./" from name. +## Added default flags for ~/.search, including TTY, -nice, -list, etc. +## Program name now has path removed when printed in diagnostics. +## Added simple tilde-expansion to -dir arg. +## Added -dskip, etc. Fixed -iregex bug. +## Changed -dir to be additive, adding -ddir. +## Now screen out devices, pipes, and sockets. +## More tidying and lots of expanding of the man page +## +## +## "941217.2"; +## initial release. + +$stripped=0; + +&init; +$rc_file = join('/', $ENV{'HOME'}, ".search"); + +&check_args; + +## Make sure we've got a regex. +## Don't need one if -find or -showrc was specified. +$!=2, die "expecting regex arguments.\n" + if $FIND_ONLY == 0 && $showrc == 0 && @ARGV == 0; + +&prepare_to_search($rc_file); + +&import_program if !defined &dodir; ## BIG key to speed. + +## do search while there are directories to be done. +&dodir(shift(@todo)) while @todo; + +&clear_message if $VERBOSE && $STDERR_IS_TTY; +exit($retval); +############################################################################### + +sub init +{ + ## initialize variables that might be reset by command-line args + $DOREP=0; ## set true by -dorep (redo multi-hardlink files) + $DO_SORT=0; ## set by -sort (sort files in a dir before checking) + $FIND_ONLY=0; ## set by -find (don't search files) + $LIST_ONLY=0; ## set true by -l (list filenames only) + $NEWER=0; ## set by -newer, "-mtime -###" + $NICE=0; ## set by -nice (print human-readable output) + $NOLINKS=0; ## set true by -nolinks (don't follow symlinks) + $OLDER=0; ## set by -older, "-mtime ###" + $PREPEND_FILENAME=1; ## set false by -h (don't prefix lines with filename) + $REPORT_LINENUM=0; ## set true by -n (show line numbers) + $VERBOSE=0; ## set to a value by -v, -vv, etc. (verbose messages) + $WHY=0; ## set true by -why, -vvv+ (report why skipped) + $XDEV=0; ## set true by -xdev (stay on one filesystem) + $all=0; ## set true by -all (don't skip many kinds of files) + $iflag = ''; ## set to 'i' by -i (ignore case); + $norc=0; ## set by -norc (don't load rc file) + $showrc=0; ## set by -showrc (show what happens with rc file) + $underlineOK=0; ## set true by -u (watch for underline stuff) + $words=0; ## set true by -w (match whole-words only) + $DELAY=0; ## inter-file delay (seconds) + $retval=1; ## will set to 0 if we find anything. + + ## various elements of stat() that we might access + $STAT_DEV = 1; + $STAT_INODE = 2; + $STAT_MTIME = 9; + + $VV_PRINT_COUNT = 50; ## with -vv, print every VV_PRINT_COUNT files, or... + $VV_SIZE = 1024*1024; ## ...every VV_SIZE bytes searched + $vv_print = $vv_size = 0; ## running totals. + + ## set default options, in case the rc file wants them + $opt{'TTY'}= 1 if -t STDOUT; + + ## want to know this for debugging message stuff + $STDERR_IS_TTY = -t STDERR ? 1 : 0; + $STDERR_SCREWS_STDOUT = ($STDERR_IS_TTY && -t STDOUT) ? 1 : 0; + + $0 =~ s,.*/,,; ## clean up $0 for any diagnostics we'll be printing. +} + +## +## Check arguments. +## +sub check_args +{ + while (@ARGV && $ARGV[0] =~ m/^-/) + { + $arg = shift(@ARGV); + + if ($arg eq '-version' || ($VERBOSE && $arg eq '-help')) { + print qq/Jeffrey's file search, version "$version".\n/; + exit(0) unless $arg eq '-help'; + } + if ($arg eq '-help') { + print <<INLINE_LITERAL_TEXT; +usage: $0 [options] [-e] [PerlRegex ....] +OPTIONS TELLING *WHERE* TO SEARCH: + -dir DIR start search at the named directory (default is current dir). + -xdev stay on starting file system. + -sort sort the files in each directory before processing. + -nolinks don't follow symbolic links. +OPTIONS TELLING WHICH FILES TO EVEN CONSIDER: + -mtime # consider files modified > # days ago (-# for < # days old) + -newer FILE consider files modified more recently than FILE (also -older) + -name GLOB consider files whose name matches pattern (also -regex). + -skip GLOB opposite of -name: identifies files to not consider. + -path GLOB like -name, but for files whose whole path is described. + -dpath/-dregex/-dskip versions for selecting or pruning directories. + -all don't skip any files marked to be skipped by the startup file. + -x<SPECIAL> (see manual, and/or try -showrc). + -why report why a file isn't checked (also implied by -vvvv). +OPTIONS TELLING WHAT TO DO WITH FILES THAT WILL BE CONSIDERED: + -f | -find just list files (PerlRegex ignored). Default is to grep them. + -ff | -ffind Does a faster -find (implies -find -all -dorep) +OPTIONS CONTROLLING HOW THE SEARCH IS DONE (AND WHAT IS PRINTED): + -l | -list only list files with matches, not the lines themselves. + -nice | -nnice print more "human readable" output. + -n prefix each output line with its line number in the file. + -h don't prefix output lines with file name. + -u also look "inside" manpage-style underlined text + -i do case-insensitive searching. + -w match words only (as defined by perl's \\b). +OTHER OPTIONS: + -v, -vv, -vvv various levels of message verbosity. + -e end of options (in case a regex looks like an option). + -showrc show what the rc file sets, then exit. + -norc don't load the rc file. + -dorep check files with multiple hard links multiple times. +INLINE_LITERAL_TEXT + print "Use -v -help for more verbose help.\n" unless $VERBOSE; + print "This script file is also a man page.\n" unless $stripped; + print <<INLINE_LITERAL_TEXT if $VERBOSE; + +If -f (or -find) given, PerlRegex is optional and ignored. +Otherwise, will search for files with lines matching any of the given regexes. + +Combining things like -name and -mtime implies boolean AND. +However, duplicating things (such as -name '*.c' -name '*.txt') implies OR. + +-mtime may be given floating point (i.e. 1.5 is a day and a half). +-iskip/-idskip/-ipath/... etc are case-insensitive versions. + +If any letter in -newer/-older is upper case, "or equal" is +inserted into the test. + +You can always find the latest version on the World Wide Web in + http://www.wg.omron.co.jp/~jfriedl/perl/ +INLINE_LITERAL_TEXT + exit(0); + } + $DOREP=1, next if $arg eq '-dorep'; ## do repeats + $DO_SORT=1, next if $arg eq '-sort'; ## sort files + $NOLINKS=1, next if $arg eq '-nolinks'; ## no sym. links + $PREPEND_FILENAME=0, next if $arg eq '-h'; ## no filename prefix + $REPORT_LINENUM=1, next if $arg eq '-n'; ## show line numbers + $WHY=1, next if $arg eq '-why'; ## tell why skipped + $XDEV=1, next if $arg eq '-xdev'; ## don't leave F.S. + $all=1,$opt{'-all'}=1,next if $arg eq '-all'; ## don't skip *.Z, etc + $iflag='i', next if $arg eq '-i'; ## ignore case + $norc=1, next if $arg eq '-norc'; ## don't load rc file + $showrc=1, next if $arg eq '-showrc'; ## show rc file + $underlineOK=1, next if $arg eq '-u'; ## look throuh underln. + $words=1, next if $arg eq '-w'; ## match "words" only + &strip if $arg eq '-strip'; ## dump this program + last if $arg eq '-e'; + $DELAY=$1, next if $arg =~ m/-delay(\d+)/; + + $FIND_ONLY=1, next if $arg =~/^-f(ind)?$/;## do "find" only + + $FIND_ONLY=1, $DOREP=1, $all=1, + next if $arg =~/^-ff(ind)?$/;## fast -find + $LIST_ONLY=1,$opt{'-list'}=1, + next if $arg =~/^-l(ist)?$/;## only list files + + if ($arg =~ m/^-(v+)$/) { ## verbosity + $VERBOSE =length($1); + foreach $len (1..$VERBOSE) { $opt{'-'.('v' x $len)}=1 } + next; + } + if ($arg =~ m/^-(n+)ice$/) { ## "nice" output + $NICE =length($1); + foreach $len (1..$NICE) { $opt{'-'.('n' x $len).'ice'}=1 } + next; + } + + if ($arg =~ m/^-(i?)(d?)skip$/) { + local($i) = $1 eq 'i'; + local($d) = $2 eq 'd'; + $! = 2, die qq/$0: expecting glob arg to -$arg\n/ unless @ARGV; + foreach (split(/\s+/, shift @ARGV)) { + if ($d) { + $idskip{$_}=1 if $i; + $dskip{$_}=1; + } else { + $iskip{$_}=1 if $i; + $skip{$_}=1; + } + } + next; + } + + + if ($arg =~ m/^-(i?)(d?)(regex|path|name)$/) { + local($i) = $1 eq 'i'; + $! = 2, die qq/$0: expecting arg to -$arg\n/ unless @ARGV; + foreach (split(/\s+/, shift @ARGV)) { + $iname{join(',', $arg, $_)}=1 if $i; + $name{join(',', $arg, $_)}=1; + } + next; + } + + if ($arg =~ m/^-d?dir$/) { + $opt{'-dir'}=1; + $! = 2, die qq/$0: expecting filename arg to -$arg\n/ unless @ARGV; + $start = shift(@ARGV); + $start =~ s#^~(/+|$)#$ENV{'HOME'}$1# if defined $ENV{'HOME'}; + $! = 2, die qq/$0: can't find ${arg}'s "$start"\n/ unless -e $start; + $! = 2, die qq/$0: ${arg}'s "$start" not a directory.\n/ unless -d _; + undef(@todo), $opt{'-ddir'}=1 if $arg eq '-ddir'; + push(@todo, $start); + next; + } + + if ($arg =~ m/^-(new|old)er$/i) { + $! = 2, die "$0: expecting filename arg to -$arg\n" unless @ARGV; + local($file, $time) = shift(@ARGV); + $! = 2, die qq/$0: can't stat -${arg}'s "$file"./ + unless $time = (stat($file))[$STAT_MTIME]; + local($upper) = $arg =~ tr/A-Z//; + if ($arg =~ m/new/i) { + $time++ unless $upper; + $NEWER = $time if $NEWER < $time; + } else { + $time-- unless $upper; + $OLDER = $time if $OLDER == 0 || $OLDER > $time; + } + next; + } + + if ($arg =~ m/-mtime/) { + $! = 2, die "$0: expecting numerical arg to -$arg\n" unless @ARGV; + local($days) = shift(@ARGV); + $! = 2, die qq/$0: inappropriate arg ($days) to $arg\n/ if $days==0; + $days *= 3600 * 24; + if ($days < 0) { + local($time) = $^T + $days; + $NEWER = $time if $NEWER < $time; + } else { + local($time) = $^T - $days; + $OLDER = $time if $OLDER == 0 || $OLDER > $time; + } + next; + } + + ## special user options + if ($arg =~ m/^-x(.+)/) { + foreach (split(/[\s,]+/, $1)) { $user_opt{$_} = $opt{$_}= 1; } + next; + } + + $! = 2, die "$0: unknown arg [$arg]\n"; + } +} + +## +## Given a filename glob, return a regex. +## If the glob has no globbing chars (no * ? or [..]), then +## prepend an effective '*' to it. +## +sub glob_to_regex +{ + local($glob) = @_; + local(@parts) = $glob =~ m/\\.|[*?]|\[]?[^]]*]|[^[\\*?]+/g; + local($trueglob)=0; + foreach (@parts) { + if ($_ eq '*' || $_ eq '?') { + $_ = ".$_"; + $trueglob=1; ## * and ? are a real glob + } elsif (substr($_, 0, 1) eq '[') { + $trueglob=1; ## [..] is a real glob + } else { + s/^\\//; ## remove any leading backslash; + s/\W/\\$&/g; ## now quote anything dangerous; + } + } + unshift(@parts, '.*') unless $trueglob; + join('', '^', @parts, '$'); +} + +sub prepare_to_search +{ + local($rc_file) = @_; + + $HEADER_BYTES=0; ## Might be set nonzero in &read_rc; + $last_message_length = 0; ## For &message and &clear_message. + + &read_rc($rc_file, $showrc) unless $norc; + exit(0) if $showrc; + + $NEXT_DIR_ENTRY = $DO_SORT ? 'shift @files' : 'readdir(DIR)'; + $WHY = 1 if $VERBOSE > 3; ## Arg -vvvv or above implies -why. + @todo = ('.') if @todo == 0; ## Where we'll start looking + + ## see if any user options were specified that weren't accounted for + foreach $opt (keys %user_opt) { + next if defined $seen_opt{$opt}; + warn "warning: -x$opt never considered.\n"; + } + + die "$0: multiple time constraints exclude all possible files.\n" + if ($NEWER && $OLDER) && ($NEWER > $OLDER); + + ## + ## Process any -skip/-iskip args that had been given + ## + local(@skip_test); + foreach $glob (keys %skip) { + $i = defined($iskip{$glob}) ? 'i': ''; + push(@skip_test, '$name =~ m/'. &glob_to_regex($glob). "/$i"); + } + if (@skip_test) { + $SKIP_TEST = join('||',@skip_test); + $DO_SKIP_TEST = 1; + } else { + $DO_SKIP_TEST = $SKIP_TEST = 0; + } + + ## + ## Process any -dskip/-idskip args that had been given + ## + local(@dskip_test); + foreach $glob (keys %dskip) { + $i = defined($idskip{$glob}) ? 'i': ''; + push(@dskip_test, '$name =~ m/'. &glob_to_regex($glob). "/$i"); + } + if (@dskip_test) { + $DSKIP_TEST = join('||',@dskip_test); + $DO_DSKIP_TEST = 1; + } else { + $DO_DSKIP_TEST = $DSKIP_TEST = 0; + } + + + ## + ## Process any -name, -path, -regex, etc. args that had been given. + ## + undef @name_test; + undef @dname_test; + foreach $key (keys %name) { + local($type, $pat) = split(/,/, $key, 2); + local($i) = defined($iname{$key}) ? 'i' : ''; + if ($type =~ /regex/) { + $pat =~ s/!/\\!/g; + $test = "\$name =~ m!^$pat\$!$i"; + } else { + local($var) = $type eq 'name' ? '$name' : '$file'; + $test = "$var =~ m/". &glob_to_regex($pat). "/$i"; + } + if ($type =~ m/^-i?d/) { + push(@dname_test, $test); + } else { + push(@name_test, $test); + } + } + if (@name_test) { + $GLOB_TESTS = join('||', @name_test); + + $DO_GLOB_TESTS = 1; + } else { + $GLOB_TESTS = $DO_GLOB_TESTS = 0; + } + if (@dname_test) { + $DGLOB_TESTS = join('||', @dname_test); + $DO_DGLOB_TESTS = 1; + } else { + $DGLOB_TESTS = $DO_DGLOB_TESTS = 0; + } + + + ## + ## Process any 'magic' things from the startup file. + ## + if (@magic_tests && $HEADER_BYTES) { + ## the $magic' one is for when &dodir is not inlined + $tests = join('||',@magic_tests); + $MAGIC_TESTS = " { package magic; \$val = ($tests) }"; + $DO_MAGIC_TESTS = 1; + } else { + $MAGIC_TESTS = 1; + $DO_MAGIC_TESTS = 0; + } + + ## + ## Prepare regular expressions. + ## + { + local(@regex_tests); + + if ($LIST_ONLY) { + $mflag = ''; + ## need to have $* set, but perl5 just won''t shut up about it. + if ($] >= 5) { + $mflag = 'm'; + } else { + eval ' $* = 1 '; + } + } + + ## + ## Until I figure out a better way to deal with it, + ## We have to worry about a regex like [^xyz] when doing $LIST_ONLY. + ## Such a regex *will* match \n, and if I'm pulling in multiple + ## lines, it can allow lines to match that would otherwise not match. + ## + ## Therefore, if there is a '[^' in a regex, we can NOT take a chance + ## an use the fast listonly. + ## + $CAN_USE_FAST_LISTONLY = $LIST_ONLY; + + local(@extra); + local($underline_glue) = ($] >= 5) ? '(:?_\cH)?' : '(_\cH)?'; + while (@ARGV) { + $regex = shift(@ARGV); + ## + ## If watching for underlined things too, add another regex. + ## + if ($underlineOK) { + if ($regex =~ m/[?*+{}()\\.|^\$[]/) { + warn "$0: warning, can't underline-safe ``$regex''.\n"; + } else { + $regex = join($underline_glue, split(//, $regex)); + } + } + + ## If nothing special in the regex, just use index... + ## is quite a bit faster. + if (($iflag eq '') && ($words == 0) && + $regex !~ m/[?*+{}()\\.|^\$[]/) + { + push(@regex_tests, "(index(\$_, q+$regex+)>=0)"); + + } else { + $regex =~ s#[\$\@\/]\w#\\$&#; + if ($words) { + if ($regex =~ m/\|/) { + ## could be dangerous -- see if we can wrap in parens. + if ($regex =~ m/\\\d/) { + warn "warning: -w and a | in a regex is dangerous.\n" + } else { + $regex = join($regex, '(', ')'); + } + } + $regex = join($regex, '\b', '\b'); + } + $CAN_USE_FAST_LISTONLY = 0 if substr($regex, "[^") >= 0; + push(@regex_tests, "m/$regex/$iflag$mflag"); + } + + ## If we're done, but still have @extra to do, get set for that. + if (@ARGV == 0 && @extra) { + @ARGV = @extra; ## now deal with the extra stuff. + $underlineOK = 0; ## but no more of this. + undef @extra; ## or this. + } + } + if (@regex_tests) { + $REGEX_TEST = join('||', @regex_tests); + ## print STDERR $REGEX_TEST, "\n"; exit; + } else { + ## must be doing -find -- just give something syntactically correct. + $REGEX_TEST = 1; + } + } + + ## + ## Make sure we can read the first item(s). + ## + foreach $start (@todo) { + $! = 2, die qq/$0: can't stat "$start"\n/ + unless ($dev,$inode) = (stat($start))[$STAT_DEV,$STAT_INODE]; + + if (defined $dir_done{"$dev,$inode"}) { + ## ignore the repeat. + warn(qq/ignoring "$start" (same as "$dir_done{"$dev,$inode"}").\n/) + if $VERBOSE; + next; + } + + ## if -xdev was given, remember the device. + $xdev{$dev} = 1 if $XDEV; + + ## Note that we won't want to do it again + $dir_done{"$dev,$inode"} = $start; + } +} + + +## +## See the comment above the __END__ above the 'sub dodir' below. +## +sub import_program +{ + sub bad { + print STDERR "$0: internal error (@_)\n"; + exit 2; + } + + ## Read from data, up to next __END__. This will be &dodir. + local($/) = "\n__END__"; + $prog = <DATA>; + close(DATA); + + $prog =~ s/\beval\b//g; ## remove any 'eval' + + ## Inline uppercase $-variables by their current values. + if ($] >= 5) { + $prog =~ s/\$([A-Z][A-Z0-9_]{2,}\b)/ + &bad($1) if !defined ${$main::{$1}}; ${$main::{$1}};/eg; + } else { + $prog =~ s/\$([A-Z][A-Z0-9_]{2,}\b)/local(*VAR) = $_main{$1}; + &bad($1) if !defined $VAR; $VAR;/eg; + } + + eval $prog; ## now do it. This will define &dodir; + $!=2, die "$0 internal error: $@\n" if $@; +} + +########################################################################### + +## +## Read the .search file: +## Blank lines and lines that are only #-comments ignored. +## Newlines may be escaped to create long lines +## Other lines are directives. +## +## A directive may begin with an optional tag in the form <...> +## Things inside the <...> are evaluated as with: +## <(this || that) && must> +## will be true if +## -xmust -xthis or -xmust -xthat +## were specified on the command line (order doesn't matter, though) +## A directive is not done if there is a tag and it's false. +## Any characters but whitespace and &|()>,! may appear after an -x +## (although "-xdev" is special). -xmust,this is the same as -xmust -xthis. +## Something like -x~ would make <~> true, and <!~> false. +## +## Directives are in the form: +## option: STRING +## magic : NUMBYTES : EXPR +## +## With option: +## The STRING is parsed like a Bourne shell command line, and the +## options are used as if given on the command line. +## No comments are allowed on 'option' lines. +## Examples: +## # skip objects and libraries +## option: -skip '.o .a' +## # skip emacs *~ and *# files, unless -x~ given: +## <!~> option: -skip '~ #' +## +## With magic: +## EXPR can be pretty much any perl (comments allowed!). +## If it evaluates to true for any particular file, it is skipped. +## The only info you'll have about a file is the variable $H, which +## will have at least the first NUMBYTES of the file (less if the file +## is shorter than that, of course, and maybe more). You'll also have +## any variables you set in previous 'magic' lines. +## Examples: +## magic: 6 : ($x6 = substr($H, 0, 6)) eq 'GIF87a' +## magic: 6 : $x6 eq 'GIF89a' +## +## magic: 6 : (($x6 = substr($H, 0, 6)) eq 'GIF87a' ## old gif \ +## || $x6 eq 'GIF89a' ## new gif +## (the above two sets are the same) +## ## Check the first 32 bytes for "binarish" looking bytes. +## ## Don't blindly dump on any high-bit set, as non-ASCII text +## ## often has them set. \x80 and \xff seem to be special, though. +## ## Require two in a row to not get things like perl's $^T. +## ## This is known to get *.Z, *.gz, pkzip, *.elc and about any +## ## executable you'll find. +## magic: 32 : $H =~ m/[\x00-\x06\x10-\x1a\x1c-\x1f\x80\xff]{2}/ +## +sub read_rc +{ + local($file, $show) = @_; + local($line_num, $ln, $tag) = 0; + local($use_default, @default) = 0; + + { package magic; $ = 0; } ## turn off warnings for when we run EXPR's + + unless (open(RC, "$file")) { + $use_default=1; + $file = "<internal default startup file>"; + ## no RC file -- use this default. + @default = split(/\n/,<<'--------INLINE_LITERAL_TEXT'); + magic: 32 : $H =~ m/[\x00-\x06\x10-\x1a\x1c-\x1f\x80\xff]{2}/ + option: -skip '.a .COM .elc .EXE .gz .o .pbm .xbm .dvi' + option: -iskip '.tarz .zip .z .lzh .jpg .jpeg .gif .uu' + <!~> option: -skip '~ #' +--------INLINE_LITERAL_TEXT + } + + ## + ## Make an eval error pretty. + ## + sub clean_eval_error { + local($_) = @_; + s/ in file \(eval\) at line \d+,//g; ## perl4-style error + s/ at \(eval \d+\) line \d+,//g; ## perl5-style error + $_ = $` if m/\n/; ## remove all but first line + "$_\n"; + } + + print "reading RC file: $file\n" if $show; + + while ($_ = ($use_default ? shift(@default) : <RC>)) { + $ln = ++$line_num; ## note starting line num. + $_ .= <RC>, $line_num++ while s/\\\n?$/\n/; ## allow continuations + next if /^\s*(#.*)?$/; ## skip blank or comment-only lines. + $do = ''; + + ## look for an initial <...> tag. + if (s/^\s*<([^>]*)>//) { + ## This simple s// will make the tag ready to eval. + ($tag = $msg = $1) =~ + s/[^\s&|(!)]+/ + $seen_opt{$&}=1; ## note seen option + "defined(\$opt{q>$&>})" ## (q>> is safe quoting here) + /eg; + + ## see if the tag is true or not, abort this line if not. + $dothis = (eval $tag); + $!=2, die "$file $ln <$msg>: $_".&clean_eval_error($@) if $@; + + if ($show) { + $msg =~ s/[^\s&|(!)]+/-x$&/; + $msg =~ s/\s*!\s*/ no /g; + $msg =~ s/\s*&&\s*/ and /g; + $msg =~ s/\s*\|\|\s*/ or /g; + $msg =~ s/^\s+//; $msg =~ s/\s+$//; + $do = $dothis ? "(doing because $msg)" : + "(do if $msg)"; + } elsif (!$dothis) { + next; + } + } + + if (m/^\s*option\s*:\s*/) { + next if $all && !$show; ## -all turns off these checks; + local($_) = $'; + s/\n$//; + local($orig) = $_; + print " $do option: $_\n" if $show; + local($0) = "$0 ($file)"; ## for any error message. + local(@ARGV); + local($this); + ## + ## Parse $_ as a Bourne shell line -- fill @ARGV + ## + while (length) { + if (s/^\s+//) { + push(@ARGV, $this) if defined $this; + undef $this; + next; + } + $this = '' if !defined $this; + $this .= $1 while s/^'([^']*)'// || + s/^"([^"]*)"// || + s/^([^'"\s\\]+)//|| + s/^(\\[\D\d])//; + die "$file $ln: error parsing $orig at $_\n" if m/^\S/; + } + push(@ARGV, $this) if defined $this; + &check_args; + die qq/$file $ln: unused arg "@ARGV".\n/ if @ARGV; + next; + } + + if (m/^\s*magic\s*:\s*(\d+)\s*:\s*/) { + next if $all && !$show; ## -all turns off these checks; + local($bytes, $check) = ($1, $'); + + if ($show) { + $check =~ s/\n?$/\n/; + print " $do contents: $check"; + } + ## Check to make sure the thing at least compiles. + eval "package magic; (\$H = '1'x \$main'bytes) && (\n$check\n)\n"; + $! = 2, die "$file $ln: ".&clean_eval_error($@) if $@; + + $HEADER_BYTES = $bytes if $bytes > $HEADER_BYTES; + push(@magic_tests, "(\n$check\n)"); + next; + } + $! = 2, die "$file $ln: unknown command\n"; + } + close(RC); +} + +sub message +{ + if (!$STDERR_IS_TTY) { + print STDERR $_[0], "\n"; + } else { + local($text) = @_; + $thislength = length($text); + if ($thislength >= $last_message_length) { + print STDERR $text, "\r"; + } else { + print STDERR $text, ' 'x ($last_message_length-$thislength),"\r"; + } + $last_message_length = $thislength; + } +} + +sub clear_message +{ + print STDERR ' ' x $last_message_length, "\r" if $last_message_length; + $vv_print = $vv_size = $last_message_length = 0; +} + +## +## Output a copy of this program with comments, extra whitespace, and +## the trailing man page removed. On an ultra slow machine, such a copy +## might load faster (but I can't tell any difference on my machine). +## +sub strip { + seek(DATA, 0, 0) || die "$0: can't reset internal pointer.\n"; + while(<DATA>) { + print, next if /INLINE_LITERAL_TEXT/.../INLINE_LITERAL_TEXT/; + ## must mention INLINE_LITERAL_TEXT on this line! + s/\#\#.*|^\s+|\s+$//; ## remove cruft + last if $_ eq '.00;'; + next if ($_ eq '') || ($_ eq "'di'") || ($_ eq "'ig00'"); + s/\$stripped=0;/\$stripped=1;/; + s/\s\s+/ /; ## squish multiple whitespaces down to one. + print $_, "\n"; + } + exit(0); +} + +## +## Just to shut up -w. Never executed. +## +sub dummy { + + 1 || &dummy || &dir_done || &bad || &message || $NEXT_DIR_ENTRY || + $DELAY || $VV_SIZE || $VV_PRINT_COUNT || $STDERR_SCREWS_STDOUT || + @files || @files || $magic'H || $magic'H || $xdev{''} || &clear_message; + +} + +## +## If the following __END__ is in place, what follows will be +## inlined when the program first starts up. Any $ variable name +## all in upper case, specifically, any string matching +## \$([A-Z][A-Z0-9_]{2,}\b +## will have the true value for that variable inlined. Also, any 'eval' is +## removed +## +## The idea is that when the whole thing is then eval'ed to define &dodir, +## the perl optimizer will make all the decisions that are based upon +## command-line options (such as $VERBOSE), since they'll be inlined as +## constants +## +## Also, and here's the big win, the tests for matching the regex, and a +## few others, are all inlined. Should be blinding speed here. +## +## See the read from <DATA> above for where all this takes place. +## But all-in-all, you *want* the __END__ here. Comment it out only for +## debugging.... +## + +__END__ + +## +## Given a directory, check all "appropriate" files in it. +## Shove any subdirectories into the global @todo, so they'll be done +## later. +## +## Be careful about adding any upper-case variables, as they are subject +## to being inlined. See comments above the __END__ above. +## +sub dodir +{ + local($dir) = @_; + $dir =~ s,/+$,,; ## remove any trailing slash. + unless (opendir(DIR, "$dir/.")) { + &clear_message if $VERBOSE && $STDERR_SCREWS_STDOUT; + warn qq($0: can't opendir "$dir/".\n); + return; + } + + if ($VERBOSE) { + &message($dir); + $vv_print = $vv_size = 0; + } + + @files = sort readdir(DIR) if $DO_SORT; + + while (defined($name = eval $NEXT_DIR_ENTRY)) + { + next if $name eq '.' || $name eq '..'; ## never follow these. + + ## create full relative pathname. + $file = $dir eq '.' ? $name : "$dir/$name"; + + ## if link and skipping them, do so. + if ($NOLINKS && -l $file) { + warn qq/skip (symlink): $file\n/ if $WHY; + next; + } + + ## skip things unless files or directories + unless (-f $file || -d _) { + if ($WHY) { + $why = (-S _ && "socket") || + (-p _ && "pipe") || + (-b _ && "block special")|| + (-c _ && "char special") || "somekinda special"; + warn qq/skip ($why): $file\n/; + } + next; + } + + ## skip things we can't read + unless (-r _) { + if ($WHY) { + $why = (-l $file) ? "follow" : "read"; + warn qq/skip (can't $why): $file\n/; + } + next; + } + + ## skip things that are empty + unless (-s _) { + warn qq/skip (empty): $file\n/ if $WHY; + next; + } + + ## Note file device & inode. If -xdev, skip if appropriate. + ($dev, $inode) = (stat(_))[$STAT_DEV, $STAT_INODE]; + if ($XDEV && defined $xdev{$dev}) { + warn qq/skip (other device): $file\n/ if $WHY; + next; + } + $id = "$dev,$inode"; + + ## special work for a directory + if (-d _) { + ## Do checks for directory file endings. + if ($DO_DSKIP_TEST && (eval $DSKIP_TEST)) { + warn qq/skip (-dskip): $file\n/ if $WHY; + next; + } + ## do checks for -name/-regex/-path tests + if ($DO_DGLOB_TESTS && !(eval $DGLOB_TESTS)) { + warn qq/skip (dirname): $file\n/ if $WHY; + next; + } + + ## _never_ redo a directory + if (defined $dir_done{$id}) { + warn qq/skip (did as "$dir_done{$id}"): $file\n/ if $WHY; + next; + } + $dir_done{$id} = $file; ## mark it done. + unshift(@todo, $file); ## add to the list to do. + next; + } + if ($WHY == 0 && $VERBOSE > 1) { + if ($VERBOSE>2||$vv_print++>$VV_PRINT_COUNT||($vv_size+=-s _)>$VV_SIZE){ + &message($file); + $vv_print = $vv_size = 0; + } + } + + ## do time-related tests + if ($NEWER || $OLDER) { + $_ = (stat(_))[$STAT_MTIME]; + if ($NEWER && $_ < $NEWER) { + warn qq/skip (too old): $file\n/ if $WHY; + next; + } + if ($OLDER && $_ > $OLDER) { + warn qq/skip (too new): $file\n/ if $WHY; + next; + } + } + + ## do checks for file endings + if ($DO_SKIP_TEST && (eval $SKIP_TEST)) { + warn qq/skip (-skip): $file\n/ if $WHY; + next; + } + + ## do checks for -name/-regex/-path tests + if ($DO_GLOB_TESTS && !(eval $GLOB_TESTS)) { + warn qq/skip (filename): $file\n/ if $WHY; + next; + } + + + ## If we're not repeating files, + ## skip this one if we've done it, or note we're doing it. + unless ($DOREP) { + if (defined $file_done{$id}) { + warn qq/skip (did as "$file_done{$id}"): $file\n/ if $WHY; + next; + } + $file_done{$id} = $file; + } + + if ($DO_MAGIC_TESTS) { + if (!open(FILE_IN, $file)) { + &clear_message if $VERBOSE && $STDERR_SCREWS_STDOUT; + warn qq/$0: can't open: $file\n/; + next; + } + unless (read(FILE_IN, $magic'H, $HEADER_BYTES)) { + &clear_message if $VERBOSE && $STDERR_SCREWS_STDOUT; + warn qq/$0: can't read from "$file"\n"/; + close(FILE_IN); + next; + } + + eval $MAGIC_TESTS; + if ($magic'val) { + close(FILE_IN); + warn qq/skip (magic): $file\n/ if $WHY; + next; + } + seek(FILE_IN, 0, 0); ## reset for later <FILE_IN> + } + + if ($WHY != 0 && $VERBOSE > 1) { + if ($VERBOSE>2||$vv_print++>$VV_PRINT_COUNT||($vv_size+=-s _)>$VV_SIZE){ + &message($file); + $vv_print = $vv_size = 0; + } + } + + if ($DELAY) { + sleep($DELAY); + } + + if ($FIND_ONLY) { + &clear_message if $VERBOSE && $STDERR_SCREWS_STDOUT; + print $file, "\n"; + $retval=0; ## we've found something + close(FILE_IN) if $DO_MAGIC_TESTS; + next; + } else { + ## if we weren't doing magic tests, file won't be open yet... + if (!$DO_MAGIC_TESTS && !open(FILE_IN, $file)) { + &clear_message if $VERBOSE && $STDERR_SCREWS_STDOUT; + warn qq/$0: can't open: $file\n/; + next; + } + if ($LIST_ONLY && $CAN_USE_FAST_LISTONLY) { + ## + ## This is rather complex, but buys us a LOT when we're just + ## listing files and not the individual internal lines. + ## + local($size) = 4096; ## block-size in which to do reads + local($nl); ## will point to $_'s ending newline. + local($read); ## will be how many bytes read. + local($_) = ''; ## Starts out empty + local($hold); ## (see below) + + while (($read = read(FILE_IN,$_,$size,length($_)))||length($_)) + { + undef @parts; + ## if read a full block, but no newline, need to read more. + while ($read == $size && ($nl = rindex($_, "\n")) < 0) { + push(@parts, $_); ## save that part + $read = read(FILE_IN, $_, $size); ## keep trying + } + + ## + ## If we had to save parts, must now combine them together. + ## adjusting $nl to reflect the now-larger $_. This should + ## be a lot more efficient than using any kind of .= in the + ## loop above. + ## + if (@parts) { + local($lastlen) = length($_); #only need if $nl >= 0 + $_ = join('', @parts, $_); + $nl = length($_) - ($lastlen - $nl) if $nl >= 0; + } + + ## + ## If we're at the end of the file, then we can use $_ as + ## is. Otherwise, we need to remove the final partial-line + ## and save it so that it'll be at the beginning of the + ## next read (where the rest of the line will be layed in + ## right after it). $hold will be what we should save + ## until next time. + ## + if ($read != $size || $nl < 0) { + $hold = ''; + } else { + $hold = substr($_, $nl + 1); + substr($_, $nl + 1) = ''; + } + + ## + ## Now have a bunch of full lines in $_. Use it. + ## + if (eval $REGEX_TEST) { + &clear_message if $VERBOSE && $STDERR_SCREWS_STDOUT; + print $file, "\n"; + $retval=0; ## we've found something + + last; + } + + ## Prepare for next read.... + $_ = $hold; + } + + } else { ## else not using faster block scanning..... + + $lines_printed = 0 if $NICE; + while (<FILE_IN>) { + study; + next unless (eval $REGEX_TEST); + + ## + ## We found a matching line. + ## + $retval=0; + &clear_message if $VERBOSE && $STDERR_SCREWS_STDOUT; + if ($LIST_ONLY) { + print $file, "\n"; + last; + } else { + ## prepare to print line. + if ($NICE && $lines_printed++ == 0) { + print '-' x 70, "\n" if $NICE > 1; + print $file, ":\n"; + } + + ## + ## Print all the prelim stuff. This looks less efficient + ## than it needs to be, but that's so that when the eval + ## is compiled (and the tests are optimized away), the + ## result will be less actual PRINTs than the more natural + ## way of doing these tests.... + ## + if ($NICE) { + if ($REPORT_LINENUM) { + print " line $.: "; + } else { + print " "; + } + } elsif ($REPORT_LINENUM && $PREPEND_FILENAME) { + print "$file,:$.: "; + } elsif ($PREPEND_FILENAME) { + print "$file: "; + } elsif ($REPORT_LINENUM) { + print "$.: "; + } + print $_; + print "\n" unless m/\n$/; + } + } + print "\n" if ($NICE > 1) && $lines_printed; + } + close(FILE_IN); + } + } + closedir(DIR); +} + +__END__ +.00; ## finish .ig + +'di \" finish diversion--previous line must be blank +.nr nl 0-1 \" fake up transition to first page again +.nr % 0 \" start at page 1 +.\"__________________NORMAL_MAN_PAGE_BELOW_________________ +.ll+10n +.TH search 1 "Dec 17, 1994" +.SH SEARCH +search \- search files (a'la grep) in a whole directory tree. +.SH SYNOPSIS +search [ grep-like and find-like options] [regex ....] +.SH DESCRIPTION +.I Search +is more or less a combo of 'find' and 'grep' (although the regular +expression flavor is that of the perl being used, which is closer to +egrep's than grep's). + +.I Search +does generally the same kind of thing that +.nf + find <blah blah> | xargs egrep <blah blah> +.fi +does, but is +.I much +more powerful and efficient (and intuitive, I think). + +This manual describes +.I search +as of version "941227.4". You can always find the latest version at +.nf + http://www.wg.omron.co.jp/~jfriedl/perl/index.html +.fi + +.SH "QUICK EXAMPLE" +Basic use is simple: +.nf + % search jeff +.fi +will search files in the current directory, and all sub directories, for +files that have "jeff" in them. The lines will be listed with the +containing file's name prepended. +.PP +If you list more than one regex, such as with +.nf + % search jeff Larry Randal+ 'Stoc?k' 'C.*son' +.fi +then a line containing any of the regexes will be listed. +This makes it effectively the same as +.nf + % search 'jeff|Larry|Randal+|Stoc?k|C.*son' +.fi +However, listing them separately is much more efficient (and is easier +to type). +.PP +Note that in the case of these examples, the +.B \-w +(list whole-words only) option would be useful. +.PP +Normally, various kinds of files are automatically removed from consideration. +If it has has a certain ending (such as ".tar", ".Z", ".o", .etc), or if +the beginning of the file looks like a binary, it'll be excluded. +You can control exactly how this works -- see below. One quick way to +override this is to use the +.B \-all +option, which means to consider all the files that would normally be +automatically excluded. +Or, if you're curious, you can use +.B \-why +to have notes about what files are skipped (and why) printed to stderr. + +.SH "BASIC OVERVIEW" +Normally, the search starts in the current directory, considering files in +all subdirectories. + +You can use the +.I ~/.search +file to control ways to automatically exclude files. +If you don't have this file, a default one will kick in, which automatically +add +.nf + -skip .o .Z .gif +.fi +(among others) to exclude those kinds of files (which you probably want to +skip when searching for text, as is normal). +Files that look to be be binary will also be excluded. + +Files ending with "#" and "~" will also be excluded unless the +.B -x~ +option is given. + +You can use +.B -showrc +to show what kinds of files will normally be skipped. +See the section on the startup file +for more info. + +You can use the +.B -all +option to indicate you want to consider all files that would otherwise be +skipped by the startup file. + +Based upon various other flags (see "WHICH FILES TO CONSIDER" below), +more files might be removed from consideration. For example +.nf + -mtime 3 +.fi +will exclude files that aren't at least three days old (change the 3 to -3 +to exclude files that are more than three days old), while +.nf + -skip .* +.fi +would exclude any file beginning with a dot (of course, '.' and '..' are +special and always excluded). + +If you'd like to see what files are being excluded, and why, you can get the +list via the +.B \-why +option. + +If a file makes it past all the checks, it is then "considered". +This usually means it is greped for the regular expressions you gave +on the command line. + +If any of the regexes match a line, the line is printed. +However, if +.B -list +is given, just the filename is printed. Or, if +.B -nice +is given, a somewhat more (human-)readable output is generated. + +If you're searching a huge tree and want to keep informed about how +the search is progressing, +.B -v +will print (to stderr) the current directory being searched. +Using +.B -vv +will also print the current file "every so often", which could be useful +if a directory is huge. Using +.B -vvv +will print the update with every file. + +Below is the full listing of options. + +.SH "OPTIONS TELLING *WHERE* TO SEARCH" +.TP +.BI -dir " DIR" +Start searching at the named directory instead of the current directory. +If multiple +.B -dir +arguments are given, multiple trees will be searched. +.TP +.BI -ddir " DIR" +Like +.B -dir +except it flushes any previous +.B -dir +directories (i.e. "-dir A -dir B -dir C" will search A, B, and C, while +"-dir A -ddir B -dir C" will search only B and C. This might be of use +in the startup file (see that section below). +.TP +.B -xdev +Stay on the same filesystem as the starting directory/directories. +.TP +.B -sort +Sort the items in a directory before processing them. +Normally they are processed in whatever order they happen to be read from +the directory. +.TP +.B -nolinks +Don't follow symbolic links. Normally they're followed. + +.SH "OPTIONS CONTROLLING WHICH FILES TO CONSIDER AND EXCLUDE" +.TP +.BI -mtime " NUM" +Only consider files that were last changed more than +.I NUM +days ago +(less than +.I NUM +days if +.I NUM +has '-' prepended, i.e. "-mtime -2.5" means to consider files that +have been changed in the last two and a half days). +.TP +.B -older FILE +Only consider files that have not changed since +.I FILE +was last changed. +If there is any upper case in the "-older", "or equal" is added to the sense +of the test. Therefore, "search -older ./file regex" will never consider +"./file", while "search -Older ./file regex" will. + +If a file is a symbolic link, the time used is that of the file and not the +link. +.TP +.BI -newer " FILE" +Opposite of +.BR -older . +.TP +.BI -name " GLOB" +Only consider files that match the shell filename pattern +.IR GLOB . +The check is only done on a file's name (use +.B -path +to check the whole path, and use +.B -dname +to check directory names). + +Multiple specifications can be given by separating them with spaces, a'la +.nf + -name '*.c *.h' +.fi +to consider C source and header files. +If +.I GLOB +doesn't contain any special pattern characters, a '*' is prepended. +This last example could have been given as +.nf + -name '.c .h' +.fi +It could also be given as +.nf + -name .c -name .h +.fi +or +.nf + -name '*.c' -name '*.h' +.fi +or +.nf + -name '*.[ch]' +.fi +(among others) +but in this last case, you have to be sure to supply the leading '*'. +.TP +.BI -path " GLOB" +Like +.B -name +except the entire path is checked against the pattern. +.TP +.B -regex " REGEX" +Considers files whose names (not paths) match the given perl regex +exactly. +.TP +.BI -iname " GLOB" +Case-insensitive version of +.BR -name . +.TP +.BI -ipath " GLOB" +Case-insensitive version of +.BR -path . +.TP +.BI -iregex " REGEX" +Case-insensitive version of +.BR -regex . + +.TP +.BI -dpath " GLOB" +Only search down directories whose path matches the given pattern (this +doesn't apply to the initial directory given by +.BI -dir , +of course). +Something like +.nf + -dir /usr/man -dpath /usr/man/man* +.fi +would completely skip +"/usr/man/cat1", "/usr/man/cat2", etc. +.TP +.BI -dskip " GLOB" +Skips directories whose name (not path) matches the given pattern. +Something like +.nf + -dir /usr/man -dskip cat* +.fi +would completely skip any directory in the tree whose name begins with "cat" +(including "/usr/man/cat1", "/usr/man/cat2", etc.). +.TP +.BI -dregex " REGEX" +Like +.BI -dpath , +but the pattern is a full perl regex. Note that this quite different +from +.B -regex +which considers only file names (not paths). This option considers +full directory paths (not just names). It's much more useful this way. +Sorry if it's confusing. +.TP +.BI -dpath " GLOB" +This option exists, but is probably not very useful. It probably wants to +be like the '-below' or something I mention in the "TODO" section. +.TP +.BI -idpath " GLOB" +Case-insensitive version of +.BR -dpath . +.TP +.BI -idskip " GLOB" +Case-insensitive version of +.BR -dskip . +.TP +.BI -idregex " REGEX" +Case-insensitive version of +.BR -dregex . +.TP +.B -all +Ignore any 'magic' or 'option' lines in the startup file. +The effect is that all files that would otherwise be automatically +excluded are considered. +.TP +.BI -x SPECIAL +Arguments starting with +.B -x +(except +.BR -xdev , +explained elsewhere) do special interaction with the +.I ~/.search +startup file. Something like +.nf + -xflag1 -xflag2 +.fi +will turn on "flag1" and "flag2" in the startup file (and is +the same as "-xflag1,flag2"). You can use this to write your own +rules for what kinds of files are to be considered. + +For example, the internal-default startup file contains the line +.nf + <!~> option: -skip '~ #' +.fi +This means that if the +.B -x~ +flag is +.I not +seen, the option +.nf + -skip '~ #' +.fi +should be done. +The effect is that emacs temp and backup files are not normally +considered, but you can included them with the -x~ flag. + +You can write your own rules to customize +.I search +in powerful ways. See the STARTUP FILE section below. +.TP +.B -why +Print a message (to stderr) when and why a file is not considered. + +.SH "OPTIONS TELLING WHAT TO DO WITH FILES THAT WILL BE CONSIDERED" +.TP +.B -find +(you can use +.B -f +as well). +This option changes the basic action of +.IR search . + +Normally, if a file is considered, it is searched +for the regular expressions as described earlier. However, if this option +is given, the filename is printed and no searching takes place. This turns +.I search +into a 'find' of some sorts. + +In this case, no regular expressions are needed on the command line +(any that are there are silently ignored). + +This is not intended to be a replacement for the 'find' program, +but to aid +you in understanding just what files are getting past the exclusion checks. +If you really want to use it as a sort of replacement for the 'find' program, +you might want to use +.B -all +so that it doesn't waste time checking to see if the file is binary, etc +(unless you really want that, of course). + +If you use +.BR -find , +none of the "GREP-LIKE OPTIONS" (below) matter. + +As a replacement for 'find', +.I search +is probably a bit slower (or in the case of GNU find, a lot slower -- +GNU find is +.I unbelievably +fast). +However, "search -ffind" +might be more useful than 'find' when options such as +.B -skip +are used (at least until 'find' gets such functionality). +.TP +.B -ffind +(or +.BR -ff ) +A faster more 'find'-like find. Does +.nf + -find -all -dorep +.fi +.SH "GREP-LIKE OPTIONS" +These options control how a searched file is accessed, +and how things are printed. +.TP +.B -i +Ignore letter case when matching. +.TP +.B -w +Consider only whole-word matches ("whole word" as defined by perl's "\\b" +regex). +.TP +.B -u +If the regex(es) is/are simple, try to modify them so that they'll work +in manpage-like underlined text (i.e. like _^Ht_^Hh_^Hi_^Hs). +This is very rudimentary at the moment. +.TP +.B -list +(you can use +.B -l +too). +Don't print matching lines, but the names of files that contain matching +lines. This will likely be *much* faster, as special optimizations are +made -- particularly with large files. +.TP +.B -n +Pepfix each line by its line number. +.TP +.B -nice +Not a grep-like option, but similar to +.BR -list , +so included here. +.B -nice +will have the output be a bit more human-readable, with matching lines printed +slightly indented after the filename, a'la +.nf + + % search foo + somedir/somefile: line with foo in it + somedir/somefile: some food for thought + anotherdir/x: don't be a buffoon! + % + +.fi +will become +.nf + + % search -nice foo + somedir/somefile: + line with foo in it + some food for thought + anotherdir/x: + don't be a buffoon! + % + +.fi +This option due to Lionel Cons. +.TP +.B -nnice +Be a bit nicer than +.BR -nice . +Prefix each file's output by a rule line, and follow with an extra blank line. +.TP +.B -h +Don't prepend each output line with the name of the file +(meaningless when +.B -find +or +.B -l +are given). + +.SH "OTHER OPTIONS" +.TP +.B -help +Print the usage information. +.TP +.B -version +Print the version information and quit. +.TP +.B -v +Set the level of message verbosity. +.B -v +will print a note whenever a new directory is entered. +.B -vv +will also print a note "every so often". This can be useful to see +what's happening when searching huge directories. +.B -vvv +will print a new with every file. +.B -vvvv +is +-vvv +plus +.BR -why . +.TP +.B -e +This ends the options, and can be useful if the regex begins with '-'. +.TP +.B -showrc +Shows what is being considered in the startup file, then exits. +.TP +.B -dorep +Normally, an identical file won't be checked twice (even with multiple +hard or symbolic links). If you're just trying to do a fast +.BR -find , +the bookkeeping to remember which files have been seen is not desirable, +so you can eliminate the bookkeeping with this flag. + +.SH "STARTUP FILE" +When +.I search +starts up, it processes the directives in +.IR ~/.search . +If no such file exists, a default +internal version is used. + +The internal version looks like: +.nf + + magic: 32 : $H =~ m/[\ex00-\ex06\ex10-\ex1a\ex1c-\ex1f\ex80\exff]{2}/ + option: -skip '.a .COM .elc .EXE .gz .o .pbm .xbm .dvi' + option: -iskip '.tarz .zip .z .lzh .jpg .jpeg .gif .uu' + <!~> option: -skip '~ #' + +.fi +If you wish to create your own "~/.search", +you might consider copying the above, and then working from there. + +There are two kinds of directives in a startup file: "magic" and "option". +.RS 0n +.TP +OPTION +Option lines will automatically do the command-line options given. +For example, the line +.nf + option: -v +.fi +in you startup file will turn on -v every time, without needing to type it +on the command line. + +The text on the line after the "option:" directive is processed +like the Bourne shell, so make sure to pay attention to quoting. +.nf + option: -skip .exe .com +.fi +will give an error (".com" by itself isn't a valid option), while +.nf + option: -skip ".exe .com" +.fi +will properly include it as part of -skip's argument. + +.TP +MAGIC +Magic lines are used to determine if a file should be considered a binary +or not (the term "magic" refers to checking a file's magic number). These +are described in more detail below. +.RE + +Blank lines and comments (lines beginning with '#') are allowed. + +If a line begins with <...>, then it's a check to see if the +directive on the line should be done or not. The stuff inside the <...> +can contain perl's && (and), || (or), ! (not), and parens for grouping, +along with "flags" that might be indicated by the user with +.BI -x flag +options. + +For example, using "-xfoo" will cause "foo" to be true inside the <...> +blocks. Therefore, a line beginning with "<foo>" would be done only when +"-xfoo" had been specified, while a line beginning with "<!foo>" would be +done only when "-xfoo" is not specified (of course, a line without any <...> +is done in either case). + +A realistic example might be +.nf + <!v> -vv +.fi +This will cause -vv messages to be the default, but allow "-xv" to override. + +There are a few flags that are set automatically: +.RS +.TP +.B TTY +true if the output is to the screen (as opposed to being redirected to a file). +You can force this (as with all the other automatic flags) with -xTTY. +.TP +.B -v +True if -v was specified. If -vv was specified, both +.B -v +and +.B -vv +flags are true (and so on). +.TP +.B -nice +True if -nice was specified. Same thing about -nnice as for -vv. +.PP +.TP +.B -list +true if -list (or -l) was given. +.TP +.B -dir +true if -dir was given. +.RE + +Using this info, you might change the last example to +.nf + + <!v && !-v> option: -vv + +.fi +The added "&& !-v" means "and if the '-v' option not given". +This will allow you to use "-v" alone on the command line, and not +have this directive add the more verbose "-vv" automatically. + +.RS 0 +Some other examples: +.TP +<!-dir && !here> option: -dir ~/ +Effectively make the default directory your home directory (instead of the +current directory). Using -dir or -xhere will undo this. +.TP +<tex> option: -name .tex -dir ~/pub +Create '-xtex' to search only "*.tex" files in your ~/pub directory tree. +Actually, this could be made a bit better. If you combine '-xtex' and '-dir' +on the command line, this directive will add ~/pub to the list, when you +probably want to use the -dir directory only. You could do +.nf + + <tex> option: -name .tex + <tex && !-dir> option: -dir ~/pub +.fi + +to will allow '-xtex' to work as before, but allow a command-line "-dir" +to take precedence with respect to ~/pub. +.TP +<fluff> option: -nnice -sort -i -vvv +Combine a few user-friendly options into one '-xfluff' option. +.TP +<man> option: -ddir /usr/man -v -w +When the '-xman' option is given, search "/usr/man" for whole-words +(of whatever regex or regexes are given on the command line), with -v. +.RE + +The lines in the startup file are executed from top to bottom, so something +like +.nf + + <both> option: -xflag1 -xflag2 + <flag1> option: ...whatever... + <flag2> option: ...whatever... + +.fi +will allow '-xboth' to be the same as '-xflag1 -xflag2' (or '-xflag1,flag2' +for that matter). However, if you put the "<both>" line below the others, +they will not be true when encountered, so the result would be different +(and probably undesired). + +The "magic" directives are used to determine if a file looks to be binary +or not. The form of a magic line is +.nf + magic: \fISIZE\fP : \fIPERLCODE\fP +.fi +where +.I SIZE +is the number of bytes of the file you need to check, and +.I PERLCODE +is the code to do the check. Within +.IR PERLCODE , +the variable $H will hold at least the first +.I SIZE +bytes of the file (unless the file is shorter than that, of course). +It might hold more bytes. The perl should evaluate to true if the file +should be considered a binary. + +An example might be +.nf + magic: 6 : substr($H, 0, 6) eq 'GIF87a' +.fi +to test for a GIF ("-iskip .gif" is better, but this might be useful +if you have images in files without the ".gif" extension). + +Since the startup file is checked from top to bottom, you can be a bit +efficient: +.nf + magic: 6 : ($x6 = substr($H, 0, 6)) eq 'GIF87a' + magic: 6 : $x6 eq 'GIF89a' +.fi +You could also write the same thing as +.nf + magic: 6 : (($x6 = substr($H, 0, 6)) eq 'GIF87a') || ## an old gif, or.. \e + $x6 eq 'GIF89a' ## .. a new one. +.fi +since newlines may be escaped. + +The default internal startup file includes +.nf + magic: 32 : $H =~ m/[\ex00-\ex06\ex10-\ex1a\ex1c-\ex1f\ex80\exff]{2}/ +.fi +which checks for certain non-printable characters, and catches a large +number of binary files, including most system's executables, linkable +objects, compressed, tarred, and otherwise folded, spindled, and mutilated +files. + +Another example might be +.nf + ## an archive library + magic: 17 : substr($H, 0, 17) eq "!<arch>\en__.SYMDEF" +.fi + +.SH "RETURN VALUE" +.I Search +returns zero if lines (or files, if appropriate) were found, +or if no work was requested (such as with +.BR -help ). +Returns 1 if no lines (or files) were found. +Returns 2 on error. + +.SH TODO +Things I'd like to add some day: +.nf + + show surrounding lines (context). + + highlight matched portions of lines. + + add '-and', which can go between regexes to override + the default logical or of the regexes. + + add something like + -below GLOB + which will examine a tree and only consider files that + lie in a directory deeper than one named by the pattern. + + add 'warning' and 'error' directives. + + add 'help' directive. +.fi +.SH BUGS +If -xdev and multiple -dir arguments are given, any file in any of the +target filesystems are allowed. It would be better to allow each filesystem +for each separate tree. + +Multiple -dir args might also cause some confusing effects. Doing +.nf + -dir some/dir -dir other +.fi +will search "some/dir" completely, then search "other" completely. This +is good. However, something like +.nf + -dir some/dir -dir some/dir/more/specific +.fi +will search "some/dir" completely *except for* "some/dir/more/specific", +after which it will return and be searched. Not really a bug, but just sort +of odd. + +File times (for -newer, etc.) of symbolic links are for the file, not the +link. This could cause some misunderstandings. + +Probably more. Please let me know. +.SH AUTHOR +Jeffrey Friedl, Omron Corp (jfriedl@omron.co.jp) +.br +http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html + +.SH "LATEST SOURCE" +See http://www.wg.omron.co.jp/~jfriedl/perl/index.html +__END__ +:endofperl diff --git a/win32/bin/test.bat b/win32/bin/test.bat new file mode 100644 index 0000000000..e6b7b38160 --- /dev/null +++ b/win32/bin/test.bat @@ -0,0 +1,143 @@ +@rem = ' +@echo off +if exist perl.exe goto perlhere +echo Cannot run without perl.exe in current directory!! Did you build it? +pause +goto endofperl +:perlhere +if exist perlglob.exe goto perlglobhere +echo Cannot run without perlglob.exe in current directory!! Did you build it? +pause +goto endofperl +:perlglobhere +perl %0.bat %1 %2 %3 %4 %5 %6 %7 %8 %9 +goto endofperl +@rem '; + +#Portions (C) 1995 Microsoft Corporation. All rights reserved. +# Developed by hip communications inc., http://info.hip.com/info/ + + +# This is written in a peculiar style, since we're trying to avoid +# most of the constructs we'll be testing for. + +$| = 1; + +if ($ARGV[0] eq '-v') { + $verbose = 1; + shift; +} + + +# WYT 1995-05-02 +chdir 't' if -f 't/TESTNT'; + + +if ($ARGV[0] eq '') { +# @ARGV = split(/[ \n]/, +# `echo base/*.t comp/*.t cmd/*.t io/*.t; echo op/*.t lib/*.t`); +# `ls base/*.t comp/*.t cmd/*.t io/*.t op/*.t lib/*.t`); + +# WYT 1995-05-02 wildcard expansion, +# `perl -e "print( join( ' ', \@ARGV ) )" base/*.t comp/*.t cmd/*.t io/*.t op/*.t lib/*.t nt/*.t`); + +# WYT 1995-06-01 removed all dependency on perlglob +# WYT 1995-11-28 hacked up to cope with braindead Win95 console. + push( @ARGV, `dir/s/b base` ); + push( @ARGV, `dir/s/b comp` ); + push( @ARGV, `dir/s/b cmd` ); + push( @ARGV, `dir/s/b io` ); + push( @ARGV, `dir/s/b op` ); + push( @ARGV, `dir/s/b lib` ); + push( @ARGV, `dir/s/b nt` ); + + grep( chomp, @ARGV ); + @ARGV = grep( /\.t$/, @ARGV ); + grep( s/.*t\\//, @ARGV ); +} + +$sharpbang = 0; + +$bad = 0; +$good = 0; +$total = @ARGV; +while ($test = shift) { + if ($test =~ /^$/) { + next; + } + $te = $test; +# chop off 't' extension + chop($te); + print "$te" . '.' x (15 - length($te)); + if ($sharpbang) { + open(results,"./$test |") || (print "can't run.\n"); + } else { + $switch = ''; +# open(results,"./perl$switch $test |") || (print "can't run.\n"); + open(results,"perl$switch $test |") || (print "can't run.\n"); + } + $ok = 0; + $next = 0; + while (<results>) { + if ($verbose) { + print $_; + } + unless (/^#/||/^$/) { + if (/^1\.\.([0-9]+)/) { + $max = $1; + $totmax += $max; + $files += 1; + $next = 1; + $ok = 1; + } else { + $next = $1, $ok = 0, last if /^not ok ([0-9]*)/; + if (/^ok (.*)/ && $1 == $next) { + $next = $next + 1; + } else { + $ok = 0; + } + } + } + } + $next = $next - 1; + if ($ok && $next == $max) { + print "ok\n"; + $good = $good + 1; + } else { + $next += 1; + print "FAILED on test $next\n"; + $bad = $bad + 1; + $_ = $test; + if (/^base/) { + die "Failed a basic test--cannot continue.\n"; + } + } +} + +if ($bad == 0) { + if ($ok) { + print "All tests successful.\n"; + } else { + die "FAILED--no tests were run for some reason.\n"; + } +} else { + $pct = sprintf("%.2f", $good / $total * 100); + if ($bad == 1) { + warn "Failed 1 test, $pct% okay.\n"; + } else { + die "Failed $bad/$total tests, $pct% okay.\n"; + } +} + + +# WYT 1995-05-03 times not implemented. +#($user,$sys,$cuser,$csys) = times; +#print sprintf("u=%g s=%g cu=%g cs=%g files=%d tests=%d\n", +# $user,$sys,$cuser,$csys,$files,$totmax); + +#`del /f Cmd_while.tmp Comp.try null 2>NULL`; + +unlink 'Cmd_while.tmp', 'Comp.try', 'null'; + +__END__ +:endofperl diff --git a/win32/bin/webget.bat b/win32/bin/webget.bat new file mode 100644 index 0000000000..e77bb88ced --- /dev/null +++ b/win32/bin/webget.bat @@ -0,0 +1,1099 @@ +@rem = '--*-Perl-*--'; +@rem = ' +@echo off +perl -S %0.bat %1 %2 %3 %4 %5 %6 %7 %8 %9 +goto endofperl +@rem '; +#!/usr/local/bin/perl -w + +#- +#!/usr/local/bin/perl -w +$version = "951121.18"; +$comments = 'jfriedl@omron.co.jp'; + +## +## This is "webget" +## +## Jeffrey Friedl (jfriedl@omron.co.jp), July 1994. +## Copyright 19.... ah hell, just take it. +## Should work with either perl4 or perl5 +## +## BLURB: +## Given a URL on the command line (HTTP and FTP supported at the moment), +## webget fetches the named object (HTML text, images, audio, whatever the +## object happens to be). Will automatically use a proxy if one is defined +## in the environment, follow "this URL has moved" responses, and retry +## "can't find host" responses from a proxy in case host lookup was slow). +## Supports users & passwords (FTP), Basic Authorization (HTTP), update-if- +## modified (HTTP), and much more. Works with perl4 or perl5. + +## +## More-detailed instructions in the comment block below the history list. +## + +## +## To-do: +## Add gopher support. +## Fix up how error messages are passed among this and the libraries. +## + +## 951219.19 +## Lost ftp connections now die with a bit more grace. +## +## 951121.18 +## Add -nnab. +## Brought the "usage" string in line with reality. +## +## 951114.17 +## Added -head. +## Added -update/-refresh/-IfNewerThan. If any URL was not pulled +## because it was not out of date, an exit value of 2 is returned. +## +## 951031.16 +## Added -timeout. Cleaned up (a bit) the exit value. Now exits +## with 1 if all URLs had some error (timeout exits immediately with +## code 3, though. This is subject to change). Exits with 0 if any +## URL was brought over safely. +## +## 951017.15 +## Neat -pf, -postfile idea from Lorrie Cranor +## (http://www.ccrc.wustl.edu/~lorracks/) +## +## 950912.14 +## Sigh, fixed a typo. +## +## 950911.13 +## Added Basic Authorization support for http. See "PASSWORDS AND STUFF" +## in the documentation. +## +## 950911.12 +## Implemented a most-excellent suggestion by Anthony D'Atri +## (aad@nwnet.net), to be able to automatically grab to a local file of +## the same name as the URL. See the '-nab' flag. +## +## 950706.11 +## Quelled small -w warning (thanks: Lars Rasmussen <gnort@daimi.aau.dk>) +## +## 950630.10 +## Steve Campbell to the rescue again. FTP now works when supplied +## with a userid & password (eg ftp://user:pass@foo.bar.com/index.txt). +## +## 950623.9 +## Incorporated changes from Steve Campbell (steven_campbell@uk.ibm.com) +## so that the ftp will work when no password is required of a user. +## +## 950530.8 +## Minor changes: +## Eliminate read-size warning message when size unknown. +## Pseudo-debug/warning messages at the end of debug_read now go to +## stderr. Some better error handling when trying to contact systems +## that aren't really set up for ftp. Fixed a bug concerning FTP access +## to a root directory. Added proxy documentation at head of file. +## +## 950426.6,7 +## Complete Overhaul: +## Renamed from httpget. Added ftp support (very sketchy at the moment). +## Redid to work with new 'www.pl' library; chucked 'Www.pl' library. +## More or less new and/or improved in many ways, but probably introduced +## a few bugs along the way. +## +## 941227.5 +## Added follow stuff (with -nofollow, etc.) +## Added -updateme. Cool! +## Some general tidying up. +## +## 941107.4 +## Allowed for ^M ending a header line... PCs give those kind of headers. +## +## 940820.3 +## First sorta'clean net release. +## +## + +## +##> +## +## Fetch http and/or ftp URL(s) given on the command line and spit to +## STDOUT. +## +## Options include: +## -V, -version +## Print version information; exit. +## +## -p, -post +## If the URL looks like a reply to a form (i.e. has a '?' in it), +## the request is POST'ed instead of GET'ed. +## +## -head +## Gets the header only (for HTTP). This might include such useful +## things as 'Last-modified' and 'Content-length' fields +## (a lack of a 'Last-modified' might be a good indication that it's +## a CGI). +## +## The "-head" option implies "-nostrip", but does *not* imply, +## for example "-nofollow". +## +## +## -pf, -postfile +## The item after the '?' is taken as a local filename, and the contents +## are POST'ed as with -post +## +## -nab, -f, -file +## Rather than spit the URL(s) to standard output, unconditionally +## dump to a file (or files) whose name is that as used in the URL, +## sans path. I like '-nab', but supply '-file' as well since that's +## what was originally suggested. Also see '-update' below for the +## only-if-changed version. +## +## -nnab +## Like -nab, but in addtion to dumping to a file, dump to stdout as well. +## Sort of like the 'tee' command. +## +## -update, -refresh +## Do the same thing as -nab, etc., but does not bother pulling the +## URL if it older than the localfile. Only applies to HTTP. +## Uses the HTTP "If-Modified-Since" field. If the URL was not modified +## (and hence not changed), the return value is '2'. +## +## -IfNewerThan FILE +## -int FILE +## Only pulls URLs if they are newer than the date the local FILE was +## last written. +## +## -q, -quiet +## Suppresses all non-essential informational messages. +## +## -nf, -nofollow +## Normally, a "this URL has moved" HTTP response is automatically +## followed. Not done with -nofollow. +## +## -nr, -noretry +## Normally, an HTTP proxy response of "can't find host" is retried +## up to three times, to give the remote hostname lookup time to +## come back with an answer. This suppresses the retries. This is the +## same as '-retry 0'. +## +## -r#, -retry#, -r #, -retry # +## Sets the number of times to retry. Default 3. +## +## -ns, -nostrip +## For HTTP items (including other items going through an HTTP proxy), +## the HTTP response header is printed rather than stripped as default. +## +## -np, -noproxy +## A proxy is not used, even if defined for the protocol. +## +## -h, -help +## Show a usage message and exit. +## +## -d, -debug +## Show some debugging messages. +## +## -updateme +## The special and rather cool flag "-updateme" will see if webget has +## been updated since you got your version, and prepare a local +## version of the new version for you to use. Keep updated! (although +## you can always ask to be put on the ping list to be notified when +## there's a new version -- see the author's perl web page). +## +## -timeout TIMESPAN +## -to TIMESPAN +## Time out if a connection can not be made within the specified time +## period. TIMESPAN is normally in seconds, although a 'm' or 'h' may +## be appended to indicate minutes and hours. "-to 1.5m" would timeout +## after 90 seconds. +## +## (At least for now), a timeout causes immediate program death (with +## exit value 3). For some reason, the alarm doesn't always cause a +## waiting read or connect to abort, so I just die immediately.. /-: +## +## I might consider adding an "entire fetch" timeout, if someone +## wants it. +## +## PASSWORDS AND SUCH +## +## You can use webget to do FTP fetches from non-Anonymous systems and +## accounts. Just put the required username and password into the URL, +## as with +## webget 'ftp:/user:password@ftp.somesite.com/pub/pix/babe.gif +## ^^^^^^^^^^^^^ +## Note the user:password is separated from the hostname by a '@'. +## +## You can use the same kind of thing with HTTP, and if so it will provide +## what's know as Basic Authorization. This is >weak< authorization. It +## also provides >zero< security -- I wouldn't be sending any credit-card +## numbers this way (unless you send them 'round my way :-). It seems to +## be used most by providers of free stuff where they want to make some +## attempt to limit access to "known users". +## +## PROXY STUFF +## +## If you need to go through a gateway to get out to the whole internet, +## you can use a proxy if one's been set up on the gateway. This is done +## by setting the "http_proxy" environmental variable to point to the +## proxy server. Other variables are used for other target protocols.... +## "gopher_proxy", "ftp_proxy", "wais_proxy", etc. +## +## For example, I have the following in my ".login" file (for use with csh): +## +## setenv http_proxy http://local.gateway.machine:8080/ +## +## This is to indicate that any http URL should go to local.gateway.machine +## (port 8080) via HTTP. Additionally, I have +## +## setenv gopher_proxy "$http_proxy" +## setenv wais_proxy "$http_proxy" +## setenv ftp_proxy "$http_proxy" +## +## This means that any gopher, wais, or ftp URL should also go to the +## same place, also via HTTP. This allows webget to get, for example, +## GOPHER URLs even though it doesn't support GOPHER itself. It uses HTTP +## to talk to the proxy, which then uses GOPHER to talk to the destination. +## +## Finally, if there are sites inside your gateway that you would like to +## connect to, you can list them in the "no_proxy" variable. This will allow +## you to connect to them directly and skip going through the proxy: +## +## setenv no_proxy "www.this,www.that,www.other" +## +## I (jfriedl@omron.co.jp) have little personal experience with proxies +## except what I deal with here at Omron, so if this is not representative +## of your situation, please let me know. +## +## RETURN VALUE +## The value returned to the system by webget is rather screwed up because +## I didn't think about dealing with it until things were already +## complicated. Since there can be more than one URL on the command line, +## it's hard to decide what to return when one times out, another is fetched, +## another doesn't need to be fetched, and a fourth isn't found. +## +## So, here's the current status: +## +## Upon any timeout (via the -timeout arg), webget immediately +## returns 3. End of story. Otherwise.... +## +## If any URL was fetched with a date limit (i.e. via +## '-update/-refresh/-IfNewerThan' and was found to not have changed, +## 2 is returned. Otherwise.... +## +## If any URL was successfully fetched, 0 is returned. Otherwise... +## +## If there were any errors, 1 is returned. Otherwise... +## +## Must have been an info-only or do-nothing instance. 0 is returned. +## +## Phew. Hopefully useful to someone. +##< +## + +## Where latest version should be. +$WEB_normal = 'http://www.wg.omron.co.jp/~jfriedl/perl/webget'; +$WEB_inlined = 'http://www.wg.omron.co.jp/~jfriedl/perl/inlined/webget'; + + +require 'network.pl'; ## inline if possible (directive to a tool of mine) +require 'www.pl'; ## inline if possible (directive to a tool of mine) +$inlined=0; ## this might be changed by a the inline thing. + +## +## Exit values. All screwed up. +## +$EXIT_ok = 0; +$EXIT_error = 1; +$EXIT_notmodified = 2; +$EXIT_timeout = 3; + +## +## + +warn qq/WARNING:\n$0: need a newer version of "network.pl"\n/ if + !defined($network'version) || $network'version < "950311.5"; +warn qq/WARNING:\n$0: need a newer version of "www.pl"\n/ if + !defined($www'version) || $www'version < "951114.8"; + +$WEB = $inlined ? $WEB_inlined : $WEB_normal; + +$debug = 0; +$strip = 1; ## default is to strip +$quiet = 0; ## also normally off. +$follow = 1; ## normally, we follow "Found (302)" links +$retry = 3; ## normally, retry proxy hostname lookups up to 3 times. +$nab = 0; ## If true, grab to a local file of the same name. +$refresh = 0; ## If true, use 'If-Modified-Since' with -nab get. +$postfile = 0; ## If true, filename is given after the '?' +$defaultdelta2print = 2048; +$TimeoutSpan = 0; ## seconds after which we should time out. + +while (@ARGV && $ARGV[0] =~ m/^-/) +{ + $arg = shift(@ARGV); + + $nab = 1, next if $arg =~ m/^-f(ile)?$/; + $nab = 1, next if $arg =~ m/^-nab$/; + $nab = 2, next if $arg =~ m/^-nnab$/; + $post = 1, next if $arg =~ m/^-p(ost)?$/i; + $post = $postfile = 1, next if $arg =~ m/^-p(ost)?f(ile)?$/i; + $quiet=1, next if $arg =~ m/^-q(uiet)?$/; + $follow = 0, next if $arg =~ m/^-no?f(ollow)?$/; + $strip = 0, next if $arg =~ m/^-no?s(trip)?$/; + $debug=1, next if $arg =~ m/^-d(ebug)?$/; + $noproxy=1, next if $arg =~ m/^-no?p(roxy)?$/; + $retry=0, next if $arg =~ m/^-no?r(etry)?$/; + $retry=$2, next if $arg =~ m/^-r(etry)?(\d+)$/; + &updateme if $arg eq '-updateme'; + $strip = 0, $head = 1, next if $arg =~ m/^-head(er)?/; + $nab = $refresh = 1, next if $arg =~ m/^-(refresh|update)/; + + &usage($EXIT_ok) if $arg =~ m/^-h(elp)?$/; + &show_version, exit($EXIT_ok) if $arg eq '-version' || $arg eq '-V'; + + if ($arg =~ m/^-t(ime)?o(ut)?$/i) { + local($num) = shift(@ARGV); + &usage($EXIT_error, "expecting timespan argument to $arg\n") unless + $num =~ m/^\d+(\d*)?[hms]?$/; + &timeout_arg($num); + next; + } + + if ($arg =~ m/^-if?n(ewer)?t(han)?$/i) { + $reference_file = shift(@ARGV); + &usage($EXIT_error, "expecting filename arg to $arg") + if !defined $reference_file; + if (!-f $reference_file) { + warn qq/$0: ${arg}'s "$reference_file" not found.\n/; + exit($EXIT_error); + } + next; + } + + if ($arg eq '-r' || $arg eq '-retry') { + local($num) = shift(@ARGV); + &usage($EXIT_error, "expecting numerical arg to $arg\n") unless + defined($num) && $num =~ m/^\d+$/; + $retry = $num; + next; + } + &usage($EXIT_error, qq/$0: unknown option "$arg"\n/); +} + +if ($head && $post) { + warn "$0: combining -head and -post makes no sense, ignoring -post.\n"; + $post = 0; + undef $postfile; +} + +if ($refresh && defined($reference_file)) { + warn "$0: combining -update and -IfNewerThan make no sense, ignoring -IfNewerThan.\n"; + undef $reference_file; +} + +if (@ARGV == 0) { + warn "$0: nothing to do. Use -help for info.\n"; + exit($EXIT_ok); +} + + +## +## Now run through the remaining arguments (mostly URLs) and do a quick +## check to see if they look well-formed. We won't *do* anything -- just +## want to catch quick errors before really starting the work. +## +@tmp = @ARGV; +$errors = 0; +while (@tmp) { + $arg = shift(@tmp); + if ($arg =~ m/^-t(ime)?o(ut)?$/) { + local($num) = shift(@tmp); + if ($num !~ m/^\d+(\d*)?[hms]?$/) { + &warn("expecting timespan argument to $arg\n"); + $errors++; + } + } else { + local($protocol) = &www'grok_URL($arg, $noproxy); + + if (!defined $protocol) { + warn qq/can't grok "$arg"/; + $errors++; + } elsif (!$quiet && ($protocol eq 'ftp')) { + warn qq/warning: -head ignored for ftp URLs\n/ if $head; + warn qq/warning: -refresh ignored for ftp URLs\n/if $refresh; + warn qq/warning: -IfNewerThan ignored for ftp URLs\n/if defined($reference_file); + + } + } +} + +exit($EXIT_error) if $errors; + + +$SuccessfulCount = 0; +$NotModifiedCount = 0; + +## +## Now do the real thing. +## +while (@ARGV) { + $arg = shift(@ARGV); + if ($arg =~ m/^-t(ime)?o(ut)?$/) { + &timeout_arg(shift(@ARGV)); + } else { + &fetch_url($arg); + } +} + +if ($NotModifiedCount) { + exit($EXIT_notmodified); +} elsif ($SuccessfulCount) { + exit($EXIT_ok); +} else { + exit($EXIT_error); +} + +########################################################################### +########################################################################### + +sub timeout_arg +{ + ($TimeoutSpan) = @_; + $TimeoutSpan =~ s/s//; + $TimeoutSpan *= 60 if $TimeoutSpan =~ m/m/; + $TimeoutSpan *= 3600 if $TimeoutSpan =~ m/h/; + +} + +## +## As a byproduct, returns the basename of $0. +## +sub show_version +{ + local($base) = $0; + $base =~ s,.*/,,; + print STDERR "This is $base version $version\n"; + $base; +} + +## +## &usage(exitval, message); +## +## Prints a usage message to STDERR. +## If MESSAGE is defined, prints that first. +## If exitval is defined, exits with that value. Otherwise, returns. +## +sub usage +{ + local($exit, $message) = @_; + + print STDERR $message if defined $message; + local($base) = &show_version; + print STDERR <<INLINE_LITERAL_TEXT; +usage: $0 [options] URL ... + Fetches and displays the named URL(s). Supports http and ftp. + (if no protocol is given, a leading "http://" is normally used). + +Options are from among: + -V, -version Print version information; exit. + -p, -post If URL looks like a form reply, does POST instead of GET. + -pf, -postfile Like -post, but takes everything after ? to be a filename. + -q, -quiet All non-essential informational messages are suppressed. + -nf, -nofollow Don't follow "this document has moved" replies. + -nr, -noretry Doesn't retry a failed hostname lookup (same as -retry 0) + -r #, -retry # Sets failed-hostname-lookup-retry to # (default $retry) + -np, -noproxy Uses no proxy, even if one defined for the protocol. + -ns, -nostrip The HTTP header, normally elided, is printed. + -head gets item header only (implies -ns) + -nab, -file Dumps output to file whose name taken from URL, minus path + -nnab Like -nab, but *also* dumps to stdout. + -update HTTP only. Like -nab, but only if the page has been modified. + -h, -help Prints this message. + -IfNewerThan F HTTP only. Only brings page if it is newer than named file. + -timeout T Fail if a connection can't be made in the specified time. + + -updateme Pull the latest version of $base from + $WEB + and reports if it is newer than your current version. + +Comments to $comments. +INLINE_LITERAL_TEXT + + exit($exit) if defined $exit; +} + +## +## Pull the latest version of this program to a local file. +## Clip the first couple lines from this executing file so that we +## preserve the local invocation style. +## +sub updateme +{ + ## + ## Open a temp file to hold the new version, + ## redirecting STDOUT to it. + ## + open(STDOUT, '>'.($tempFile="/tmp/webget.new")) || + open(STDOUT, '>'.($tempFile="/usr/tmp/webget.new")) || + open(STDOUT, '>'.($tempFile="/webget.new")) || + open(STDOUT, '>'.($tempFile="webget.new")) || + die "$0: can't open a temp file.\n"; + + ## + ## See if we can figure out how we were called. + ## The seek will rewind not to the start of the data, but to the + ## start of the whole program script. + ## + ## Keep the first line if it begins with #!, and the next two if they + ## look like the trick mentioned in the perl man page for getting + ## around the lack of #!-support. + ## + if (seek(DATA, 0, 0)) { ## + $_ = <DATA>; if (m/^#!/) { print STDOUT; + $_ = <DATA>; if (m/^\s*eval/) { print STDOUT; + $_ = <DATA>; if (m/^\s*if/) { print STDOUT; } + } + } + print STDOUT "\n#-\n"; + } + + ## Go get the latest one... + local(@options); + push(@options, 'head') if $head; + push(@options, 'nofollow') unless $follow; + push(@options, ('retry') x $retry) if $retry; + push(@options, 'quiet') if $quiet; + push(@options, 'debug') if $debug; + local($status, $memo, %info) = &www'open_http_url(*IN, $WEB, @options); + die "fetching $WEB:\n $memo\n" unless $status eq 'ok'; + + $size = $info{'content-length'}; + while (<IN>) + { + $size -= length; + print STDOUT; + if (!defined $fetched_version && m/version\s*=\s*"([^"]+)"/) { + $fetched_version = $1; + &general_read(*IN, $size); + last; + } + } + + $fetched_version = "<unknown>" unless defined $fetched_version; + + ## + ## Try to update the mode of the temp file with the mode of this file. + ## Don't worry if it fails. + ## + chmod($mode, $tempFile) if $mode = (stat($0))[2]; + + $as_well = ''; + if ($fetched_version eq $version) + { + print STDERR "You already have the most-recent version ($version).\n", + qq/FWIW, the newly fetched one has been left in "$tempFile".\n/; + } + elsif ($fetched_version <= $version) + { + print STDERR + "Mmm, your current version seems newer (?!):\n", + qq/ your version: "$version"\n/, + qq/ new version: "$fetched_version"\n/, + qq/FWIW, fetched one left in "$tempFile".\n/; + } + else + { + print STDERR + "Indeed, your current version was old:\n", + qq/ your version: "$version"\n/, + qq/ new version: "$fetched_version"\n/, + qq/The file "$tempFile" is ready to replace the old one.\n/; + print STDERR qq/Just do:\n % mv $tempFile $0\n/ if -f $0; + $as_well = ' as well'; + } + print STDERR "Note that the libraries it uses may (or may not) need updating$as_well.\n" + unless $inlined; + exit($EXIT_ok); +} + +## +## Given a list of URLs, fetch'em. +## Parses the URL and calls the routine for the appropriate protocol +## +sub fetch_url +{ + local(@todo) = @_; + local(%circref, %hold_circref); + + URL_LOOP: while (@todo) + { + $URL = shift(@todo); + %hold_circref = %circref; undef %circref; + + local($protocol, @args) = &www'grok_URL($URL, $noproxy); + + if (!defined $protocol) { + &www'message(1, qq/can't grok "$URL"/); + next URL_LOOP; + } + + ## call protocol-specific handler + $func = "fetch_via_" . $protocol; + $error = &$func(@args, $TimeoutSpan); + if (defined $error) { + &www'message(1, "$URL: $error"); + } else { + $SuccessfulCount++; + } + } +} + +sub filedate +{ + local($filename) = @_; + local($filetime) = (stat($filename))[9]; + return 0 if !defined $filetime; + local($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($filetime); + return 0 if !defined $wday; + sprintf(qq/"%s, %02d-%s-%02d %02d:%02d:%02d GMT"/, + ("Sunday", "Monday", "Tuesdsy", "Wednesday", + "Thursday", "Friday", "Saturday")[$wday], + $mday, + ("Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[$mon], + $year, + $hour, + $min, + $sec); +} + +sub local_filename +{ + local($filename) = @_; + $filename =~ s,/+$,,; ## remove any trailing slashes + $filename =~ s,.*/,,; ## remove any leading path + if ($filename eq '') { + ## empty -- pick a random name + $filename = "file0000"; + ## look for a free random name. + $filename++ while -f $filename; + } + $filename; +} + +sub set_output_file +{ + local($filename) = @_; + if (!open(OUT, ">$filename")) { + &www'message(1, "$0: can't open [$filename] for output"); + } else { + open(SAVEOUT, ">>&STDOUT") || die "$!";; + open(STDOUT, ">>&OUT"); + } +} + +sub close_output_file +{ + local($filename) = @_; + unless ($quiet) + { + local($note) = qq/"$filename" written/; + if (defined $error) { + $note .= " (possibly corrupt due to error above)"; + } + &www'message(1, "$note."); + } + close(STDOUT); + open(STDOUT, ">&SAVEOUT"); +} + +sub http_alarm +{ + &www'message(1, "ERROR: $AlarmNote."); + exit($EXIT_timeout); ## the alarm doesn't seem to cause a waiting syscall to break? +# $HaveAlarm = 1; +} + +## +## Given the host, port, and path, and (for info only) real target, +## fetch via HTTP. +## +## If there is a user and/or password, use that for Basic Authorization. +## +## If $timeout is nonzero, time out after that many seconds. +## +sub fetch_via_http +{ + local($host, $port, $path, $target, $user, $password, $timeout) = @_; + local(@options); + local($local_filename); + + ## + ## If we're posting, but -postfile was given, we need to interpret + ## the item in $path after '?' as a filename, and replace it with + ## the contents of the file. + ## + if ($postfile && $path =~ s/\?([\d\D]*)//) { + local($filename) = $1; + return("can't open [$filename] to POST") if !open(IN, "<$filename"); + local($/) = ''; ## want to suck up the whole file. + $path .= '?' . <IN>; + close(IN); + } + + $local_filename = &local_filename($path) + if $refresh || $nab || defined($reference_file); + $refresh = &filedate($local_filename) if $refresh; + $refresh = &filedate($reference_file) if defined($reference_file); + + push(@options, 'head') if $head; + push(@options, 'post') if $post; + push(@options, 'nofollow') unless $follow; + push(@options, ('retry') x 3); + push(@options, 'quiet') if $quiet; + push(@options, 'debug') if $debug; + push(@options, "ifmodifiedsince=$refresh") if $refresh; + + if (defined $password || defined $user) { + local($auth) = join(':', ($user || ''), ($password || '')); + push(@options, "authorization=$auth"); + } + + local($old_alarm); + if ($timeout) { + $old_alarm = $SIG{'ALRM'} || 'DEFAULT'; + $SIG{'ALRM'} = "main'http_alarm"; +# $HaveAlarm = 0; + $AlarmNote = "host $host"; + $AlarmNote .= ":$port" if $port != $www'default_port{'http'}; + $AlarmNote .= " timed out after $timeout second"; + $AlarmNote .= 's' if $timeout > 1; + alarm($timeout); + } + local($result, $memo, %info) = + &www'open_http_connection(*HTTP, $host,$port,$path,$target,@options); + + if ($timeout) { + alarm(0); + $SIG{'ALRM'} = $old_alarm; + } + +# if ($HaveAlarm) { +# close(HTTP); +# $error = "timeout after $timeout second"; +# $error .= "s" if $timeout > 1; +# return $error; +# } + + if ($follow && ($result eq 'follow')) { + %circref = %hold_circref; + $circref{$memo} = 1; + unshift(@todo, $memo); + return undef; + } + + + return $memo if $result eq 'error'; + if (!$quiet && $result eq 'status' && ! -t STDOUT) { + #&www'message(1, "Warning: $memo"); + $error = "Warning: $memo"; + } + + if ($info{'CODE'} == 304) { ## 304 is magic for "Not Modified" + close(HTTP); + &www'message(1, "$URL: Not Modified") unless $quiet; + $NotModifiedCount++; + return undef; ## no error + } + + + &set_output_file($local_filename) if $nab; + + unless($strip) { + print $info{'STATUS'}, "\n", $info{'HEADER'}, "\n"; + + print SAVEOUT $info{'STATUS'}, "\n", $info{'HEADER'}, "\n" if $nab==2; + } + + if (defined $info{'BODY'}) { + print $info{'BODY'}; + print SAVEOUT $info{'BODY'} if $nab==2; + } + + if (!$head) { + &general_read(*HTTP, $info{'content-length'}); + } + close(HTTP); + &close_output_file($local_filename) if $nab; + + $error; ## will be 'undef' if no error; +} + +sub fetch_via_ftp +{ + local($host, $port, $path, $target, $user, $password, $timeout) = @_; + local($local_filename) = &local_filename($path); + local($ftp_debug) = $debug; + local(@password) = ($password); + $path =~ s,^/,,; ## remove a leading / from the path. + $path = '.' if $path eq ''; ## make sure we have something + + if (!defined $user) { + $user = 'anonymous'; + $password = $ENV{'USER'} || 'WWWuser'; + @password = ($password.'@'. &network'addr_to_ascii(&network'my_addr), + $password.'@'); + } elsif (!defined $password) { + @password = (""); + } + + local($_last_ftp_reply, $_passive_host, $_passive_port); + local($size); + + sub _ftp_get_reply + { + local($text) = scalar(<FTP_CONTROL>); + die "lost connection to $host\n" if !defined $text; + local($_, $tmp); + print STDERR "READ: $text" if $ftp_debug; + die "internal error: expected reply code in response from ". + "ftp server [$text]" unless $text =~ s/^(\d+)([- ])//; + local($code) = $1; + if ($2 eq '-') { + while (<FTP_CONTROL>) { + ($tmp = $_) =~ s/^\d+[- ]//; + $text .= $tmp; + last if m/^$code /; + } + } + $text =~ s/^\d+ ?/<foo>/g; + ($code, $text); + } + + sub _ftp_expect + { + local($code, $text) = &_ftp_get_reply; + $_last_ftp_reply = $text; + foreach $expect (@_) { + return ($code, $text) if $code == $expect; + } + die "internal error: expected return code ". + join('|',@_).", got [$text]"; + } + + sub _ftp_send + { + print STDERR "SEND: ", @_ if $ftp_debug; + print FTP_CONTROL @_; + } + + sub _ftp_do_passive + { + local(@commands) = @_; + + &_ftp_send("PASV\r\n"); + local($code) = &_ftp_expect(227, 125); + + if ($code == 227) + { + die "internal error: can't grok passive reply [$_last_ftp_reply]" + unless $_last_ftp_reply =~ m/\(([\d,]+)\)/; + local($a,$b,$c,$d, $p1, $p2) = split(/,/, $1); + ($_passive_host, $_passive_port) = + ("$a.$b.$c.$d", $p1*256 + $p2); + } + + foreach(@commands) { + &_ftp_send($_); + } + + local($error)= + &network'connect_to(*PASSIVE, $_passive_host, $_passive_port); + die "internal error: passive ftp connect [$error]" if $error; + } + + ## make the connection to the host + &www'message($debug, "connecting to $host...") unless $quiet; + + local($old_alarm); + if ($timeout) { + $old_alarm = $SIG{'ALRM'} || 'DEFAULT'; + $SIG{'ALRM'} = "main'http_alarm"; ## can use this for now +# $HaveAlarm = 0; + $AlarmNote = "host $host"; + $AlarmNote .= ":$port" if $port != $www'default_port{'ftp'}; + $AlarmNote .= " timed out after $timeout second"; + $AlarmNote .= 's' if $timeout > 1; + alarm($timeout); + } + + local($error) = &network'connect_to(*FTP_CONTROL, $host, $port); + + if ($timeout) { + alarm(0); + $SIG{'ALRM'} = $old_alarm; + } + + return $error if $error; + + local ($code, $text) = &_ftp_get_reply(*FTP_CONTROL); + close(FTP_CONTROL), return "internal ftp error: [$text]" unless $code==220; + + ## log in + &www'message($debug, "logging in as $user...") unless $quiet; + foreach $password (@password) + { + &_ftp_send("USER $user\r\n"); + ($code, $text) = &_ftp_expect(230,331,530); + close(FTP_CONTROL), return $text if ($code == 530); + last if $code == 230; ## hey, already logged in, cool. + + &_ftp_send("PASS $password\r\n"); + ($code, $text) = &_ftp_expect(220,230,530,550,332); + last if $code != 550; + last if $text =~ m/can't change directory/; + } + + if ($code == 550) + { + $text =~ s/\n+$//; + &www'message(1, "Can't log in $host: $text") unless $quiet; + exit($EXIT_error); + } + + if ($code == 332) + { + &_ftp_send("ACCT noaccount\r\n"); + ($code, $text) = &_ftp_expect(230, 202, 530, 500,501,503, 421) + } + close(FTP_CONTROL), return $text if $code >= 300; + + &_ftp_send("TYPE I\r\n"); + &_ftp_expect(200); + + unless ($quiet) { + local($name) = $path; + $name =~ s,.*/([^/]),$1,; + &www'message($debug, "requesting $name..."); + } + ## get file + &_ftp_do_passive("RETR $path\r\n"); + ($code,$text) = &_ftp_expect(125, 150, 550, 530); + close(FTP_CONTROL), return $text if $code == 530; + + if ($code == 550) + { + close(PASSIVE); + if ($text =~ /directory/i) { + ## probably from "no such file or directory", so just return now. + close(FTP_CONTROL); + return $text; + } + + ## do like Mosaic and try getting a directory listing. + &_ftp_send("CWD $path\r\n"); + ($code) = &_ftp_expect(250,550); + if ($code == 550) { + close(FTP_CONTROL); + return $text; + } + &_ftp_do_passive("LIST\r\n"); + &_ftp_expect(125, 150); + } + + $size = $1 if $text =~ m/(\d+)\s+bytes/; + binmode(PASSIVE); ## just in case. + &www'message($debug, "waiting for data...") unless $quiet; + &set_output_file($local_filename) if $nab; + &general_read(*PASSIVE, $size); + &close_output_file($local_filename) if $nab; + + close(PASSIVE); + close(FTP_CONTROL); + undef; +} + +sub general_read +{ + local(*INPUT, $size) = @_; + local($lastcount, $bytes) = (0,0); + local($need_to_clear) = 0; + local($start_time) = time; + local($last_time, $time) = $start_time; + ## Figure out how often to print the "bytes read" message + local($delta2print) = + (defined $size) ? int($size/50) : $defaultdelta2print; + + &www'message(0, "read 0 bytes") unless $quiet; + + ## so $! below is set only if a real error happens from now + eval 'local($^W) = 0; undef $!'; + + + while (defined($_ = <INPUT>)) + { + ## shove it out. + &www'clear_message if $need_to_clear; + print; + print SAVEOUT if $nab==2; + + ## if we know the content-size, keep track of what we're reading. + $bytes += length; + + last if eof || (defined $size && $bytes >= $size); + + if (!$quiet && $bytes > ($lastcount + $delta2print)) + { + if ($time = time, $last_time == $time) { + $delta2print *= 1.5; + } else { + $last_time = $time; + $lastcount = $bytes; + local($time_delta) = $time - $start_time; + local($text); + + $delta2print /= $time_delta; + if (defined $size) { + $text = sprintf("read $bytes bytes (%.0f%%)", + $bytes*100/$size); + } else { + $text = "read $bytes bytes"; + } + + if ($time_delta > 5 || ($time_delta && $bytes > 10240)) + { + local($rate) = int($bytes / $time_delta); + if ($rate < 5000) { + $text .= " ($rate bytes/sec)"; + } elsif ($rate < 1024 * 10) { + $text .= sprintf(" (%.1f k/sec)", $rate/1024); + } else { + $text .= sprintf(" (%.0f k/sec)", $rate/1024); + } + } + &www'message(0, "$text..."); + $need_to_clear = -t STDOUT; + } + } + } + + if (!$quiet) + { + if ($size && ($size != $bytes)) { + &www'message("WARNING: Expected $size bytes, read $bytes bytes.\n"); + } +# if ($!) { +# print STDERR "\$! is [$!]\n"; +# } +# if ($@) { +# print STDERR "\$\@ is [$@]\n"; +# } + } + &www'clear_message($text) unless $quiet; +} + +sub dummy { + 1 || &dummy || &fetch_via_ftp || &fetch_via_http || &http_alarm; + 1 || close(OUT); + 1 || close(SAVEOUT); +} + +__END__ +__END__ +:endofperl diff --git a/win32/bin/www.pl b/win32/bin/www.pl new file mode 100644 index 0000000000..8022597454 --- /dev/null +++ b/win32/bin/www.pl @@ -0,0 +1,901 @@ +## +## Jeffrey Friedl (jfriedl@omron.co.jp) +## Copyri.... ah hell, just take it. +## +## This is "www.pl". +## Include (require) to use, execute ("perl www.pl") to print a man page. +## Requires my 'network.pl' library. +package www; +$version = "951219.9"; + +## +## 951219.9 +## -- oops, stopped sending garbage Authorization line when no +## authorization was requested. +## +## 951114.8 +## -- added support for HEAD, If-Modified-Since +## +## 951017.7 +## -- Change to allow a POST'ed HTTP text to have newlines in it. +## Added 'NewURL to the open_http_connection %info. Idea courtesy +## of Bryan Schmersal (http://www.transarc.com/~bryans/Home.html). +## +## +## 950921.6 +## -- added more robust HTTP error reporting +## (due to steven_campbell@uk.ibm.com) +## +## 950911.5 +## -- added Authorization support +## + +## +## HTTP return status codes. +## +%http_return_code = + (200,"OK", + 201,"Created", + 202,"Accepted", + 203,"Partial Information", + 204,"No Response", + 301,"Moved", + 302,"Found", + 303,"Method", + 304,"Not modified", + 400,"Bad request", + 401,"Unauthorized", + 402,"Payment required", + 403,"Forbidden", + 404,"Not found", + 500,"Internal error", + 501,"Not implemented", + 502,"Service temporarily overloaded", + 503,"Gateway timeout"); + +## +## If executed directly as a program, print as a man page. +## +if (length($0) >= 6 && substr($0, -6) eq 'www.pl') +{ + seek(DATA, 0, 0) || die "$0: can't reset internal pointer.\n"; + print "www.pl version $version\n", '=' x 60, "\n"; + while (<DATA>) { + next unless /^##>/../^##</; ## select lines to print + s/^##[<> ]?//; ## clean up + print; + } + exit(0); +} + +## +## History: +## version 950425.4 +## added require for "network.pl" +## +## version 950425.3 +## re-did from "Www.pl" which was a POS. +## +## +## BLURB: +## A group of routines for dealing with URLs, HTTP sessions, proxies, etc. +## Requires my 'network.pl' package. The library file can be executed +## directly to produce a man page. + +##> +## A motley group of routines for dealing with URLs, HTTP sessions, proxies, +## etc. Requires my 'network.pl' package. +## +## Latest version, as well as other stuff (including network.pl) available +## at http://www.wg.omron.co.jp/~jfriedl/perl/ +## +## Simpleton complete program to dump a URL given on the command-line: +## +## require 'network.pl'; ## required for www.pl +## require 'www.pl'; ## main routines +## $URL = shift; ## get URL +## ($status, $memo) = &www'open_http_url(*IN, $URL); ## connect +## die "$memo\n" if $status ne 'ok'; ## report any error +## print while <IN>; ## dump contents +## +## There are various options available for open_http_url. +## For example, adding 'quiet' to the call, i.e. vvvvvvv-----added +## ($status, $memo) = &www'open_http_url(*IN, $URL, 'quiet'); +## suppresses the normal informational messages such as "waiting for data...". +## +## The options, as well as the various other public routines in the package, +## are discussed below. +## +##< + +## +## Default port for the protocols whose URL we'll at least try to recognize. +## +%default_port = ('http', 80, + 'ftp', 21, + 'gopher', 70, + 'telnet', 23, + 'wais', 210, + ); + +## +## A "URL" to "ftp.blah.com" without a protocol specified is probably +## best reached via ftp. If the hostname begins with a protocol name, it's +## easy. But something like "www." maps to "http", so that mapping is below: +## +%name2protocol = ( + 'www', 'http', + 'wwwcgi','http', +); + +$last_message_length = 0; +$useragent = "www.pl/$version"; + +## +##> +############################################################################## +## routine: open_http_url +## +## Used as +## ($status, $memo, %info) = &www'open_http_url(*FILEHANDLE, $URL, options..) +## +## Given an unused filehandle, a URL, and a list of options, opens a socket +## to the URL and returns with the filehandle ready to read the data of the +## URL. The HTTP header, as well as other information, is returned in %info. +## +## OPTIONS are from among: +## +## "post" +## If PATH appears to be a query (i.e. has a ? in it), contact +## via a POST rather than a GET. +## +## "nofollow" +## Normally, if the initial contact indicates that the URL has moved +## to a different location, the new location is automatically contacted. +## "nofollow" inhibits this. +## +## "noproxy" +## Normally, a proxy will be used if 'http_proxy' is defined in the +## environment. This option inhibits the use of a proxy. +## +## "retry" +## If a host's address can't be found, it may well be because the +## nslookup just didn't return in time and that retrying the lookup +## after a few seconds will succeed. If this option is given, will +## wait five seconds and try again. May be given multiple times to +## retry multiple times. +## +## "quiet" +## Informational messages will be suppressed. +## +## "debug" +## Additional messages will be printed. +## +## "head" +## Requests only the file header to be sent +## +## +## +## +## The return array is ($STATUS, $MEMO, %INFO). +## +## STATUS is 'ok', 'error', 'status', or 'follow' +## +## If 'error', the MEMO will indicate why (URL was not http, can't +## connect, etc.). INFO is probably empty, but may have some data. +## See below. +## +## If 'status', the connnection was made but the reply was not a normal +## "OK" successful reply (i.e. "Not found", etc.). MEMO is a note. +## INFO is filled as noted below. Filehandle is ready to read (unless +## $info{'BODY'} is filled -- see below), but probably most useful +## to treat this as an 'error' response. +## +## If 'follow', MEMO is the new URL (for when 'nofollow' was used to +## turn off automatic following) and INFO is filled as described +## below. Unless you wish to give special treatment to these types of +## responses, you can just treat 'follow' responses like 'ok' +## responses. +## +## If 'ok', the connection went well and the filehandle is ready to +## read. +## +## INFO contains data as described at the read_http_header() function (in +## short, the HTTP response header) and additional informational fields. +## In addition, the following fields are filled in which describe the raw +## connection made or attempted: +## +## PROTOCOL, HOST, PORT, PATH +## +## Note that if a proxy is being used, these will describe the proxy. +## The field TARGET will describe the host or host:port ultimately being +## contacted. When no proxy is being used, this will be the same info as +## in the raw connection fields above. However, if a proxy is being used, +## it will refer to the final target. +## +## In some cases, the additional entry $info{'BODY'} exists as well. If +## the result-code indicates an error, the body of the message may be +## parsed for internal reasons (i.e. to support 'repeat'), and if so, it +## will be saved in $info{'BODY}. +## +## If the URL has moved, $info{'NewURL'} will exist and contain the new +## URL. This will be true even if the 'nofollow' option is specified. +## +##< +## +sub open_http_url +{ + local(*HTTP, $URL, @options) = @_; + return &open_http_connection(*HTTP, $URL, undef, undef, undef, @options); +} + + +## +##> +############################################################################## +## routine: read_http_header +## +## Given a filehandle to a just-opened HTTP socket connection (such as one +## created via &network'connect_to which has had the HTTP request sent), +## reads the HTTP header and and returns the parsed info. +## +## ($replycode, %info) = &read_http_header(*FILEHANDLE); +## +## $replycode will be the HTTP reply code as described below, or +## zero on header-read error. +## +## %info contains two types of fields: +## +## Upper-case fields are informational from the function. +## Lower-case fields are the header field/value pairs. +## +## Upper-case fields: +## +## $info{'STATUS'} will be the first line read (HTTP status line) +## +## $info{'CODE'} will be the numeric HTTP reply code from that line. +## This is also returned as $replycode. +## +## $info{'TYPE'} is the text from the status line that follows CODE. +## +## $info{'HEADER'} will be the raw text of the header (sans status line), +## newlines and all. +## +## $info{'UNKNOWN'}, if defined, will be any header lines not in the +## field/value format used to fill the lower-case fields of %info. +## +## Lower-case fields are reply-dependent, but in general are described +## in http://info.cern.ch/hypertext/WWW/Protocols/HTTP/Object_Headers.html +## +## A header line such as +## Content-type: Text/Plain +## will appear as $info{'content-type'} = 'Text/Plain'; +## +## (*) Note that while the field names are are lower-cased, the field +## values are left as-is. +## +## +## When $replycode is zero, there are two possibilities: +## $info{'TYPE'} is 'empty' +## No response was received from the filehandle before it was closed. +## No other %info fields present. +## $info{'TYPE'} is 'unknown' +## First line of the response doesn't seem to be proper HTTP. +## $info{'STATUS'} holds that line. No other %info fields present. +## +## The $replycode, when not zero, is as described at +## http://info.cern.ch/hypertext/WWW/Protocols/HTTP/HTRESP.html +## +## Some of the codes: +## +## success 2xx +## ok 200 +## created 201 +## accepted 202 +## partial information 203 +## no response 204 +## redirection 3xx +## moved 301 +## found 302 +## method 303 +## not modified 304 +## error 4xx, 5xx +## bad request 400 +## unauthorized 401 +## paymentrequired 402 +## forbidden 403 +## not found 404 +## internal error 500 +## not implemented 501 +## service temporarily overloaded 502 +## gateway timeout 503 +## +##< +## +sub read_http_header +{ + local(*HTTP) = @_; + local(%info, $_); + + ## + ## The first line of the response will be the status (OK, error, etc.) + ## + unless (defined($info{'STATUS'} = <HTTP>)) { + $info{'TYPE'} = "empty"; + return (0, %info); + } + chop $info{'STATUS'}; + + ## + ## Check the status line. If it doesn't match and we don't know the + ## format, we'll just let it pass and hope for the best. + ## + unless ($info{'STATUS'} =~ m/^HTTP\S+\s+(\d\d\d)\s+(.*\S)/i) { + $info{'TYPE'} = 'unknown'; + return (0, %info); + } + + $info{'CODE'} = $1; + $info{'TYPE'} = $2; + $info{'HEADER'} = ''; + + ## read the rest of the header. + while (<HTTP>) { + last if m/^\s*$/; + $info{'HEADER'} .= $_; ## save whole text of header. + + if (m/^([^\n:]+):[ \t]*(.*\S)/) { + local($field, $value) = ("\L$1", $2); + if (defined $info{$field}) { + $info{$field} .= "\n" . $value; + } else { + $info{$field} = $value; + } + } elsif (defined $info{'UNKNOWN'}) { + $info{'UNKNOWN'} .= $_; + } else { + $info{'UNKNOWN'} = $_; + } + } + + return ($info{'CODE'}, %info); +} + +## +##> +## +############################################################################## +## routine: grok_URL(URL, noproxy, defaultprotocol) +## +## Given a URL, returns access information. Deals with +## http, wais, gopher, ftp, and telnet +## URLs. +## +## Information returned is +## (PROTOCOL, HOST, PORT, PATH, TARGET, USER, PASSWORD) +## +## If noproxy is not given (or false) and there is a proxy defined +## for the given protocol (via the "*_proxy" environmental variable), +## the returned access information will be for the proxy and will +## reference the given URL. In this case, 'TARGET' will be the +## HOST:PORT of the original URL (PORT elided if it's the default port). +## +## Access information returned: +## PROTOCOL: "http", "ftp", etc. (guaranteed to be lowercase). +## HOST: hostname or address as given. +## PORT: port to access +## PATH: path of resource on HOST:PORT. +## TARGET: (see above) +## USER and PASSWORD: for 'ftp' and 'telnet' URLs, if supplied by the +## URL these will be defined, undefined otherwise. +## +## If no protocol is defined via the URL, the defaultprotocol will be used +## if given. Otherwise, the URL's address will be checked for a leading +## protocol name (as with a leading "www.") and if found will be used. +## Otherwise, the protocol defaults to http. +## +## Fills in the appropriate default port for the protocol if need be. +## +## A proxy is defined by a per-protocol environmental variable such +## as http_proxy. For example, you might have +## setenv http_proxy http://firewall:8080/ +## setenv ftp_proxy $http_proxy +## to set it up. +## +## A URL seems to be officially described at +## http://www.w3.org/hypertext/WWW/Addressing/URL/5_BNF.html +## although that document is a joke of errors. +## +##< +## +sub grok_URL +{ + local($_, $noproxy, $defaultprotocol) = @_; + $noproxy = defined($noproxy) && $noproxy; + + ## Items to be filled in and returned. + local($protocol, $address, $port, $path, $target, $user, $password); + + return undef unless m%^(([a-zA-Z]+)://|/*)([^/]+)(/.*)?$%; + + ## + ## Due to a bug in some versions of perl5, $2 might not be empty + ## even if $1 is. Therefore, we must check $1 for a : to see if the + ## protocol stuff matched or not. If not, the protocol is undefined. + ## + ($protocol, $address, $path) = ((index($1,":") >= 0 ? $2 : undef), $3, $4); + + if (!defined $protocol) + { + ## + ## Choose a default protocol if none given. If address begins with + ## a protocol name (one that we know via %name2protocol or + ## %default_port), choose it. Otherwise, choose http. + ## + if (defined $defaultprotocol) { + $protocol = $defaultprotocol; + } + else + { + $address =~ m/^[a-zA-Z]+/; + if (defined($name2protocol{"\L$&"})) { + $protocol = $name2protocol{"\L$&"}; + } else { + $protocol = defined($default_port{"\L$&"}) ? $& : 'http'; + } + } + } + $protocol =~ tr/A-Z/a-z/; ## ensure lower-case. + + ## + ## Http support here probably not kosher, but fits in nice for basic + ## authorization. + ## + if ($protocol eq 'ftp' || $protocol eq 'telnet' || $protocol eq 'http') + { + ## Glean a username and password from address, if there. + ## There if address starts with USER[:PASSWORD]@ + if ($address =~ s/^(([^\@:]+)(:([^@]+))?\@)//) { + ($user, $password) = ($2, $4); + } + } + + ## + ## address left is (HOSTNAME|HOSTNUM)[:PORTNUM] + ## + if ($address =~ s/:(\d+)$//) { + $port = $1; + } else { + $port = $default_port{$protocol}; + } + + ## default path is '/'; + $path = '/' if !defined $path; + + ## + ## If there's a proxy and we're to proxy this request, do so. + ## + local($proxy) = $ENV{$protocol."_proxy"}; + if (!$noproxy && defined($proxy) && !&no_proxy($protocol,$address)) + { + local($dummy); + local($old_pass, $old_user); + + ## + ## Since we're going through a proxy, we want to send the + ## proxy the entire URL that we want. However, when we're + ## doing Authenticated HTTP, we need to take out the user:password + ## that webget has encoded in the URL (this is a bit sleazy on + ## the part of webget, but the alternative is to have flags, and + ## having them part of the URL like with FTP, etc., seems a bit + ## cleaner to me in the context of how webget is used). + ## + ## So, if we're doing this slezy thing, we need to construct + ## the new URL from the compnents we have now (leaving out password + ## and user), decode the proxy URL, then return the info for + ## that host, a "filename" of the entire URL we really want, and + ## the user/password from the original URL. + ## + ## For all other things, we can just take the original URL, + ## ensure it has a protocol on it, and pass it as the "filename" + ## we want to the proxy host. The difference between reconstructing + ## the URL (as for HTTP Authentication) and just ensuring the + ## protocol is there is, except for the user/password stuff, + ## nothing. In theory, at least. + ## + if ($protocol eq 'http' && (defined($password) || defined($user))) + { + $path = "http://$address$path"; + $old_pass = $password; + $old_user = $user; + } else { + ## Re-get original URL and ensure protocol// actually there. + ## This will become our new path. + ($path = $_) =~ s,^($protocol:)?/*,$protocol://,i; + } + + ## note what the target will be + $target = ($port==$default_port{$protocol})?$address:"$address:$port"; + + ## get proxy info, discarding + ($protocol, $address, $port, $dummy, $dummy, $user, $password) + = &grok_URL($proxy, 1); + $password = $old_pass if defined $old_pass; + $user = $old_user if defined $old_user; + } + ($protocol, $address, $port, $path, $target, $user, $password); +} + + + +## +## &no_proxy($protocol, $host) +## +## Returns true if the specified host is identified in the no_proxy +## environmental variable, or identify the proxy server itself. +## +sub no_proxy +{ + local($protocol, $targethost) = @_; + local(@dests, $dest, $host, @hosts, $aliases); + local($proxy) = $ENV{$protocol."_proxy"}; + return 0 if !defined $proxy; + $targethost =~ tr/A-Z/a-z/; ## ensure all lowercase; + + @dests = ($proxy); + push(@dests,split(/\s*,\s*/,$ENV{'no_proxy'})) if defined $ENV{'no_proxy'}; + + foreach $dest (@dests) + { + ## just get the hostname + $host = (&grok_URL($dest, 1), 'http')[1]; + + if (!defined $host) { + warn "can't grok [$dest] from no_proxy env.var.\n"; + next; + } + @hosts = ($host); ## throw in original name just to make sure + ($host, $aliases) = (gethostbyname($host))[0, 1]; + + if (defined $aliases) { + push(@hosts, ($host, split(/\s+/, $aliases))); + } else { + push(@hosts, $host); + } + foreach $host (@hosts) { + next if !defined $host; + return 1 if "\L$host" eq $targethost; + } + } + return 0; +} + +sub ensure_proper_network_library +{ + require 'network.pl' if !defined $network'version; + warn "WARNING:\n". __FILE__ . + qq/ needs a newer version of "network.pl"\n/ if + !defined($network'version) || $network'version < "950311.5"; +} + + + +## +##> +############################################################################## +## open_http_connection(*FILEHANDLE, HOST, PORT, PATH, TARGET, OPTIONS...) +## +## Opens an HTTP connection to HOST:PORT and requests PATH. +## TARGET is used only for informational messages to the user. +## +## If PORT and PATH are undefined, HOST is taken as an http URL and TARGET +## is filled in as needed. +## +## Otherwise, it's the same as open_http_url (including return value, etc.). +##< +## +sub open_http_connection +{ + local(*HTTP, $host, $port, $path, $target, @options) = @_; + local($post_text, @error, %seen); + local(%info); + + &ensure_proper_network_library; + + ## options allowed: + local($post, $retry, $authorization, $nofollow, $noproxy, + $head, $debug, $ifmodifiedsince, $quiet, ) = (0) x 10; + ## parse options: + foreach $opt (@options) + { + next unless defined($opt) && $opt ne ''; + local($var, $val); + if ($opt =~ m/^(\w+)=(.*)/) { + ($var, $val) = ($1, $2); + } else { + $var = $opt; + $val = 1; + } + $var =~ tr/A-Z/a-z/; ## ensure variable is lowercase. + local(@error); + + eval "if (defined \$$var) { \$$var = \$val; } else { \@error = + ('error', 'bad open_http_connection option [$opt]'); }"; + return ('error', "open_http_connection eval: $@") if $@; + return @error if defined @error; + } + $quiet = 0 if $debug; ## debug overrides quiet + + local($protocol, $error, $code, $URL, %info, $tmp, $aite); + + ## + ## if both PORT and PATH are undefined, treat HOST as a URL. + ## + unless (defined($port) && defined($path)) + { + ($protocol,$host,$port,$path,$target)=&grok_URL($host,$noproxy,'http'); + if ($protocol ne "http") { + return ('error',"open_http_connection doesn't grok [$protocol]"); + } + unless (defined($host)) { + return ('error', "can't grok [$URL]"); + } + } + + return ('error', "no port in URL [$URL]") unless defined $port; + return ('error', "no path in URL [$URL]") unless defined $path; + + RETRY: while(1) + { + ## we'll want $URL around for error messages and such. + if ($port == $default_port{'http'}) { + $URL = "http://$host"; + } else { + $URL = "http://$host:$default_port{'http'}"; + } + $URL .= ord($path) eq ord('/') ? $path : "/$path"; + + $aite = defined($target) ? "$target via $host" : $host; + + &message($debug, "connecting to $aite ...") unless $quiet; + + ## + ## note some info that might be of use to the caller. + ## + local(%preinfo) = ( + 'PROTOCOL', 'http', + 'HOST', $host, + 'PORT', $port, + 'PATH', $path, + ); + if (defined $target) { + $preinfo{'TARGET'} = $target; + } elsif ($default_port{'http'} == $port) { + $preinfo{'TARGET'} = $host; + } else { + $preinfo{'TARGET'} = "$host:$port"; + } + + ## connect to the site + $error = &network'connect_to(*HTTP, $host, $port); + if (defined $error) { + return('error', "can't connect to $aite: $error", %preinfo); + } + + ## If we're asked to POST and it looks like a POST, note post text. + if ($post && $path =~ m/\?/) { + $post_text = $'; ## everything after the '?' + $path = $`; ## everything before the '?' + } + + ## send the POST or GET request + $tmp = $head ? 'HEAD' : (defined $post_text ? 'POST' : 'GET'); + + &message($debug, "sending request to $aite ...") if !$quiet; + print HTTP $tmp, " $path HTTP/1.0\n"; + + ## send the If-Modified-Since field if needed. + if ($ifmodifiedsince) { + print HTTP "If-Modified-Since: $ifmodifiedsince\n"; + } + + ## oh, let's sputter a few platitudes..... + print HTTP "Accept: */*\n"; + print HTTP "User-Agent: $useragent\n" if defined $useragent; + + ## If doing Authorization, do so now. + if ($authorization) { + print HTTP "Authorization: Basic ", + &htuu_encode($authorization), "\n"; + } + + ## If it's a post, send it. + if (defined $post_text) + { + print HTTP "Content-type: application/x-www-form-urlencoded\n"; + print HTTP "Content-length: ", length $post_text, "\n\n"; + print HTTP $post_text, "\n"; + } + print HTTP "\n"; + &message($debug, "waiting for data from $aite ...") unless $quiet; + + ## we can now read the response (header, then body) via HTTP. + binmode(HTTP); ## just in case. + + ($code, %info) = &read_http_header(*HTTP); + &message(1, "header returns code $code ($info{'TYPE'})") if $debug; + + ## fill in info from %preinfo + local($val, $key); + while (($val, $key) = each %preinfo) { + $info{$val} = $key; + } + + if ($code == 0) + { + return('error',"empty response for $URL") + if $info{'TYPE'} eq 'empty'; + return('error', "non-HTTP response for $URL", %info) + if $info{'TYPE'} eq 'unknown'; + return('error', "unknown zero-code for $URL", %info); + } + + if ($code == 302) ## 302 is magic for "Found" + { + if (!defined $info{'location'}) { + return('error', "No location info for Found URL $URL", %info); + } + local($newURL) = $info{'location'}; + + ## Remove :80 from hostname, if there. Looks ugly. + $newURL =~ s,^(http:/+[^/:]+):80/,$1/,i; + $info{"NewURL"} = $newURL; + + ## if we're not following links or if it's not to HTTP, return. + return('follow', $newURL, %info) if + $nofollow || $newURL!~m/^http:/i; + + ## note that we've seen this current URL. + $seen{$host, $port, $path} = 1; + + &message(1, qq/[note: now moved to "$newURL"]/) unless $quiet; + + + ## get the new one and return an error if it's been seen. + ($protocol, $host, $port, $path, $target) = + &www'grok_URL($newURL, $noproxy); + &message(1, "[$protocol][$host][$port][$path]") if $debug; + + if (defined $seen{$host, $port, $path}) + { + return('error', "circular reference among:\n ". + join("\n ", sort grep(/^http/i, keys %seen)), %seen); + } + next RETRY; + } + elsif ($code == 500) ## 500 is magic for "internal error" + { + ## + ## A proxy will often return this with text saying "can't find + ## host" when in reality it's just because the nslookup returned + ## null at the time. Such a thing should be retied again after a + ## few seconds. + ## + if ($retry) + { + local($_) = $info{'BODY'} = join('', <HTTP>); + if (/Can't locate remote host:\s*(\S+)/i) { + local($times) = ($retry == 1) ? + "once more" : "up to $retry more times"; + &message(0, "can't locate $1, will try $times ...") + unless $quiet; + sleep(5); + $retry--; + next RETRY; + } + } + } + + if ($code != 200) ## 200 is magic for "OK"; + { + ## I'll deal with these as I see them..... + &clear_message; + if ($info{'TYPE'} eq '') + { + if (defined $http_return_code{$code}) { + $info{'TYPE'} = $http_return_code{$code}; + } else { + $info{'TYPE'} = "(unknown status code $code)"; + } + } + return ('status', $info{'TYPE'}, %info); + } + + &clear_message; + return ('ok', 'ok', %info); + } +} + + +## +## Hyper Text UUencode. Somewhat different from regular uuencode. +## +## Logic taken from Mosaic for X code by Mark Riordan and Ari Luotonen. +## +sub htuu_encode +{ + local(@in) = unpack("C*", $_[0]); + local(@out); + + push(@in, 0, 0); ## in case we need to round off an odd byte or two + while (@in >= 3) { + ## + ## From the next three input bytes, + ## construct four encoded output bytes. + ## + push(@out, $in[0] >> 2); + push(@out, (($in[0] << 4) & 060) | (($in[1] >> 4) & 017)); + push(@out, (($in[1] << 2) & 074) | (($in[2] >> 6) & 003)); + push(@out, $in[2] & 077); + splice(@in, 0, 3); ## remove these three + } + + ## + ## @out elements are now indices to the string below. Convert to + ## the appropriate actual text. + ## + foreach $new (@out) { + $new = substr( + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/", + $new, 1); + } + + if (@in == 2) { + ## the two left over are the two extra nulls, so we encoded the proper + ## amount as-is. + } elsif (@in == 1) { + ## We encoded one extra null too many. Undo it. + $out[$#out] = '='; + } else { + ## We must have encoded two nulls... Undo both. + $out[$#out ] = '='; + $out[$#out -1] = '='; + } + + join('', @out); +} + +## +## This message stuff really shouldn't be here, but in some seperate library. +## Sorry. +## +## Called as &message(SAVE, TEXT ....), it shoves the text to the screen. +## If SAVE is true, bumps the text out as a printed line. Otherwise, +## will shove out without a newline so that the next message overwrites it, +## or it is clearded via &clear_message(). +## +sub message +{ + local($nl) = shift; + die "oops $nl." unless $nl =~ m/^\d+$/; + local($text) = join('', @_); + local($NL) = $nl ? "\n" : "\r"; + $thislength = length($text); + if ($thislength >= $last_message_length) { + print STDERR $text, $NL; + } else { + print STDERR $text, ' 'x ($last_message_length-$thislength), $NL; + } + $last_message_length = $nl ? 0 : $thislength; +} + +sub clear_message +{ + if ($last_message_length) { + print STDERR ' ' x $last_message_length, "\r"; + $last_message_length = 0; + } +} + +1; +__END__ |