#!/usr/bin/perl -w # # wwwoffle-ls2 # Alternative implementation of wwwoffle-ls with wildcard matching. # # Author: Marc Boucher # Copyright: # Copyright (C) 2003 Marc Boucher # This program can be redistributed and/or modified under the terms # of the GNU General Public License; version 2, or any later version. # See . # # Revision: # # v1.0 sometime... # # # v0.5 # 2003-06-20: Marc Code modified to work with perl's warnings ON (-w) # Still having problems with "use strict;" # 2003-06-07: Marc Remark: I've noticed that wwwoffle-rm doesn't change the date of the directory. # wwwoffle-ls2 will not see that something has changed, and will continue to use the old listing. # Patch wwwoffle-tools (-mv) too? # 2003-06-07: Marc now reading Ufiles based uppon 'ls D*' (previously 'ls U*') # 2003-06-06: Marc Added Fast mode and case-sensitive search. # 2003-06-06: Marc Some help text corrections, and wrapped to fit in 80 cols. # # v0.4 # 2003-05-13: Marc Remark: Only one pass decoding. Problem if a %XX has been re-encoded (through a redirect). # Do we need 2 pass? # 2003-05-13: Marc Handling of several options: config file and decoding. # 2003-05-11: Marc Some minor modifications. # 2003-05-10: Marc Remark: when WWWOFFLE purges the cache it removes all listing files. # A patch is available to prevent this. # v0.3 # 2003-05-09: Marc Remark: Only root can create wwwoffle_ls files. # 2003-05-09: Marc Added: restoring mdate of host-dir after creating a wwwoffle_ls file. # 2003-05-08: Marc Problems with sh expanding '*' in parameters before calling a program. # Error message: "bash: /bin/ls: Argument list too long" # -> rewriting everything in perl. # v0.2 # 2003-05-07: Marc Reading all U* files is too slow. Implementing cached listings. # Using sed because it's the only way to get a CR after each URL. # Call to sh to test date of "listing" file. # v0.1 # 2003-05-07: Marc Initial code. The script is only a wrapper calling ls awk & grep. # Using perl for RE usability. # # ######################################################################## #use strict; my $Spool = '/var/spool/wwwoffle/'; my ($progname) = $0 =~ /^.*?([^\/]+)$/; my $version = 'v0.4.3'; my $url=''; my $prot; my $host; my $path; my $usage="Usage: $progname [OPTIONS] :/// : is either 'ftp', 'http', 'finger' or '*' : wildcard matching hostname (*google* , *.freshmeat.net) : wildcard matching path (image/*.gif) Options: -c Use Spool directory from option in config . -d Decode URL (%20 = space). Codes lower than %20 are not decoded. -D Decode before matching the expression. -f Fast: don't update outdated 'listing' files. Missing ones are created. -F Force: 'listing' files for the matched hosts are erased. -i Case-insensitive search (default behavior). -I Case-sensitive search (for the path). -h Here we are... -V Print version number. Remarks: - '*' '.' '?' are escaped for the RE: '.*' '\\.' '\\?'. If you want to use the real character you can escape it with '\\' if the URL is enclosed between \"\" (or ''), or with '\\\\' (double) if the URL is not enclosed. The other metacharacters are passed unchanged to the RE : +(){}[]\$ ... - '?' is only escaped in . - doesn't need a trailing '*' to match. If is empty $progname assumes that you want '/'. - If there is no and no '/' after , $progname will display a list of matching hostnames (eg: http:/*ads* ). IMPORTANT: 'listing' files are not created in this particular situation. - If you need the '|' metacharacter, you must enclose the URL in \" (or '). - Matching is case insensitive by default.\n\n"; # Getopts function based on getopts.pl. # Modified to allow unordered options mixed with other args (eg. url) # Usage: Getopts('a:bc'); # -a takes arg. -b & -c not. Sets $opt_* as a side effect. sub Getopts { my ($argumentative) = @_; #local(@args,$_,$first,$rest); my (@args,$first,$rest); my $errs = 0; @args = split( / */, $argumentative ); while(@ARGV) { #if (($_ = $ARGV[0]) =~ /^-(.)(.*)/) { if ( $ARGV[0] =~ /^-(.)(.*)/) { ($first,$rest) = ($1,$2); my $pos = index($argumentative,$first); if ($pos >= 0) { if ($pos < $#args && $args[$pos+1] eq ':') { shift(@ARGV); if ($rest eq '') { ++$errs unless @ARGV; $rest = shift(@ARGV); } ${"opt_$first"} = $rest; } else { ${"opt_$first"} = 1; if ($rest eq '') { shift(@ARGV); } else { $ARGV[0] = "-$rest"; } } } else { ++$errs; return $errs; # If we don't understand an option stop here. } } else { if ($url eq '') { $url=$ARGV[0]; } # Copy the shifted arg (not an option) to $url (if empty); shift(@ARGV); #if ($url eq '') { $url=$_; } # Copy the shifted arg (not an option) to $url (if empty); } } #$errs == 0; return $errs; } if (@ARGV==0) {@ARGV=('-h');} # If no args then display help screen local ($opt_h,$opt_d,$opt_D,$opt_f,$opt_F,$opt_i,$opt_I,$opt_V,$opt_c)=(0,0,0,0,0,0,0,0,0); if (Getopts('hdDfFiIVc:') != 0) { die("Check your options... (-h for help)\n"); } if ($opt_V) { print "$progname $version\n"; exit; } if ($opt_h) { print $usage; exit; } my $fast=0; if ($opt_f) { $fast=1; } my $force=0; if ($opt_F) { $force=1; } if ($opt_i) { # nothing. Just for compatibility. } my $case=0; if ($opt_I) { $case=1; } if ($opt_c) { my ($newspool) = `grep -E -i "^ *spool-dir" $opt_c` =~ /^\s*spool\-dir\s+\=\s*([^\s]*)/i; #print "DEBUG: reading from file ($opt_c): $newspool\n"; if ($newspool =~ /[^\/]$/) { $newspool .= '/'; } if (-d $newspool) { $Spool=$newspool; #print "DEBUG: using spool directory: $Spool\n"; } } #print "DEBUG: args: $url\n"; $url =~ s/\\/\255/g; # using char 255 to preserve \ #print "DEBUG: args: $url\n"; my $hostonly=0; if ($url =~ /^(http|ftp|finger|\*)\:\/\/([^\/]+)(?:(\/)(.*))?$/i) { $prot=$1; $host=lc($2); # ensures that $host is lower case $path=$4; if (!$3) {$hostonly=1;} # default value is 0 #print "DEBUG: protocol: $prot\n"; #print "DEBUG: hostname: $host\n"; #print "DEBUG: path: $path\n"; if ($prot eq '*') {$prot='(http|ftp|finger)';} $host =~ s/(?0 ? # mDate of wwwoffle_ls is newer than directory's mDate? if ( !(-f "$Spool$protdir/$hostdir/wwwoffle_ls") || (-z "$Spool$protdir/$hostdir/wwwoffle_ls") || ( (-M "$Spool$protdir/$hostdir/wwwoffle_ls" > -M "$Spool$protdir/$hostdir/") && !$fast ) || $force ) { # Reading directory status (atime and mtime) my (undef,undef,undef,undef,$uid,$gid,undef,undef, $atime,$mtime,undef,undef,undef) = stat("$Spool$protdir/$hostdir/"); opendir DIRpath, "$Spool$protdir/$hostdir" || die "Can't open dir: $Spool$protdir/$hostdir"; open LS,">$Spool$protdir/$hostdir/wwwoffle_ls.new"; # Using a alternate name. Halting the process won't leave an incomplete file. # Restoring directory atime and mtime # In case the user stops the process while writing the file utime $atime, $mtime, "$Spool$protdir/$hostdir/"; my $URL; while (my $pathdir = readdir DIRpath) { #if ($pathdir =~ /^U.*/) { if ($pathdir =~ /^D(.*)/) { my $Ufile = "U$1"; if (open Ufile,"$Spool$protdir/$hostdir/$Ufile") { if (defined($URL = )) { print LS "$URL\n"; } close Ufile; } } } close LS; close DIRpath; # renaming the file rename "$Spool$protdir/$hostdir/wwwoffle_ls.new", "$Spool$protdir/$hostdir/wwwoffle_ls"; # Setting the uid and gid of the spool directory chmod 0644 ,"$Spool$protdir/$hostdir/wwwoffle_ls"; chown $uid, $gid, "$Spool$protdir/$hostdir/wwwoffle_ls"; # Restoring directory atime and mtime utime $atime, $mtime, "$Spool$protdir/$hostdir/"; } # At this point wwwoffle_ls should exist. if (-f "$Spool$protdir/$hostdir/wwwoffle_ls") { open LS,"$Spool$protdir/$hostdir/wwwoffle_ls"; while (my $ls = ) { if ($opt_D) { $ls =~ s/%([2-9A-F][0-9A-F])/chr(hex($1))/gei; } if ( ( !$case && ($ls =~ /^$prot\:\/\/$host\/$path/i)) || ( $case && ($ls =~ /^$prot\:\/\/$host\/$path/)) ) { if ($opt_d) { $ls =~ s/%([2-9A-F][0-9A-F])/chr(hex($1))/gei; } print $ls; } } close LS; } else { print "Can't find $Spool$protdir/$hostdir/wwwoffle_ls (write permission denied ?)\n";} } } } close DIRhost; } } closedir DIRspool; # Version 0.2 Can't make it work. # # $host2 = $host; # $host2 =~ s/\*/[^\/]*/g; # #system (" # ls -f -d -1 $Spool$prot/$host/ | # awk '{if (system(\"if [ \"\$1\"wwwoffle_ls -nt \$1/ ] && [ -s \"\$1\"wwwoffle_ls ]; then exit 1; fi\") == 0) { # system(\"ls -1 \"\$1\" \"); # } # print(\"cat \"\$1\"wwwoffle_ls\");print \"\"; # }' # | # grep -E -i \"^[^:]+://$host2/$path\" # "); # Version 0.2 # #system (" # ls -f -d -1 $Spool$prot/$host/ | # awk '{if (system(\"if [ \"\$1\"wwwoffle_ls -nt \$1/ ] && [ -s \"\$1\"wwwoffle_ls ]; then exit 1; fi\") == 0) { # system (\"sed -e \\\"s/^1/1/\\\" \"\$1\"U\* > \"\$1\"wwwoffle_ls \"); # } # system(\"cat \"\$1\"wwwoffle_ls\");print \"\"; # }' | # grep -E -i \"^[^:]+://$host2/$path\" # "); # Version 0.1 # #system (" #ls -f -d -1 $Spool$prot/$host/ | # awk '{system(\"ls -f -1 \"\$1\"\/U\*\")}' | # awk '{system(\"cat \"\$1);print \"\"}' | # grep -E -i \"^[^:]+://$host2/$path\" ");