#!/usr/bin/ruby # eregister # usage: read program and adjust global constants to your liking. # can be safely interrupted by SIGINT. # This script will index a wwwoffle cache and handle incremental # updates examining only changed directories and files. # # (C) Richard Zidlicky, GPL # # Content type is determined by looking at cache file headers, this # is more exact and faster (avoids some disk access) than guessing # from URL # This method also allows excluding http redirects or http/wwwoffle # errors from index. # # Tested on ruby 1.6.8 which has issues with threading, thus some stuff # commented out. # # TODO: # - remove from index documents that vanished from cache # - ensure all estcmd errors are detected and acted upon # - write commandline parsing etc # - easier configurable mime type handling # - maybe there are more HTTP response types that could # ignored etc # casket + xdata go into EDIR, by default /var/spool/wwwoffle/hyperestraier # EDIR/casket-xdata dir is used to store timestamps of domain dirs and data # files # mime-type is additionally stored for data files, atm this is not needed # for this program but might be used for other tools and will be usefull # for removing stale documents ## which domains are to be indexed #WHITELIST="*linux* *kernel* ajp*.org *journals* *endotext* *physiology* *.aps:org *psychiatry* *annals* *oxfordjourn* *.ama-assn.org *.asm.org *jnls.org *publications.org *reviews.org *billstclair* *wiki* *heise* *sourceforge* *q40* *q60* *.gov *.edu *ruby* *ocaml* *gnu.org *eff.org *bbc* *nature* *zmed.org *medicine* *squeak* *mozilla* *firefox* *freshmeat* *mozdev.org" WHITELIST="*" # ignore/recompute time stamps if set to true $force=nil $Casketname="casket-wwwoffle" SPD="/var/spool/wwwoffle" EDIR=File.join(SPD,"hyperestraier") $Csfullname=File.join(EDIR,$Casketname) DDIR=File.join(EDIR,$Casketname+"-xdata") ECMD="estcmd gather -sd -cl -cm -fm -cs 25 -px @uri #{$Csfullname} -" ### end of user serviceable part ### Dir.mkdir(EDIR) unless FileTest.directory?(EDIR) Dir.mkdir(DDIR) unless FileTest.directory?(DDIR) # fork/exec/background *cmd, connect pipes for communication # returns [fdin,fdout,fderr,childpid] def popen4(*cmd) pw = IO::pipe # pipe[0] for read, pipe[1] for write pr = IO::pipe pe = IO::pipe fork{ fork{ # grandchild trap("HUP","SIG_IGN") trap("ABRT","SIG_IGN") Process.setsid #print "forked process, ppid=#{Process.ppid}\n" #print " pid=#{Process.pid}\n" pw[1].close STDIN.reopen(pw[0]) pw[0].close pr[0].close STDOUT.reopen(pr[1]) pr[1].close pe[0].close STDERR.reopen(pe[1]) pe[1].close STDOUT.print "#{Process.pid}\n" exec(*cmd) } exit! } pw[0].close pr[1].close pe[1].close pid=pr[0].gets.to_i #print "pid=#{pid}\n" pi = [pw[1], pr[0], pe[0], pid] pw[1].sync = true if defined? yield begin return yield(*pi) ensure pi.each{|p| p.close unless p.closed?} end end pi end $pid=0 $finish=nil trap("INT"){ #Process.kill "INT",$pid $finish=true } # Matching Shell Globs as Regular Expressions ## UNUSED def glob2pat(globstr) patmap = { '*' => '.*', '?' => '.', '[' => '[', ']' => ']', } globstr.gsub!(/(.)/) { |c| patmap[c] || Regexp::escape(c) } '^' + globstr + '$' end # find out http errors, http content type # look at headers for filetype, do not look into body (no meta etc..) def get_type(file) #print "get_type #{file}\n" File.open(file){|fl| l1=fl.gets case l1 when /^HTTP\/\d.\d.*30\d.*Moved Permanently/i then return "redirect" when /^HTTP\/\d.\d.*404/i then return "error" when /^HTTP\/\d.\d 50\d WWWOFFLE/i then "werror" end fl.each{|line| #print "get_type: #{line}\n" case line when /^Content-Type: (\w+\/\w+)/i #print "matched #{$1} #{$2}\n" return $1 when /^$/ then return "" end } } return "" end # return true if file should not be indexed def ignore(type) case type when /text.(html|plain)/ return nil when /(audio|image|video|redirect|error|javascript|text.css)/ return true default return nil end ##when /plain\/text/ # some weirdo sites return this ;) end def get_times(name) times=nil begin File.open(DDIR+"/"+name,"r"){|df| times=Marshal.load df #print "times = ";p times;print "\n" } rescue => detail #print "get_times #{name}: rescue #{detail}\n" times=Hash.new end return times end def save_times(name,times) begin File.open(DDIR+"/"+name,"w"){|df| Marshal.dump times,df } rescue => detail #print "save_times #{name}: rescue #{detail}\n" end end dirtimes=get_times "dirtimes" newdirtimes=dirtimes.dup domains_done=Array.new $unclean=nil register="" mftimes=[] Dir.chdir SPD estin, estout, esterr, estpid = popen4(ECMD) $pid=estpid Thread.new{ lastdomain=nil estout.each{|line| domain=nil #print line case line when /^estcmd: INFO:.* \(http:\/\/([^\/]*)\/.*: registered$/ domain=$1 when /^estcmd: INFO:.*\/http\/([^\/]*)\/D.*: passed$/ domain=$1 when /^estcmd: INFO:.*signal.* catched$/ $unclean=true when /^estcmd: INFO: finished successfully/ $unclean=nil else print " STDOUT<< #{line}\n" end if domain!=lastdomain then ## does block with ruby 1.6.8 :( #domains_done.push lastdomain if lastdomain print "done #{lastdomain}, doing #{domain}\n" lastdomain=domain end } } Thread.new{ esterr.each{|line| print " ERR<< #{line}\n"} } Dir.chdir "http" Dir.glob(WHITELIST).each{ |domain| #print "domain #{domain}, dtime=#{File.stat(domain).mtime}, last_update=#{dirtimes[domain]}\n" break if $finish next if !$force && File.stat(domain).mtime == dirtimes[domain] newdirtimes[domain] = File.stat(domain).mtime Dir.chdir domain ftimes=get_times "domain_"+domain Dir.glob("D*"){|fle| t=ftimes[fle] next if !$force && t && t[0] == File.stat(fle).mtime # something changed, recheck type type=get_type fle #print "type #{type}\n" ftimes[fle] = [File.stat(fle).mtime,type] next if ignore(type) # rescue necessary if some3d party program left cache inconsistent.. begin url=File.open("U"+fle[1..-1]).gets estin.puts SPD+"/http/"+domain+"/"+fle+"\t"+url+"\n" rescue end } mftimes=mftimes.push [ftimes,domain] Dir.chdir ".." } Dir.chdir ".." # thats all, send eof to estcmd estin.close # wait for all threads to finish Thread.list.each{|thr| thr.join unless thr==Thread.current} estout.close esterr.close # /usr/include/bits/waitstatus.h #define __W_EXITCODE(ret, sig) ((ret) << 8 | (sig)) #err=Process.waitpid2($pid) # err[1] is error code - doesnt work atm if !$unclean then print "returned ok, updating timestamps\n" mftimes.each{|ftimes,domain| save_times "domain_"+domain, ftimes} save_times "dirtimes", newdirtimes else print "unclean return? not updating tstamps\n" # selectively update where sure.. domains_done.each{|dom| dirtimes[dom]=newdirtimes[dom]} save_times "dirtimes", dirtimes end