#/usr/local/bin/perl # # # # aub: assemble usenet binaries # # Mark Stantz, stantz@sgi.com, stantz@sierra.stanford.edu # # aub v1.0 1/92 Basic functionality only. Buggy. # v1.1 3/92 Cleaned up, documented, released, added functionality. # v2.0 8/92. Major rewrite, numerous enhancements, speed hacks. # Now requires a formal configuration file. # # This code is offered as-is. Anyone is welcome to make improvements, # provided that my notice of authorship is retained. I accept no # responsibility for loss or damage caused by this program, nor do I accept # responsibility for supporting it. # # Most of the documentation for this program is self-contained. # # aub -m Prints the short form of the documentation # aub -M Prints the long form # aub -C Lists changes made since previous version # # # DEPENDENCIES: # # /bin/echo -- &find_pg will look for these for you if # /bin/sum they aren't in the usual place. Though # /bin/cat I don't know of any *NIX so bizarre... # /bin/mv # which -- These had better be in your path; # whereis I'm not looking for them. But they # only help you if your other stuff isn't # in the standard place. $ECHO = "/bin/echo"; &find_pg("echo", 0) unless (-x $ECHO); $SUM = "/bin/sum"; &find_pg("sum", 0) unless (-x $SUM); $CAT = "/bin/cat"; &find_pg("cat", 0) unless (-x $CAT); $MV = "/bin/mv"; &find_pg("mv", 0) unless (-x $MV); # CONSTANTS: most likely you want to leave these alone. # # # $aub_rcfile Keeps track of what groups we assemble # binaries in, and what articles there we've # not yet resolved. # # $aub_lock A lockfile, used to ensure that only one # instance of aub runs at a time. # # $aub_tmp A temporary file, used in the construction # of $aub_rcfile # # $general Magic cookie for accessing general aub # parameters. Must be an invalid newsgroup # name. # # $aub_dir If the AUBDIR environment variable is set, it # will override the general (but not specific) # aub directory specified in the configuration # file. # # $aub_desc Specifies the general description file. # # $aub_hook Specified the general hook program. # # $mini_aub_config Unqualified name of our configuration file. # # $aub_config Qualified name of our configuration file. # # # $timeout_interval How long we'll wait for a response from an # NNTP server before deciding it's died on us. # # $version Version number of this program. # # $last_version Version number of previous release. # # $temp_decode_file Another temporary file; binaries are actually # assembled here before being moved into an # aub directory. # # $nntpserver_file A place aub can find the name of the NNTP # server it's supposed to use. This is ignored # if the user's NNTPSERVER environment variable # is set, or if an NNTP server is specified in # the configuration file. # # $obsolete_init Name of obsolete v1.1 pre-warp-drive proto- # configuration file. # # @sigs List of signals that might interrupt our # progress. # # @extn_hints Extensions which help us recognize binary # images in subject lines. $aub_rcfile = join("/", $ENV{"HOME"}, ".aubrc"); $aub_lock = join("/", $ENV{"HOME"}, ".aub_lock"); $aub_tmp = join("/", $ENV{"HOME"}, ".aubtmp"); $general = "."; $aub_dir{$general} = $ENV{"AUBDIR"}; $aub_desc{$general} = $ENV{"AUBDESC"}; $aub_hook{$general} = $ENV{"AUBHOOK"}; $mini_aub_config = ".aubconf"; $aub_config = join("/", $ENV{"HOME"}, $mini_aub_config); $timeout_interval = 60; $version = "2.0.5"; $last_version = "1.1"; $temp_decode_file = "/usr/tmp/aub.decode$$"; $nntpserver_file = "/usr/local/lib/rn/nntpserver"; $obsolete_init = join("/", $ENV{"HOME"}, ".aubinit"); @sigs = ('HUP', 'INT', 'QUIT', 'ILL', 'TRAP', 'IOT', 'EMT', 'FPE', 'BUS', 'SEGV', 'SYS', 'TERM', 'USR1', 'USR2', 'XCPU', 'XFSZ', 'PIPE'); @extn_hints = (".gif", ".jpg", ".jpeg", ".gl", ".zip", ".au", ".zoo", ".exe", ".dl", ".snd", ".mpg", ".mpeg", ".tiff", ".lzh", ".wav"); # # GLOBAL variables # # # There are zillions of 'em, and I don't have the energy to document them all # just now. If you see something that's not explicitly defined as local(), # assume it's global. # # # Main program -- Setup code. # # # Parse arguments. If only documentation is requested, deal with it ASAP # and bail out. # # I probably should have an argument that lets you specify an alternate # configuration file, but if I do that, there'll be some bozo who won't # realize I've broken compatibility with the v1.1 configuration file and # will just use the argument to point v2.0 at his obsolete .aubinit file. # Then I'll get mail when it doesn't work. I don't want to deal with that, # so I'm delaying the command line option until v2.2. (Almost certainly # v2.1 will be bug fixes for v2.0.) # (&Getopts("cd:nCMm")) || (exit(1)); &long_manual if ($opt_M); # None of these subroutines &short_manual if ($opt_m); # will return... &changes if ($opt_C); # Make sure we clean up when we catch an unexpected signal foreach (@sigs) { $SIG{$_} = 'handler'; } # Make a first guess at what file we want to use as our NNTP server. If the # user's NNTPSERVER environment variable is defined, we'll use what's in it. # Otherwise, if $nntpserver_file exists, we'll take a hint from it. if ($ENV{"NNTPSERVER"}) { $server = $ENV{"NNTPSERVER"}; } elsif (-r $nntpserver_file) { chop($defserver = `$CAT $nntpserver_file 2>/dev/null`); $defserver = $1 if ($defserver =~ m/^([^\n]+)\n/); } # Load the configuration file. &load_config_file; # # At this point, we should know for certain whether or not we're using # disk-based news access or NNTP access. If we're going to use NNTP, # we have to have figured out what our server is. # # If our news access is NNTP-based, we need the &AF_INET and &SOCK_STREAM # subroutines defined. These must come from sys/socket.ph, a perl include # file. The user's system may not have this file on it if no-one's run # h2ph there. We'll explain all this to them if we have to. # # We don't want to depend on &AF_INET and &SOCK_STREAM being defined if # we don't have to, so we'll only try to load them if we're running NNTP- # based. if ($spooldir) { print "Using disk-based news access; spool directory is $spooldir\n" if ($opt_d); } else { &abort("Can't find an NNTP server; please define your NNTPSERVER " . "environment variable\nor specify an NNTP server or disk based " . "news access in your configuration file.") unless ($server); if ($opt_d > 1) { print "Searching library directories:"; foreach $libr (@INC) { print " $libr"; } print " for sys/socket.ph...\n"; } &need_to_run_h2ph unless (eval("require ")); print "Using NNTP-based news access; server is $server\n" if ($opt_d); # For NNTP access, we'll need to figure out the remote NNTP service port # number (119 is standard) ($name, $junk, $port, $junk) = getservbyname("nntp", "tcp"); &abort("Can't find port number for NNTP service") unless ($name); # Open up a connection to the remote nntp server. &connect_tcp(SOCKET, $server, $port, 1); # Connect or bust $connected_to_server++; # For cleanup routines &setup_socket_io(SOCKET); # Necessary I/O setup stuff &get_nntp_header(SOCKET); # Discard server greeting banner } # # Make sure we're the only instance of aub running for this user (actually, # for the set of all users having the same home directory as this user.) # &get_lock; # # All the skanky code that used to be here to pre-validate groups has been # removed for speed purposes. We can validate on the fly. # # Load in all the old group information. &load_aub_rcfile; # # Main Loop -- iterate over groups we're interested in decoding binaries from # foreach (@Groups) { $main_loop_group = $_; # Isn't this ugly? $aubdir = ($aub_dir{$_}) ? $aub_dir{$_} : $aub_dir{$general}; if (!chdir($aubdir)) { &warn("Could not cd to directory $aubdir...skipping group $_"); next; } print "Assembling binaries from $_\n" if ($opt_d); print " Unpacking into directory $aubdir\n" if ($opt_d > 1); $aub_desc = ($aub_desc{$_}) ? $aub_desc{$_} : $aub_desc{$general}; close(DESC); # Can't hurt if not open if (($aub_desc) && (!open(DESC, ">> $aub_desc"))) { &warn("Could not open (to append) description file $aub_desc...\n" . "...Skipping group $_"); next; } $aub_hook = ($aub_hook{$_}) ? $aub_hook{$_} : $aub_hook{$general}; @aub_pp = split(" ", ($postprocessor{$_} ? $postprocessor{$_} : $postprocessor{$general})); # Get the first and last article numbers in the group we're interested in. ($first, $last) = &set_group($_); if ((!$first) && (!$last)) { # This group is bogus $_ = ""; # Don't checkpoint this next; } # We'll need to known next time what the last article we saw this time around # was. # Look at the list of unresolved articles for this group which was loaded # from $aub_rcfile. Any unresolved articles that are no longer stored on # the nntp server must be discarded. We load all other articles into memory. print "Previously unresolved articles: $unresolved_list{$_}\n" if ($opt_d); foreach (split(" ", $unresolved_list{$_})) { &load_article($_) unless ($_ < $first); } $unresolved_list{$_} = ""; # We've also loaded from $aub_rcfile the article number of the last article # we saw in this group the last time we ran. If for some reason this has # wound up undefined (it shouldn't be), or if the first article in the # newsgroup is now bigger than the last article we've seen (newsgroup has # been reset), we initialize this value. # # Note that when a new group is loaded for the first time, this is set to # zero, which is most likely smaller than the first article in the group. # $last_article_seen{$_} = $first - 1 if ((!$last_article_seen{$_}) || ($last_article_seen{$_} < $first)); print "Last seen $last_article_seen{$_}, first $first, last $last\n" if ($opt_d); # Load all of the new articles in the group into image queues &load_article(++$last_article_seen{$_}) while ($last_article_seen{$_} < $last); # We use the concept of 'pictures' (ie, binary images) and pieces of # pictures while naming our variables, but really we're just putting together # the pieces of binary files... # # Each image queue (image queues are built by &load_article; see the # discussion there) potentially contains a picture. # # For each potential picture, extract the name of the picture and the number # of pieces that are known to be in it. Build an array, @pairs, containing # N items, where N is the number of pieces of the picture that we actually # have, and where each item contains (encoded) the article number of the ith # piece. foreach $picture (keys %image_queues) { print "Image $picture..." if ($opt_d); ($picture_name, $picture_pieces) = split(":", $picture); @pairs = split("%", $image_queues{$picture}); # If we don't yet have all of the pieces of the picture, then this picture # is unresolved. We want to keep track of the pieces we do have in # a list, which we'll use later in creating our new $aub_rcfile. We write # out the information about unused pieces of images to our $aub_tmp file. if ($#pairs+1 < $picture_pieces) { # Don't have all pieces yet print "incomplete\nNoting unresolved article(s) " if ($opt_d); foreach $pair (@pairs) { ($piece, $article_no) = split(";", $pair); $unresolved_list{$_} = join(" ", $unresolved_list{$_}, $article_no); print "$article_no " if ($opt_d); } print "\n" if ($opt_d); next; } # We've found a new, complete binary image. Build a list of all of the # article numbers in it, then pass this list to the &assemble_picture code, # which creates the binary. undef @articles_this_picture; print "Complete\n" if ($opt_d); foreach $pair (@pairs) { ($piece, @articles_this_picture[++$#articles_this_picture]) = split(";", $pair); } # If the user has stuck a hook into aub, now's the time to ask it whether or # not it really wants us to do the decoding. if ($aub_hook) { $sl = &get_subj_line(@articles_this_picture[0]); `$ECHO "$sl" 2>/dev/null | $aub_hook >/dev/null 2>/dev/null`; $hook_result = $?; print (($hook_result) ? "Hook $aub_hook returned FALSE; not decoding\n" : "Hook $aub_hook returned TRUE; decoding\n") if ($opt_d); } else { $hook_result = 0; } if (!$hook_result) { (($newsgroupdir = $_) =~ y/\./\//) if ($spooldir); &assemble_picture(@articles_this_picture); # Invoke a postprocessor, if necessary. @pp = @aub_pp; while ($#pp > -1) { if ($real_name =~ m/$pp[0]$/i) { &post_process($real_name, $postprocessor_def{$pp[1]}); last; } else { shift @pp; shift @pp; } } } } # Checkpoint the $aub_rcfile file. Blow away our current %image_queues # array, which could well mess us up the next time around the loop. &checkpoint; undef %image_queues; } # # Cleanup code -- leave things in an orderly state # close(DESC) if ($aub_desc); &cleanup; exit(0); # # Subroutines -- the first bunch handle decoding images (recognition code) # # Things have gotten pretty ugly in here lately, with global variables # in use all over the place... # # If you are hacking around in here improving aub's ability to recognize # binary images, and you come up with something clever, let me know. # Except, I'm not interested in hacks that modify &get_subj_line to scan # the _body_ of the article to look for lines of the form 'begin X NNN' -- # you have to assume that people have some amount of common sense. # sub load_article { # # Given an article number, we grab the subject line of the article and # look to see if it might be something we're interested in. If it's not, # we return. # # We are certainly not interested in: # # -- Articles with no subject line # -- Articles beginning with "Re:" # # We are interested in: # # -- Articles containing a string of the form: # (all white space is optional, case not significant) # - N of N (N is any number) # - N fo N (bad typist not using posting software) # - N / N # - N \ N (dyslexic poster?) # - N | N (dyslexic poster?) # # Given something sufficiently interesting, we guess that it's a piece of a # binary image. # # Given such a beast, we note how many pieces there are all together and # which piece of the image this is, then try to deduce the name of the # entire image. # # If we have no better guess, we use all the stuff preceding the # interesting part (described above) as the name of the image. However, # if we find a string containing ".gif", ".jpg" or something like that -- # a string containing any one of the extensions in the list of hints -- # we use that as the name of the image instead. # # It really doesn't matter what we use as the name, so long as it's the # same for all of the pieces of the image, and so long as it's not the # same as any pieces of other images. But people reposting articles # with the same names will screw us up if we're too naive. See below. # # We map colons in the name of the image to "X's", since they can really # mess us up later in life (we use ":" as one of our delimiters, internally.) # # Each identified image piece is inserted into an image queue. Image # queues are named "PICTURE_NAME:NUMBER_OF_PIECES_IN_PICTURE". They are # priority queues; the priority associated with each piece of an image is # the piece number, and the data is the image piece article number. # # Note that later while scanning all of the pieces in a given image queue, # this use of priority values will cause us to see the pieces (article # numbers) in the correct order. So we're doing some implicit sorting here. # # All numerical values derived from article subject lines are padded with # a (rather large) number of zeros, so that we never get confused and think # that piece "02 of 3" should follow piece "1 of 3". # # To keep from mixing pieces of reposted binaries up with the original # postings, we stick the string "reposted." in front of the name. # # There are some more smarts buried in the &insert_q routine. # # Our decoding ability is substantially greater than that of v1.1 now. # local($article) = @_[0]; local($sl, $hint, $piece, $pieces, $name); $sl = &get_subj_line($article); print " Got article $sl\n" if ($opt_d > 1); return if (($sl eq "") || ($sl =~ m/^\s*Re:/i)); $sl =~ y/A-Z/a-z/; return unless (($sl =~ m/^(.*\D)(\d+)\s*o\s*f\s*(\d+)/i) || ($sl =~ m/^(.*\D)(\d+)\s*f\s*o\s*(\d+)/i) || ($sl =~ m/^(.*\D)(\d+)\s*\/\s*(\d+)/) || ($sl =~ m/^(.*\D)(\d+)\s*\|\s*(\d+)/) || ($sl =~ m/^(.*\D)(\d+)\s*\\\s*(\d+)/)); $piece = &pad($2); $pieces = &pad($3); $name = $1; foreach $hint (@extn_hints) { if ($sl =~ m/\S+$hint/) { $name = $&; last; } } $name = "reposted.$name" if ($sl =~ m/repost/i); $name =~ s/:/X/g; # Avoid nasty, subtle bug print " Recognized piece $piece of $pieces, binary $name\n" if ($opt_d > 1); &insert_q(join(":", $name, $pieces), $piece, $article); } sub assemble_picture { # # Given a list of article numbers which make up a given image, we want to # produce the complete, uudecoded file. # # For each article, we use a state machine to ensure that we only uudecode # binary data, not news envelope or text or .sig garbage that the poster # may have included. # # The state machine is discussed a little in the &process_line subroutine. # How it works is neither immediately obvious nor completely documented. # You can mess things up really good by tinkering with it, too. Unless you # can rewrite uumerge or uucat from memory, better not tinker with this stuff. # # This could be made more readable, but it would be a performance trade-off, # and good performance is highly desirable, since we're doing a lot of # interaction with the NNTP server / news files in here. # # We take care not to generate images which exactly match previously generated # images with similar names. local($state, $last_piece, $rawfile); # # If we're in catch up mode, we don't want to do much. We will go as far # as to list the article numbers we would have assembled had we been really # decoding articles. This information can be very useful during debugging. # if ($opt_c) { return unless ($opt_d); print "Catch-up mode...skipping assembly of articles "; foreach (@_) { print "$_ "; } print "\n"; return; } # Yes, all of these are global... $state = 0; # Initialize state machine $last_piece = 0; # Set for last piece of image $desc_text = ""; # Text describing this image $desc_lines = 0; # Running count $desc_text lines $real_name = ""; # Name of binary actually made $possible_duplicate = ""; # This is a global print " Assembling articles " if ($opt_d > 1); if (!$spooldir) { # ***Using NNTP*** foreach (@_) { # For each piece... print "$_ " if ($opt_d > 1); $state = 2 if ($state); # This is black magic $last_piece++ if ($_ eq $_[$#_]); # Set on last piece of image &putline(SOCKET, "body $_"); # Ask for article text $data = &getline(SOCKET); # Get header response if ($data !~ m/^222\s/) { &warn("Could not get body of article $_"); close(DECODE); # Abandon ship unlink($temp_decode_file); return; # Perhaps it just got expired } while (1) { # Not infinite, just fast $data = &getline(SOCKET); # Get a line of the article last if ($last_nntp_line); # Last line this article next if ($state == 3); # Dump remainder of article $state = &process_line($data, $state); # One step thru state machine } } } else { # ***Using raw disk files*** foreach (@_) { # For each piece print "$_ " if ($opt_d > 1); $state = 2 if ($state); # Black magic again... $last_piece++ if ($_ eq $_[$#_]); # Set on last piece of image $rawfile = "$spooldir/$newsgroupdir/$_"; # Filename of the article if (!open(RAWNEWS, "< $rawfile")) { # Can't read the article? &warn("Could not open $rawfile for reading"); close(DECODE); close(RAWNEWS); unlink($temp_decode_file); return; } do { # Discard the news envelope chop($data = ); print " Discard: $data\n" if ($opt_d > 2); } while ($data !~ m/^\s*$/); foreach $data () { last if ($state == 3); chop $data; print " Read: $data\n" if ($opt_d > 2); $state = &process_line($data, $state); } close(RAWNEWS); } } close(DECODE); # Temporary file if ($possible_duplicate) { while ($possible_duplicate ne $real_name) { if (&identical($temp_decode_file, $possible_duplicate)) { print "\n" if ($opt_d > 1); print "Suppressing $real_name; would duplicate $possible_duplicate\n" if ($opt_d); unlink($temp_decode_file); $suppressed++; return; } $possible_duplicate .= "+"; } } `$MV '$temp_decode_file' '$real_name' >/dev/null 2>&1`; print DESC $desc_text if ($aub_desc); # Print only if we decode print "\nActual image name is $real_name\n" if ($opt_d > 1); } sub process_line { # # State machine for processing data from articles: # # This isn't the best explanation possible, I realize. # # We process states in order of frequency of their occurrence, to make things # go fast... # # STATE MEANING # # 0 Processing first article, "begin" line not yet seen. # 1 Processing a not-last article, data lines should # begin with "M" # 2 Processing non-first article, scanning for # continuation of binary data (M-lines) # 3 Through with an article; further data from this # article should be discarded. # 4 Processing last article, data lines should be copied # until 'end' seen. # # # There is now a terrible hack in here to make things work for the group # comp.binaries.os2, where people like to post stuff that contains lines # beginning with 'Minimum-OS2-Version:' before the uuencoded data continues # in pieces 2+ of the binary. aub was seeing the leading 'M' and expecting # uuencoded data to immediately follow, then dropping into state 3 when it # didn't appear. Instant lobotomy. # # Too bad we can't dump anything beginning with /M\S+: /, but alas, that's # perfectly legal... # local($data, $state) = @_; local($begin, $mode, $rest); local($dashes) = "-------"; if ($state == 1) { # State 1 == most common state return 3 unless ($data =~ m/^M/); # End of binary data this piece print DECODE unpack("u", "$data\n");# This is uuencoded data; decode it return 1; # Expect to copy more binary data } if ($state == 4) { # State 4 == 2nd most common state return 3 if ($data =~ m/^end/); # Discontinue decoding when 'end' seen print DECODE unpack("u", "$data\n");# Decode this data, except for 'end' return 4; # Continue this state until 'end' } if (!$state) { # State 0 == 3rd most common state if ($data !~ m/^begin [0-9]/) { # Looking for "begin"; not seen yet? return 0 if ($data =~ m/\-\-\-/); # HACK -- keep ugliness out of $aub_desc return 0 if ($desc_lines++ > 60); # HACK -- some morons post sh archives chop $desc_text if ($spooldir); # HACK -- dump trailing \n when non-NNTP $desc_text = join("\n", $desc_text, $data) if ($aub_desc); return 0; # Copy to desc file if $aub_desc defn. } ($begin, $mode, $real_name, $rest) = split(/\s+/, $data, 4); # Sanity check binary names...we don't like things posted with pathname # components in the name, weird characters, names beginning with ".", or # binaries names "." or ".." $real_name = $1 if ($real_name =~ m/\/([^\/]+)$/); $real_name = $1 if ($real_name =~ m/^\.(.*)$/); $real_name =~ tr/[A-Z][a-z][0-9]+-=_:;.,//dc; $real_name = "Mangled" if (($real_name eq ".") || ($real_name eq "..") || ($real_name eq "")); $mode = 644 if (!$mode); # We don't like mode zero binaries if (-e $real_name) { $possible_duplicate = $real_name; # Would collide with this... $real_name .= "+" while (-e $real_name); } $desc_text = # Stuff group, name into desc info "$dashes <$main_loop_group:$real_name> $dashes\n$desc_text\n\n" if ($aub_desc); unlink($temp_decode_file); (open(DECODE, "> $temp_decode_file")) || &abort("Could not open temporary file $temp_decode_file for writing"); chmod(oct($mode), $temp_decode_file); return 1 if (!$last_piece); # If 1-N-1, -> state 1, else -> state 4 return 4; } if ($state == 2) { # Looking for beginning of >1st piece return 2 unless ($data =~ m/^M/); # Haven't found it yet. return 2 if (length($data) < 61); # Want uuencoded data, not just anything return 2 if ($data =~ m/^Minimum-OS2-Version:/); # What next? print DECODE unpack("u", "$data\n");# Found it, need to decode it return 1 if (!$last_piece); # Not last piece -> state 1 return 4; # Copy last piece data from state 4. } # We should not be called when in state 3; all other states are undefined. &abort("Logic error in state machine"); } sub insert_q { # # Insert item $item with priority $priority into an image queue $queue. # # Both the priority and the item are actually stored in the queue as # the string "$priority;$item". # # A queue is just a string; values in the queue are separated by # "%" characters. # # Since our items are made entirely of numbers, we don't need to worry # about getting confused by queue data that happens to coincide with the # characters we use as delimiters. # # Sometimes we find ourselves in the position of trying to add an item # of priority P to a queue which already contains an item with priority P. # We know that if we go ahead and do this, it will likely hose us, since # there should be one and only one piece P of Q in any given binary. Since # we are called by code that processes articles in sequential order of # subject lines, as a heuristic, we'll assume that the later article should # supersede the previous article. This is reasonable; although it is # clearly not _defined_ to be the case that article X is more recent than # article Y when the article number of X is larger than that of Y, such is # often the case in practice. Besides, there's no perfect way to decide # which of the two articles is to be preferred. It turns out that this helps # us a lot more often than it hurts us. # # Things here should have been sped up slightly since v1.1. ($queue, $priority, $item) = @_; local(@a_queue) = split("%", $image_queues{$queue}); for ($i=0; $i <= $#a_queue; $i++) { last if $a_queue[$i] >= $priority; } if ($a_queue[$i] =~ m/^${priority};/) { $a_queue[$i] = "$priority;$item"; print " Replacing piece $priority of $queue with (presumably) newer data\n" if ($opt_d> 1); } else { splice(@a_queue, $i, 0, "$priority;$item"); } $image_queues{$queue} = join("%", @a_queue); } # # More subroutines -- These deal with accessing news articles. # sub get_subj_line { # # Gets the header information of the specified article, and returns the # subject line, removing the "Subject:" portion. # # If we are using NNTP and the server understands the XHDR NNTP enhancement, # we use it, unless the NOHXDR keyword appeared in the configuration file. # local($article_no) = pop(@_); local($sub_line) = ""; local($resp); # If $spooldir is defined, we're accessing articles directly instead of # using the NNTP protocol. if ($spooldir) { return "" unless (open(SUBJECT, "$spooldir/$newsgroupdir/$article_no")); while (!eof(SUBJECT)) { $sub_line = ; next unless ($sub_line =~ m/Subject:\s*(.*)$/); close(SUBJECT); return $1; } close(SUBJECT); return ""; } # At this point we know we're using NNTP. See whether or not we've already # decided if XHDR works or not. If we haven't decided yet, we decide now. if (!$have_gotten_subj_line_before) { # Another global variable $have_gotten_subj_line_before++; &putline(SOCKET, "xhdr subject $article_no"); $resp = &getline(SOCKET); if ($resp =~ m/^221\s/) { # XHDR is supported! $xhdr_supported++; # Yup, this is global too do { $resp = &getline(SOCKET); # Data's pending, so we use it $sub_line = $1 if ($resp =~ m/^\d+\s+(.*)$/); } until ($resp eq "\."); return $sub_line; # There's the answer } } # How we make the NNTP query for the subject line depends on whether or not # we can use XHDR. &putline(SOCKET, ($xhdr_supported) ? "xhdr subject $article_no" : "head $article_no"); $resp = &getline(SOCKET); return "" unless ($resp =~ m/^221\s/); # Not the expected response do { $resp = &getline(SOCKET); if ($xhdr_supported) { $sub_line = $1 if ($resp =~ m/^\d+\s+(.*)$/); } else { $sub_line = $1 if ($resp =~ m/^Subject:\s*(.*)$/); } } until ($last_nntp_line); return $sub_line; } sub set_group { # # Return the first and last article numbers of the group we're interested in. # If we're using NNTP, we'll use the 'GROUP' command to set the group we're # talking about and return the data the remote nntpd gives to us. If we're # using raw disk files, we'll have to do a little more work than this... # # When processing raw spool directories, $newsgroupdir is set as a side-effect. # # We return a list (f,l) where f is the first available article in the # group and l is the last available article. (0,0) indicates we've run # into a problem. # local($group_to_examine) = @_[0]; local($data, $file); local($min) = "first_time"; local($max) = 0; if ($spooldir) { # Disk based spool data ($newsgroupdir = $group_to_examine) =~ y/\./\/\//; if (!opendir(DIRECTORY, "$spooldir/$newsgroupdir")) { &warn((-d "$spooldir/$newsgroupdir") ? "Cannot open $spooldir/$newsgroupdir; skipping it." : "Invalid group: $group_to_examine...ignoring"); return (0,0); } foreach $file (readdir(DIRECTORY)) { next if (($file eq ".") || ($file eq "..")); next unless ($file =~ m/^\d+$/); $min = $file if (($min eq "first_time") || ($file < $min)); $max = $file if ($file > $max); } close(DIRECTORY); return ($min eq "first_time" ? 0 : $min, $max); } &putline(SOCKET,"group $group_to_examine"); $data = &getline(SOCKET); if ($data !~ m/^211\s/) { # No such group &warn("Invalid group: $group_to_examine...ignoring"); return (0,0); } $data =~ m/^211\s+\d+\s+(\d+)\s+(\d+)\s/; return ($1,$2); } sub get_nntp_header { # # Grab the nntp banner line from the server, sanity check it, and return. # This code is only useful if we're doing NNTP. # local($sock) = pop(@_); local($line) = &getline($sock); return if (($line =~ m/^200\s/) || ($line =~ m/^201\s/)); &abort("Remote nntp service doesn't look like nntp service to me."); } sub setup_socket_io { # # Make the passed socket unbuffered, and invoke a timeout routine if we # don't hear something within finite time. # # I don't really have an 'initialization' section, so this is the next # best place for this... # local($sockname) = pop(@_); local($old_def_fh) = select($sockname); $SIG{"ALRM"} = 'getline_timeout'; $|=1; select($old_def_fh); } sub getline { # # Get a line of data from a socket. This code is used only if our news # access is NNTP-based. # # Note that $socket_stuff is a global variable # local($sockname) = pop(@_); local($data) = ""; local($inp); $timeout_flag = 0; # Not a local variable alarm($timeout_interval); # Can't wait forever. while ($socket_stuff !~ m/\n/) { # No complete line in buffer recv($sockname, $inp, 256, 0); # Need more data $socket_stuff .= $inp; # Append to our buffer. } alarm(0); # Got data. Shut off alarm. &abort("Line from server was too long") if # Still not end of line? ($socket_stuff !~ m/\r\n/); ($data, $socket_stuff) = split("\r\n", $socket_stuff, 2); $last_nntp_line = ($data eq "\."); # Can't confuse with ".." $data =~ s?\.?? if ($data =~ m/^\.\./); # NNTP doubles leading "." print " Received: $data\n" if ($opt_d > 2); return $data; } sub getline_timeout { # # What can we do if our server isn't talking? Not much. # &abort("NNTP server not responding after $timeout_interval seconds."); } sub putline { # # Send a line of data over a socket. # local($line) = pop(@_); local($sockname) = pop(@_); print $sockname "$line\r\n"; print " Sent: $line\n" if ($opt_d > 2); } sub connect_tcp { # # Connect to a tcp port on some host. This code is useful in more places # than just in aub. local($e) = pop(@_); # 0=return on err, >0 = print error, abort on err local($port) = pop(@_); # port to connect to local($server) = pop(@_); # name of server to connect to local($sockname) = pop(@_); # socket to use local($packing_template) = "S n a4 x8"; local($protocol) = "tcp"; local($thishost, $problem, $junk); $thishost = `hostname`; chop $thishost; # Figure out our address... ($name, $junk, $junk, $junk, $ouraddr) = gethostbyname($thishost); if ($name eq "") { $problem="Can't get address of this host (\"$thishost\")"; &abort($problem) if $e; return $e; } # And the address of the host we want to connect to ($name, $junk, $junk, $junk, $theiraddr) = gethostbyname($server); if ($name eq "") { $problem = "Can't find address of server $server"; &abort($problem) if $e; return $e; } # Get the number of the protocol we're to use ($name, $junk, $proto) = getprotobyname($protocol); if ($name eq "") { $problem="Unrecognized protocol: $protocol"; &abort($problem) if $e; return $e; } $us = pack($packing_template, &AF_INET, 0, $ouraddr); $them = pack($packing_template, &AF_INET, $port, $theiraddr); # Get a socket filehandle if (!(socket($sockname, &AF_INET, &SOCK_STREAM, $proto))) { $problem="Could not create socket"; &abort($problem) if $e; return $e; } if (!connect($sockname, $them)) { $problem="Could not connect to server"; &abort($problem) if $e; return $e; } } # # Subroutines -- dedicated to supporting postprocessors # sub post_process { # # Invoke postprocessor command $cmd on $file. # # The error checking here is pretty minimal... # local($file, $cmd) = @_; local($head, $tail); if ($file =~ m/\./) { $head = $`; # Though $' and $` look similar $tail = $'; # one's forward, one's backward } else { $head = $file; # If no "." in image name, $tail = ""; # there is no tail } $cmd =~ s/([^\\])\$h/$1$head/g; # I'm sure there's a more $cmd =~ s/([^\\])\$t/$1$tail/g; # graceful way of doing this, $cmd =~ s/([^\\])\$f/$1$file/g; # but I'm lazy. $cmd =~ s/\\\$h/\$h/g; $cmd =~ s/\\\$t/\$t/g; $cmd =~ s/\\\$f/\$f/g; print "Postprocessing: $cmd\n" if ($opt_d); `$cmd`; } # # Subroutines -- dedicated to loading and parsing configuration files and # files maintained by aub. # sub load_config_file { # # Load the configuration file, which is now line-oriented. # # We provide the user with some helpful hints if we run into problems opening # processing his configuration file. # # $current_group is a global variable used to keep track of which group # current keywords are to be applied to. # # &parse_line does most of the work; we also use it to verify that anything # from the environment we'll be using is legitimate. # local($extn, $cmd, $rest, $grp); if (!open(CONFIG, $aub_config)) { if (-f $obsolete_init) { print <<"EOF"; Hi there. It looks like you're running aub version $version for the first time. The format of aub's configuration file has changed since the version of aub you last ran. You need to create a new one. It's very easy, and you can get all the details from this program. You'll want to call your new configuration file \$HOME/$mini_aub_config. You can print the short form of the documentation by executing 'aub -m', and the long form by executing 'aub -M'. Also, you can read about changes made since the last version of aub by executing 'aub -C'. If you yet read the new documentation for aub v$version, it would really be a good idea to go over the whole thing again. Many things have changed. Mark Stantz EOF exit(1); } &abort("Couldn't open your configuration file \"$aub_config\".\n" . "Execute 'aub -m' (short form) or 'aub -M' (long form) if you " . "need help\nfiguring out how to create a configuration file"); } # Validate any information which might be given to us via the environment $current_group = $general; &parse_line("directory", split(/\s+/, $aub_dir{$general}), "environment variable \$AUBDIR") if ($aub_dir{$general}); &parse_line("desc", split(/\s+/, $aub_desc{$general}), "environment variable \$AUBDESC") if ($aub_desc{$general}); &parse_line("hook", split(/\s+/, $aub_hook{$general}), "environment variable \$AUBHOOK") if ($aub_hook{$general}); # Load and validate the configuration file foreach () { chop; # Trim s/^#.*$//; # Remove comments s/[^\\]#.*$//; s/\\#/#/g; # Unescape escaped comments next if m/^\s*$/; # Skip blank lines s/\s+/ /; # Fold white space into 1 s/^ //; # Drop leading and trailing s/ $//; # white space ($cmd, $rest) = split(/\s+/, $_, 2); &parse_line($cmd, $rest, "configuration file $aub_config"); } # Make sure that certain things we need to know have been defined &conf_err("configuration file $aub_config", "Configuration file doesn't specify any groups to access") if ($#Groups < 0); if (!$aub_dir{$general}) { foreach $grp (@Groups) { next if ($aub_dir{$grp}); &conf_err("configuration file $aub_config", "No directory to put decoded binaries in specified.\n Please " . "set the AUBDIR environment variable or modify your " . "configuration file"); } } # Use configuration file NNTP server if specified. Otherwise use NNTP # environment variable. Otherwise use deduced NNTP server, if any. # All of this is ignored if $spooldir is ever set, in which case disk-based # access will be used. $server = $nntpserver if ($nntpserver); $server = $defserver unless ($server); if (($server eq "") && ($spooldir eq "")) { &conf_err("configuration file $aub_config", "No news spool directory or NNTP server specified or " . "locatable.\n Please specify where to access news in the " . "configuration file, or by\n setting the NNTPSERVER " . "environment variable"); } foreach (@extn_hints) { # dispel regexp magic (Tom s/(\W)/\\$1/g; # Christiansen's term) } } sub parse_line { # # Handle a given line from the configuration file. # # This is pretty straightforward, but tedious. Almost certainly there's a # bug or two in here, but I don't see anything wrong with it at the moment. # local($cmd, $args, $err_type) = @_; local(@args) = split(/\s+/, $args); local($extn, $group, $pp_name, $pp_cmd); if ($cmd =~ m/^dir/i) { # Directory keyword &conf_err($err_type, "No directory specified with DIRectory keyword") if ($#args < 0); &conf_err($err_type, "Too many arguments specified with DIRectory keyword") if ($#args > 0); &conf_err($err_type, "Supposed directory $args is not a directory") unless (-d $args); &conf_err($err_type, "Directory $args is not writable") unless (-w $args); &conf_err($err_type, "Directory $args is not searchable (executable)") unless (-x $args); $aub_dir{$current_group} = $args unless ($aub_dir{$current_group}); return; } if ($cmd =~ m/^desc/i) { # Description file argument &conf_err($err_type, "No description file specified with DESCription keyword") if ($#args < 0); &conf_err($err_type, "Too many arguments specified with DESCription keyword") if ($#args > 0); &conf_err($err_type, "Could not append to description file $args") unless (open(TEST, ">> $args")); close(TEST); $aub_desc{$current_group} = $args unless ($aub_desc{$current_group}); return; } if ($cmd =~ m/^hook/i) { # Hook program &conf_err($err_type, "No hook program specified with HOOK keyword") if ($#args < 0); &conf_err($err_type, "Too many arguments specified with HOOK keyword") if ($#args > 0); $args = &find_pg($args, 1); &conf_err($err_type, "Supposed hook program $args nonexistent or not a plain file") unless (-f $args); &conf_err($err_type, "Supposed hook program $args is not executable") unless (-x $args); $aub_hook{$current_group} = $args unless ($aub_hook{$current_group}); return; } if ($cmd =~ m/^def/i) { # Postprocessor definition &conf_err($err_type, "No postprocessor named with DEFine keyword") if ($#args < 0); $args[0] =~ y/[A-Z]/[a-z]/; if (!$#args) { $args[0] =~ y/[A-Z]/[a-z]/; &conf_err($err_type, "No command associated with postprocessor $args[0]"); } ($pp_name, $pp_cmd) = split(/\s+/, $args, 2); $pp_name =~ y/[A-Z]/[a-z]/; $postprocessor_def{$pp_name} = $pp_cmd; return; } if ($cmd =~ m/^post/i) { # Postprocessor usage &conf_err($err_type, "No postprocessor named with POSTprocess keyword") if ($#args < 0); $args[0] =~ y/[A-Z]/[a-z]/; &conf_err($err_type, "Postprocessor $args[0] used while undefined") unless ($postprocessor_def{$args[0]}); &conf_err($err_type, "Postprocessor $args[0] applied to no extensions") if (!$#args); $pp_name = shift(@args); foreach $extn (@args) { $extn =~ s/(\W)/\\$1/g; $postprocessor{$current_group} .= "$extn $pp_name "; } return; } if ($cmd =~ m/^rec/i) { # Recognize an extension &conf_err($err_type, "No extensions named with RECognize keyword") if ($#args < 0); foreach $extn (@args) { # We'll dispel regexps later push(@extn_hints, $extn) unless (grep(($_ eq $extn), @extn_hints)); } return; } if ($cmd =~ m/^noxhdr/i) { # NOXHDR keyword &conf_err($err_type, "Argument specified with NOXHDR keyword") unless($#args == -1); $have_gotten_subj_line_before++; # Don't bother trying XHDR return; } if ($cmd =~ m/^spool/i) { # Spool directory &conf_err($err_type, "No argument specified with SPOOL keyword") if ($#args < 0); &conf_err($err_type, "Too many arguments specified with SPOOL keyword") if ($#args > 0); &conf_err($err_type, "Cannot both NNTP and spooled news files as input") if ($nntpserver); &conf_err($err_type, "Supposed spool directory $args is not a directory") unless (-d $args); &conf_err($err_type, "Spool directory $args is not searchable (executable)") unless (-x $args); $spooldir = $args unless ($spooldir); return; } if ($cmd =~ m/^nntp/i) { # NNTP server &conf_err($err_type, "No argument specified with NNTPserver keyword") if ($#args < 0); &conf_err($err_type, "Too many arguments specified with NNTPserver keyword") if ($#args > 0); $nntpserver = $args unless ($nntpserver); # We'll validate this later return; } if ($cmd =~ m/^debug/i) { # Turn on debugging &conf_err($err_type, "No argument specified with DEBUG keyword") if ($#args < 0); &conf_err($err_type, "Too many arguments specified with DEBUG keyword") if ($#args > 0); &conf_err($err_type, "Debug level is not a number") unless ($args =~ m/^\d+$/); $opt_d = scalar($args) unless ($opt_d); return; } if ($cmd =~ m/^group/i) { # A group, or group list &conf_err($err_type, "No argument specified with GROUP keyword") if ($#args < 0); foreach $group (@args) { $group =~ y/[A-Z]/[a-z]/; push(@Groups, $group) unless (grep(($_ eq $group), @Groups)); } $current_group = $args[$#args]; return; } &conf_err($err_type, "Unrecognized keyword: $cmd"); } sub conf_err { # # Pretty-print configuration file errors. # local($a, $b) = @_; print "Error in $a:\n $b.\n"; exit(1); } sub load_aub_rcfile { # # Load the aub_rcfile. This file contains the last article seen in all # groups we were reading the last time we were invoked, as well as a list # articles which were unresolved in that group. # # We build the %last_article_seen and %unresolved_list associative arrays # from this information. local($group) = ""; if (! -e $aub_rcfile) { # No $aub_rc file? foreach (@Groups) { # Initialize stuff next if ($_ eq ""); $last_article_seen{$_} = 0; # Nothing seen yet $unresolved_list{$_} = ""; # No unresolved articles } return; } &abort("Can't open $aub_rcfile") if (!open(RCFILE, $aub_rcfile)); chop(@Rc = ); close(RCFILE); foreach (@Rc) { if (m/^([^:]+):(.+)$/) { # Info for a new group $group = $1; # Name of group $last_article_seen{$group} = $2; # Last seen in group $Groups_known[++$#Groups_known] = $1; next; } &abort("$aub_rcfile mangled...please fix or discard") if (($group eq "") || (!m/^\d+$/)); # Else must be a number $unresolved_list{$group} = join(" ", $unresolved_list{$group}, $_); } } sub checkpoint { # # We've just processed all of the new articles in a given group. We want # to create a new $aub_rcfile in case someone decides to interrupt us, so # that all our pointers won't be lost. # # When aub is run with the -n option, all checkpointing is suppressed and # the $aub_rcfile is never modified. This is useful if you want to 'peek # ahead' and see what you'll be getting later -- not something you probably # want to do very often, but it's useful in debugging...especially in # conjunction with -c. # # Anything in @Groups_known -- that is, anything that was in the $aubrc_file # when it was first read in -- will be copied to all subsequent versions # of the $aubrc_file. However, anything invalid groups introduced by new # data in the configuration file will have been discovered by the time this # code is run and the entry in @Groups removed. So while it's true that we'll # forever be copying old pointers for groups we used to follow but no longer # follow, and for groups which were once valid but no longer are valid, we'll # never introduce any currently invalid group in this code. # # We really don't want to get interrupted while running this code. If we # catch a signal we can't block in here, it could mangle the configuration # file. Buffered I/O might safe us from that in a pinch, but don't count # on it. Nobody should be sending us uncatchable signals anyways. # local(@Checkpointed, $known); return if ($opt_n); # No checkpointing foreach (@sigs) { # Don't allow interruptions $SIG{$_} = 'IGNORE'; } (open(CHECKPOINT, "> $aub_tmp")) || # This is just temporary... &abort("Could not open temporary file $aub_tmp for writing"); foreach $known (@Groups_known, @Groups) { next if ($known eq ""); # Possible, if group invalidated next if (grep(($_ eq $known), @Checkpointed)); # Did this one already push(@Checkpointed, $known); # Don't do this again # Now, theoretically, it's supposed to be impossible for $last_article_seen{} # to have something undefined in it. But I want to be really sure, because # this happened to me once. I may be imagining things, or I may just be # fixing a bug with a band-aid instead of getting at the real cause. Not # sure which... $last_article_seen{$known} = "0" if ($last_article_seen{$known} eq ""); print CHECKPOINT "$known:$last_article_seen{$known}\n"; foreach (split(" ", $unresolved_list{$known})) { print CHECKPOINT "$_\n"; } print " Checkpointed: $known ($unresolved_list{$known} )\n" if ($opt_d > 1); } close(CHECKPOINT); &abort("Could not replace $aub_rcfile with updated data") unless (rename($aub_tmp, $aub_rcfile)); # This should be atomic... foreach (@sigs) { $SIG{$_} = 'handler'; # Restore normal signal handling } } # # Subroutines -- Miscellaneous other stuff... # sub get_lock { # # Primitive (but effective) locking mechanism, used to guarantee that only # one instance of aub is running at a given time for a given user. # # We write out PID to a file using echo/append. If the first line of the # file is our PID, we have the lock. # # If the first line of the file is someone else's PID, we don't have the # lock. We check to see if that process is still around; if it isn't, # we'll seize the lock. # `$ECHO "$$" >> $aub_lock 2>/dev/null`; &abort("Can't write $aub_lock") unless (-e $aub_lock); if (!open(LOCK, $aub_lock)) { unlink($aub_lock); &abort("Couldn't open $aub_lock for reading"); } chop($first_line = ); close(LOCK); if ($first_line == $$) { $have_lock = 1; return; } print "Process $first_line seems to be already running aub.\n"; &abort("You may not have two instance of aub running at once. Sorry") if (kill(0, $first_line)); print "That process does not seem to exist any more...\n"; unlink($aub_lock); &get_lock; } sub identical { # # Return true if two files are identical to one another; false otherwise # I'm lazy and count on sum being around to do this for me. # local($f1, $f2) = @_; local($s1, $s2) = @_; chop($s1 = `$SUM '$f1'`); # Compute checksums chop($s2 = `$SUM '$f2'`); $f1 =~ s/(\W)/\\$1/g; # Dispel regexp magic $f2 =~ s/(\W)/\\$1/g; $s1 =~ s/$f1//; # Remove filenames from strs $s2 =~ s/$f2//; return ($s1 eq $s2); } sub find_pg { # # find_pg: find the specified executable on this machine, if possible. # # We try using which first, assuming that if the desired executable is in # our path, it's the one we want. # # If it's not in our path, we try whereis, returning the first program # whereis names for us which is executable. # # If we can't find what we need, we just return our argument back if our # second argument is true. If it's false, we assume that it's crucial that # we find the program specified, and blow up. # local($pg, $crucial) = @_; local($ex) = 1; local($try, @found); return $pg if ($pg =~ m/^\//); # Absolute paths know best chop($try = `which $pg`); return $try if ($try =~ m/^\//); chop($try = `whereis $pg`); if ($try =~ m/^$pg:\s+\//) { @found = split(/\s/, $try); $ex++ while (! -x $found[$ex]); return $found[$ex] unless ($found[$ex] eq ""); } return $pg if (!$crucial); &abort("Could not locate executable \"$pg\""); } sub abort { # # Print an error message and exit. This is for problems we can't recover # from. # &cleanup; print STDERR "@_[0].\n"; exit(1); } sub warn { # # Print an error message and return. This is for things which really # perplex us but which we can recover from. This might get invoked if # articles are expired between the time we find out they're around and # the time we try to access them, for instance, or if we can't open a # file we need to be able to open. # print "@_[0].\n"; } sub cleanup { # # Cleanup code. This could probably be improved/added-to. # &putline(SOCKET, "quit") if ($connected_to_server); close(DESC); close(DECODE); close(SUBJECT); unlink($aub_lock) if ($have_lock); unlink($temp_decode_file); unlink($aub_tmp); close(SOCKET); } sub handler { # # Die gracefully if interrupted by a signal. # print STDERR "Dying on signal @_[0]\n"; &cleanup; exit(1); } sub pad { # # Pad a number to six characters (rather excessive, for our needs) with zeros. # local($n) = pop(@_); local($i); $i = sprintf("%6d", $n); $i =~ s/ /0/g; return $i; } sub Getopts { # # This code was adapted (read that, stolen) from perl's getopt.pl library. # local($argumentative) = @_; local(@args,$_,$first,$rest,$errs); local($[) = 0; @args = split( / */, $argumentative ); while(($_ = $ARGV[0]) =~ /^-(.)(.*)/) { ($first,$rest) = ($1,$2); $pos = index($argumentative,$first); if($pos >= $[) { if($args[$pos+1] eq ':') { shift(@ARGV); if($rest eq '') { $rest = shift(@ARGV); } eval "\$opt_$first = \$rest;"; } else { eval "\$opt_$first = 1"; if($rest eq '') { shift(@ARGV); } else { $ARGV[0] = "-$rest"; } } } else { print STDERR "Unknown option: $first\n"; ++$errs; if($rest ne '') { $ARGV[0] = "-$rest"; } else { shift(@ARGV); } } } $errs == 0; } # # Subroutines -- long, boring subroutines that print out lots of text. # # You can't lose the documentation any more. Beware: variable values get # interpolated in here... It may be occasionally necessary to escape things. # sub need_to_run_h2ph { # # I can't tell you how many mail messages I got about the perl "Can't locate # sys/socket.ph in @INC" message when aub v1.1 was released. I don't want # to deal with that any more. # print <<"EOF"; Your system is missing the library file sys/socket.ph, which is required by aub. This file can't be distributed with aub because its contents are system-dependent. sys/socket.ph is generated by h2ph, which is distributed with perl. h2ph takes your systems /usr/include/sys/socket.h file and generates an analogous header file, probably /usr/local/lib/perl/sys/socket.ph. aub needs this file because it works with sockets. h2ph is very straightforward -- read the man page. Supposing that your perl library directory is /usr/local/lib/perl, all you need to do is run 'h2ph < /usr/include/sys/socket.h > /usr/local/lib/perl/sys/socket.ph'. You may need need to create the directory /usr/local/lib/perl/sys before you do this. Also, it's not uncommon for the perl library to be /usr/lib/perl instead of /usr/local/lib/perl. aub doesn't need this file if you set up your configuration file to use disk-based spool files instead of the NNTP protocol. But that's really not recommended if NNTP is available to you; NNTP-based access to news is much faster, and it really shouldn't be hard for you to figure this out. Mark Stantz EOF exit(1); } sub changes { # # Print changes to aub since last version. We hit only the high points. # print <<"EOF"; Changes to aub since v$last_version: o Added support for disk-based (non-NNTP) access to news o Made aub configuration file-oriented o Added code to suppress generation of identical binaries o Added .aubrc checkpointing code o Added postprocessor functionality o Added hook functionality (allows selective decoding) o Added support for NNTP/XHDR o Improved binary recognition abilities significantly o Removed unnecessary newsgroup pre-validation (speed hack) o Folded uudecode functionality into aub itself (speed hack) o Folded documentation into program (convenience hack) o Added 'catchup' and 'no-checkpoint' command line options o Fixed bugs in recognition code, NNTP protocol implementation, signal handling Known bugs in this version: o -d0 on command line does not override 'debug N' in configuration file. Unknown bugs in this version: o Rather likely. But you still really want to run this instead of v$last_version. Trust me. EOF exit(0); } sub short_manual { # # Print out the short form of the manual...memory refresh mode... # print <<"EOF"; Command line options: aub -c Catch up on all groups, but do not assemble binaries aub -n No checkpointing; don't update .aubrc aub -dN Set debugging level to N aub -M Print the aub manual aub -m Print this summary of the manual aub -C List changes since previous release of aub Configuration keywords (for \$HOME/.aubconf): GROUP ... Causes aub to process the newsgroup(s) listed SPOOL Use directory as the root of the news spool tree (4) NNTP Use as an NNTP news server (4) DIRectory Assemble binaries into the specified directory (1) DESCription Store descriptions of assembled binaries in the specified file (1) HOOK Decode binaries only when the specified program exits returning status true (zero) (1,2) DEFine Define a postprocessor called , which will invoke the command (3) POSTprocess ... Apply postprocessor to binaries whose filenames end in any listed suffix (1,3) RECognize ... Add the suffixes listed to aub's internal table of common suffixes NOXHDR Do not use the NNTP XHDR command, even if it is understood by the NNTP server DEBUG Set the default debugging level to N Notes: (1) -- If this keyword appears before any GROUP keyword, it applies to all groups binaries are assembled from by default. If the keyword appears after a given GROUP keyword, it applies to that group only, and overrides any default which may earlier have been established with the same keyword. (2) -- The hook program will be passed the subject line of the lowest- numbered piece of the binary image on standard input. It cannot be invoked with arguments (yet). (3) -- If the string "\$h" appears unescaped in the command, it will be replaced before the command is invoked with the "head" of the filename of the binary. Similarly, "\$t" will be replaced with the "tail", and "\$f" will be replaced with the entire filename. The head and tail of "foo.gif" are "foo" and "gif", respectively. (4) -- The SPOOL and NNTP keywords may not both appear in the same configuration file. EOF exit(0); } sub long_manual { # # # print <<"EOF"; The Introducing AUB Document 1. What is aub? More and more people are posting binary files to usenet these days. Some of these binaries are executables and audio data; a majority seem to be pictures of various things, typically landscapes, movie stars and naked people. Because of limitations in the type data that usenet can accommodate, binaries must be encoded into text, and because binary files are commonly very large relative to text files usenet was designed to handle, they frequently must be broken up into pieces. Programs have been developed which take a given binary, encode it, and automatically post it in pieces with descriptive subject lines. When this data arrives at a remote site, users see subject lines that look something like this: 12011 roadkill03.gif, part 1/4 12012 roadkill03.gif, part 3/4 12013 More pictures of tatooed children, please... 12014 Re: roadkill02.gif -- I love the way the eyes bulge out 12015 roadkill03.gif, part 4/4 12016 roseanne_nude.jpg, part 02 of 02 12017 Only BINARIES should be posted here, GOD DAMMIT 12018 roadkill03.gif, part 2/4 12019 HI, I'M BIFF!!!! THESE PIX ARE WAY COOL!!!! 12020 roseanne_nude.jpg, part 01 of 02 While the process of encoding and splitting up binaries for posting to usenet is relatively straightforward, the process of retrieving, sorting, and decoding the pieces (which do not necessarily arrive in order) at receiving sites is less straightforward, tedious, time consuming, and very prone to human error. aub, which stands for "assemble usenet binaries", automates this reassembly process for you. aub is intended for use in newsgroups to which binaries are posted exclusively. When run, it accesses news articles via either a disk-based news spool directory, or via an NNTP news server, determines whether or not any new binaries have appeared in selected newsgroups since the last time it was run, and if so, retrieves, organizes and decodes them, depositing them in a configurable location. This process requires no human intervention once aub has been configured. aub also keeps track of binaries which it has seen some, but not all, of the pieces of. It remembers how to find these old pieces, so that when new, previously missing pieces arrive at your site, it will build the entire binary the next time it is run. It also remembers which binaries it has already seen all of the pieces of already, so that it does not waste time rebuilding the same binaries over and over again. aub was created as a time saver; too many people at too many sites were spending way too much time manually unpacking binary files. Its ability to identify and assemble binary images depends on people posting images with subject lines that observe (loosely) established conventions. aub's recognition capabilities have been significantly improved since the earliest release. 2. How does aub work? aub looks for subject lines containing strings like: N of N N / N N \ N N | N where N is any number composed of one or more digits, and white space is optional. Once it sees such a line, it tries to figure out a name for the binary by looking at the rest of the subject line. These names are relevant only to aub's internal functioning; when unpacked, binaries are named according to the information they were encoded with. However, it's important that, whatever internal name aub decides on for the binary, that name be recognizable in the subject lines of all pieces. aub ignores all news articles with null subject lines and subject lines that begin with "Re:" regardless of other content. aub uses two files which are maintained in each user's home directory. One is \$HOME/.aubconf, which is a configuration file that allows you to customize aub's behavior. See section 5 for a detailed explanation of the structure of configuration files. The other file is \$HOME/.aubrc. You should never need to modify this file; aub creates it and maintains it. It's used to keep track of what articles in which groups aub has resolved already, and what articles aub believes to be pieces of binaries that it hasn't seen all of the pieces of yet. 3. What do I need on my system to run aub? You will need Larry Wall's perl interpreter. Older versions of aub also required David Mack's uumerge program; this functionality has since been folded into aub for the sake of speed. perl is available via anonymous FTP from uunet.uu.net, tut.cis.ohio-state.edu, and jpl-decvax.jpl.nasa.gov. Your machine must also have access to news, either via the NNTP NNTP protocol, or by being able to open raw news files on a disk somewhere. Previous versions of aub required that your news access be NNTP-based; this restriction has since been lifted. 4. How do I install aub? There's really only one thing that you might need to configure. aub is a perl script. The first line of the program looks like this: #!/usr/local/bin/perl This appears to tell your shell where to find the perl interpreter. If the path of perl on your system is something else, you'll need to change this line, or create a link called /usr/local/bin/perl which points to where your perl executable actually resides. If you need to change this, you'll probably see a message like: 'aub: Bad address.' when you try to run aub. 5. How do I configure aub? Older versions of aub made use of a configuration file which was normally called \$HOME/.aubinit. But few interesting customizations could be accomplished with .aubinit files, because the configuration language was so primitive. The configuration language has been redesigned to allow much greater flexibility. Old .aubinit files will no longer work, or be recognized by aub (except inasmuch as aub will notice them and point out to you that you need to create a new configuration file if you don't already have one.) The new configuration file for aub should be called \$HOME/.aubconf. Configuration files are line-oriented; each line is processed separately. If any line contains the '#' character, aub concludes that the character begins a comment, and discards the comment character and everything one the line that follows it. If for some reason you need to put a '#' character in your configuration file and do not want it to be interpreted as beginning a comment, you'll have to escape it by preceding it with a backslash character, e.g. '\\#'. Each non-blank line in a configuration file must begin with a keyword recognized by aub. The case of keywords is not significant. As far as aub is concerned, "keyword", "KEYWORD", "Keyword" and "KeYWorD" all mean the same thing. Some keywords require arguments; some require no arguments appear, and some permit varialbe numbers of arguments. If aub sees keywords it doesn't understand in your .aubconf file, it will complain to you about them. One of the keywords aub understands is the GROUP keyword. It's used to tell aub that you want to decode binaries from the newsgroup(s) which appear as argument(s) to the keyword. For example: GROUP alt.binaries.pictures.misc GROUP alt.binaries.pictures.misc alt.binaries.pictures.fractals Every configuration file must contain at least one GROUP keyword to be correct. In general, aub understands two types of keywords. One type is called 'position insensitive', which means that the keyword will have the same effect no matter where in the configuration file it appears. The other type is called 'position sensitive', which means that the keyword means something different when it appears before any GROUP keywords than it does when it appears after any given GROUP keyword. One such position sensitive keyword is the DIRectory keyword. This keyword is used to tell aub what directory to put binaries it decodes in. ("DIRectory" is spelled the way it is because only the 'DIR' part needs to appear in a configuration file for aub to recognize it. In fact, aub will interpret any keyword beginning with the letters 'DIR' as being an instance of the DIRectory keyword.) When a position sensitive keyword appears _before_ any GROUP keyword, the keyword is interpreted as being the default for all groups that appear later. When a position sensitive keyword appears _after_ any GROUP keyword, it is interpreting as applying *only* to that group, overriding any previous default which may have been established via use of the same keyword, or by the value of environment variables (see section 8.) Position sensitive keywords appearing after a GROUP keyword which lists multiple groups are applied only to the last group listed, not to all groups appearing on the group line. For example, the following three configuration files are equivalent: # Sample .aubconf file no. 1 -- basic example # dir /tmp/aub # Default directory group alt.binaries.pictures.misc # Process these group alt.binaries.pictures.fractals # two groups # Sample .aubconf file no. 2 -- multiple group usage, mixed case # DiR /tmp/aub # Default directory gRoUp alt.binaries.pictures.misc alt.binaries.pictures.fractals # Sample .aubconf file no. 3 -- does not use defaults # group alt.binaries.pictures.misc directory /tmp/aub group alt.binaries.pictures.fractals direct-to /tmp/aub # 'dir' is all you need The following three configuration files are also equivalent, though not equivalent to the previous three: # Sample .aubconf file no. 4 -- explicit placement of binaries # group alt.binaries.pictures.misc dir /tmp/aub/misc group alt.binaries.pictures.fractals dir /tmp/aub/fractals # Sample .aubconf file no. 5 -- explicit and default placement # dir /tmp/aub/misc # Default directory group alt.binaries.pictures.misc # Use default directory group alt.binaries.pictures.fractals dir /tmp/aub/fractals # Override default # Sample .aubconf file no. 6 -- explicit and default placement revisited # dir /tmp/aub/fractals # Default directory group alt.binaries.pictures.misc dir /tmp/aub/fractals # Override default group alt.binaries.pictures.fractals # Use default directory The configuration file: # Sample .aubconf file no. 7 -- invalid # group alt.binaries.pictures.misc dir /tmp/aub group alt.binaries.pictures.fractals # No good is invalid, because no directory for aub to place binaries decoded from the newsgroup alt.binaries.pictures.fractals is specified. The DIRectory keyword is unique in this regard; there must be some use of the keyword that enables aub to figure out where to put binaries for every group specified, or it will refuse to run. The easiest way to deal with this is to always establish a default directory by using the DIRectory keyword somewhere before any groups appear. Other position sensitive keywords are available. DESCription This keyword causes aub to extract text from what it thinks is the text portion of posted articles, and append it to the file you specify. This is useful if you're interested in reading the text that describes what all the binaries aub is unpacking are about. A maximum of 60 lines per binary extracted will be put into the file you indicate. Each description is prepended with the name of the decoded binary it refers to, and the group that binary was decoded from. HOOK This keyword enables you to select which binaries aub decodes using your own software. If the HOOK keyword is specified, aub will invoke the argument program and supply it with subject line of the first piece of a binary that it can potentially decode via standard input. If the program returns true (zero), aub will decode the binary. If the program returns false (non-zero), aub will skip decoding the binary, and continue processing. It is not (yet) possible to specify arguments to the user program. For example, the following sample program returns true if standard input contains the string ".gif" (case insignificant), and false otherwise. #!/usr/local/bin/perl # # /tmp/sample_aub_hook: a simple, sample hook program # \$sl = ; # Get standard input exit(0) if (\$sl =~ m/\.gif/i); # Contains ".gif" exit(1); # Didn't see ".gif" Suppose this program were attached to aub via the configuration line: hook /tmp/sample_aub_hook Then aub would only decode binaries containing the string '.gif'. You can write hook programs in any language you choose. POSTprocess ... This keyword enables you to postprocess binaries whose names end in the string (you can list any number of these suffixes on a single line in the configuration file.) Case is not significant in . Before a POSTprocess keyword can appear, must first be defined using the DEFine keyword, which is position insensitive. The format of the DEFine keyword is DEFine may be any string. It's recommended that you stick to alphanumerics. is any UNIX command, with arguments. Simple substitutions are performed on before it's executed in conjunction with the existenece of a POSTprocess keyword and the appearance of a binary whose filename ends in one of the suffixes listed as arguments to the POSTprocess keyword. This all makes perfect sense but is a little difficult to explain. The following example should make things much clearer. Consider the following configuration file: # Sample aub configuration file demonstrating use of a postprocessor # dir /tmp/aubdir define jpg2gif djpeg -G \$f > \$h_.gif postprocess jpg2gif .jpg .jpeg group alt.binaries.pictures.misc The first line tells aub that it should decode binaries into the directory /tmp/aubdir. The second line defines a postprocessor for aub. The name of the postprocessor is specified as "jpg2gif". The third line says that the postprocessor will be invoked whenever a binary with a name ending in '.jpg' or '.jpeg' is decoded. The fourth line specifies the group that binaries are to be decoded from. Suppose the binary full_moon.jpeg is decoded from alt.binaries.pictures.misc. The binary name "full_moon.jpeg" can be thought of as consisting of three parts; the head part -- everything before the last '.' character -- the '.' character itself, and the tail part -- everything after the last '.' character. aub uses the abbreviations '\$h', '\$t', and '\$f' to refer to the head part, tail part, and entire filename, respectively. (If no '.' character appears in the name of a decoded binary, \$h equals \$f, the entire name of the binary, and \$t is empty.) Because the binary name "full_moon.jpeg" ends in ".jpeg", one of the arguments specified on line two of the sample configuration file, aub invokes the postprocessor "jpg2gif". aub substitutes the appropriate values for '\$f' and '\$h', in this case, "full_moon.jpeg" and "full_moon" into the postprocessor definition, and executes the resulting UNIX command, which in this case is 'djpeg -G full_moon.jpeg > full_moon_.gif' Assuming that you have the djpeg program on your machine (this software is available via anonymous FTP from ftp.uu.net under the graphics/jpeg directory), this command will cause the .jpeg file to be automatically converted into a similarly named .gif file when it is decoded. A few more examples, again, based on the configuration file above Filename of decoded binary \$h \$t \$f ------------------------------------------------------------------------------ crescent_moon.jpg crescent_moon jpg crescent_moon.jpg big.dog.gif big.dog gif big.dog.gif Filename of decoded binary Postprocessed Reason ------------------------------------------------------------------------------ crescent_moon.jpg yes \$f ends in '.jpg' big.dog.gif no \$f doesn't end in '.jpg' or in '.jpeg' Filename of decoded binary UNIX command executed ------------------------------------------------------------------------------ crescent_moon.jpg djpeg -G crescent_moon.jpg > crescent_moon_.gif big.dog.gif (none executed) We could have easily have written: define jpg2gif djpeg -G \$f > \$h_.gif ; rm -f \$f to cause aub to remove the old .jpeg version of the binary after converting it to .gif format. I've added the extra underscore character in this example to decrease the chance that djpeg, when it runs, will clobber another binary which aub already unpacked with the name "full_moon.gif" or "cresecent_moon.gif". Postprocessor definitions that can't be executed for some reason may cause you (and aub) some problems at run time. The following keywords are, like DEFine, position independent: NNTP This tells aub that your news access is NNTP-based, and that it should use the specified host as an NNTP server. SPOOL This tells aub that your news access is based on access to raw news files, and that is the root of the news spool tree. A single configuration file may not contain both the NNTP and SPOOL keywords. If neither the NNTP keyword nor the SPOOL keyword appear in your configuration file, aub will assume your news access is via NNTP and use your NNTPSERVER environment variable, if it is defined, to decide what server to connect to. If your NNTPSERVER environment variable is not defined, aub will try to figure out where you normally read news from. If it can't do that, it will ask you to supply the information. If you ever change the mechanism by which you access news, or the server you read news on, you'll need to remove the .aubrc file that aub maintains to keep track of what groups you have and have not read. Otherwise, because articles are numbered differently on different servers, aub will get hopelessly confused. (It's possible, though not recommended, to switch seamlessly back and forth between NNTP and SPOOL access to news on the same host.) This is probably the only time you'll ever want to tamper with a .aubrc file. DEBUG Sets the default debugging level aub runs at to N. N must be a non-negative integer. Debugging level 0 is the default; when run at debugging level zero, aub produces no output unless it runs into serious problems. Setting the debugging level to 1 will tell you about what aub is doing. Setting the debugging level to 2 will tell you even more about what aub is doing. Setting the debugging level to 3 or higher will show you more than you ever wanted to know. RECognize ... The recognition code (the part of aub that identifies binaries) maintains a list of common suffixes that it uses to recognize binaries while it scans subject lines. For example, many binaries have names ending in ".gif", so ".gif" is on aub's internal list of hints. The RECognize keyword allows you to add suffixes to this internal list of hints. Use this capability sparinging. You can really give aub a coronary by saying something like 'rec a b c d e f g ...'. Doing something foolish like that will cause your aub to lose the ability to assemble things that it would otherwise have been able to. The current list of common suffixes aub maintains is: ".gif", ".jpg", ".jpeg", ".gl", ".zip", ".au", ".zoo", ".exe", ".dl", ".snd", ".mpg", ".mpeg", ".tiff", ".lzh", ".wav" NOXHDR This keyword is meaningful only if your news access is NNTP-based. It will cause aub to not use the XHDR command to access the subject lines of news articles, even if the NNTP server you're using has XHDR capability. If the same keyword appears multiple times, and the second appearance is not a position sensitive override of some established default, then aub ignores the second instance of the keyword. 7. How do I use aub? After you've built your configuration file, just run 'aub'. If this is the first time you've run aub since v$last_version, you may want to undefine any AUB-related environment variables you had set. These variables are interpreted differently now. See section 8. You will not need to remove your .aubrc file, but your .aubinit file is no longer useful and you'll probably want to get rid of it once you've created .aubconf. If this is the first time you've run any version of aub, ever, you may want to use the '-c' command line option. Or you may not...see section 9. 8. Environment variables used by aub. \$AUBDIR Sets the default directory binaries are unpacked into. Equivalent to specifying a DIRectory keyword before any GROUP keywords. Will override any DIRectory keyword appearing before any GROUP keyword, but not those appearing after a GROUP keyword. \$AUBDESC Analogous to \$AUBDIR \$AUBHOOK Analogous to \$AUBDIR \$NNTPSERVER Specifies an NNTP server to use for news access if no NNTP keyword appears in the configuration file. If an NNTP keyword does appear, \$NNTPSERVER is ignored. Note that \$AUBGROUPS is no longer used as of version $version. If aub doesn't seem to be doing what you'd expect it to do based on your .aubconf file, it could be because your environment variables are causing defaults you've established there to be ignored. 9. Command line options supported by aub: -c 'Catch-up' mode; aub will bring its internal pointers (and your .aubrc file) up to date, but will not actually generate any binaries. This is useful when you run aub for the first time; it keeps it from generating megabytes and megabytes, as it scans old news articles. -n 'No-checkpoint' mode; prohibits aub from updating its internal pointers (your .aubrc file). This option is primarily useful only during debugging. -dn 'Debug' mode; sets the debugging level to N. This overrides the debugging level set in the configuration file, except that 'aub -d0' does not work...this is a bug. -M Causes aub to print the long form of the documentation (this document.) -m Causes aub to print a summary of the documentation. -C Lists significant changes since that last major release of aub. 10. What do I do if I have problems installing or configuring aub? See if you can figure out what the problem is. I've only set aub up on my local system, so it's possible you could have problems I haven't foreseen. If you really can't get it to work, try talking to a friend who knows systems programming and administration type stuff. Offer your friend food -- systems people especially like dim sum and Heineken. You could also send me mail. Whether or not I answer your mail will depend a lot on how busy I am. Sorry, but I have an obligation to get work done promptly for my client, who's paying me for my time. I can't really deal with supporting aub on the side for the entire net. Also, if your problem has to do with peculiarities of your local site, there may not be a lot I can do about it. 11. What else do I need to know? In order to guarantee proper administration of the .aubrc file, you can only run one instance of aub at a time. In this respect aub is similar to most newsreaders. The first time you run aub over a given group, if you choose not to use the -c option, it may take a long time to run. This is because it's looking at all of the articles in the group, and building lots of binaries. After you run it for the first time, it only needs to look at new stuff in the group. Things will go much faster after that. If aub assembles two binaries with the same name, and wants to store them in the same place, it will compare them to see whether or not they're identical. If they are identical, it will discard the newer copy. If they're not identical, it will append '+' characters as necessary to the name of the second binary until the name is unique. aub checkpoints its progress in the .aubrc file after processing each group. This keeps it from having to start all over again if it dies of a signal, expired CPU time limit, etc... aub takes liberties with changing around the names of binaries that it doesn't particularly like. It may rename binaries to be called "Mangled" if people post things that are supposed to be unpacked to "." or "..", or something equally obnoxious, for instance. It will drop the leading "." off of binaries called ".something", and relativize pathnames so that your binaries always wind up in the directories you want them in. It's unfriendly to run aub so often that you occupy too much of your news server's time. It's pronounced "oww-buh", as in "S(au)di", not "awe-buh", as in "sl(aw)". This software is offered as-is, with no guarantees or promises made by me whatsoever. I disclaim all responsibility for loss or damage caused by the program. Mark Stantz stantz@sierra.stanford.edu stantz@sgi.com 8/92 EOF exit(0); } # # Subroutines -- Unused code (not invoked by anything in aub, but still here) # sub tribute { # # This has some (not much) value as a debugging aid. # &abort("", &process_line(join('&',"82G5S=\"!A;F]T:","5R('!E&STDOUT")); } sub debug_parser { # # Unformatted, uncontrolled spewing of information. Useful in making sure # that the parser works. In the release, we can assume that this is the # case. # foreach (keys %aub_dir) { print "directory $_ -> $aub_dir{$_}\n"; } foreach (keys %aub_desc) { print "desc $_ -> $aub_desc{$_}\n"; } foreach (keys %aub_hook) { print "hook $_ -> $aub_hook{$_}\n"; } foreach (keys %postprocessor_def) { print "definition $_ -> $postprocessor_def{$_}\n"; } foreach (keys %postprocessor) { print "postprocessor $_ -> $postprocessor{$_}\n"; } print "hints "; foreach (@extn_hints) { print "$_ "; } print "\n"; print "groups "; foreach (@Groups) { print "$_ "; } print "\n"; print "spool $spooldir\nnntp $server\n"; print "debug $opt_d xhdr $have_gotten_subj_line_before\n"; }