Article 2366 of alt.sources: Xref: feenix.metronet.com alt.sources:2366 Newsgroups: alt.sources Path: feenix.metronet.com!news.utdallas.edu!hermes.chpc.utexas.edu!cs.utexas.edu!uunet!decwrl!decwrl!netcomsv!netcom.com!jolomo From: jolomo@netcom.com (Joe Morris) Subject: newser - acc. news browser Message-ID: Summary: Browses through saved News and Mail Keywords: news mail perl Organization: NETCOM On-line Communication Services (408 241-9760 guest) X-Newsreader: TIN [version 1.1 PL8] Date: Thu, 2 Dec 1993 23:31:07 GMT Lines: 448 Perl program that treats those accumulated News and Mail files as a sort of database. Please mail any modifications to jolomo@netcom.com #!/usr/bin/perl # # newser -- a News and Mail query browser. Joe Morris jolomo@netcom.com # # $Id: newser,v 1.16 1993/12/02 20:20:10 joe Exp $ # # SYNTAX sub Usage { print "Usage: $0 [-f from_string] [-s subject_string] [search_string]\n"; print " Wrap double quotes around regex strings\n"; exit(1); } # # DESCRIPTION # The raw materials for this program are files where you've appended all # your online goodies. All those articles that you thought were worth # saving but now are taking 100MB of disk space and all you can do # is grep around in them. BLAH :( # NEWSER treats each of these files as a series of discreet messages, # you will be much better off if you just appended these things one after # the other, never editing the area which really defines a message --- # The Header. (Formal description of a "message" is in Notes section). # The USENET digests I've tried have worked fine. # With NEWSER, you can search on specific subjects, authors (From lines), # and things within the body of the message. A search for the word "From" # in the body will not match the "From"'s in headers, searches for # a specific subject will only match Subject lines in the header. # This program was prompted by a recent discussion on # alt.folklore.computers where it seemed that this sort of thing is # not readily available today -- so here's my 300 lines worth. # # This is alpha, it's been run successfully on Linux(4MB ram :), SUN and AIX. # # ENVIRONMENT # NDIRS (news-dirs) a PATH-like variable to searched for your accumulated # articles and e-mail. Defaults to "$HOME/News:$HOME/Mail". # PAGER if unset or empty, defaults to "less". # # NOTES, Questions and Bugs # 0. It runs about as quickly as you would expect. Perl experts please mail # me useful optimizations -- especially in got_a_match(). # 1. Probably am overusing global variables # 2. Does anyone have a regex to handle most every "Date:" format that # shows up in News and Mail -- To do date ranges. I haven't really needed # this feature, is it useful? # 3. What's the best way to implement ignore case matches -- with evals or # more conditional statements or something else? I work around this now # with regex's like: $ newser -s "[Hh][Ee][Ll][Pp]" # 4. Right now all search criteria are ANDed together. # 5. Quitting the PAGER before search is done will abort the program. # 6. How can I position the PAGER at the most recently viewed message? # 7. If you don't already use "less", this would be a good time to learn it. # 8. What is a News (or Mail) message: # a. Top of file or blank line # b. header -- at least three lines (can't have a blank line in between) # must be matched by patterns in handle_head_line(). # You can have any number of these in the header # and any order is ok # c. A blank line # d. body of message -- optional # any number of lines with one possible fatal error: # a blank line followed by 3 lines matched by handle_head_line(). # Any blank line before the magic "3" will recover from error. # 9. Please mail me any rewrites or additions -- jolomo@netcom.com # require "getopts.pl"; require "assert.ph"; require "flush.pl"; #$DEBUG="yes"; # default environment stuff $home=$ENV{'HOME'}; $DEFDIRS = "$home/News:$home/Mail"; if (length($ENV{'NDIRS'}) == 0) { $ENV{'NDIRS'} = $DEFDIRS; } $DEFPAGER = "less"; if (length($ENV{'PAGER'}) == 0) { $ENV{'PAGER'} = $DEFPAGER; } $PAGER = $ENV{'PAGER'}; # main() # # Define scope of From && Subject lines (META support date ranges??) # &Getopts('f:s:'); # From, Subject # Must have at least one author, subject or string if (!$opt_f && !$opt_s) { $norm_scope = shift || &Usage(); } $from_scope = $opt_f; $subj_scope = $opt_s; # Arbitrary number of possible search paths, probably way too big @dbdirs = split(/:/, $ENV{'NDIRS'}, 999); open (SHOWU, "|$PAGER"); print SHOWU "Looking for: \n"; $from_scope && print SHOWU " From lines matching $from_scope\n"; $subj_scope && print SHOWU " Subject lines matching $subj_scope\n"; $norm_scope && print SHOWU " Message lines matching $norm_scope\n"; print SHOWU "\nSearching....\n\n"; &flush(SHOWU); # Number of matching messages $match_count = 0; foreach $dbdir (@dbdirs) { opendir(DIR,$dbdir) || ((warn "Can't process $dbdir: $!\n"), next); @files = sort grep(!/^\./, readdir(DIR)); closedir(DIR); foreach $dbfile (@files) { $infile = "$dbdir/$dbfile"; &get_the_message(); } } print SHOWU "Done Searching...Quit PAGER to continue\n"; if ($match_count < 1) { print SHOWU "\n\nNo matches found\n\n"; } close (SHOWU); # Lovely user interface while (1) { # Grab ze response print "Which Message:"; $worthy_input = 0; $choice = (); for ($i = 1; $i <= $match_count; $i++) { if ($choice == $i) { $worthy_input = 1; &activate_message($i); open (SHOWU, "|$PAGER"); print SHOWU "MESSAGE $i\n\n"; &display_message(SHOWU); close (SHOWU); } } # Did I read garbage??? if ($worthy_input < 1) { print "Take it easy\n"; exit(0); } open (SHOWU, "|$PAGER"); print SHOWU "Quit PAGER. Enter Message# or enter garbage to exit\n\n"; for ($i = 1; $i <= $match_count; $i++) { &activate_message($i); printf (SHOWU "$i - %-30.30s %-40.40s\n",$From_Line,$Date_Line); printf (SHOWU " %.70s\n",$Subj_Line); printf (SHOWU " in file %.60s\n\n",$Filename); } close (SHOWU); } # End Main sub get_the_message { # Make sure it's a text file before searching for matches if ( -T $infile ) { open(INF,$infile) || ((warn "Can't process $infile: $!\n"), return()); &got_a_match(); close(INF); } } sub got_a_match { # # The evals used here I think are necessary for the optional case # insensitivity -- would it be cheaper to have additional # conditionals instead? # # # Many times a file even starts with a header # $likely_header = 1; $in_header = 0; $in_message = 0; $blank_line = 1; $real_top = 0; $tmp_top = 0; $bottom = 1; while($cur_line = ) { chop $cur_line; $DEBUG && print SHOWU "$likely_header.$in_header.$in_message.$blank_line.$tmp_top.$real_top.$bottom.$infile.",substr($cur_line,0,20),"\n"; if ($cur_line =~ /\S+/) { if ($in_header < 1) { if ($likely_header > 1) { # # We've already got one match, this could be a header # $blank_line = 0; $key_word = &handle_head_line($cur_line); if ($key_word ne "0") { ++$likely_header; if ($likely_header > 2) { # # assume this is a header and we're not # in a message -- not likely anymore, it's fact # # Pop off any header lines at tail of message while (pop(@this_message) =~ /\S+/) { ; } $likely_header = 0; $in_header = 1; $in_message = 0; } } } elsif ($blank_line > 0 || $likely_header > 0) { # # had a blank line -- header?? # $blank_line = 0; $key_word = &handle_head_line($cur_line); if ($key_word ne "0") { # # got 1 match so far, save this line as top of # header. Bump likeliness to stage two and clear # out header holder # while ($temp_head[0] =~ /\S+/) { shift(@temp_head); } ++$likely_header; if ($likely_header == 1) { $tmp_top = tell - length($cur_line) - 1; $bottom = tell; } } else { # # Where are we? try message # #if ($likely_header < 1) { $in_message = 1; } } } else { ++$in_message; $blank_line = 0; } } else { ++$in_header; $blank_line = 0; } } else { # # each blank line not following header is a harbinger of # another header -- set likely header to stage 1 # if ($in_header > 0) { $in_header = 0; # Was previous message a keeper? if (&this_matches() > 0) { &light_match(); } undef @this_message; # New message header @this_message = @temp_head; undef @temp_head; # store a separator push(@this_message,"MESSAGE STARTS HERE"); $in_message = 1; $blank_line = 0; $real_top = $tmp_top; } else { $blank_line = 1; } } # any header stuff this might push will get undef'd if ($in_message > 0 ) { push(@this_message,"$cur_line"); } # Save for header push(@temp_head,"$cur_line"); $DEBUG && print SHOWU "$likely_header.$in_header.$in_message.$blank_line.$tmp_top.$real_top.$bottom.$infile.",substr($cur_line,0,20),"\n"; } # Catch the last message of the file, if it's a keeper $bottom = tell; if (&this_matches() > 0) { &light_match(); } undef @this_message; } sub handle_head_line { local($head_line) = @_; $ret_val = "0"; if ($head_line =~ /^Crossposted-To:/) { $ret_val = "c"; } elsif ($head_line =~ /^Date:/) { $ret_val = "d"; } elsif ($head_line =~ /^From[: ]/) { $ret_val = "f"; } elsif ($head_line =~ /^Message-ID:/) { $ret_val = "m"; } elsif ($head_line =~ /^Received:/) { $ret_val = "r"; } elsif ($head_line =~ /^Reply-To:/) { $ret_val = "a"; } elsif ($head_line =~ /^Subject:/) { $ret_val = "s"; } elsif ($head_line =~ /^To:/) { $ret_val = "t"; } elsif ($head_line =~ /^X-Mailer:/) { $ret_val = "x"; } return ($ret_val); } sub this_matches { # Assume we got a full match until we know we didn't $met_my_match = 1; $now_in_message = 0; $from_match = 0; $subj_match = 0; $norm_match = 0; for (@this_message) { /^MESSAGE STARTS HERE$/ && ($now_in_message = 1); $try_line = $_; if ($now_in_message < 1) { $m_type = &handle_head_line($try_line); if ($m_type eq "s" && ($this_subj = $try_line)) { if ($subj_scope && $try_line =~ /^Subject:.*$subj_scope/) { $subj_match = 1; } } elsif ($m_type eq "f" && ($this_from = $try_line)) { $this_from =~ /^From[: ]*(\S*)/; $this_from = $1; if ($from_scope && $try_line =~ /^From[: ].*$from_scope/) { $from_match = 1; } } else { ($m_type eq "d") && ($this_date = $try_line); } } elsif ($norm_scope && $try_line =~ /$norm_scope/) { $norm_match = 1; } $DEBUG && print SHOWU "MM.$now_in_message.$m_type.$from_scope.$from_match.$subj_scope.$subj_match.$norm_scope.$norm_match.$try_line\n"; } # Make sure we got every match we needed if ($from_scope && $from_match < 1) { $met_my_match = 0; } if ($subj_scope && $subj_match < 1) { $met_my_match = 0; } if ($norm_scope && $norm_match < 1) { $met_my_match = 0; } $DEBUG && print SHOWU "MM. $met_my_match - non-Zero if I met my match\n"; return ($met_my_match); } sub light_match { # # got a matching pattern -- must find limits of this messgage and load # the "pseudononymous" :) # ++$match_count; &activate_message($match_count); $Filename = $infile; $Date_Line = $this_date; $From_Line = $this_from; $Subj_Line = $this_subj; $TOM = $real_top; $BOM = $bottom; printf (SHOWU "$match_count - %-30.30s %-40.40s\n",$From_Line,$Date_Line); printf (SHOWU " %.70s\n",$Subj_Line); printf (SHOWU " in file %.60s\n\n",$Filename); &flush(SHOWU); } sub display_message { open(MFILE,$Filename) || ((warn "Can't process $Filename: $!\n"), return()); seek(MFILE,$TOM,0); while() { last if (tell(MFILE) > $BOM); push(@mess_ary,$_); } close(MFILE); # Pop off any header lines at tail of message while (pop(@mess_ary) =~ /\S+/) { ; } # Display to the PAGER pipe for (@mess_ary) { print SHOWU "$_"; } undef @mess_ary; } sub gensym { 'gensym_' . ++$gensym'symbol; } sub activate_message { # # This was smilingly stolen from tchrist -- faq # local($message) = @_; &assert('$message',$message); $Last_Seq = $Current_Seq; if (! defined $Active_Messages{$message}) { $Active_Messages{$message} = &gensym; push(@Active_Messages, $message); } local($package) = $Active_Messages{$message}; local($code)=<<"EOF"; { package $package; *'Message_ID = *Message_ID; *'Filename = *Filename; *'Date_Line = *Date_Line; *'From_Line = *From_Line; *'Subj_Line = *Subj_Line; *'TOM = *TOM; *'BOM = *BOM; } EOF eval $code; $Current_Seq = $message; &panic("bad eval: $@\n$code\n") if $@; } -- -joe | "We're all Bozos on this Bus" --firesigns | "I'm suffering from software bloat!" Joe Morris | "New things must be allowed to come up instead jolomo@netcom.com | of being slapped down" --Jane Jacobs .