#!/usr/local/bin/perl ####################################################################### # $Author: jgoerzen $ # $Revision: 1.1.1.1 $ # $Date: 2000/08/19 00:28:56 $ # $Source: /var/cvs/gopher/gopherd/scripts/autokeywords,v $ # $State: Exp $ # # Paul Lindner, University of Minnesota. # # Copyright 1994 by the Regents of the University of Minnesota # see the file "Copyright" in the distribution for conditions of use. ####################################################################### # MODULE: autokeywords # # A script to automatically generate a gopher+ KEYWORDS block for an item # It can be set up to build .keywords files too.. # ####################################################################### # Revision History: # $Log: autokeywords,v $ # Revision 1.1.1.1 2000/08/19 00:28:56 jgoerzen # Import from UMN Gopher 2.3.1 after GPLization # # Revision 1.4 1995/02/06 22:27:56 lindner # Use dynamic space for Data_Dir, remove RunLS # # Revision 1.3 1994/12/15 17:34:02 lindner # Fix cached filename, make caching default # # Revision 1.2 1994/12/14 22:28:02 lindner # Add stoplist, better file support # # ####################################################################### ####################################################################### # Parameters you might want to set.. # # %txtconvert specifies programs that take non-text output and # output a text version. # # $cacheit specifies that this script should automatically generate # .keywords files for us. eats disk space like a pig ####################################################################### # @ignore = (".dct", ".inv", ".doc"); $txtconvert{'.ps'} = "ps2ascii \"%s\""; $txtconvert{'.html'} = "lynx -dump \"%s\""; $txtconvert{'.tar.Z'} = "zcat \"%s\" | tar tvf -"; $txtconvert{'.zip'} = "unzip -l \"%s\""; $txtconvert{'.tex'} = "detex %s"; $cacheit = 1; $MaxKeywords = 20; # ####################################################################### $filenamebase = shift || die "Need a filename..."; $fname = $filenamebase; $ENV{'PATH'} = $ENV{'PATH'} . ':/usr/local/bin:/usr/gnu/bin:/usr/local/harvest/bin'; exit if (-d $fname); if (-f "$fname.keywords") { open(moo, "<$fname.keywords"); while () { print; } exit; } foreach $i (@ignore) { exit if ($fname =~ /$i$/); } $fname = &FindTXTfile($fname); open (f, "<$fname"); foreach $e (keys(%txtconvert)) { if ($fname =~ /$e$/i) { $cmd = sprintf($txtconvert{"$e"}, $fname); close(f); open(f, "$cmd |"); done; } } &InitStopList; &InitDict; # # Summarize a file for keywords # while () { @words = split; foreach $i (@words) { $loweri = $i; $loweri =~ y/[A-Z]/[a-z]/; next if (length($i) < 3); next if (! ($i =~ /^[A-Z\-a-z]+$/)); # skip numbers and weird suff next if ($StopList{$loweri}); next if ( $i =~ /^[\-]+$/); $weight = 1; $weight +=3 if ($i =~ /^[A-Z]/); # Start with Cap $weight ++ if ($i =~ /^[A-Z]+$/); # All CAPS $weight +=3 if (! &isinDict($loweri)); # Not in Dictionary $i =~ y/[A-Z]/[a-z]/; $Words{$i} += $weight; } } @scores = sort {$a <=> $b} values(%Words); foreach $i (@scores) { $total += $i; } $avg = $total/$#scores; $median = $scores[($#scores/2)+1]; $highend = $scores[3*($#scores/4)]; if (($#scores/4) > $MaxKeywords) { $highend = $scores[$#scores - $MaxKeywords]; } #print "Average score for culled words $avg\n"; #print "Median value is $median\n\n"; #print "Highend value is $highend\n"; $linelen = 0; foreach $i (sort(keys(%Words))) { next if ($Words{$i} < ($highend+1)); # printf("%3d %s\n", $Words{$i}, $i); if ($linelen + length($i) > 70) { $keywords .= "\n"; $linelen = 0; } $keywords .= "$i "; $linelen += length($i) + 1; } if ($keywords =~ /[A-Za-z]/) { print "$keywords\n"; if ($cacheit) { open(keywordsout, ">$filenamebase.keywords"); print keywordsout "$keywords\n"; close(keywordsout); } } # # Load /usr/dict/words # sub InitDict { open(dict, ") { chop; next if (length($_) < 3); $Dict{"$_"} = 1; } close(dict); } sub FindTXTfile { local($fname) = @_; local($base, $f); return($fname) if ( -f $fname); # # Probably not a text file... # $fname =~ m/^(.*)\/([^\/]+)$/; $base = $1; $f = $2; if (!($fname =~ /\//)) { $base = "."; $f = $fname; } opendir(thedir, $base); @files = readdir(thedir); foreach $i (@files) { if ($i =~ /^$f/) { foreach $t (keys(%txtconvert)) { return("$base/$i") if ($i =~ /$t$/i); } return("$base/$i") if ($i =~ /txt$/i); push(@ourfiles, $i) } } closedir(thedir); return($ourfiles[1]); } sub isinDict { local($word) = @_; local($word2); if ($word =~ /^(.*)s$/) { $word2 = $1; } elsif ($word =~ /^(.*)ed$/) { $word2 = $1 . "e"; } if ($Dict{$word} || $Dict{$word2}) { return(1); } return(0); } sub InitStopList { &SnarfStopList; foreach $i (split(/\n/,$sl)) { $StopList{"$i"} = 1; } } sub SnarfStopList { $sl = <<; above according across actually adj after afterwards again against all almost alone along already also although always among amongst and another any anyhow anyone anything anywhere are around became because become becomes becoming been before beforehand begin beginning behind being below beside besides between beyond billion both but can cannot caption could did does down during each eight eighty either else elsewhere end ending enough etc even ever every everyone everything everywhere except few fifty first five for former formerly forty found four from fri further had has hasn't have haven't hence here here's hereafter hereby herein hereupon hers herself him himself his how however hundred inc. indeed instead into its itself last later latter latterly least less let like likely ltd made make makefile makes many maybe meantime meanwhile might million miss mon more moreover most mostly mrs much must myself namely neither never nevertheless new nine ninety nobody none nonetheless noone nor not nothing now nowhere off often once one one's only onto other others otherwise our ours ourselves out over overall own per perhaps rather recent recently same sat seem seemed seeming seems seven seventy several she should shouldn't since six sixty some somehow someone something sometime sometimes somewhere still stop such taking ten than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thirty this those though thousand three through throughout thru thu thus thur together toward towards trillion tue twenty two under unless unlike unlikely until upon use used using very via was wed were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whomever whose why will with within without would yes yet you your yours yourself yourselves }