bmf.c - bmf - bmf (Bayesian Mail Filter) 0.9.4 fork + patches
 (HTM) git clone git://git.codemadness.org/bmf
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       bmf.c (9060B)
       ---
            1 /* $Id: bmf.c,v 1.20 2002/10/20 18:19:17 tommy Exp $ */
            2 
            3 /*
            4  * Copyright (c) 2002 Tom Marshall <tommy@tig-grr.com>
            5  *
            6  * This program is free software.  It may be distributed under the terms
            7  * in the file LICENSE, found in the top level of the distribution.
            8  *
            9  * bmf.c: top level Bayesian mail filter app.
           10  */
           11 
           12 #include "config.h"
           13 #include "dbg.h"
           14 #include "str.h"
           15 #include "lex.h"
           16 #include "vec.h"
           17 #include "dbh.h"
           18 #include "filt.h"
           19 
           20 #define PACKAGE "bmf"
           21 
           22 /* modes of operation (mutually exclusive) */
           23 typedef enum {
           24         mode_test,                /* test and produce report */
           25         mode_normal,                /* test and register result */
           26         mode_reg_s,                /* register as spam */
           27         mode_reg_n,                /* register as non-spam */
           28         mode_n_to_s,                /* undo non-spam registration and register as
           29                                  * spam */
           30         mode_s_to_n,                /* undo spam registration and register as
           31                                  * non-spam */
           32         /* test and product report in bulk, read file list from stdin, output TAB-separated */
           33         mode_bulk
           34 } runmode_t;
           35 
           36 static void
           37 usage(void)
           38 {
           39         printf("\n"
           40                "Usage: " PACKAGE " [mode] [options]\n"
           41                "\n"
           42                "Modes of operation (mutually exclusive; the last one specified is used):\n"
           43                "\t\tRegister message using historical data if no mode is specified.\n"
           44                "\t-b\tBulk test mode, read file list from stdin, output file, TAB, spamicity score per line.\n"
           45                "\t-n\tRegister message as non-spam.\n"
           46                "\t-s\tRegister message as spam.\n"
           47                "\t-N\tRegister message as non-spam and undo prior registration as spam.\n"
           48                "\t-S\tRegister message as spam and undo prior registration as non-spam.\n"
           49                "\t-t\tTest mode, print report and do not save results.\n"
           50                "\n"
           51                "Other options:\n"
           52                "\t-d db\tSpecify database or directory name.\n"
           53                "\t-k n\tSpecify count of extrema to use (keepers), default is 15.\n"
           54                "\t-m type\t[DEPRECATED] Specify mail storage format (mbox|maildir)\n"
           55                "\t-p\tPassthrough mode, like SpamAssassin.\n"
           56                "\t-v\tIncrease verbosity level.\n"
           57                "\t-V\tShow version information and exit.\n"
           58                "\t-h\tShow this message and exit.\n"
           59                "\n");
           60         exit(2);
           61 }
           62 
           63 static void
           64 version(void)
           65 {
           66         printf("\n"
           67                PACKAGE " version " VERSION " - a Bayesian mail filter\n"
           68                "Copyright (c) 2002 Tom Marshall\n"
           69                "\n"
           70                PACKAGE " comes with ABSOLUTELY NO WARRANTY.\n"
           71                "This is free software.  You are welcome to redistribute it under the terms\n"
           72                "of the GNU General Public License.  See the file LICENSE in the source\n"
           73                "distribution, or visit http://www.gnu.org/licenses/gpl.html\n"
           74                "\n");
           75         exit(2);
           76 }
           77 
           78 int
           79 main(int argc, char **argv)
           80 {
           81         char *dbname = NULL;
           82         bool_t rdonly;
           83         runmode_t mode = mode_normal;
           84         mbox_t mboxtype = detect;
           85         bool_t do_passthru = false;
           86         dbhtext_t *pdb;
           87         dbt_t *pblist, *pglist, *ptable;
           88         vec_t mlist;
           89         stats_t stats;
           90         lex_t lex;
           91         tok_t tok;
           92         bool_t is_spam = false;
           93         int ch;
           94         char *line = NULL;
           95         size_t linesiz = 0;
           96         ssize_t n;
           97 
           98         int fd = STDIN_FILENO;
           99 
          100         if (pledge("stdio rpath wpath cpath flock unveil", NULL) == -1)
          101                 err(1, "pledge");
          102 
          103         srand(time(NULL));
          104 
          105         stats.keepers = DEF_KEEPERS;
          106         while ((ch = getopt(argc, argv, "NSVd:hk:m:bnpstv")) != EOF) {
          107                 switch (ch) {
          108                 case 'b':
          109                         mode = mode_bulk;
          110                         break;
          111                 case 'N':
          112                         mode = mode_s_to_n;
          113                         break;
          114                 case 'S':
          115                         mode = mode_n_to_s;
          116                         break;
          117                 case 'V':
          118                         version();
          119                         break;        /* NOTREACHED */
          120                 case 'd':
          121                         free(dbname);
          122                         if (!(dbname = strdup(optarg))) {
          123                                 perror("strdup()");
          124                                 exit(2);
          125                         }
          126                         break;
          127                 case 'h':
          128                         usage();
          129                         break;        /* NOTREACHED */
          130                 case 'k':
          131                         stats.keepers = atoi(optarg);
          132                         break;
          133                 case 'm':
          134                         if (strcasecmp(optarg, "mbox") == 0) {
          135                                 mboxtype = mbox;
          136                         } else if (strcasecmp(optarg, "maildir") == 0) {
          137                                 mboxtype = maildir;
          138                         } else {
          139                                 usage();
          140                         }
          141                         break;
          142                 case 'n':
          143                         mode = mode_reg_n;
          144                         break;
          145                 case 'p':
          146                         do_passthru = true;
          147                         break;
          148                 case 's':
          149                         mode = mode_reg_s;
          150                         break;
          151                 case 't':
          152                         mode = mode_test;
          153                         if (pledge("stdio rpath cpath flock unveil", NULL) == -1)
          154                                 err(1, "pledge");
          155                         break;
          156                 case 'v':
          157                         g_verbose++;
          158                         verbose(1, "Verbose level now %u\n", g_verbose);
          159                         break;
          160                 default:
          161                         usage();
          162                 }
          163         }
          164         stats.extrema = (discrim_t *) malloc(stats.keepers * sizeof(discrim_t));
          165 
          166         rdonly = (mode == mode_test) ? 1 : 0;
          167 
          168         /* create directory if it doesn't exist yet, when dbname is NULL or empty use $HOME/.bmf */
          169         pdb = dbtext_db_open(dbname, rdonly);
          170         if (pdb == NULL) {
          171                 fprintf(stderr, "%s: cannot open database\n", argv[0]);
          172                 exit(2);
          173         }
          174 
          175         /* bulk mode */
          176         if (mode == mode_bulk) {
          177                 pblist = pdb->opentable(pdb, "spamlist", rdonly);
          178                 if (pblist == NULL) {
          179                         fprintf(stderr, "%s: cannot open spamlist\n", argv[0]);
          180                         exit(2);
          181                 }
          182                 pglist = pdb->opentable(pdb, "goodlist", rdonly);
          183                 if (pglist == NULL) {
          184                         fprintf(stderr, "%s: cannot open goodlist\n", argv[0]);
          185                         exit(2);
          186                 }
          187 
          188                 while ((n = getline(&line, &linesiz, stdin)) > 0) {
          189                         if (line[n - 1] == '\n')
          190                                 line[--n] = '\0';
          191 
          192                         if ((fd = open(line, O_RDONLY)) == -1)
          193                                 err(1, "open: %s", line);
          194 
          195                         memset(stats.extrema, 0, stats.keepers * sizeof(discrim_t));
          196 
          197                         lex_create(&lex, mboxtype);
          198                         if (!lex_load(&lex, fd)) {
          199                                 fprintf(stderr, "%s: cannot read input\n", argv[0]);
          200                                 exit(2);
          201                         }
          202                         lex_nexttoken(&lex, &tok);
          203                         if (tok.tt == eof) {
          204                                 fprintf(stderr, "%s: no input available\n", argv[0]);
          205                                 exit(2);
          206                         }
          207 
          208                         while (tok.tt != eof) {
          209                                 /* TODO: vec_create at top, vec->nitems = 0, but keep allocated buffers */
          210                                 vec_create(&mlist);
          211 
          212                                 bvec_loadmsg(&mlist, &lex, &tok);
          213                                 bayesfilt(pglist, pblist, &mlist, &stats);
          214 
          215                                 vec_destroy(&mlist);
          216 
          217                                 printf("%s\t%f\n", line, stats.spamicity);
          218                         }
          219 
          220                         lex_destroy(&lex);
          221 
          222                         close(fd);
          223                 }
          224 
          225                 pglist->close(pglist);
          226                 free(pglist);
          227                 pblist->close(pblist);
          228                 free(pblist);
          229 
          230                 pdb->close(pdb);
          231                 free(pdb);
          232 
          233                 free(stats.extrema);
          234 
          235                 return 0;
          236         }
          237 
          238         lex_create(&lex, mboxtype);
          239         if (!lex_load(&lex, fd)) {
          240                 fprintf(stderr, "%s: cannot read input\n", argv[0]);
          241                 exit(2);
          242         }
          243         lex_nexttoken(&lex, &tok);
          244         if (tok.tt == eof) {
          245                 fprintf(stderr, "%s: no input available\n", argv[0]);
          246                 exit(2);
          247         }
          248 
          249         if (mode == mode_test) {
          250                 pblist = pdb->opentable(pdb, "spamlist", rdonly);
          251                 if (pblist == NULL) {
          252                         fprintf(stderr, "%s: cannot open spamlist\n", argv[0]);
          253                         exit(2);
          254                 }
          255                 pglist = pdb->opentable(pdb, "goodlist", rdonly);
          256                 if (pglist == NULL) {
          257                         fprintf(stderr, "%s: cannot open goodlist\n", argv[0]);
          258                         exit(2);
          259                 }
          260                 if (pledge("stdio", NULL) == -1)
          261                         err(1, "pledge");
          262         }
          263         while (tok.tt != eof) {
          264                 if (mboxtype == mbox && tok.tt != from) {
          265                         fprintf(stderr, "%s: input does not look like an mbox message\n", argv[0]);
          266                         exit(2);
          267                 }
          268                 if (mode != mode_test) {
          269                         pblist = pdb->opentable(pdb, "spamlist", rdonly);
          270                         if (pblist == NULL) {
          271                                 fprintf(stderr, "%s: cannot open spamlist\n", argv[0]);
          272                                 exit(2);
          273                         }
          274                         pglist = pdb->opentable(pdb, "goodlist", rdonly);
          275                         if (pglist == NULL) {
          276                                 fprintf(stderr, "%s: cannot open goodlist\n", argv[0]);
          277                                 exit(2);
          278                         }
          279                 }
          280                 vec_create(&mlist);
          281                 bvec_loadmsg(&mlist, &lex, &tok);
          282 
          283                 switch (mode) {
          284                 case mode_test:
          285                         bayesfilt(pglist, pblist, &mlist, &stats);
          286                         is_spam = (stats.spamicity > SPAM_CUTOFF);
          287                         break;
          288                 case mode_normal:
          289                         bayesfilt(pglist, pblist, &mlist, &stats);
          290                         is_spam = (stats.spamicity > SPAM_CUTOFF);
          291                         ptable = (is_spam ? pblist : pglist);
          292                         svec_sort(&mlist);
          293                         if (!ptable->mergeclose(ptable, &mlist)) {
          294                                 fprintf(stderr, "%s: cannot merge/save list\n", argv[0]);
          295                                 exit(2);
          296                         }
          297                         break;
          298                 case mode_reg_s:
          299                         stats.spamicity = 1.0;
          300                         is_spam = true;
          301                         svec_sort(&mlist);
          302                         if (!pblist->mergeclose(pblist, &mlist)) {
          303                                 fprintf(stderr, "%s: cannot merge/save list\n", argv[0]);
          304                                 exit(2);
          305                         }
          306                         break;
          307                 case mode_reg_n:
          308                         stats.spamicity = 0.0;
          309                         is_spam = false;
          310                         svec_sort(&mlist);
          311                         if (!pglist->mergeclose(pglist, &mlist)) {
          312                                 fprintf(stderr, "%s: cannot merge/save list\n", argv[0]);
          313                                 exit(2);
          314                         }
          315                         break;
          316                 case mode_n_to_s:
          317                         stats.spamicity = 1.0;
          318                         is_spam = true;
          319                         svec_sort(&mlist);
          320                         if (!pblist->mergeclose(pblist, &mlist) ||
          321                             !pglist->unmergeclose(pglist, &mlist)) {
          322                                 fprintf(stderr, "%s: cannot merge/save list\n", argv[0]);
          323                                 exit(2);
          324                         }
          325                         break;
          326                 case mode_s_to_n:
          327                         stats.spamicity = 0.0;
          328                         is_spam = false;
          329                         svec_sort(&mlist);
          330                         if (!pblist->unmergeclose(pblist, &mlist) ||
          331                             !pglist->mergeclose(pglist, &mlist)) {
          332                                 fprintf(stderr, "%s: cannot merge/save list\n", argv[0]);
          333                                 exit(2);
          334                         }
          335                         break;
          336                 default:
          337                         usage();
          338                 }
          339 
          340                 if (mode == mode_test) {
          341                         statdump(&stats, stdout);
          342                 }
          343                 if (do_passthru) {
          344                         lex_passthru(&lex, is_spam, stats.spamicity);
          345                 }
          346                 vec_destroy(&mlist);
          347 
          348                 if (mode != mode_test) {
          349                         pglist->close(pglist);
          350                         free(pglist);
          351                         pblist->close(pblist);
          352                         free(pblist);
          353                 }
          354         }
          355 
          356         if (mode == mode_test) {
          357                 pglist->close(pglist);
          358                 free(pglist);
          359                 pblist->close(pblist);
          360                 free(pblist);
          361         }
          362         lex_destroy(&lex);
          363 
          364         pdb->close(pdb);
          365         free(pdb);
          366 
          367         free(stats.extrema);
          368 
          369         return ((do_passthru || is_spam) ? 0 : 1);
          370 }