add a bulk test mode option (-b) - bmf - bmf (Bayesian Mail Filter) 0.9.4 fork + patches
 (HTM) git clone git://git.codemadness.org/bmf
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 8a316864887a48a5fd2867b6bde5d5e3b215e288
 (DIR) parent da5b33ffd35e25649614ac678df293afcffb3f35
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Tue,  4 Feb 2020 22:59:04 +0100
       
       add a bulk test mode option (-b)
       
       Much more efficient for my use-case: bulk testing directory full of maildir
       files quickly.
       
       find "$d/new" -type f | bmf -b | awk -F '\t' '$2 > 0.9 { print $1; }' | while read -r f; do
               ...move ugly spam here...
       done
       
       Diffstat:
         M bmf.1                               |       5 ++++-
         M bmf.c                               |      78 ++++++++++++++++++++++++++++++-
         M dbh.c                               |       3 +++
       
       3 files changed, 83 insertions(+), 3 deletions(-)
       ---
 (DIR) diff --git a/bmf.1 b/bmf.1
       @@ -23,7 +23,7 @@ bmf \- efficient Bayesian mail filter
        .SH "SYNOPSIS"
        
        .nf
       -\fBbmf\fR [-t] [-n] [-s] [-N] [-S] [-d db] [-k n] [-m type] [-p]
       +\fBbmf\fR [-b] [-t] [-n] [-s] [-N] [-S] [-d db] [-k n] [-m type] [-p]
            [-v] [-V] [-h]
        .fi
        
       @@ -41,6 +41,9 @@ bmf supports both mbox and maildir mail storage formats. It will automatically p
        Without command-line options, bmf processes the input, registers it as either "good" or "spam", and returns the appropriate error code. The wordlist directory and nonexistent wordfiles are created if absent.
        
        .PP
       +\fB-b\fR Bulk test mode, read file list from stdin, output file, TAB, spamicity score per line.
       +
       +.PP
        \fB-t\fR Test to see if the input is spam. The word lists are not updated. A report is written to stdout showing the final score and the tokens with the highest deviation form a mean of 0.5.
        
        .PP
 (DIR) diff --git a/bmf.c b/bmf.c
       @@ -27,8 +27,10 @@ typedef enum {
                mode_reg_n,                /* register as non-spam */
                mode_n_to_s,                /* undo non-spam registration and register as
                                         * spam */
       -        mode_s_to_n                /* undo spam registration and register as
       +        mode_s_to_n,                /* undo spam registration and register as
                                         * non-spam */
       +        /* test and product report in bulk, read file list from stdin, output TAB-separated */
       +        mode_bulk
        } runmode_t;
        
        static void
       @@ -39,6 +41,7 @@ usage(void)
                       "\n"
                       "Modes of operation (mutually exclusive; the last one specified is used):\n"
                       "\t\tRegister message using historical data if no mode is specified.\n"
       +               "\t-b\tBulk test mode, read file list from stdin, output file, TAB, spamicity score per line.\n"
                       "\t-n\tRegister message as non-spam.\n"
                       "\t-s\tRegister message as spam.\n"
                       "\t-N\tRegister message as non-spam and undo prior registration as spam.\n"
       @@ -88,6 +91,9 @@ main(int argc, char **argv)
                tok_t tok;
                bool_t is_spam = false;
                int ch;
       +        char *line = NULL;
       +        size_t linesiz = 0;
       +        ssize_t n;
        
                int fd = STDIN_FILENO;
        
       @@ -97,8 +103,11 @@ main(int argc, char **argv)
                srand(time(NULL));
        
                stats.keepers = DEF_KEEPERS;
       -        while ((ch = getopt(argc, argv, "NSVd:hk:m:npstv")) != EOF) {
       +        while ((ch = getopt(argc, argv, "NSVd:hk:m:bnpstv")) != EOF) {
                        switch (ch) {
       +                case 'b':
       +                        mode = mode_bulk;
       +                        break;
                        case 'N':
                                mode = mode_s_to_n;
                                break;
       @@ -162,6 +171,70 @@ main(int argc, char **argv)
                        fprintf(stderr, "%s: cannot open database\n", argv[0]);
                        exit(2);
                }
       +
       +        /* bulk mode */
       +        if (mode == mode_bulk) {
       +                pblist = pdb->opentable(pdb, "spamlist", rdonly);
       +                if (pblist == NULL) {
       +                        fprintf(stderr, "%s: cannot open spamlist\n", argv[0]);
       +                        exit(2);
       +                }
       +                pglist = pdb->opentable(pdb, "goodlist", rdonly);
       +                if (pglist == NULL) {
       +                        fprintf(stderr, "%s: cannot open goodlist\n", argv[0]);
       +                        exit(2);
       +                }
       +
       +                while ((n = getline(&line, &linesiz, stdin)) > 0) {
       +                        if (line[n - 1] == '\n')
       +                                line[--n] = '\0';
       +
       +                        if ((fd = open(line, O_RDONLY)) == -1)
       +                                err(1, "open: %s", line);
       +
       +                        memset(stats.extrema, 0, stats.keepers * sizeof(discrim_t));
       +
       +                        lex_create(&lex, mboxtype);
       +                        if (!lex_load(&lex, fd)) {
       +                                fprintf(stderr, "%s: cannot read input\n", argv[0]);
       +                                exit(2);
       +                        }
       +                        lex_nexttoken(&lex, &tok);
       +                        if (tok.tt == eof) {
       +                                fprintf(stderr, "%s: no input available\n", argv[0]);
       +                                exit(2);
       +                        }
       +
       +                        while (tok.tt != eof) {
       +                                /* TODO: vec_create at top, vec->nitems = 0, but keep allocated buffers */
       +                                vec_create(&mlist);
       +
       +                                bvec_loadmsg(&mlist, &lex, &tok);
       +                                bayesfilt(pglist, pblist, &mlist, &stats);
       +
       +                                vec_destroy(&mlist);
       +
       +                                printf("%s\t%f\n", line, stats.spamicity);
       +                        }
       +
       +                        lex_destroy(&lex);
       +
       +                        close(fd);
       +                }
       +
       +                pglist->close(pglist);
       +                free(pglist);
       +                pblist->close(pblist);
       +                free(pblist);
       +
       +                pdb->close(pdb);
       +                free(pdb);
       +
       +                free(stats.extrema);
       +
       +                return 0;
       +        }
       +
                lex_create(&lex, mboxtype);
                if (!lex_load(&lex, fd)) {
                        fprintf(stderr, "%s: cannot read input\n", argv[0]);
       @@ -172,6 +245,7 @@ main(int argc, char **argv)
                        fprintf(stderr, "%s: no input available\n", argv[0]);
                        exit(2);
                }
       +
                if (mode == mode_test) {
                        pblist = pdb->opentable(pdb, "spamlist", rdonly);
                        if (pblist == NULL) {
 (DIR) diff --git a/dbh.c b/dbh.c
       @@ -95,6 +95,8 @@ dbtext_db_open(cpchar dbname, bool_t rdonly)
                                goto bail;
                }
        
       +/* TODO: handle unveil for bulk mode */
       +#if 0
                /* unveil(2), TODO: rework later */
                char listpath[PATH_MAX];
                snprintf(listpath, sizeof(listpath), "%s/%s", pthis->dir, "goodlist.txt");
       @@ -111,6 +113,7 @@ dbtext_db_open(cpchar dbname, bool_t rdonly)
                        perror("unveil()");
                        exit(2);
                }
       +#endif
        
                return pthis;