add a bulk test mode option (-b) - bmf - bmf (Bayesian Mail Filter) 0.9.4 fork + patches
(HTM) git clone git://git.codemadness.org/bmf
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 8a316864887a48a5fd2867b6bde5d5e3b215e288
(DIR) parent da5b33ffd35e25649614ac678df293afcffb3f35
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Tue, 4 Feb 2020 22:59:04 +0100
add a bulk test mode option (-b)
Much more efficient for my use-case: bulk testing directory full of maildir
files quickly.
find "$d/new" -type f | bmf -b | awk -F '\t' '$2 > 0.9 { print $1; }' | while read -r f; do
...move ugly spam here...
done
Diffstat:
M bmf.1 | 5 ++++-
M bmf.c | 78 ++++++++++++++++++++++++++++++-
M dbh.c | 3 +++
3 files changed, 83 insertions(+), 3 deletions(-)
---
(DIR) diff --git a/bmf.1 b/bmf.1
@@ -23,7 +23,7 @@ bmf \- efficient Bayesian mail filter
.SH "SYNOPSIS"
.nf
-\fBbmf\fR [-t] [-n] [-s] [-N] [-S] [-d db] [-k n] [-m type] [-p]
+\fBbmf\fR [-b] [-t] [-n] [-s] [-N] [-S] [-d db] [-k n] [-m type] [-p]
[-v] [-V] [-h]
.fi
@@ -41,6 +41,9 @@ bmf supports both mbox and maildir mail storage formats. It will automatically p
Without command-line options, bmf processes the input, registers it as either "good" or "spam", and returns the appropriate error code. The wordlist directory and nonexistent wordfiles are created if absent.
.PP
+\fB-b\fR Bulk test mode, read file list from stdin, output file, TAB, spamicity score per line.
+
+.PP
\fB-t\fR Test to see if the input is spam. The word lists are not updated. A report is written to stdout showing the final score and the tokens with the highest deviation form a mean of 0.5.
.PP
(DIR) diff --git a/bmf.c b/bmf.c
@@ -27,8 +27,10 @@ typedef enum {
mode_reg_n, /* register as non-spam */
mode_n_to_s, /* undo non-spam registration and register as
* spam */
- mode_s_to_n /* undo spam registration and register as
+ mode_s_to_n, /* undo spam registration and register as
* non-spam */
+ /* test and product report in bulk, read file list from stdin, output TAB-separated */
+ mode_bulk
} runmode_t;
static void
@@ -39,6 +41,7 @@ usage(void)
"\n"
"Modes of operation (mutually exclusive; the last one specified is used):\n"
"\t\tRegister message using historical data if no mode is specified.\n"
+ "\t-b\tBulk test mode, read file list from stdin, output file, TAB, spamicity score per line.\n"
"\t-n\tRegister message as non-spam.\n"
"\t-s\tRegister message as spam.\n"
"\t-N\tRegister message as non-spam and undo prior registration as spam.\n"
@@ -88,6 +91,9 @@ main(int argc, char **argv)
tok_t tok;
bool_t is_spam = false;
int ch;
+ char *line = NULL;
+ size_t linesiz = 0;
+ ssize_t n;
int fd = STDIN_FILENO;
@@ -97,8 +103,11 @@ main(int argc, char **argv)
srand(time(NULL));
stats.keepers = DEF_KEEPERS;
- while ((ch = getopt(argc, argv, "NSVd:hk:m:npstv")) != EOF) {
+ while ((ch = getopt(argc, argv, "NSVd:hk:m:bnpstv")) != EOF) {
switch (ch) {
+ case 'b':
+ mode = mode_bulk;
+ break;
case 'N':
mode = mode_s_to_n;
break;
@@ -162,6 +171,70 @@ main(int argc, char **argv)
fprintf(stderr, "%s: cannot open database\n", argv[0]);
exit(2);
}
+
+ /* bulk mode */
+ if (mode == mode_bulk) {
+ pblist = pdb->opentable(pdb, "spamlist", rdonly);
+ if (pblist == NULL) {
+ fprintf(stderr, "%s: cannot open spamlist\n", argv[0]);
+ exit(2);
+ }
+ pglist = pdb->opentable(pdb, "goodlist", rdonly);
+ if (pglist == NULL) {
+ fprintf(stderr, "%s: cannot open goodlist\n", argv[0]);
+ exit(2);
+ }
+
+ while ((n = getline(&line, &linesiz, stdin)) > 0) {
+ if (line[n - 1] == '\n')
+ line[--n] = '\0';
+
+ if ((fd = open(line, O_RDONLY)) == -1)
+ err(1, "open: %s", line);
+
+ memset(stats.extrema, 0, stats.keepers * sizeof(discrim_t));
+
+ lex_create(&lex, mboxtype);
+ if (!lex_load(&lex, fd)) {
+ fprintf(stderr, "%s: cannot read input\n", argv[0]);
+ exit(2);
+ }
+ lex_nexttoken(&lex, &tok);
+ if (tok.tt == eof) {
+ fprintf(stderr, "%s: no input available\n", argv[0]);
+ exit(2);
+ }
+
+ while (tok.tt != eof) {
+ /* TODO: vec_create at top, vec->nitems = 0, but keep allocated buffers */
+ vec_create(&mlist);
+
+ bvec_loadmsg(&mlist, &lex, &tok);
+ bayesfilt(pglist, pblist, &mlist, &stats);
+
+ vec_destroy(&mlist);
+
+ printf("%s\t%f\n", line, stats.spamicity);
+ }
+
+ lex_destroy(&lex);
+
+ close(fd);
+ }
+
+ pglist->close(pglist);
+ free(pglist);
+ pblist->close(pblist);
+ free(pblist);
+
+ pdb->close(pdb);
+ free(pdb);
+
+ free(stats.extrema);
+
+ return 0;
+ }
+
lex_create(&lex, mboxtype);
if (!lex_load(&lex, fd)) {
fprintf(stderr, "%s: cannot read input\n", argv[0]);
@@ -172,6 +245,7 @@ main(int argc, char **argv)
fprintf(stderr, "%s: no input available\n", argv[0]);
exit(2);
}
+
if (mode == mode_test) {
pblist = pdb->opentable(pdb, "spamlist", rdonly);
if (pblist == NULL) {
(DIR) diff --git a/dbh.c b/dbh.c
@@ -95,6 +95,8 @@ dbtext_db_open(cpchar dbname, bool_t rdonly)
goto bail;
}
+/* TODO: handle unveil for bulk mode */
+#if 0
/* unveil(2), TODO: rework later */
char listpath[PATH_MAX];
snprintf(listpath, sizeof(listpath), "%s/%s", pthis->dir, "goodlist.txt");
@@ -111,6 +113,7 @@ dbtext_db_open(cpchar dbname, bool_t rdonly)
perror("unveil()");
exit(2);
}
+#endif
return pthis;