merge dbh and dbtext (WIP) - bmf - bmf (Bayesian Mail Filter) 0.9.4 fork + patches
(HTM) git clone git://git.codemadness.org/bmf
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 60b437c6d0bc19fc9f67ca8cfaf6cbfc50d47423
(DIR) parent 4c3c79f49125ef555fba1df7f6cbab2c7b26ea00
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sat, 27 Oct 2018 19:31:30 +0200
merge dbh and dbtext (WIP)
Diffstat:
M Makefile | 2 --
M bmf.c | 3 +--
M dbh.c | 474 ++++++++++++++++++++++++++++++-
M dbh.h | 45 +++++++++++++++++++++++++------
D dbtext.c | 490 -------------------------------
D dbtext.h | 49 -------------------------------
M filt.h | 14 +++++++++-----
7 files changed, 511 insertions(+), 566 deletions(-)
---
(DIR) diff --git a/Makefile b/Makefile
@@ -9,7 +9,6 @@ SRC = \
bmf.c \
dbg.c \
dbh.c \
- dbtext.c \
filt.c \
lex.c \
str.c \
@@ -21,7 +20,6 @@ HDR = \
config.h \
dbg.h \
dbh.h \
- dbtext.h \
filt.h \
lex.h \
str.h \
(DIR) diff --git a/bmf.c b/bmf.c
@@ -75,7 +75,6 @@ version(void)
int
main(int argc, char **argv)
{
- dbfmt_t dbfmt = db_text;
char *dbname = NULL;
bool_t rdonly;
runmode_t mode = mode_normal;
@@ -152,7 +151,7 @@ main(int argc, char **argv)
}
stats.extrema = (discrim_t *) malloc(stats.keepers * sizeof(discrim_t));
- pdb = dbh_open(dbfmt, "localhost", dbname, "", "");
+ pdb = dbh_open(dbname);
if (pdb == NULL) {
fprintf(stderr, "%s: cannot open database\n", argv[0]);
exit(2);
(DIR) diff --git a/dbh.c b/dbh.c
@@ -16,7 +16,6 @@
#include "vec.h"
#include "dbh.h"
-#include "dbtext.h"
/*
* get count for new (incoming) word. there may be duplicate entries for the
@@ -50,17 +49,472 @@ db_getnewcount(veciter_t * piter)
}
dbh_t *
-dbh_open(dbfmt_t dbfmt, cpchar dbhost, cpchar dbname, cpchar dbuser, cpchar dbpass)
+dbh_open(cpchar dbname)
{
- dbh_t *pdb;
+ dbhtext_t *pthis = NULL;
+ uint dirlen;
+ cpchar phome;
+ struct stat st;
- switch (dbfmt) {
- case db_text:
- pdb = (dbh_t *) dbtext_db_open(dbhost, dbname, dbuser, dbpass);
- break;
- default:
- break;
+ if ((pthis = malloc(sizeof(dbhtext_t))) == NULL) {
+ perror("malloc()");
+ goto bail;
}
- return pdb;
+ pthis->close = dbtext_db_close;
+ pthis->opentable = dbtext_db_opentable;
+
+ if (dbname != NULL && *dbname != '\0') {
+ dirlen = strlen(dbname);
+ if ((pthis->dir = strdup(dbname)) == NULL) {
+ perror("strdup()");
+ goto bail;
+ }
+ if (dirlen && pthis->dir[dirlen - 1] == '/')
+ pthis->dir[--dirlen] = '\0';
+ } else {
+ phome = getenv("HOME");
+ if (phome == NULL || *phome == '\0') {
+ phome = ".";
+ }
+ dirlen = strlen(phome) + 5 + 1;
+ if ((pthis->dir = malloc(dirlen)) == NULL)
+ goto bail;
+
+ /* NOTE: no truncation possible */
+ snprintf(pthis->dir, dirlen, "%s/.bmf", phome);
+ }
+
+ /* make sure config directory exists */
+ if (stat(pthis->dir, &st) != 0) {
+ if (errno != ENOENT ||
+ mkdir(pthis->dir, S_IRUSR | S_IWUSR | S_IXUSR) != 0)
+ goto bail;
+ } else {
+ if (!S_ISDIR(st.st_mode))
+ goto bail;
+ }
+
+ /* unveil(2), TODO: rework later */
+ /* TODO: permission depending on mode */
+ char listpath[PATH_MAX];
+ snprintf(listpath, sizeof(listpath), "%s/%s", pthis->dir, "goodlist");
+ if (unveil(listpath, "rw") == -1) {
+ perror("unveil()");
+ exit(2);
+ }
+ snprintf(listpath, sizeof(listpath), "%s/%s", pthis->dir, "spamlist");
+ if (unveil(listpath, "rw") == -1) {
+ perror("unveil()");
+ exit(2);
+ }
+ if (unveil(NULL, NULL) == -1) {
+ perror("unveil()");
+ exit(2);
+ }
+
+ return (dbh_t *)pthis;
+
+bail:
+ if (pthis) {
+ if (pthis->dir)
+ free(pthis->dir);
+ free(pthis);
+ }
+
+ return NULL;
+}
+
+static void
+dbtext_table_setsize(dbttext_t * pthis, uint nsize)
+{
+ uint nnewalloc;
+ rec_t *pnewitems;
+ uint n;
+
+ if (nsize <= pthis->nalloc)
+ return;
+
+ nnewalloc = pthis->nalloc * 2;
+ if (nnewalloc < nsize)
+ nnewalloc = nsize;
+ pnewitems = (rec_t *) realloc(pthis->pitems, nnewalloc * sizeof(rec_t));
+ if (pnewitems == NULL) {
+ exit(2);
+ }
+ for (n = pthis->nitems; n < nsize; n++) {
+ str_create(&pnewitems[n].w);
+ pnewitems[n].n = 0;
+ }
+ pthis->pitems = pnewitems;
+ pthis->nalloc = nnewalloc;
+}
+
+bool_t
+dbtext_db_close(dbhtext_t * pthis)
+{
+ free(pthis->dir);
+ pthis->dir = NULL;
+ return true;
+}
+
+dbt_t *
+dbtext_db_opentable(dbhtext_t * pthis, cpchar table, bool_t rdonly)
+{
+ dbttext_t *ptable = NULL;
+
+#ifndef NOLOCK
+ struct flock lock;
+
+#endif /* ndef NOLOCK */
+ char szpath[PATH_MAX];
+ int flags, ret;
+ struct stat st;
+ char *pbegin;
+ char *pend;
+ rec_t r;
+ uint pos;
+
+ if (pthis->dir == NULL)
+ goto bail;
+
+ if ((ptable = malloc(sizeof(dbttext_t))) == NULL) {
+ perror("malloc()");
+ goto bail;
+ }
+ ptable->close = dbtext_table_close;
+ ptable->mergeclose = dbtext_table_mergeclose;
+ ptable->unmergeclose = dbtext_table_unmergeclose;
+ ptable->getmsgcount = dbtext_table_getmsgcount;
+ ptable->getcount = dbtext_table_getcount;
+ ptable->fd = -1;
+ ptable->pbuf = NULL;
+ ptable->nmsgs = 0;
+ ptable->nalloc = 0;
+ ptable->nitems = 0;
+ ptable->pitems = NULL;
+
+ ret = snprintf(szpath, sizeof(szpath), "%s/%s.txt", pthis->dir, table);
+ if (ret == -1 || (size_t)ret >= sizeof(szpath)) {
+ fprintf(stderr, "path truncation: %s/%s.txt", pthis->dir, table);
+ goto bail;
+ }
+
+ flags = O_CREAT | (rdonly ? O_RDONLY : O_RDWR);
+ if ((ptable->fd = open(szpath, flags, 0644)) == -1) {
+ perror("open()");
+ goto bail;
+ }
+
+#ifndef NOLOCK
+ memset(&lock, 0, sizeof(lock));
+ lock.l_type = rdonly ? F_RDLCK : F_WRLCK;
+ lock.l_start = 0;
+ lock.l_whence = SEEK_SET;
+ lock.l_len = 0;
+ fcntl(ptable->fd, F_SETLKW, &lock);
+#endif /* ndef NOLOCK */
+
+ if (fstat(ptable->fd, &st) != 0) {
+ perror("fstat()");
+ goto bail_uc;
+ }
+ if (st.st_size == 0) {
+ return (dbt_t *) ptable;
+ }
+ ptable->pbuf = (char *) malloc(st.st_size);
+ if (ptable->pbuf == NULL) {
+ perror("malloc()");
+ goto bail_uc;
+ }
+ if (read(ptable->fd, ptable->pbuf, st.st_size) != st.st_size) {
+ perror("read()");
+ goto bail_fuc;
+ }
+ /* XXX: bogofilter compatibility */
+ if (sscanf(ptable->pbuf, BOGOFILTER_HEADER, &ptable->nmsgs) != 1) {
+ goto bail_fuc;
+ }
+ pbegin = ptable->pbuf;
+ while (*pbegin != '\n')
+ pbegin++;
+ pbegin++;
+
+ pos = 0;
+ while (pbegin < ptable->pbuf + st.st_size) {
+ pend = pbegin;
+ r.w.p = pbegin;
+ r.w.len = 0;
+ r.n = 0;
+
+ while (*pend != '\n') {
+ if (pend >= ptable->pbuf + st.st_size) {
+ goto bail_fuc;
+ }
+ *pend = tolower(*pend);
+ if (*pend == ' ') {
+ r.w.len = (pend - pbegin);
+ r.n = strtol(pend + 1, NULL, 10);
+ }
+ pend++;
+ }
+ if (pend > pbegin && *pbegin != '#' && *pbegin != ';') {
+ if (r.w.len == 0 || r.w.len > MAXWORDLEN) {
+ fprintf(stderr, "dbh_loadfile: bad file format\n");
+ goto bail_fuc;
+ }
+ dbtext_table_setsize(ptable, pos + 1);
+ ptable->pitems[pos++] = r;
+ ptable->nitems = pos;
+ }
+ pbegin = pend + 1;
+ }
+
+ if (rdonly) {
+#ifndef NOLOCK
+ lock.l_type = F_UNLCK;
+ fcntl(ptable->fd, F_SETLKW, &lock);
+#endif /* ndef NOLOCK */
+ close(ptable->fd);
+ ptable->fd = -1;
+ }
+ return (dbt_t *) ptable;
+
+bail_fuc:
+ free(ptable->pbuf);
+
+bail_uc:
+#ifndef NOLOCK
+ lock.l_type = F_UNLCK;
+ fcntl(ptable->fd, F_SETLKW, &lock);
+#endif /* ndef NOLOCK */
+
+ close(ptable->fd);
+ ptable->fd = -1;
+
+bail:
+ free(ptable);
+ return NULL;
+}
+
+bool_t
+dbtext_table_close(dbttext_t * pthis)
+{
+ struct flock lockall;
+
+ free(pthis->pbuf);
+ pthis->pbuf = NULL;
+ free(pthis->pitems);
+ pthis->pitems = NULL;
+
+ if (pthis->fd != -1) {
+#ifndef NOLOCK
+ memset(&lockall, 0, sizeof(lockall));
+ lockall.l_type = F_UNLCK;
+ lockall.l_start = 0;
+ lockall.l_whence = SEEK_SET;
+ lockall.l_len = 0;
+ fcntl(pthis->fd, F_SETLKW, &lockall);
+#endif /* ndef NOLOCK */
+ close(pthis->fd);
+ pthis->fd = -1;
+ }
+ return true;
+}
+
+bool_t
+dbtext_table_mergeclose(dbttext_t * pthis, vec_t * pmsg)
+{
+ /* note that we require both vectors to be sorted */
+
+ uint pos;
+ rec_t *prec;
+ veciter_t msgiter;
+ str_t *pmsgstr;
+ uint count;
+ char iobuf[IOBUFSIZE];
+ char *p;
+
+ if (pthis->fd == -1) {
+ return false;
+ }
+ ftruncate(pthis->fd, 0);
+ lseek(pthis->fd, 0, SEEK_SET);
+
+ pthis->nmsgs++;
+
+ p = iobuf;
+ p += sprintf(p, BOGOFILTER_HEADER, pthis->nmsgs);
+
+ vec_first(pmsg, &msgiter);
+ pmsgstr = veciter_get(&msgiter);
+
+ pos = 0;
+ while (pos < pthis->nitems || pmsgstr != NULL) {
+ int cmp = 0;
+
+ prec = &pthis->pitems[pos];
+ if (pmsgstr != NULL && pos < pthis->nitems) {
+ cmp = str_casecmp(&prec->w, pmsgstr);
+ } else {
+ /* we exhausted one list or the other (but not both) */
+ cmp = (pos < pthis->nitems) ? -1 : 1;
+ }
+ if (cmp < 0) {
+ /* write existing str */
+ count = prec->n;
+ strncpylwr(p, prec->w.p, prec->w.len);
+ p += prec->w.len;
+ *p++ = ' ';
+ p += sprintf(p, "%u\n", count);
+
+ pos++;
+ } else if (cmp == 0) {
+ /* same str, merge and write sum */
+ count = db_getnewcount(&msgiter);
+ count += prec->n;
+ strncpylwr(p, prec->w.p, prec->w.len);
+ p += prec->w.len;
+ *p++ = ' ';
+ p += sprintf(p, "%u\n", count);
+
+ pos++;
+ veciter_next(&msgiter);
+ pmsgstr = veciter_get(&msgiter);
+ } else { /* cmp > 0 */
+ /* write new str */
+ count = db_getnewcount(&msgiter);
+ strncpylwr(p, pmsgstr->p, pmsgstr->len);
+ p += pmsgstr->len;
+ *p++ = ' ';
+ p += sprintf(p, "%u\n", count);
+
+ veciter_next(&msgiter);
+ pmsgstr = veciter_get(&msgiter);
+ }
+
+ if (p + TEXTDB_MAXLINELEN > (iobuf + 1)) {
+ write(pthis->fd, iobuf, p - iobuf);
+ p = iobuf;
+ }
+ }
+ if (p != iobuf) {
+ write(pthis->fd, iobuf, p - iobuf);
+ }
+ veciter_destroy(&msgiter);
+ return dbtext_table_close(pthis);
+}
+
+bool_t
+dbtext_table_unmergeclose(dbttext_t * pthis, vec_t * pmsg)
+{
+ /* note that we require both vectors to be sorted */
+
+ uint pos;
+ rec_t *prec;
+ veciter_t msgiter;
+ str_t *pmsgstr;
+ uint count;
+ char iobuf[IOBUFSIZE];
+ char *p;
+
+ if (pthis->fd == -1) {
+ return false;
+ }
+ ftruncate(pthis->fd, 0);
+ lseek(pthis->fd, 0, SEEK_SET);
+
+ pthis->nmsgs--;
+
+ p = iobuf;
+ p += sprintf(p, BOGOFILTER_HEADER, pthis->nmsgs);
+
+ vec_first(pmsg, &msgiter);
+ pmsgstr = veciter_get(&msgiter);
+
+ pos = 0;
+ while (pos < pthis->nitems || pmsgstr != NULL) {
+ int cmp = 0;
+
+ prec = &pthis->pitems[pos];
+ if (pmsgstr != NULL && pos < pthis->nitems) {
+ cmp = str_casecmp(&prec->w, pmsgstr);
+ } else {
+ /* we exhausted one list or the other (but not both) */
+ cmp = (pos < pthis->nitems) ? -1 : 1;
+ }
+ if (cmp < 0) {
+ /* write existing str */
+ count = prec->n;
+ strncpylwr(p, prec->w.p, prec->w.len);
+ p += prec->w.len;
+ *p++ = ' ';
+ p += sprintf(p, "%u\n", count);
+
+ pos++;
+ } else if (cmp == 0) {
+ /* same str, merge and write difference */
+ count = db_getnewcount(&msgiter);
+ count = (prec->n > count) ? (prec->n - count) : 0;
+ strncpylwr(p, prec->w.p, prec->w.len);
+ p += prec->w.len;
+ *p++ = ' ';
+ p += sprintf(p, "%u\n", count);
+
+ pos++;
+ veciter_next(&msgiter);
+ pmsgstr = veciter_get(&msgiter);
+ } else { /* cmp > 0 */
+ /* this should not happen, so write with count=0 */
+ db_getnewcount(&msgiter);
+ count = 0;
+ strncpylwr(p, pmsgstr->p, pmsgstr->len);
+ p += pmsgstr->len;
+ *p++ = ' ';
+ p += sprintf(p, "%u\n", count);
+
+ veciter_next(&msgiter);
+ pmsgstr = veciter_get(&msgiter);
+ }
+
+ if (p + TEXTDB_MAXLINELEN > (iobuf + 1)) {
+ write(pthis->fd, iobuf, p - iobuf);
+ p = iobuf;
+ }
+ }
+ if (p != iobuf) {
+ write(pthis->fd, iobuf, p - iobuf);
+ }
+ veciter_destroy(&msgiter);
+ return dbtext_table_close(pthis);
+}
+
+uint
+dbtext_table_getmsgcount(dbttext_t * pthis)
+{
+ return pthis->nmsgs;
+}
+
+uint
+dbtext_table_getcount(dbttext_t * pthis, str_t * pword)
+{
+ int lo, hi, mid;
+
+ if (pthis->nitems == 0) {
+ return 0;
+ }
+ hi = pthis->nitems - 1;
+ lo = -1;
+ while (hi - lo > 1) {
+ mid = (hi + lo) / 2;
+ if (str_casecmp(pword, &pthis->pitems[mid].w) <= 0)
+ hi = mid;
+ else
+ lo = mid;
+ }
+
+ if (str_casecmp(pword, &pthis->pitems[hi].w) != 0) {
+ return 0;
+ }
+ return pthis->pitems[hi].n;
}
(DIR) diff --git a/dbh.h b/dbh.h
@@ -10,16 +10,14 @@
#ifndef _DBH_H
#define _DBH_H
-/* database formats */
-typedef enum {
- db_text /* flat text */
-} dbfmt_t;
+#define BOGOFILTER_HEADER "# bogofilter wordlist (format version A): %u\n"
+#define TEXTDB_MAXLINELEN (MAXWORDLEN+32)
/* record/field structure */
typedef struct _rec {
str_t w;
uint n;
-} rec_t;
+} rec_t;
/* database table */
typedef struct _dbt dbt_t;
@@ -38,11 +36,42 @@ struct _dbh {
dbt_t *(*opentable) (dbh_t *, cpchar, bool_t);
};
-dbh_t *dbh_open(dbfmt_t dbfmt, cpchar dbhost, cpchar dbname, cpchar dbuser, cpchar dbpass);
+typedef struct _dbttext dbttext_t;
+struct _dbttext
+{
+ bool_t (*close)(dbttext_t*);
+ bool_t (*mergeclose)(dbttext_t*,vec_t*);
+ bool_t (*unmergeclose)(dbttext_t*,vec_t*);
+ uint (*getmsgcount)(dbttext_t*);
+ uint (*getcount)(dbttext_t*,str_t*);
-#define BOGOFILTER_HEADER "# bogofilter wordlist (format version A): %u\n"
-#define TEXTDB_MAXLINELEN (MAXWORDLEN+32)
+ int fd; /* file descriptor, if currently open */
+ char* pbuf; /* data buffer, if currently open */
+ uint nmsgs; /* number of messages represented in list */
+ uint nalloc; /* items alloced in pitems */
+ uint nitems; /* items available */
+ rec_t* pitems; /* growing vector of items */
+};
+
+typedef struct _dbhtext dbhtext_t;
+struct _dbhtext
+{
+ bool_t (*close)(dbhtext_t*);
+ dbt_t* (*opentable)(dbhtext_t*,cpchar,bool_t);
+
+ char* dir;
+};
uint db_getnewcount(veciter_t * piter);
+dbh_t* dbtext_db_open(cpchar dbname);
+bool_t dbtext_db_close( dbhtext_t* pthis );
+dbt_t* dbtext_db_opentable( dbhtext_t* pthis, cpchar table, bool_t rdonly );
+
+bool_t dbtext_table_close( dbttext_t* pthis );
+bool_t dbtext_table_mergeclose( dbttext_t* pthis, vec_t* pmsg );
+bool_t dbtext_table_unmergeclose( dbttext_t* pthis, vec_t* pmsg );
+uint dbtext_table_getmsgcount( dbttext_t* pthis );
+uint dbtext_table_getcount( dbttext_t* pthis, str_t* pword );
+
#endif /* ndef _DBH_H */
(DIR) diff --git a/dbtext.c b/dbtext.c
@@ -1,490 +0,0 @@
-/* $Id: dbtext.c,v 1.12 2002/10/19 09:59:35 tommy Exp $ */
-
-/*
- * Copyright (c) 2002 Tom Marshall <tommy@tig-grr.com>
- *
- * This program is free software. It may be distributed under the terms
- * in the file LICENSE, found in the top level of the distribution.
- *
- * dbtext.c: flatfile database handler
- */
-
-#include "config.h"
-#include "dbg.h"
-#include "str.h"
-#include "lex.h"
-#include "vec.h"
-
-#include "dbh.h"
-#include "dbtext.h"
-
-static void
-dbtext_table_setsize(dbttext_t * pthis, uint nsize)
-{
- uint nnewalloc;
- rec_t *pnewitems;
- uint n;
-
- if (nsize <= pthis->nalloc)
- return;
-
- nnewalloc = pthis->nalloc * 2;
- if (nnewalloc < nsize)
- nnewalloc = nsize;
- pnewitems = (rec_t *) realloc(pthis->pitems, nnewalloc * sizeof(rec_t));
- if (pnewitems == NULL) {
- exit(2);
- }
- for (n = pthis->nitems; n < nsize; n++) {
- str_create(&pnewitems[n].w);
- pnewitems[n].n = 0;
- }
- pthis->pitems = pnewitems;
- pthis->nalloc = nnewalloc;
-}
-
-dbh_t *
-dbtext_db_open(cpchar dbhost, cpchar dbname, cpchar dbuser, cpchar dbpass)
-{
- dbhtext_t *pthis = NULL;
- uint dirlen;
- cpchar phome;
- struct stat st;
-
- if ((pthis = malloc(sizeof(dbhtext_t))) == NULL) {
- perror("malloc()");
- goto bail;
- }
-
- pthis->close = dbtext_db_close;
- pthis->opentable = dbtext_db_opentable;
-
- if (dbname != NULL && *dbname != '\0') {
- dirlen = strlen(dbname);
- if ((pthis->dir = strdup(dbname)) == NULL) {
- perror("strdup()");
- goto bail;
- }
- if (dirlen && pthis->dir[dirlen - 1] == '/')
- pthis->dir[--dirlen] = '\0';
- } else {
- phome = getenv("HOME");
- if (phome == NULL || *phome == '\0') {
- phome = ".";
- }
- dirlen = strlen(phome) + 5 + 1;
- if ((pthis->dir = malloc(dirlen)) == NULL)
- goto bail;
-
- /* NOTE: no truncation possible */
- snprintf(pthis->dir, dirlen, "%s/.bmf", phome);
- }
-
- /* make sure config directory exists */
- if (stat(pthis->dir, &st) != 0) {
- if (errno != ENOENT ||
- mkdir(pthis->dir, S_IRUSR | S_IWUSR | S_IXUSR) != 0)
- goto bail;
- } else {
- if (!S_ISDIR(st.st_mode))
- goto bail;
- }
-
- /* unveil(2), TODO: rework later */
- /* TODO: permission depending on mode */
- char listpath[PATH_MAX];
- snprintf(listpath, sizeof(listpath), "%s/%s", pthis->dir, "goodlist");
- if (unveil(listpath, "rw") == -1) {
- perror("unveil()");
- exit(2);
- }
- snprintf(listpath, sizeof(listpath), "%s/%s", pthis->dir, "spamlist");
- if (unveil(listpath, "rw") == -1) {
- perror("unveil()");
- exit(2);
- }
- if (unveil(NULL, NULL) == -1) {
- perror("unveil()");
- exit(2);
- }
-
- return (dbh_t *)pthis;
-
-bail:
- if (pthis) {
- if (pthis->dir)
- free(pthis->dir);
- free(pthis);
- }
-
- return NULL;
-}
-
-bool_t
-dbtext_db_close(dbhtext_t * pthis)
-{
- free(pthis->dir);
- pthis->dir = NULL;
- return true;
-}
-
-dbt_t *
-dbtext_db_opentable(dbhtext_t * pthis, cpchar table, bool_t rdonly)
-{
- dbttext_t *ptable = NULL;
-
-#ifndef NOLOCK
- struct flock lock;
-
-#endif /* ndef NOLOCK */
- char szpath[PATH_MAX];
- int flags, ret;
- struct stat st;
- char *pbegin;
- char *pend;
- rec_t r;
- uint pos;
-
- if (pthis->dir == NULL)
- goto bail;
-
- if ((ptable = malloc(sizeof(dbttext_t))) == NULL) {
- perror("malloc()");
- goto bail;
- }
- ptable->close = dbtext_table_close;
- ptable->mergeclose = dbtext_table_mergeclose;
- ptable->unmergeclose = dbtext_table_unmergeclose;
- ptable->getmsgcount = dbtext_table_getmsgcount;
- ptable->getcount = dbtext_table_getcount;
- ptable->fd = -1;
- ptable->pbuf = NULL;
- ptable->nmsgs = 0;
- ptable->nalloc = 0;
- ptable->nitems = 0;
- ptable->pitems = NULL;
-
- ret = snprintf(szpath, sizeof(szpath), "%s/%s.txt", pthis->dir, table);
- if (ret == -1 || (size_t)ret >= sizeof(szpath)) {
- fprintf(stderr, "path truncation: %s/%s.txt", pthis->dir, table);
- goto bail;
- }
-
- flags = O_CREAT | (rdonly ? O_RDONLY : O_RDWR);
- if ((ptable->fd = open(szpath, flags, 0644)) == -1) {
- perror("open()");
- goto bail;
- }
-
-#ifndef NOLOCK
- memset(&lock, 0, sizeof(lock));
- lock.l_type = rdonly ? F_RDLCK : F_WRLCK;
- lock.l_start = 0;
- lock.l_whence = SEEK_SET;
- lock.l_len = 0;
- fcntl(ptable->fd, F_SETLKW, &lock);
-#endif /* ndef NOLOCK */
-
- if (fstat(ptable->fd, &st) != 0) {
- perror("fstat()");
- goto bail_uc;
- }
- if (st.st_size == 0) {
- return (dbt_t *) ptable;
- }
- ptable->pbuf = (char *) malloc(st.st_size);
- if (ptable->pbuf == NULL) {
- perror("malloc()");
- goto bail_uc;
- }
- if (read(ptable->fd, ptable->pbuf, st.st_size) != st.st_size) {
- perror("read()");
- goto bail_fuc;
- }
- /* XXX: bogofilter compatibility */
- if (sscanf(ptable->pbuf, BOGOFILTER_HEADER, &ptable->nmsgs) != 1) {
- goto bail_fuc;
- }
- pbegin = ptable->pbuf;
- while (*pbegin != '\n')
- pbegin++;
- pbegin++;
-
- pos = 0;
- while (pbegin < ptable->pbuf + st.st_size) {
- pend = pbegin;
- r.w.p = pbegin;
- r.w.len = 0;
- r.n = 0;
-
- while (*pend != '\n') {
- if (pend >= ptable->pbuf + st.st_size) {
- goto bail_fuc;
- }
- *pend = tolower(*pend);
- if (*pend == ' ') {
- r.w.len = (pend - pbegin);
- r.n = strtol(pend + 1, NULL, 10);
- }
- pend++;
- }
- if (pend > pbegin && *pbegin != '#' && *pbegin != ';') {
- if (r.w.len == 0 || r.w.len > MAXWORDLEN) {
- fprintf(stderr, "dbh_loadfile: bad file format\n");
- goto bail_fuc;
- }
- dbtext_table_setsize(ptable, pos + 1);
- ptable->pitems[pos++] = r;
- ptable->nitems = pos;
- }
- pbegin = pend + 1;
- }
-
- if (rdonly) {
-#ifndef NOLOCK
- lock.l_type = F_UNLCK;
- fcntl(ptable->fd, F_SETLKW, &lock);
-#endif /* ndef NOLOCK */
- close(ptable->fd);
- ptable->fd = -1;
- }
- return (dbt_t *) ptable;
-
-bail_fuc:
- free(ptable->pbuf);
-
-bail_uc:
-#ifndef NOLOCK
- lock.l_type = F_UNLCK;
- fcntl(ptable->fd, F_SETLKW, &lock);
-#endif /* ndef NOLOCK */
-
- close(ptable->fd);
- ptable->fd = -1;
-
-bail:
- free(ptable);
- return NULL;
-}
-
-bool_t
-dbtext_table_close(dbttext_t * pthis)
-{
- struct flock lockall;
-
- free(pthis->pbuf);
- pthis->pbuf = NULL;
- free(pthis->pitems);
- pthis->pitems = NULL;
-
- if (pthis->fd != -1) {
-#ifndef NOLOCK
- memset(&lockall, 0, sizeof(lockall));
- lockall.l_type = F_UNLCK;
- lockall.l_start = 0;
- lockall.l_whence = SEEK_SET;
- lockall.l_len = 0;
- fcntl(pthis->fd, F_SETLKW, &lockall);
-#endif /* ndef NOLOCK */
- close(pthis->fd);
- pthis->fd = -1;
- }
- return true;
-}
-
-bool_t
-dbtext_table_mergeclose(dbttext_t * pthis, vec_t * pmsg)
-{
- /* note that we require both vectors to be sorted */
-
- uint pos;
- rec_t *prec;
- veciter_t msgiter;
- str_t *pmsgstr;
- uint count;
- char iobuf[IOBUFSIZE];
- char *p;
-
- if (pthis->fd == -1) {
- return false;
- }
- ftruncate(pthis->fd, 0);
- lseek(pthis->fd, 0, SEEK_SET);
-
- pthis->nmsgs++;
-
- p = iobuf;
- p += sprintf(p, BOGOFILTER_HEADER, pthis->nmsgs);
-
- vec_first(pmsg, &msgiter);
- pmsgstr = veciter_get(&msgiter);
-
- pos = 0;
- while (pos < pthis->nitems || pmsgstr != NULL) {
- int cmp = 0;
-
- prec = &pthis->pitems[pos];
- if (pmsgstr != NULL && pos < pthis->nitems) {
- cmp = str_casecmp(&prec->w, pmsgstr);
- } else {
- /* we exhausted one list or the other (but not both) */
- cmp = (pos < pthis->nitems) ? -1 : 1;
- }
- if (cmp < 0) {
- /* write existing str */
- count = prec->n;
- strncpylwr(p, prec->w.p, prec->w.len);
- p += prec->w.len;
- *p++ = ' ';
- p += sprintf(p, "%u\n", count);
-
- pos++;
- } else if (cmp == 0) {
- /* same str, merge and write sum */
- count = db_getnewcount(&msgiter);
- count += prec->n;
- strncpylwr(p, prec->w.p, prec->w.len);
- p += prec->w.len;
- *p++ = ' ';
- p += sprintf(p, "%u\n", count);
-
- pos++;
- veciter_next(&msgiter);
- pmsgstr = veciter_get(&msgiter);
- } else { /* cmp > 0 */
- /* write new str */
- count = db_getnewcount(&msgiter);
- strncpylwr(p, pmsgstr->p, pmsgstr->len);
- p += pmsgstr->len;
- *p++ = ' ';
- p += sprintf(p, "%u\n", count);
-
- veciter_next(&msgiter);
- pmsgstr = veciter_get(&msgiter);
- }
-
- if (p + TEXTDB_MAXLINELEN > (iobuf + 1)) {
- write(pthis->fd, iobuf, p - iobuf);
- p = iobuf;
- }
- }
- if (p != iobuf) {
- write(pthis->fd, iobuf, p - iobuf);
- }
- veciter_destroy(&msgiter);
- return dbtext_table_close(pthis);
-}
-
-bool_t
-dbtext_table_unmergeclose(dbttext_t * pthis, vec_t * pmsg)
-{
- /* note that we require both vectors to be sorted */
-
- uint pos;
- rec_t *prec;
- veciter_t msgiter;
- str_t *pmsgstr;
- uint count;
- char iobuf[IOBUFSIZE];
- char *p;
-
- if (pthis->fd == -1) {
- return false;
- }
- ftruncate(pthis->fd, 0);
- lseek(pthis->fd, 0, SEEK_SET);
-
- pthis->nmsgs--;
-
- p = iobuf;
- p += sprintf(p, BOGOFILTER_HEADER, pthis->nmsgs);
-
- vec_first(pmsg, &msgiter);
- pmsgstr = veciter_get(&msgiter);
-
- pos = 0;
- while (pos < pthis->nitems || pmsgstr != NULL) {
- int cmp = 0;
-
- prec = &pthis->pitems[pos];
- if (pmsgstr != NULL && pos < pthis->nitems) {
- cmp = str_casecmp(&prec->w, pmsgstr);
- } else {
- /* we exhausted one list or the other (but not both) */
- cmp = (pos < pthis->nitems) ? -1 : 1;
- }
- if (cmp < 0) {
- /* write existing str */
- count = prec->n;
- strncpylwr(p, prec->w.p, prec->w.len);
- p += prec->w.len;
- *p++ = ' ';
- p += sprintf(p, "%u\n", count);
-
- pos++;
- } else if (cmp == 0) {
- /* same str, merge and write difference */
- count = db_getnewcount(&msgiter);
- count = (prec->n > count) ? (prec->n - count) : 0;
- strncpylwr(p, prec->w.p, prec->w.len);
- p += prec->w.len;
- *p++ = ' ';
- p += sprintf(p, "%u\n", count);
-
- pos++;
- veciter_next(&msgiter);
- pmsgstr = veciter_get(&msgiter);
- } else { /* cmp > 0 */
- /* this should not happen, so write with count=0 */
- db_getnewcount(&msgiter);
- count = 0;
- strncpylwr(p, pmsgstr->p, pmsgstr->len);
- p += pmsgstr->len;
- *p++ = ' ';
- p += sprintf(p, "%u\n", count);
-
- veciter_next(&msgiter);
- pmsgstr = veciter_get(&msgiter);
- }
-
- if (p + TEXTDB_MAXLINELEN > (iobuf + 1)) {
- write(pthis->fd, iobuf, p - iobuf);
- p = iobuf;
- }
- }
- if (p != iobuf) {
- write(pthis->fd, iobuf, p - iobuf);
- }
- veciter_destroy(&msgiter);
- return dbtext_table_close(pthis);
-}
-
-uint
-dbtext_table_getmsgcount(dbttext_t * pthis)
-{
- return pthis->nmsgs;
-}
-
-uint
-dbtext_table_getcount(dbttext_t * pthis, str_t * pword)
-{
- int lo, hi, mid;
-
- if (pthis->nitems == 0) {
- return 0;
- }
- hi = pthis->nitems - 1;
- lo = -1;
- while (hi - lo > 1) {
- mid = (hi + lo) / 2;
- if (str_casecmp(pword, &pthis->pitems[mid].w) <= 0)
- hi = mid;
- else
- lo = mid;
- }
-
- if (str_casecmp(pword, &pthis->pitems[hi].w) != 0) {
- return 0;
- }
- return pthis->pitems[hi].n;
-}
(DIR) diff --git a/dbtext.h b/dbtext.h
@@ -1,49 +0,0 @@
-/* $Id: dbtext.h,v 1.3 2002/10/02 04:45:40 tommy Exp $ */
-
-/*
- * Copyright (c) 2002 Tom Marshall <tommy@tig-grr.com>
- *
- * This program is free software. It may be distributed under the terms
- * in the file LICENSE, found in the top level of the distribution.
- */
-
-#ifndef _DBTEXT_H
-#define _DBTEXT_H
-
-typedef struct _dbttext dbttext_t;
-struct _dbttext
-{
- bool_t (*close)(dbttext_t*);
- bool_t (*mergeclose)(dbttext_t*,vec_t*);
- bool_t (*unmergeclose)(dbttext_t*,vec_t*);
- uint (*getmsgcount)(dbttext_t*);
- uint (*getcount)(dbttext_t*,str_t*);
-
- int fd; /* file descriptor, if currently open */
- char* pbuf; /* data buffer, if currently open */
- uint nmsgs; /* number of messages represented in list */
- uint nalloc; /* items alloced in pitems */
- uint nitems; /* items available */
- rec_t* pitems; /* growing vector of items */
-};
-
-typedef struct _dbhtext dbhtext_t;
-struct _dbhtext
-{
- bool_t (*close)(dbhtext_t*);
- dbt_t* (*opentable)(dbhtext_t*,cpchar,bool_t);
-
- char* dir;
-};
-
-dbh_t* dbtext_db_open( cpchar dbhost, cpchar dbname, cpchar dbuser, cpchar dbpass );
-bool_t dbtext_db_close( dbhtext_t* pthis );
-dbt_t* dbtext_db_opentable( dbhtext_t* pthis, cpchar table, bool_t rdonly );
-
-bool_t dbtext_table_close( dbttext_t* pthis );
-bool_t dbtext_table_mergeclose( dbttext_t* pthis, vec_t* pmsg );
-bool_t dbtext_table_unmergeclose( dbttext_t* pthis, vec_t* pmsg );
-uint dbtext_table_getmsgcount( dbttext_t* pthis );
-uint dbtext_table_getcount( dbttext_t* pthis, str_t* pword );
-
-#endif /* ndef _DBTEXT_H */
(DIR) diff --git a/filt.h b/filt.h
@@ -10,17 +10,21 @@
#ifndef _FILT_H
#define _FILT_H
+#include "lex.h"
+#include "str.h"
+#include "vec.h"
+
typedef struct
{
- str_t key;
- double prob;
+ str_t key;
+ double prob;
} discrim_t;
typedef struct
{
- double spamicity;
- uint keepers;
- discrim_t* extrema;
+ double spamicity;
+ uint keepers;
+ discrim_t* extrema;
} stats_t;
void statdump( stats_t* pstat, int fd );