*** dbinc/os.h.orig 2002/03/27 04:34:55 11.14 --- dbinc/os.h 2002/09/26 18:10:10 *************** *** 22,29 **** int fd; /* POSIX file descriptor. */ char *name; /* File name. */ u_int32_t log_size; /* XXX: Log file size. */ ! u_int32_t pagesize; /* XXX: Page size. */ #define DB_FH_NOSYNC 0x01 /* Handle doesn't need to be sync'd. */ #define DB_FH_UNLINK 0x02 /* Unlink on close */ --- 22,34 ---- int fd; /* POSIX file descriptor. */ char *name; /* File name. */ + u_int32_t pagesize; /* Underlying page size. */ + u_int32_t log_size; /* XXX: Log file size. */ ! ! u_int32_t pgno; /* Last seek. */ ! u_int32_t pgsize; ! u_int32_t offset; #define DB_FH_NOSYNC 0x01 /* Handle doesn't need to be sync'd. */ #define DB_FH_UNLINK 0x02 /* Unlink on close */ *** os/os_rw.c.orig 2002/07/12 18:56:52 11.24 --- os/os_rw.c 2002/09/16 20:46:14 11.25 *************** *** 35,40 **** --- 35,43 ---- { int ret; + /* Check for illegal usage. */ + DB_ASSERT(F_ISSET(db_iop->fhp, DB_FH_VALID) && db_iop->fhp->fd != -1); + #if defined(HAVE_PREAD) && defined(HAVE_PWRITE) switch (op) { case DB_IO_READ: *************** *** 95,100 **** --- 98,106 ---- int ret; u_int8_t *taddr; + /* Check for illegal usage. */ + DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1); + for (taddr = addr, offset = 0; offset < len; taddr += nr, offset += nr) { retry: if ((nr = DB_GLOBAL(j_read) != NULL ? *************** *** 131,136 **** --- 137,145 ---- ssize_t nw; int ret; u_int8_t *taddr; + + /* Check for illegal usage. */ + DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1); for (taddr = addr, offset = 0; offset < len; taddr += nw, offset += nw) *** os/os_rw.c.orig 2002/09/16 20:46:14 11.25 --- os/os_rw.c 2002/09/26 18:10:20 *************** *** 13,18 **** --- 13,19 ---- #ifndef NO_SYSTEM_INCLUDES #include + #include #include #include *************** *** 20,25 **** --- 21,31 ---- #include "db_int.h" + #ifdef HAVE_FILESYSTEM_NOTZERO + static int __os_zerofill __P((DB_ENV *, DB_FH *)); + #endif + static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *)); + /* * __os_io -- * Do an I/O. *************** *** 49,54 **** --- 55,64 ---- case DB_IO_WRITE: if (DB_GLOBAL(j_write) != NULL) goto slow; + #ifdef HAVE_FILESYSTEM_NOTZERO + if (__os_fs_notzero()) + goto slow; + #endif *niop = pwrite(db_iop->fhp->fd, db_iop->buf, db_iop->bytes, (off_t)db_iop->pgno * db_iop->pagesize); break; *************** *** 133,145 **** size_t len; size_t *nwp; { size_t offset; ssize_t nw; int ret; u_int8_t *taddr; ! /* Check for illegal usage. */ ! DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1); for (taddr = addr, offset = 0; offset < len; taddr += nw, offset += nw) --- 143,189 ---- size_t len; size_t *nwp; { + /* Check for illegal usage. */ + DB_ASSERT(F_ISSET(fhp, DB_FH_VALID) && fhp->fd != -1); + + #ifdef HAVE_FILESYSTEM_NOTZERO + /* Zero-fill as necessary. */ + if (__os_fs_notzero()) { + int ret; + if ((ret = __os_zerofill(dbenv, fhp)) != 0) + return (ret); + } + #endif + return (__os_physwrite(dbenv, fhp, addr, len, nwp)); + } + + /* + * __os_physwrite -- + * Physical write to a file handle. + */ + static int + __os_physwrite(dbenv, fhp, addr, len, nwp) + DB_ENV *dbenv; + DB_FH *fhp; + void *addr; + size_t len; + size_t *nwp; + { size_t offset; ssize_t nw; int ret; u_int8_t *taddr; ! #if defined(HAVE_FILESYSTEM_NOTZERO) && defined(DIAGNOSTIC) ! if (__os_fs_notzero()) { ! struct stat sb; ! off_t cur_off; ! ! DB_ASSERT(fstat(fhp->fd, &sb) != -1 && ! (cur_off = lseek(fhp->fd, (off_t)0, SEEK_CUR)) != -1 && ! cur_off <= sb.st_size); ! } ! #endif for (taddr = addr, offset = 0; offset < len; taddr += nw, offset += nw) *************** *** 155,157 **** --- 199,288 ---- *nwp = len; return (0); } + + #ifdef HAVE_FILESYSTEM_NOTZERO + /* + * __os_zerofill -- + * Zero out bytes in the file. + * + * Pages allocated by writing pages past end-of-file are not zeroed, + * on some systems. Recovery could theoretically be fooled by a page + * showing up that contained garbage. In order to avoid this, we + * have to write the pages out to disk, and flush them. The reason + * for the flush is because if we don't sync, the allocation of another + * page subsequent to this one might reach the disk first, and if we + * crashed at the right moment, leave us with this page as the one + * allocated by writing a page past it in the file. + */ + static int + __os_zerofill(dbenv, fhp) + DB_ENV *dbenv; + DB_FH *fhp; + { + off_t stat_offset, write_offset; + size_t blen, nw; + u_int32_t bytes, mbytes; + int group_sync, need_free, ret; + u_int8_t buf[8 * 1024], *bp; + + /* Calculate the byte offset of the next write. */ + write_offset = (off_t)fhp->pgno * fhp->pgsize + fhp->offset; + + /* Stat the file. */ + if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0) + return (ret); + stat_offset = (off_t)mbytes * MEGABYTE + bytes; + + /* Check if the file is large enough. */ + if (stat_offset >= write_offset) + return (0); + + /* Get a large buffer if we're writing lots of data. */ + #undef ZF_LARGE_WRITE + #define ZF_LARGE_WRITE (64 * 1024) + if (write_offset - stat_offset > ZF_LARGE_WRITE) { + if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0) + return (ret); + blen = ZF_LARGE_WRITE; + need_free = 1; + } else { + bp = buf; + blen = sizeof(buf); + need_free = 0; + memset(buf, 0, sizeof(buf)); + } + + /* Seek to the current end of the file. */ + if ((ret = __os_seek( + dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0) + goto err; + + /* + * Hash is the only access method that allocates groups of pages. Hash + * uses the existence of the last page in a group to signify the entire + * group is OK; so, write all the pages but the last one in the group, + * flush them to disk, then write the last one to disk and flush it. + */ + for (group_sync = 0; stat_offset < write_offset; group_sync = 1) { + if (write_offset - stat_offset <= blen) { + blen = (size_t)(write_offset - stat_offset); + if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0) + goto err; + } + if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0) + goto err; + stat_offset += blen; + } + if ((ret = __os_fsync(dbenv, fhp)) != 0) + goto err; + + /* Seek back to where we started. */ + mbytes = (u_int32_t)(write_offset / MEGABYTE); + bytes = (u_int32_t)(write_offset % MEGABYTE); + ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET); + + err: if (need_free) + __os_free(dbenv, bp); + return (ret); + } + #endif *** os/os_seek.c.orig Mon Jul 15 22:03:38 2002 --- os/os_seek.c Thu Sep 26 14:13:52 2002 *************** *** 68,74 **** } while (ret == EINTR); } ! if (ret != 0) __db_err(dbenv, "seek: %lu %d %d: %s", (u_long)pgsize * pageno + relative, isrewind, db_whence, strerror(ret)); --- 68,78 ---- } while (ret == EINTR); } ! if (ret == 0) { ! fhp->pgsize = pgsize; ! fhp->pgno = pageno; ! fhp->offset = relative; ! } else __db_err(dbenv, "seek: %lu %d %d: %s", (u_long)pgsize * pageno + relative, isrewind, db_whence, strerror(ret)); *** os_win32/os_rw.c.orig 2002/08/06 04:56:19 11.28 --- os_win32/os_rw.c 2002/09/26 18:10:20 *************** *** 20,25 **** --- 20,30 ---- #include "db_int.h" + #ifdef HAVE_FILESYSTEM_NOTZERO + static int __os_zerofill __P((DB_ENV *, DB_FH *)); + #endif + static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *)); + /* * __os_io -- * Do an I/O. *************** *** 54,59 **** --- 59,68 ---- case DB_IO_WRITE: if (DB_GLOBAL(j_write) != NULL) goto slow; + #ifdef HAVE_FILESYSTEM_NOTZERO + if (__os_fs_notzero()) + goto slow; + #endif if (!WriteFile(db_iop->fhp->handle, db_iop->buf, (DWORD)db_iop->bytes, &nbytes, &over)) goto slow; *************** *** 149,154 **** --- 158,185 ---- size_t len; size_t *nwp; { + int ret; + + #ifdef HAVE_FILESYSTEM_NOTZERO + /* Zero-fill as necessary. */ + if (__os_fs_notzero() && (ret = __os_zerofill(dbenv, fhp)) != 0) + return (ret); + #endif + return (__os_physwrite(dbenv, fhp, addr, len, nwp)); + } + + /* + * __os_physwrite -- + * Physical write to a file handle. + */ + static int + __os_physwrite(dbenv, fhp, addr, len, nwp) + DB_ENV *dbenv; + DB_FH *fhp; + void *addr; + size_t len; + size_t *nwp; + { size_t offset; DWORD nw; int ret; *************** *** 180,182 **** --- 211,300 ---- *nwp = len; return (0); } + + #ifdef HAVE_FILESYSTEM_NOTZERO + /* + * __os_zerofill -- + * Zero out bytes in the file. + * + * Pages allocated by writing pages past end-of-file are not zeroed, + * on some systems. Recovery could theoretically be fooled by a page + * showing up that contained garbage. In order to avoid this, we + * have to write the pages out to disk, and flush them. The reason + * for the flush is because if we don't sync, the allocation of another + * page subsequent to this one might reach the disk first, and if we + * crashed at the right moment, leave us with this page as the one + * allocated by writing a page past it in the file. + */ + static int + __os_zerofill(dbenv, fhp) + DB_ENV *dbenv; + DB_FH *fhp; + { + unsigned __int64 stat_offset, write_offset; + size_t blen, nw; + u_int32_t bytes, mbytes; + int group_sync, need_free, ret; + u_int8_t buf[8 * 1024], *bp; + + /* Calculate the byte offset of the next write. */ + write_offset = (unsigned __int64)fhp->pgno * fhp->pgsize + fhp->offset; + + /* Stat the file. */ + if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0) + return (ret); + stat_offset = (unsigned __int64)mbytes * MEGABYTE + bytes; + + /* Check if the file is large enough. */ + if (stat_offset >= write_offset) + return (0); + + /* Get a large buffer if we're writing lots of data. */ + #undef ZF_LARGE_WRITE + #define ZF_LARGE_WRITE (64 * 1024) + if (write_offset - stat_offset > ZF_LARGE_WRITE) { + if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0) + return (ret); + blen = ZF_LARGE_WRITE; + need_free = 1; + } else { + bp = buf; + blen = sizeof(buf); + need_free = 0; + memset(buf, 0, sizeof(buf)); + } + + /* Seek to the current end of the file. */ + if ((ret = __os_seek( + dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0) + goto err; + + /* + * Hash is the only access method that allocates groups of pages. Hash + * uses the existence of the last page in a group to signify the entire + * group is OK; so, write all the pages but the last one in the group, + * flush them to disk, then write the last one to disk and flush it. + */ + for (group_sync = 0; stat_offset < write_offset; group_sync = 1) { + if (write_offset - stat_offset <= blen) { + blen = (size_t)(write_offset - stat_offset); + if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0) + goto err; + } + if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0) + goto err; + stat_offset += blen; + } + if ((ret = __os_fsync(dbenv, fhp)) != 0) + goto err; + + /* Seek back to where we started. */ + mbytes = (u_int32_t)(write_offset / MEGABYTE); + bytes = (u_int32_t)(write_offset % MEGABYTE); + ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET); + + err: if (need_free) + __os_free(dbenv, bp); + return (ret); + } + #endif *** os_win32/os_seek.c.orig 2002/08/06 04:56:20 11.17 --- os_win32/os_seek.c 2002/09/26 18:10:20 *************** *** 79,88 **** __os_win32_errno() : 0; } ! if (ret != 0) __db_err(dbenv, "seek: %lu %d %d: %s", (u_long)pgsize * pageno + relative, isrewind, db_whence, strerror(ret)); return (ret); } --- 79,93 ---- __os_win32_errno() : 0; } ! if (ret == 0) { ! fhp->pgsize = pgsize; ! fhp->pgno = pageno; ! fhp->offset = relative; ! } else { __db_err(dbenv, "seek: %lu %d %d: %s", (u_long)pgsize * pageno + relative, isrewind, db_whence, strerror(ret)); + } return (ret); } .