tventi: new icache - plan9port - [fork] Plan 9 from user space
 (HTM) git clone git://src.adamsgaard.dk/plan9port
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 7a400ee957a0815287af806e18ef90dd18b47f82
 (DIR) parent 25a4e89fa907ed5a5f5d84eccfb66180007d9c68
 (HTM) Author: Russ Cox <rsc@swtch.com>
       Date:   Tue, 25 Sep 2007 09:47:31 -0400
       
       venti: new icache
       
       Diffstat:
         M src/cmd/venti/srv/arena.c           |     195 ++++++++++++++++++++++++++++++-
         M src/cmd/venti/srv/checkindex.c      |      11 ++++++-----
         M src/cmd/venti/srv/conv.c            |       8 ++++----
         M src/cmd/venti/srv/dat.h             |      46 +++++++++++++++++++++++--------
         M src/cmd/venti/srv/fns.h             |      11 +++++++----
         M src/cmd/venti/srv/hdisk.c           |      10 +++++-----
         M src/cmd/venti/srv/httpd.c           |       8 +++++++-
         M src/cmd/venti/srv/icache.c          |     708 +++++++++++++++++++------------
         M src/cmd/venti/srv/icachewrite.c     |      92 +++++++++++++++++--------------
         M src/cmd/venti/srv/ifile.c           |      35 ++++++++++++++++++++-----------
         M src/cmd/venti/srv/index.c           |      19 +++++++++++++++++++
         M src/cmd/venti/srv/lump.c            |      50 +++++--------------------------
         M src/cmd/venti/srv/stats.c           |       6 ++++++
         M src/cmd/venti/srv/syncindex.c       |       8 +-------
         M src/cmd/venti/srv/syncindex0.c      |       2 +-
         M src/cmd/venti/srv/venti.c           |      21 ++++++++-------------
         M src/cmd/venti/srv/www/stats.js      |      10 ++++++++++
       
       17 files changed, 813 insertions(+), 427 deletions(-)
       ---
 (DIR) diff --git a/src/cmd/venti/srv/arena.c b/src/cmd/venti/srv/arena.c
       t@@ -16,6 +16,7 @@ static int        loadarena(Arena *arena);
        static CIBlock        *getcib(Arena *arena, int clump, int writing, CIBlock *rock);
        static void        putcib(Arena *arena, CIBlock *cib);
        static void        sumproc(void *);
       +static void loadcig(Arena *arena);
        
        static QLock        sumlock;
        static Rendez        sumwait;
       t@@ -137,14 +138,23 @@ readclumpinfos(Arena *arena, int clump, ClumpInfo *cis, int n)
                CIBlock *cib, r;
                int i;
        
       -        for(i = 0; i < n; i++){
       +        /*
       +         * because the clump blocks are laid out
       +         * in reverse order at the end of the arena,
       +         * it can be a few percent faster to read
       +         * the clumps backwards, which reads the
       +         * disk blocks forwards.
       +         */
       +        for(i = n-1; i >= 0; i--){
                        cib = getcib(arena, clump + i, 0, &r);
       -                if(cib == nil)
       -                        break;
       +                if(cib == nil){
       +                        n = i;
       +                        continue;
       +                }
                        unpackclumpinfo(&cis[i], &cib->data->data[cib->offset]);
                        putcib(arena, cib);
                }
       -        return i;
       +        return n;
        }
        
        /*
       t@@ -349,7 +359,28 @@ writeaclump(Arena *arena, Clump *c, u8int *clbuf, u64int start, u64int *pa)
                if(c->info.size < c->info.uncsize)
                        arena->memstats.cclumps++;
        
       -        clump = arena->memstats.clumps++;
       +        clump = arena->memstats.clumps;
       +        if(clump % ArenaCIGSize == 0){
       +                if(arena->cig == nil){
       +                        loadcig(arena);
       +                        if(arena->cig == nil)
       +                                goto NoCIG;
       +                }
       +                /* add aa as start of next cig */
       +                if(clump/ArenaCIGSize != arena->ncig){
       +                        fprint(2, "bad arena cig computation %s: writing clump %d but %d cigs\n",
       +                                arena->name, clump, arena->ncig);
       +                        arena->ncig = -1;
       +                        vtfree(arena->cig);
       +                        arena->cig = nil;
       +                        goto NoCIG;
       +                }
       +                arena->cig = vtrealloc(arena->cig, (arena->ncig+1)*sizeof arena->cig[0]);
       +                arena->cig[arena->ncig++].offset = aa;
       +        }
       +NoCIG:
       +        arena->memstats.clumps++;
       +
                if(arena->memstats.clumps == 0)
                        sysfatal("clumps wrapped");
                arena->wtime = now();
       t@@ -752,3 +783,157 @@ putcib(Arena *arena, CIBlock *cib)
                putdblock(cib->data);
                cib->data = nil;
        }
       +
       +
       +/*
       + * For index entry readahead purposes, the arenas are 
       + * broken into smaller subpieces, called clump info groups
       + * or cigs.  Each cig has ArenaCIGSize clumps (ArenaCIGSize
       + * is chosen to make the index entries take up about half
       + * a megabyte).  The index entries do not contain enough
       + * information to determine what the clump index is for
       + * a given address in an arena.  That info is needed both for
       + * figuring out which clump group an address belongs to 
       + * and for prefetching a clump group's index entries from
       + * the arena table of contents.  The first time clump groups
       + * are accessed, we scan the entire arena table of contents
       + * (which might be 10s of megabytes), recording the data 
       + * offset of each clump group.
       + */
       +
       +/* 
       + * load clump info group information by scanning entire toc.
       + */
       +static void
       +loadcig(Arena *arena)
       +{
       +        u32int i, j, ncig, nci;
       +        ArenaCIG *cig;
       +        ClumpInfo *ci;
       +        u64int offset;
       +        int ms;
       +
       +        if(arena->cig || arena->ncig < 0)
       +                return;
       +
       +//        fprint(2, "loadcig %s\n", arena->name);
       +        
       +        ncig = (arena->memstats.clumps+ArenaCIGSize-1) / ArenaCIGSize;
       +        if(ncig == 0){
       +                arena->cig = vtmalloc(1);
       +                arena->ncig = 0;
       +                return;
       +        }
       +
       +        ms = msec();
       +        cig = vtmalloc(ncig*sizeof cig[0]);
       +        ci = vtmalloc(ArenaCIGSize*sizeof ci[0]);
       +        offset = 0;
       +        for(i=0; i<ncig; i++){
       +                nci = readclumpinfos(arena, i*ArenaCIGSize, ci, ArenaCIGSize);
       +                cig[i].offset = offset;
       +                for(j=0; j<nci; j++)
       +                        offset += ClumpSize + ci[j].size;
       +                if(nci < ArenaCIGSize){
       +                        if(i != ncig-1){
       +                                vtfree(ci);
       +                                vtfree(cig);
       +                                arena->ncig = -1;
       +                                fprint(2, "loadcig %s: got %ud cigs, expected %ud\n", arena->name, i+1, ncig);
       +                                goto out;
       +                        }
       +                }
       +        }
       +        vtfree(ci);
       +        
       +        arena->ncig = ncig;
       +        arena->cig = cig;
       +
       +out:
       +        ms = msec() - ms;
       +        addstat2(StatCigLoad, 1, StatCigLoadTime, ms);
       +}
       +
       +/*
       + * convert arena address into arena group + data boundaries.
       + */
       +int
       +arenatog(Arena *arena, u64int addr, u64int *gstart, u64int *glimit, int *g)
       +{
       +        int r, l, m;
       +
       +        qlock(&arena->lock);
       +        if(arena->cig == nil)
       +                loadcig(arena);
       +        if(arena->cig == nil || arena->ncig == 0){
       +                qunlock(&arena->lock);
       +                return -1;
       +        }
       +
       +        l = 1;
       +        r = arena->ncig - 1;
       +        while(l <= r){
       +                m = (r + l) / 2;
       +                if(arena->cig[m].offset <= addr)
       +                        l = m + 1;
       +                else
       +                        r = m - 1;
       +        }
       +        l--;
       +
       +        *g = l;
       +        *gstart = arena->cig[l].offset;
       +        if(l+1 < arena->ncig)
       +                *glimit = arena->cig[l+1].offset;
       +        else
       +                *glimit = arena->memstats.used;
       +        qunlock(&arena->lock);
       +        return 0;
       +}
       +
       +/*
       + * load the clump info for group g into the index entries.
       + */
       +int
       +asumload(Arena *arena, int g, IEntry *entries, int nentries)
       +{
       +        int i, base, limit;
       +        u64int addr;
       +        ClumpInfo ci;
       +        IEntry *ie;
       +
       +        if(nentries < ArenaCIGSize){
       +                fprint(2, "asking for too few entries\n");
       +                return -1;
       +        }
       +        
       +        qlock(&arena->lock);
       +        if(arena->cig == nil)
       +                loadcig(arena);
       +        if(arena->cig == nil || arena->ncig == 0 || g >= arena->ncig){
       +                qunlock(&arena->lock);
       +                return -1;
       +        }
       +        
       +        addr = 0;
       +        base = g*ArenaCIGSize;
       +        limit = base + ArenaCIGSize;
       +        if(base > arena->memstats.clumps)
       +                base = arena->memstats.clumps;
       +        ie = entries;
       +        for(i=base; i<limit; i++){
       +                if(readclumpinfo(arena, i, &ci) < 0)
       +                        break;
       +                if(ci.type != VtCorruptType){
       +                        scorecp(ie->score, ci.score);
       +                        ie->ia.type = ci.type;
       +                        ie->ia.size = ci.uncsize;
       +                        ie->ia.blocks = (ci.size + ClumpSize + (1<<ABlockLog) - 1) >> ABlockLog;
       +                        ie->ia.addr = addr;
       +                        ie++;
       +                }
       +                addr += ClumpSize + ci.size;
       +        }
       +        qunlock(&arena->lock);
       +        return ie - entries;
       +}
 (DIR) diff --git a/src/cmd/venti/srv/checkindex.c b/src/cmd/venti/srv/checkindex.c
       t@@ -8,7 +8,7 @@ static void
        phdr(DBlock *eb)
        {
                static int did;
       -        
       +
                if(!did){
                        did = 1;
                        print("# diff actual correct\n");
       t@@ -168,7 +168,7 @@ checkbloom(Bloom *b1, Bloom *b2, int fix)
        {
                u32int *a1, *a2;
                int i, n, extra, missing;
       -        
       +
                if(b1==nil && b2==nil)
                        return 0;
                if(b1==nil || b2==nil){
       t@@ -188,13 +188,14 @@ checkbloom(Bloom *b1, Bloom *b2, int fix)
                missing = 0;
                for(i=BloomHeadSize/4; i<n; i++){
                        if(a1[i] != a2[i]){
       -print("%.8ux/%.8ux.", a1[i], a2[i]);
       -                        extra += countbits(a1[i] & ~a2[i]);
       +// print("%.8ux/%.8ux.", a1[i], a2[i]);
       +                        extra   += countbits(a1[i] & ~a2[i]);
                                missing += countbits(a2[i] & ~a1[i]);
                        }
                }
                if(extra || missing)
       -                fprint(2, "bloom filter: %d spurious bits, %d missing bits\n", extra, missing);
       +                fprint(2, "bloom filter: %d spurious bits, %d missing bits\n",
       +                        extra, missing);
                else
                        fprint(2, "bloom filter: correct\n");
                if(!fix && missing){
 (DIR) diff --git a/src/cmd/venti/srv/conv.c b/src/cmd/venti/srv/conv.c
       t@@ -581,9 +581,9 @@ unpackientry(IEntry *ie, u8int *buf)
        
                scorecp(ie->score, p);
                p += VtScoreSize;
       -        ie->wtime = U32GET(p);
       +        /* ie->wtime = U32GET(p); */
                p += U32Size;
       -        ie->train = U16GET(p);
       +        /* ie->train = U16GET(p); */
                p += U16Size;
                if(p - buf != IEntryAddrOff)
                        sysfatal("unpackentry bad IEntryAddrOff amount");
       t@@ -613,9 +613,9 @@ packientry(IEntry *ie, u8int *buf)
        
                scorecp(p, ie->score);
                p += VtScoreSize;
       -        U32PUT(p, ie->wtime);
       +        U32PUT(p, 0); /* wtime */
                p += U32Size;
       -        U16PUT(p, ie->train);
       +        U16PUT(p, 0); /* train */
                p += U16Size;
                U64PUT(p, ie->ia.addr, t32);
                p += U64Size;
 (DIR) diff --git a/src/cmd/venti/srv/dat.h b/src/cmd/venti/srv/dat.h
       t@@ -3,6 +3,7 @@ typedef struct AMap                AMap;
        typedef struct AMapN                AMapN;
        typedef struct Arena                Arena;
        typedef struct AState        AState;
       +typedef struct ArenaCIG        ArenaCIG;
        typedef struct ArenaHead        ArenaHead;
        typedef struct ArenaPart        ArenaPart;
        typedef struct ArenaTail        ArenaTail;
       t@@ -28,8 +29,10 @@ typedef struct ZBlock                ZBlock;
        typedef struct Round        Round;
        typedef struct Bloom        Bloom;
        
       -#define TWID32        ((u32int)~(u32int)0)
       -#define TWID64        ((u64int)~(u64int)0)
       +#pragma incomplete IEStream
       +
       +#define        TWID32        ((u32int)~(u32int)0)
       +#define        TWID64        ((u64int)~(u64int)0)
        #define        TWID8        ((u8int)~(u8int)0)
        
        enum
       t@@ -44,7 +47,6 @@ enum
                IndexBase                = 1024*1024,        /* initial address to use in an index */
                MaxIo                        = 64*1024,        /* max size of a single read or write operation */
                ICacheBits                = 16,                /* default bits for indexing icache */
       -        ICacheDepth                = 4,                /* default depth of an icache hash chain */
                MaxAMap                        = 2*1024,        /* max. allowed arenas in an address mapping; must be < 32*1024 */
        
                /*
       t@@ -147,6 +149,8 @@ enum
                DirtyArenaCib,
                DirtyArenaTrailer,
                DirtyMax,
       +        
       +        ArenaCIGSize = 10*1024,        // about 0.5 MB worth of IEntry.
        
                VentiZZZZZZZZ
        };
       t@@ -371,6 +375,14 @@ struct Arena
                u32int                ctime;                        /* first time a block was written */
                u32int                wtime;                        /* last time a block was written */
                u32int                clumpmagic;
       +        
       +        ArenaCIG        *cig;
       +        int        ncig;
       +};
       +
       +struct ArenaCIG
       +{
       +        u64int        offset;  // from arena base
        };
        
        /*
       t@@ -505,14 +517,20 @@ struct IAddr
         */
        struct IEntry
        {
       -        u8int                score[VtScoreSize];
       -        IEntry                *next;                        /* next in hash chain */
       -        IEntry                *nextdirty;                 /* next in dirty chain */
       -        u32int                wtime;                        /* last write time */
       -        u16int                train;                        /* relative train containing the most recent ref; 0 if no ref, 1 if in same car */
       -        u8int                rac;                        /* read ahead count */
       -        u8int                dirty;                /* is dirty */
       -        IAddr                ia;
       +        /* on disk data - 32 bytes*/
       +        u8int        score[VtScoreSize];
       +        IAddr        ia;
       +        
       +        IEntry        *nexthash;
       +        IEntry        *nextdirty;
       +        IEntry        *next;
       +        IEntry        *prev;
       +        u8int        state;
       +};
       +enum {
       +        IEClean = 0,
       +        IEDirty = 1,
       +        IESummary = 2,
        };
        
        /*
       t@@ -607,6 +625,9 @@ enum
                StatIcacheFlush,
                StatIcacheStall,
                StatIcacheReadTime,
       +        StatIcacheLookup,
       +        StatScacheHit,
       +        StatScachePrefetch,
        
                StatBloomHit,
                StatBloomMiss,
       t@@ -628,6 +649,9 @@ enum
        
                StatSumRead,
                StatSumReadBytes,
       +        
       +        StatCigLoad,
       +        StatCigLoadTime,
        
                NStat
        };
 (DIR) diff --git a/src/cmd/venti/srv/fns.h b/src/cmd/venti/srv/fns.h
       t@@ -6,8 +6,11 @@ void                addstat(int, int);
        void                addstat2(int, int, int, int);
        ZBlock                *alloczblock(u32int size, int zeroed, uint alignment);
        Arena                *amapitoa(Index *index, u64int a, u64int *aa);
       +Arena                *amapitoag(Index *index, u64int a, u64int *gstart, u64int *glimit, int *g);
        u64int                arenadirsize(Arena *arena, u32int clumps);
       +int                arenatog(Arena *arena, u64int aa, u64int *gstart, u64int *glimit, int *g);
        void                arenaupdate(Arena *arena, u32int size, u8int *score);
       +int                asumload(Arena *arena, int g, IEntry *entries, int maxentries);
        void                backsumarena(Arena *arena);
        void        binstats(long (*fn)(Stats *s0, Stats *s1, void*), void *arg, long t0, long t1, Statbin *bin, int nbin);
        int                bloominit(Bloom*, vlong, uchar*);
       t@@ -64,6 +67,7 @@ int                iaddrcmp(IAddr *ia1, IAddr *ia2);
        IEntry*        icachedirty(u32int, u32int, u64int);
        ulong        icachedirtyfrac(void);
        void                icacheclean(IEntry*);
       +int                icachelookup(u8int *score, int type, IAddr *ia);
        int                ientrycmp(const void *vie1, const void *vie2);
        char                *ifileline(IFile *f);
        int                ifilename(IFile *f, char *dst);
       t@@ -76,7 +80,7 @@ ArenaPart        *initarenapart(Part *part);
        int                initarenasum(void);
        void                initbloomfilter(Index*);
        void                initdcache(u32int mem);
       -void                initicache(int bits, int depth);
       +void                initicache(u32int mem);
        void                initicachewrite(void);
        IEStream        *initiestream(Part *part, u64int off, u64int clumps, u32int size);
        ISect                *initisect(Part *part);
       t@@ -87,7 +91,7 @@ Part*                initpart(char *name, int mode);
        void                initround(Round*, char*, int);
        int                initventi(char *config, Config *conf);
        void                insertlump(Lump *lump, Packet *p);
       -int                insertscore(u8int *score, IAddr *ia, int write);
       +int                insertscore(u8int *score, IAddr *ia, int state);
        void                kickdcache(void);
        void                kickicache(void);
        void                kickround(Round*, int wait);
       t@@ -97,8 +101,7 @@ DBlock        *loadibucket(Index *index, u8int *score, ISect **is, u32int *buck, IBucke
        int                loadientry(Index *index, u8int *score, int type, IEntry *ie);
        void                logerr(int severity, char *fmt, ...);
        Lump                *lookuplump(u8int *score, int type);
       -int                _lookupscore(u8int *score, int type, IAddr *ia, int *rac);
       -int                lookupscore(u8int *score, int type, IAddr *ia, int *rac);
       +int                lookupscore(u8int *score, int type, IAddr *ia);
        int                maparenas(AMap *am, Arena **arenas, int n, char *what);
        void                markbloomfilter(Bloom*, u8int*);
        uint                msec(void);
 (DIR) diff --git a/src/cmd/venti/srv/hdisk.c b/src/cmd/venti/srv/hdisk.c
       t@@ -547,7 +547,7 @@ debugread(HConnect *c, u8int *score)
                Lump *u;
                IAddr ia;
                IEntry ie;
       -        int i, rac;
       +        int i;
                Arena *arena;
                u64int aa;
                ZBlock *zb;
       t@@ -561,7 +561,7 @@ debugread(HConnect *c, u8int *score)
                }
                
                hprint(&c->hout, "<h2>index search %V</h2><pre>\n", score);
       -        if(_lookupscore(score, -1, &ia, nil) < 0)
       +        if(icachelookup(score, -1, &ia) < 0)
                        hprint(&c->hout, "  icache: not found\n");
                else
                        hprint(&c->hout, "  icache: addr=%#llx size=%d type=%d blocks=%d\n",
       t@@ -585,12 +585,12 @@ debugread(HConnect *c, u8int *score)
                                hprint(&c->hout, " -cache");
                        putlump(u);
                        
       -                if(lookupscore(score, type, &ia, &rac) < 0){
       +                if(lookupscore(score, type, &ia) < 0){
                                hprint(&c->hout, " -lookup\n");
                                continue;
                        }
       -                hprint(&c->hout, "\n  lookupscore: addr=%#llx size=%d blocks=%d rac=%d\n",
       -                        ia.addr, ia.size, ia.blocks, rac);
       +                hprint(&c->hout, "\n  lookupscore: addr=%#llx size=%d blocks=%d\n",
       +                        ia.addr, ia.size, ia.blocks);
                        
                        arena = amapitoa(mainindex, ia.addr, &aa);
                        if(arena == nil){
 (DIR) diff --git a/src/cmd/venti/srv/httpd.c b/src/cmd/venti/srv/httpd.c
       t@@ -895,7 +895,7 @@ static char* graphname[] =
        
                "icachehit",
                "icachemiss",
       -        "icachelookup",
       +        "icacheread",
                "icachewrite",
                "icachefill",
                "icacheprefetch",
       t@@ -904,6 +904,9 @@ static char* graphname[] =
                "icacheflush",
                "icachestall",
                "icachelookuptime",
       +        "icachelookup",
       +        "scachehit",
       +        "scacheprefetch",
        
                "bloomhit",
                "bloommiss",
       t@@ -925,6 +928,9 @@ static char* graphname[] =
        
                "sumread",
                "sumreadbyte",
       +        
       +        "cigload",
       +        "cigloadtime",
        };
        
        static int
 (DIR) diff --git a/src/cmd/venti/srv/icache.c b/src/cmd/venti/srv/icache.c
       t@@ -2,236 +2,421 @@
        #include "dat.h"
        #include "fns.h"
        
       +int icacheprefetch = 1;
       +
        typedef struct ICache ICache;
       +typedef struct IHash IHash;
       +typedef struct ISum ISum;
       +
        struct ICache
        {
       -        QLock        lock;                        /* locks hash table & all associated data */
       +        QLock        lock;
                Rendez        full;
       -        IEntry        **heads;                /* heads of all the hash chains */
       -        int        bits;                        /* bits to use for indexing heads */
       -        u32int        size;                        /* number of heads; == 1 << bits, should be < entries */
       -        IEntry        *base;                        /* all allocated hash table entries */
       -        IEntry        *free;
       -        u32int        entries;                /* elements in base */
       -        IEntry        *dirty;                /* chain of dirty elements */
       -        u32int        ndirty;
       +        IHash        *hash;
       +        IEntry        *entries;
       +        int                nentries;
       +        IEntry        free;
       +        IEntry        clean;
       +        IEntry        dirty;
                u32int        maxdirty;
       -        u32int        unused;                        /* index of first unused element in base */
       -        u32int        stolen;                        /* last head from which an element was stolen */
       +        u32int        ndirty;
        
       -        Arena        *last[4];
       -        Arena        *lastload;
       -        int                nlast;
       +        ISum        **sum;
       +        int                nsum;
       +        IHash        *shash;
       +        IEntry        *sentries;
       +        int                nsentries;
        };
        
       -int icacheprefetch = 1;
       -
        static ICache icache;
        
       -static IEntry        *icachealloc(IAddr *ia, u8int *score);
       -
        /*
       - * bits is the number of bits in the icache hash table
       - * depth is the average depth
       - * memory usage is about (1<<bits) * depth * sizeof(IEntry) + (1<<bits) * sizeof(IEntry*)
       + * Hash table of IEntries
         */
       -void
       -initicache(int bits, int depth)
       +
       +struct IHash
        {
       -        icache.bits = bits;
       -        icache.size = 1 << bits;
       -        icache.entries = depth * icache.size;
       -        icache.maxdirty = icache.entries/2;
       -        icache.base = MKNZ(IEntry, icache.entries);
       -        icache.heads = MKNZ(IEntry*, icache.size);
       -        icache.full.l = &icache.lock;
       -        setstat(StatIcacheSize, icache.entries);
       -}
       +        int bits;
       +        u32int size;
       +        IEntry **table;
       +};
        
       -ulong
       -icachedirtyfrac(void)
       +static IHash*
       +mkihash(int size1)
        {
       -        return (vlong)icache.ndirty*IcacheFrac / icache.entries;
       +        u32int size;
       +        int bits;
       +        IHash *ih;
       +        
       +        bits = 0;
       +        size = 1;
       +        while(size < size1){
       +                bits++;
       +                size <<= 1;
       +        }
       +        
       +        ih = vtmallocz(sizeof(IHash)+size*sizeof(ih->table[0]));
       +        ih->table = (IEntry**)(ih+1);
       +        ih->bits = bits;
       +        ih->size = size;
       +        return ih;
        }
        
       -u32int
       -hashbits(u8int *sc, int bits)
       +static IEntry*
       +ihashlookup(IHash *ih, u8int score[VtScoreSize], int type)
        {
       -        u32int v;
       -
       -        v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3];
       -        if(bits < 32)
       -                 v >>= (32 - bits);
       -        return v;
       +        u32int h;
       +        IEntry *ie;
       +        
       +        h = hashbits(score, ih->bits);
       +        for(ie=ih->table[h]; ie; ie=ie->nexthash)
       +                if((type == -1 || type == ie->ia.type) && scorecmp(score, ie->score) == 0)
       +                        return ie;
       +        return nil;
        }
        
        static void
       -loadarenaclumps(Arena *arena, u64int aa)
       +ihashdelete(IHash *ih, IEntry *ie, char *what)
        {
       -        ulong i;
       -        ClumpInfo ci;
       -        IAddr ia;
       -
       -        for(i=0; i<arena->memstats.clumps; i++){
       -                if(readclumpinfo(arena, i, &ci) < 0)
       -                        break;
       -                ia.type = ci.type;
       -                ia.size = ci.uncsize;
       -                ia.blocks = (ci.size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
       -                ia.addr = aa;
       -                aa += ClumpSize + ci.size;
       -                if(ia.type != VtCorruptType)
       -                        insertscore(ci.score, &ia, 0);
       -        }
       +        u32int h;
       +        IEntry **l;
       +        
       +        h = hashbits(ie->score, ih->bits);
       +        for(l=&ih->table[h]; *l; l=&(*l)->nexthash)
       +                if(*l == ie){
       +                        *l = ie->nexthash;
       +                        return;
       +                }
       +        fprint(2, "warning: %s %V not found in ihashdelete\n", what, ie->score);
        }
        
       -int
       -_lookupscore(u8int *score, int type, IAddr *ia, int *rac)
       +static void
       +ihashinsert(IHash *ih, IEntry *ie)
        {
                u32int h;
       -        IEntry *ie, *last;
       -
       -        qlock(&icache.lock);
       -        h = hashbits(score, icache.bits);
       -        last = nil;
       -        for(ie = icache.heads[h]; ie != nil; ie = ie->next){
       -                if((ie->ia.type == type || type == -1) && scorecmp(ie->score, score)==0){
       -                        if(last != nil)
       -                                last->next = ie->next;
       -                        else
       -                                icache.heads[h] = ie->next;
       -                        addstat(StatIcacheHit, 1);
       -                        if(rac)
       -                                ie->rac = 1;
       -                        trace(TraceLump, "lookupscore incache");
       -                        ie->next = icache.heads[h];
       -                        icache.heads[h] = ie;
       -
       -                        *ia = ie->ia;
       -                        if(rac)
       -                                *rac = ie->rac;
       -                        qunlock(&icache.lock);
       -                        return 0;
       -                }
       -                last = ie;
       -        }
       -        addstat(StatIcacheMiss, 1);
       -        qunlock(&icache.lock);
       -        return -1;
       +        
       +        h = hashbits(ie->score, ih->bits);
       +        ie->nexthash = ih->table[h];
       +        ih->table[h] = ie;
        }
        
        
        /*
       -ZZZ need to think about evicting the correct IEntry,
       -and writing back the wtime.
       - * look up data score in the index cache
       - * if this fails, pull it in from the disk index table, if it exists.
       - *
       - * must be called with the lump for this score locked
       + * IEntry lists.
         */
       -int
       -lookupscore(u8int *score, int type, IAddr *ia, int *rac)
       +
       +static IEntry*
       +popout(IEntry *ie)
        {
       -        IEntry d, *ie;
       -        u32int h;
       -        u64int aa;
       -        Arena *load;
       -        int i, ret;
       -        uint ms;
       +        if(ie->prev == nil && ie->next == nil)
       +                return ie;
       +        ie->prev->next = ie->next;
       +        ie->next->prev = ie->prev;
       +        ie->next = nil;
       +        ie->prev = nil;
       +        return ie;
       +}
        
       -        aa = 0;
       -        ms = msec();
       -        
       -        trace(TraceLump, "lookupscore %V.%d", score, type);
       +static IEntry*
       +poplast(IEntry *list)
       +{
       +        if(list->prev == list)
       +                return nil;
       +        return popout(list->prev);
       +}
        
       -        ret = 0;
       -        if(_lookupscore(score, type, ia, rac) < 0){
       -                if(loadientry(mainindex, score, type, &d) < 0){
       -                        ret = -1;
       -                        goto out;
       -                }
       +static IEntry*
       +pushfirst(IEntry *list, IEntry *ie)
       +{
       +        popout(ie);
       +        ie->prev = list;
       +        ie->next = list->next;
       +        ie->prev->next = ie;
       +        ie->next->prev = ie;
       +        return ie;
       +}
        
       -                /* failed in cache but found on disk - fill cache. */
       -                trace(TraceLump, "lookupscore loaded");
       -                addstat(StatIcacheFill, 1);
       +/*
       + * Arena summary cache.
       + */
       +struct ISum
       +{
       +        QLock        lock;
       +        IEntry        *entries;
       +        int        nentries;
       +        int        loaded;
       +        u64int addr;
       +        u64int limit;
       +        Arena *arena;
       +        int g;
       +};
        
       -                /*
       -                 * no one else can load an entry for this score,
       -                 * since we have this score's lump's lock.
       -                 */
       -                qlock(&icache.lock);
       -        
       -                /*
       -                 * If we notice that all the hits are coming from one arena,
       -                 * load the table of contents for that arena into the cache.
       -                 */
       -                load = nil;
       -                h = hashbits(score, icache.bits);
       -                ie = icachealloc(&d.ia, score);
       -                if(icacheprefetch){
       -                        icache.last[icache.nlast++%nelem(icache.last)] = amapitoa(mainindex, ie->ia.addr, &aa);
       -                        aa = ie->ia.addr - aa;        /* compute base addr of arena */
       -                        for(i=0; i<nelem(icache.last); i++)
       -                                if(icache.last[i] != icache.last[0])
       -                                        break;
       -                        if(i==nelem(icache.last) && icache.lastload != icache.last[0]){
       -                                load = icache.last[0];
       -                                icache.lastload = load;
       +static ISum*
       +scachelookup(u64int addr)
       +{
       +        int i;
       +        ISum *s;
       +
       +        for(i=0; i<icache.nsum; i++){
       +                s = icache.sum[i];
       +                if(s->addr <= addr && addr < s->limit){
       +                        if(i > 0){
       +                                memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]);
       +                                icache.sum[0] = s;
                                }
       +                        return s;
                        }
       +        }
       +        return nil;
       +}
       +
       +static void
       +sumclear(ISum *s)
       +{
       +        int i;
       +
       +        for(i=0; i<s->nentries; i++)
       +                ihashdelete(icache.shash, &s->entries[i], "scache");
       +        s->nentries = 0;
       +        s->loaded = 0;
       +        s->addr = 0;
       +        s->limit = 0;
       +        s->arena = nil;
       +        s->g = 0;
       +}
       +
       +static ISum*
       +scacheevict(void)
       +{
       +        ISum *s;
       +        int i;
                
       -                ie->next = icache.heads[h];
       -                icache.heads[h] = ie;
       -        
       -                *ia = ie->ia;
       -                *rac = ie->rac;
       -        
       -                qunlock(&icache.lock);
       -                if(load){
       -                        trace(TraceProc, "preload 0x%llux", aa);
       -                        loadarenaclumps(load, aa);
       +        for(i=icache.nsum-1; i>=0; i--){
       +                s = icache.sum[i];
       +                if(canqlock(&s->lock)){
       +                        if(i > 0){
       +                                memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]);
       +                                icache.sum[0] = s;
       +                        }
       +                        sumclear(s);
       +                        return s;
                        }
                }
       +        return nil;
       +}
        
       -out:
       -        ms = msec() - ms;
       -        addstat2(StatIcacheRead, 1, StatIcacheReadTime, ms);
       +static void
       +scachehit(u64int addr)
       +{
       +        scachelookup(addr);        /* for move-to-front */
       +}
        
       -        return ret;
       +static void
       +scachesetup(ISum *s, u64int addr)
       +{
       +        u64int addr0, limit;
       +        int g;
       +
       +        s->arena = amapitoag(mainindex, addr, &addr0, &limit, &g);
       +        s->addr = addr0;
       +        s->limit = limit;
       +        s->g = g;
       +}
       +
       +static void
       +scacheload(ISum *s)
       +{
       +        int i, n;
       +
       +        s->loaded = 1;
       +        n = asumload(s->arena, s->g, s->entries, ArenaCIGSize);
       +        /*
       +         * n can be less then ArenaCIGSize, either if the clump group
       +         * is the last in the arena and is only partially filled, or if there
       +         * are corrupt clumps in the group -- those are not returned.
       +         */
       +        for(i=0; i<n; i++){
       +                s->entries[i].ia.addr += s->addr;
       +                ihashinsert(icache.shash, &s->entries[i]);
       +        }
       +//fprint(2, "%T scacheload %s %d - %d entries\n", s->arena->name, s->g, n);
       +        addstat(StatScachePrefetch, n);
       +        s->nentries = n;
       +}
       +
       +static ISum*
       +scachemiss(u64int addr)
       +{
       +        ISum *s;
       +
       +        s = scachelookup(addr);
       +        if(s == nil){
       +                /* first time: make an entry in the cache but don't populate it yet */
       +                s = scacheevict();
       +                if(s == nil)
       +                        return nil;
       +                scachesetup(s, addr);
       +                qunlock(&s->lock);
       +                return nil;
       +        }
       +
       +        /* second time: load from disk */
       +        qlock(&s->lock);
       +        if(s->loaded || !icacheprefetch){
       +                qunlock(&s->lock);
       +                return nil;
       +        }
       +        
       +        return s;        /* locked */
        }
        
        /*
       - * insert a new element in the hash table.
       + * Index cache.
         */
       -int
       -insertscore(u8int *score, IAddr *ia, int write)
       +
       +void
       +initicache(u32int mem0)
        {
       -        IEntry *ie, se;
       -        u32int h;
       +        u32int mem;
       +        int i, entries, scache;
       +        
       +        icache.full.l = &icache.lock;
        
       -        trace(TraceLump, "insertscore enter");
       -        if(write)
       -                addstat(StatIcacheWrite, 1);
       -        else
       -                addstat(StatIcachePrefetch, 1);
       +        mem = mem0;
       +        entries = mem / (sizeof(IEntry)+sizeof(IEntry*));
       +        scache = (entries/8) / ArenaCIGSize;
       +        entries -= entries/8;
       +        if(scache < 4)
       +                scache = 4;
       +        if(scache > 16)
       +                scache = 16;
       +        if(entries < 1000)
       +                entries = 1000;
       +fprint(2, "icache %,d bytes = %,d entries; %d scache\n", mem0, entries, scache);
       +
       +        icache.clean.prev = icache.clean.next = &icache.clean;
       +        icache.dirty.prev = icache.dirty.next = &icache.dirty;
       +        icache.free.prev = icache.free.next = &icache.free;
       +        
       +        icache.hash = mkihash(entries);
       +        icache.nentries = entries;
       +        setstat(StatIcacheSize, entries);
       +        icache.entries = vtmallocz(entries*sizeof icache.entries[0]);
       +        icache.maxdirty = entries / 2;
       +        for(i=0; i<entries; i++)
       +                pushfirst(&icache.free, &icache.entries[i]);
       +
       +        icache.nsum = scache;
       +        icache.sum = vtmallocz(scache*sizeof icache.sum[0]);
       +        icache.sum[0] = vtmallocz(scache*sizeof icache.sum[0][0]);
       +        icache.nsentries = scache * ArenaCIGSize;
       +        icache.sentries = vtmallocz(scache*ArenaCIGSize*sizeof icache.sentries[0]);
       +        icache.shash = mkihash(scache*ArenaCIGSize);
       +        for(i=0; i<scache; i++){
       +                icache.sum[i] = icache.sum[0] + i;
       +                icache.sum[i]->entries = icache.sentries + i*ArenaCIGSize;
       +        }
       +}
        
       -        qlock(&icache.lock);
       -        h = hashbits(score, icache.bits);
        
       -        ie = icachealloc(ia, score);
       -        if(write){
       +static IEntry*
       +evictlru(void)
       +{
       +        IEntry *ie;
       +        
       +        ie = poplast(&icache.clean);
       +        if(ie == nil)
       +                return nil;
       +        ihashdelete(icache.hash, ie, "evictlru");
       +        return ie;
       +}
       +
       +static void
       +icacheinsert(u8int score[VtScoreSize], IAddr *ia, int state)
       +{
       +        IEntry *ie;
       +
       +        if((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){
       +                addstat(StatIcacheStall, 1);
       +                while((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){
       +                        // Could safely return here if state == IEClean.
       +                        // But if state == IEDirty, have to wait to make
       +                        // sure we don't lose an index write.  
       +                        // Let's wait all the time.
       +                        flushdcache();
       +                        kickicache();
       +                        rsleep(&icache.full);
       +                }
       +                addstat(StatIcacheStall, -1);
       +        }
       +
       +        memmove(ie->score, score, VtScoreSize);
       +        ie->state = state;
       +        ie->ia = *ia;
       +        if(state == IEClean){
       +                addstat(StatIcachePrefetch, 1);
       +                pushfirst(&icache.clean, ie);
       +        }else{
       +                addstat(StatIcacheWrite, 1);
       +                assert(state == IEDirty);
                        icache.ndirty++;
                        setstat(StatIcacheDirty, icache.ndirty);
                        delaykickicache();
       -                ie->dirty = 1;
       +                pushfirst(&icache.dirty, ie);
                }
       -        ie->next = icache.heads[h];
       -        icache.heads[h] = ie;
       +        ihashinsert(icache.hash, ie);
       +}
       +
       +int
       +icachelookup(u8int score[VtScoreSize], int type, IAddr *ia)
       +{
       +        IEntry *ie;
        
       -        se = *ie;
       +        qlock(&icache.lock);
       +        addstat(StatIcacheLookup, 1);
       +        if((ie = ihashlookup(icache.hash, score, type)) != nil){
       +                *ia = ie->ia;
       +                if(ie->state == IEClean)
       +                        pushfirst(&icache.clean, ie);
       +                addstat(StatIcacheHit, 1);
       +                qunlock(&icache.lock);
       +                return 0;
       +        }
       +
       +        if((ie = ihashlookup(icache.shash, score, type)) != nil){
       +                *ia = ie->ia;
       +                icacheinsert(score, &ie->ia, IEClean);
       +                scachehit(ie->ia.addr);
       +                addstat(StatScacheHit, 1);
       +                qunlock(&icache.lock);
       +                return 0;
       +        }
       +        addstat(StatIcacheMiss, 1);
                qunlock(&icache.lock);
        
       -        if(write && icache.ndirty >= icache.maxdirty)
       +        return -1;
       +}
       +
       +int
       +insertscore(u8int score[VtScoreSize], IAddr *ia, int state)
       +{
       +        ISum *toload;
       +
       +        qlock(&icache.lock);
       +        icacheinsert(score, ia, state);
       +        if(state == IEClean)
       +                toload = scachemiss(ia->addr);
       +        else{
       +                assert(state == IEDirty);
       +                toload = nil;
       +        }
       +        qunlock(&icache.lock);
       +        if(toload){
       +                scacheload(toload);
       +                qunlock(&toload->lock);
       +        }
       +        
       +        if(icache.ndirty >= icache.maxdirty)
                        kickicache();
        
                /*
       t@@ -240,125 +425,81 @@ insertscore(u8int *score, IAddr *ia, int write)
                 * the lump, meaning any searches for this block
                 * will hit in the lump cache until after we return.
                 */
       -        markbloomfilter(mainindex->bloom, score);
       +        if(state == IEDirty)
       +                markbloomfilter(mainindex->bloom, score);
        
                return 0;
        }
        
       -/*
       - * allocate a index cache entry which hasn't been used in a while.
       - * must be called with icache.lock locked
       - * if the score is already in the table, update the entry.
       - */
       -static IEntry *
       -icachealloc(IAddr *ia, u8int *score)
       +static int
       +lookupscore_untimed(u8int score[VtScoreSize], int type, IAddr *ia)
        {
       -        int i;
       -        IEntry *ie, *last, *clean, *lastclean;
       -        u32int h;
       +        IEntry d;
        
       -        h = hashbits(score, icache.bits);
       -        last = nil;
       -        for(ie = icache.heads[h]; ie != nil; ie = ie->next){
       -                if(ie->ia.type == ia->type && scorecmp(ie->score, score)==0){
       -                        if(last != nil)
       -                                last->next = ie->next;
       -                        else
       -                                icache.heads[h] = ie->next;
       -                        trace(TraceLump, "icachealloc hit");
       -                        ie->rac = 1;
       -                        return ie;
       -                }
       -                last = ie;
       -        }
       +        if(icachelookup(score, type, ia) >= 0)
       +                return 0;
        
       -        h = icache.unused;
       -        if(h < icache.entries){
       -                ie = &icache.base[h++];
       -                icache.unused = h;
       -                trace(TraceLump, "icachealloc unused");
       -                goto Found;
       -        }
       +        addstat(StatIcacheFill, 1);
       +        if(loadientry(mainindex, score, type, &d) < 0)
       +                return -1;
                
       -        if((ie = icache.free) != nil){
       -                icache.free = ie->next;
       -                goto Found;
       -        }
       +        insertscore(score, &d.ia, IEClean);
       +        *ia = d.ia;
       +        return 0;
       +}
        
       -        h = icache.stolen;
       -        for(i=0;; i++){
       -                h++;
       -                if(h >= icache.size)
       -                        h = 0;
       -                if(i == icache.size){
       -                        trace(TraceLump, "icachealloc sleep");
       -                        addstat(StatIcacheStall, 1);
       -                        while(icache.ndirty == icache.entries){
       -                                /*
       -                                 * This is a bit suspect.  Kickicache will wake up the
       -                                 * icachewritecoord, but if all the index entries are for
       -                                 * unflushed disk blocks, icachewritecoord won't be
       -                                 * able to do much.  It always rewakes everyone when
       -                                 * it thinks it is done, though, so at least we'll go around
       -                                 * the while loop again.  Also, if icachewritecoord sees
       -                                 * that the disk state hasn't change at all since the last
       -                                 * time around, it kicks the disk.  This needs to be
       -                                 * rethought, but it shouldn't deadlock anymore.
       -                                 */
       -                                kickicache();
       -                                rsleep(&icache.full);
       -                        }
       -                        addstat(StatIcacheStall, -1);
       -                        i = 0;
       -                }
       -                lastclean = nil;
       -                clean = nil;
       -                last = nil;
       -                for(ie=icache.heads[h]; ie; last=ie, ie=ie->next){
       -                        if(!ie->dirty){
       -                                clean = ie;
       -                                lastclean = last;
       -                        }
       -                }
       -                if(clean){
       -                        if(lastclean)
       -                                lastclean->next = clean->next;
       -                        else
       -                                icache.heads[h] = clean->next;
       -                        clean->next = nil;
       -                        icache.stolen = h;
       -                        ie = clean;
       -                        trace(TraceLump, "icachealloc steal");
       -                        goto Found;
       -                }
       -        }
       +int
       +lookupscore(u8int score[VtScoreSize], int type, IAddr *ia)
       +{
       +        int ms, ret;
       +        
       +        ms = msec();
       +        ret = lookupscore_untimed(score, type, ia);
       +        ms = msec() - ms;
       +        addstat2(StatIcacheRead, 1, StatIcacheReadTime, ms);
       +        return ret;
       +}
       +        
       +u32int
       +hashbits(u8int *sc, int bits)
       +{
       +        u32int v;
        
       -Found:
       -        ie->ia = *ia;
       -        scorecp(ie->score, score);
       -        ie->rac = 0;        
       -        return ie;
       +        v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3];
       +        if(bits < 32)
       +                 v >>= (32 - bits);
       +        return v;
        }
        
       +ulong
       +icachedirtyfrac(void)
       +{
       +        return (vlong)icache.ndirty*IcacheFrac / icache.nentries;
       +}
       +
       +/*
       + * Return a singly-linked list of dirty index entries.
       + * with 32-bit hash numbers between lo and hi
       + * and address < limit.
       + */
        IEntry*
        icachedirty(u32int lo, u32int hi, u64int limit)
        {
       -        int i;
                u32int h;
                IEntry *ie, *dirty;
        
                dirty = nil;
                trace(TraceProc, "icachedirty enter");
                qlock(&icache.lock);
       -        for(i=0; i<icache.size; i++)
       -        for(ie = icache.heads[i]; ie; ie=ie->next)
       -                if(ie->dirty && ie->ia.addr != 0 && ie->ia.addr < limit){
       +        for(ie = icache.dirty.next; ie != &icache.dirty; ie=ie->next){
       +                if(ie->state == IEDirty && ie->ia.addr < limit){
                                h = hashbits(ie->score, 32);
                                if(lo <= h && h <= hi){
                                        ie->nextdirty = dirty;
                                        dirty = ie;
                                }
                        }
       +        }
                qunlock(&icache.lock);
                trace(TraceProc, "icachedirty exit");
                if(dirty == nil)
       t@@ -366,36 +507,49 @@ icachedirty(u32int lo, u32int hi, u64int limit)
                return dirty;
        }
        
       +
       +/*
       + * The singly-linked non-circular list of index entries ie
       + * has been written to disk.  Move them to the clean list.
       + */
        void
        icacheclean(IEntry *ie)
        {
       -        trace(TraceProc, "icachedirty enter");
       +        IEntry *next;
       +        
       +        trace(TraceProc, "icacheclean enter");
                qlock(&icache.lock);
       -        for(; ie; ie=ie->nextdirty){
       +        for(; ie; ie=next){
       +                assert(ie->state == IEDirty);
       +                next = ie->nextdirty;
       +                ie->nextdirty = nil;
       +                popout(ie); /* from icache.dirty */
                        icache.ndirty--;
       -                ie->dirty = 0;
       +                ie->state = IEClean;
       +                pushfirst(&icache.clean, ie);
                }
                setstat(StatIcacheDirty, icache.ndirty);
                rwakeupall(&icache.full);
                qunlock(&icache.lock);
       -        trace(TraceProc, "icachedirty exit");
       +        trace(TraceProc, "icacheclean exit");
        }
        
        void
        emptyicache(void)
        {
                int i;
       -        IEntry *ie, **lie;
       +        IEntry *ie;
       +        ISum *s;
                
                qlock(&icache.lock);
       -        for(i=0; i<icache.size; i++)
       -        for(lie=&icache.heads[i]; (ie=*lie); ){
       -                if(ie->dirty == 0){
       -                        *lie = ie->next;
       -                        ie->next = icache.free;
       -                        icache.free = ie;
       -                }else
       -                        lie = &ie->next;
       +        while((ie = evictlru()) != nil)
       +                pushfirst(&icache.free, ie);
       +        for(i=0; i<icache.nsum; i++){
       +                s = icache.sum[i];
       +                qlock(&s->lock);
       +                sumclear(s);
       +                qunlock(&s->lock);
                }
                qunlock(&icache.lock);
        }
       +
 (DIR) diff --git a/src/cmd/venti/srv/icachewrite.c b/src/cmd/venti/srv/icachewrite.c
       t@@ -45,6 +45,16 @@ initicachewrite(void)
                vtproc(delaykickroundproc, &iwrite.round);
        }
        
       +static u64int
       +ie2diskaddr(Index *ix, ISect *is, IEntry *ie)
       +{
       +        u64int bucket, addr;
       +
       +        bucket = hashbits(ie->score, 32)/ix->div;
       +        addr = is->blockbase + ((bucket - is->start) << is->blocklog);
       +        return addr;
       +}
       +
        static IEntry*
        nextchunk(Index *ix, ISect *is, IEntry **pie, u64int *paddr, uint *pnbuf)
        {
       t@@ -55,13 +65,13 @@ nextchunk(Index *ix, ISect *is, IEntry **pie, u64int *paddr, uint *pnbuf)
        
                bsize = 1<<is->blocklog;
                iefirst = *pie;
       -        addr = is->blockbase + ((u64int)(hashbits(iefirst->score, 32) / ix->div - is->start) << is->blocklog);
       +        addr = ie2diskaddr(ix, is, iefirst);
                nbuf = 0;
       -        for(l=&iefirst->nextdirty; (ie=*l)!=nil; l=&(*l)->nextdirty){
       -                naddr = is->blockbase + ((u64int)(hashbits(ie->score, 32) / ix->div - is->start) << is->blocklog);
       +        for(l = &iefirst->nextdirty; (ie = *l) != nil; l = &(*l)->nextdirty){
       +                naddr = ie2diskaddr(ix, is, ie);
                        if(naddr - addr >= Bufsize)
                                break;
       -                nbuf = naddr-addr;
       +                nbuf = naddr - addr;
                }
                nbuf += bsize;
        
       t@@ -75,7 +85,7 @@ nextchunk(Index *ix, ISect *is, IEntry **pie, u64int *paddr, uint *pnbuf)
        static int
        icachewritesect(Index *ix, ISect *is, u8int *buf)
        {
       -        int err, h, bsize, t;
       +        int err, i, werr, h, bsize, t;
                u32int lo, hi;
                u64int addr, naddr;
                uint nbuf, off;
       t@@ -89,29 +99,32 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
                else
                        hi = is->stop * ix->div - 1;
        
       -        trace(TraceProc, "icachewritesect enter %ud %ud %llud", lo, hi, iwrite.as.aa);
       +        trace(TraceProc, "icachewritesect enter %ud %ud %llud",
       +                lo, hi, iwrite.as.aa);
        
                iedirty = icachedirty(lo, hi, iwrite.as.aa);
                iedirty = iesort(iedirty);
       -        bsize = 1<<is->blocklog;
       +        bsize = 1 << is->blocklog;
                err = 0;
        
                while(iedirty){
                        disksched();
       -                while((t=icachesleeptime) == SleepForever){
       +                while((t = icachesleeptime) == SleepForever){
                                sleep(1000);
                                disksched();
                        }
                        if(t < minicachesleeptime)
                                t = minicachesleeptime;
       -                sleep(t);
       +                if(t > 0)
       +                        sleep(t);
                        trace(TraceProc, "icachewritesect nextchunk");
                        chunk = nextchunk(ix, is, &iedirty, &addr, &nbuf);
        
       -                trace(TraceProc, "icachewritesect readpart 0x%llux+0x%ux", addr, nbuf);
       +                trace(TraceProc, "icachewritesect readpart 0x%llux+0x%ux",
       +                        addr, nbuf);
                        if(readpart(is->part, addr, buf, nbuf) < 0){
       -                        /* XXX more details here */
       -                        fprint(2, "icachewriteproc readpart: %r\n");
       +                        fprint(2, "%s: part %s addr 0x%llux: icachewritesect "
       +                                "readpart: %r\n", argv0, is->part->name, addr);
                                err  = -1;
                                continue;
                        }
       t@@ -120,31 +133,34 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
                        addstat(StatIsectRead, 1);
        
                        for(l=&chunk; (ie=*l)!=nil; l=&ie->nextdirty){
       -                again:
       -                        naddr = is->blockbase + ((u64int)(hashbits(ie->score, 32) / ix->div - is->start) << is->blocklog);
       +again:
       +                        naddr = ie2diskaddr(ix, is, ie);
                                off = naddr - addr;
                                if(off+bsize > nbuf){
       -                                fprint(2, "whoops! addr=0x%llux nbuf=%ud addr+nbuf=0x%llux naddr=0x%llux\n",
       -                                        addr, nbuf, addr+nbuf, naddr);
       +                                fprint(2, "%s: whoops! addr=0x%llux nbuf=%ud "
       +                                        "addr+nbuf=0x%llux naddr=0x%llux\n",
       +                                        argv0, addr, nbuf, addr+nbuf, naddr);
                                        assert(off+bsize <= nbuf);
                                }
                                unpackibucket(&ib, buf+off, is->bucketmagic);
                                if(okibucket(&ib, is) < 0){
       -                                fprint(2, "bad bucket XXX\n");
       +                                fprint(2, "%s: bad bucket XXX\n", argv0);
                                        goto skipit;
                                }
       -                        trace(TraceProc, "icachewritesect add %V at 0x%llux", ie->score, naddr);
       +                        trace(TraceProc, "icachewritesect add %V at 0x%llux",
       +                                ie->score, naddr);
                                h = bucklook(ie->score, ie->ia.type, ib.data, ib.n);
                                if(h & 1){
                                        h ^= 1;
                                        packientry(ie, &ib.data[h]);
                                }else if(ib.n < is->buckmax){
       -                                memmove(&ib.data[h+IEntrySize], &ib.data[h], ib.n*IEntrySize - h);
       +                                memmove(&ib.data[h + IEntrySize], &ib.data[h],
       +                                        ib.n*IEntrySize - h);
                                        ib.n++;
                                        packientry(ie, &ib.data[h]);
                                }else{
       -                                fprint(2, "bucket overflow XXX\n");
       -                        skipit:
       +                                fprint(2, "%s: bucket overflow XXX\n", argv0);
       +skipit:
                                        err = -1;
                                        *l = ie->nextdirty;
                                        ie = *l;
       t@@ -154,33 +170,29 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
                                                break;
                                }
                                packibucket(&ib, buf+off, is->bucketmagic);
       -                        /* XXX
       -                         * This is not quite right - it's good that we 
       -                         * update the cached block (if any) here, but
       -                         * since the block doesn't get written until writepart
       -                         * below, we also need to make sure that the cache 
       -                         * doesn't load the stale block before we write it to
       -                         * disk below.  We could lock the disk cache during
       -                         * the writepart, but that's pretty annoying.
       -                         * Another possibility would be never to cache
       -                         * index partition blocks.  The hit rate on those is
       -                         * miniscule anyway.
       -                         */
       -                        if((b = _getdblock(is->part, naddr, ORDWR, 0)) != nil){
       -                                memmove(b->data, buf+off, bsize);
       -                                putdblock(b);
       -                        }
                        }
        
                        diskaccess(1);
        
                        trace(TraceProc, "icachewritesect writepart", addr, nbuf);
       -                if(writepart(is->part, addr, buf, nbuf) < 0 || flushpart(is->part) < 0){
       -                        /* XXX more details here */
       -                        fprint(2, "icachewriteproc writepart: %r\n");
       +                werr = 0;
       +                if(writepart(is->part, addr, buf, nbuf) < 0 || flushpart(is->part) < 0)
       +                        werr = -1;
       +
       +                for(i=0; i<nbuf; i+=bsize){
       +                        if((b = _getdblock(is->part, addr+i, ORDWR, 0)) != nil){
       +                                memmove(b->data, buf+i, bsize);
       +                                putdblock(b);
       +                        }
       +                }
       +
       +                if(werr < 0){
       +                        fprint(2, "%s: part %s addr 0x%llux: icachewritesect "
       +                                "writepart: %r\n", argv0, is->part->name, addr);
                                err = -1;
                                continue;
                        }
       +                
                        addstat(StatIsectWriteBytes, nbuf);
                        addstat(StatIsectWrite, 1);
                        icacheclean(chunk);
 (DIR) diff --git a/src/cmd/venti/srv/ifile.c b/src/cmd/venti/srv/ifile.c
       t@@ -2,46 +2,57 @@
        #include "dat.h"
        #include "fns.h"
        
       +static char vcmagic[] = "venti config\n";
       +
       +enum {
       +        Maxconfig = 8 * 1024,
       +        Maglen = sizeof vcmagic - 1,
       +};
       +
        int
        readifile(IFile *f, char *name)
        {
       -        int m;
                Part *p;
                ZBlock *b;
                u8int *z;
       -        
       +
                p = initpart(name, OREAD);
                if(p == nil)
                        return -1;
       -        b = alloczblock(8192, 1, 0);
       +        b = alloczblock(Maxconfig+1, 1, 0);
                if(b == nil){
                        seterr(EOk, "can't alloc for %s: %R", name);
                        return -1;
                }
                if(p->size > PartBlank){
                        /*
       -                 * this is likely a real venti partition, in which case
       -                 * we're looking for the config file stored as 8k at end of PartBlank.
       +                 * this is likely a real venti partition, in which case we're
       +                 * looking for the config file stored as 8k at end of PartBlank.
                         */
       -                if(readpart(p, PartBlank-8192, b->data, 8192) < 0){
       +                if(readpart(p, PartBlank-Maxconfig, b->data, Maxconfig) < 0){
                                seterr(EOk, "can't read %s: %r", name);
                                freezblock(b);
                                freepart(p);
                                return -1;
                        }
       -                m = 5+1+6+1;
       -                if(memcmp(b->data, "venti config\n", m) != 0){
       +                b->data[Maxconfig] = '\0';
       +                if(memcmp(b->data, vcmagic, Maglen) != 0){
                                seterr(EOk, "bad venti config magic in %s", name);
                                freezblock(b);
                                freepart(p);
                                return -1;
                        }
       -                b->data += m;
       -                b->len -= m;
       -                z = memchr(b->data, 0, b->len);
       +                /*
       +                 * if we change b->data+b->_size, freezblock
       +                 * will blow an assertion, so don't.
       +                 */
       +                b->data  += Maglen;
       +                b->_size -= Maglen;
       +                b->len   -= Maglen;
       +                z = memchr(b->data, '\0', b->len);
                        if(z)
                                b->len = z - b->data;
       -        }else if(p->size > 8192){
       +        }else if(p->size > Maxconfig){
                        seterr(EOk, "config file is too large");
                        freepart(p);
                        freezblock(b);
 (DIR) diff --git a/src/cmd/venti/srv/index.c b/src/cmd/venti/srv/index.c
       t@@ -596,6 +596,25 @@ print("want arena %d for %llux\n", l, a);
                return ix->arenas[l];
        }
        
       +/*
       + * convert an arena index to the bounds of the containing arena group.
       + */
       +Arena*
       +amapitoag(Index *ix, u64int a, u64int *gstart, u64int *glimit, int *g)
       +{
       +        u64int aa;
       +        Arena *arena;
       +        
       +        arena = amapitoa(ix, a, &aa);
       +        if(arena == nil)
       +                return nil;
       +        if(arenatog(arena, aa, gstart, glimit, g) < 0)
       +                return nil;
       +        *gstart += a - aa;
       +        *glimit += a - aa;
       +        return arena;
       +}
       +
        int
        iaddrcmp(IAddr *ia1, IAddr *ia2)
        {
 (DIR) diff --git a/src/cmd/venti/srv/lump.c b/src/cmd/venti/srv/lump.c
       t@@ -7,7 +7,7 @@ int                        queuewrites = 0;
        int                        writestodevnull = 0;
        int                        verifywrites = 0;
        
       -static Packet                *readilump(Lump *u, IAddr *ia, u8int *score, int rac);
       +static Packet                *readilump(Lump *u, IAddr *ia, u8int *score);
        
        /*
         * Some of this logic is duplicated in hdisk.c
       t@@ -19,7 +19,6 @@ readlump(u8int *score, int type, u32int size, int *cached)
                Packet *p;
                IAddr ia;
                u32int n;
       -        int rac;
        
                trace(TraceLump, "readlump enter");
        /*
       t@@ -49,7 +48,7 @@ readlump(u8int *score, int type, u32int size, int *cached)
                if(cached)
                        *cached = 0;
        
       -        if(lookupscore(score, type, &ia, &rac) < 0){
       +        if(lookupscore(score, type, &ia) < 0){
                        /* ZZZ place to check for someone trying to guess scores */
                        seterr(EOk, "no block with score %V/%d exists", score, type);
        
       t@@ -64,7 +63,7 @@ readlump(u8int *score, int type, u32int size, int *cached)
                }
        
                trace(TraceLump, "readlump readilump");
       -        p = readilump(u, &ia, score, rac);
       +        p = readilump(u, &ia, score);
                putlump(u);
        
                trace(TraceLump, "readlump exit");
       t@@ -134,9 +133,8 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
                Packet *old;
                IAddr ia;
                int ok;
       -        int rac;
        
       -        if(lookupscore(u->score, u->type, &ia, &rac) == 0){
       +        if(lookupscore(u->score, u->type, &ia) == 0){
                        if(verifywrites == 0){
                                /* assume the data is here! */
                                packetfree(p);
       t@@ -149,7 +147,7 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
                         * if the read fails,
                         * assume it was corrupted data and store the block again
                         */
       -                old = readilump(u, &ia, u->score, rac);
       +                old = readilump(u, &ia, u->score);
                        if(old != nil){
                                ok = 0;
                                if(packetcmp(p, old) != 0){
       t@@ -176,7 +174,7 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
                ok = storeclump(mainindex, flat, u->score, u->type, creator, &ia);
                freezblock(flat);
                if(ok == 0)
       -                ok = insertscore(u->score, &ia, 1);
       +                ok = insertscore(u->score, &ia, IEDirty);
                if(ok == 0)
                        insertlump(u, p);
                else
       t@@ -193,39 +191,14 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
                return ok;
        }
        
       -static void
       -lreadahead(u64int a, Arena *arena, u64int aa, int n)
       -{        
       -        u8int buf[ClumpSize];
       -        Clump cl;
       -        IAddr ia;
       -
       -        while(n > 0) {
       -                if (aa >= arena->memstats.used)
       -                        break;
       -                if(readarena(arena, aa, buf, ClumpSize) < ClumpSize)
       -                        break;
       -                if(unpackclump(&cl, buf, arena->clumpmagic) < 0)
       -                        break;
       -                ia.addr = a;
       -                ia.type = cl.info.type;
       -                ia.size = cl.info.uncsize;
       -                ia.blocks = (cl.info.size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
       -                insertscore(cl.info.score, &ia, 0);
       -                a += ClumpSize + cl.info.size;
       -                aa += ClumpSize + cl.info.size;
       -                n--;
       -        }
       -}
       -
        static Packet*
       -readilump(Lump *u, IAddr *ia, u8int *score, int rac)
       +readilump(Lump *u, IAddr *ia, u8int *score)
        {
                Arena *arena;
                ZBlock *zb;
                Packet *p, *pp;
                Clump cl;
       -        u64int a, aa;
       +        u64int aa;
                u8int sc[VtScoreSize];
        
                trace(TraceLump, "readilump enter");
       t@@ -258,13 +231,6 @@ readilump(Lump *u, IAddr *ia, u8int *score, int rac)
                        return nil;
                }
        
       -        if(rac == 0) {
       -                trace(TraceLump, "readilump readahead");
       -                a = ia->addr + ClumpSize + cl.info.size;
       -                aa += ClumpSize + cl.info.size;
       -                lreadahead(a, arena, aa, 20);
       -        }
       -
                trace(TraceLump, "readilump success");
                p = zblock2packet(zb, cl.info.uncsize);
                freezblock(zb);
 (DIR) diff --git a/src/cmd/venti/srv/stats.c b/src/cmd/venti/srv/stats.c
       t@@ -60,6 +60,9 @@ Statdesc statdesc[NStat] =
                { "index cache flushes", },
                { "index cache stalls", },
                { "index cache read time", },
       +        { "index cache lookups" },
       +        { "index cache summary hits" },
       +        { "index cache summary prefetches" },
        
                { "bloom filter hits", },
                { "bloom filter misses", },
       t@@ -81,6 +84,9 @@ Statdesc statdesc[NStat] =
        
                { "sum reads", },
                { "sum read bytes", },
       +        
       +        { "cig loads" },
       +        { "cig load time" },
        };
        
        QLock statslock;
 (DIR) diff --git a/src/cmd/venti/srv/syncindex.c b/src/cmd/venti/srv/syncindex.c
       t@@ -56,13 +56,7 @@ threadmain(int argc, char *argv[])
                if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem);
                initdcache(bcmem);
                initlumpcache(1*1024*1024, 1024/8);
       -        icmem = u64log2(icmem / (sizeof(IEntry)+sizeof(IEntry*)) / ICacheDepth);
       -        if(icmem < 4)
       -                icmem = 4;
       -        if(1) fprint(2, "initialize %d bytes of index cache for %d index entries\n",
       -                (sizeof(IEntry)+sizeof(IEntry*)) * (1 << icmem) * ICacheDepth,
       -                (1 << icmem) * ICacheDepth);
       -        initicache(icmem, ICacheDepth);
       +        initicache(icmem);
                initicachewrite();
                if(mainindex->bloom)
                        startbloomproc(mainindex->bloom);
 (DIR) diff --git a/src/cmd/venti/srv/syncindex0.c b/src/cmd/venti/srv/syncindex0.c
       t@@ -101,7 +101,7 @@ syncarenaindex(Index *ix, Arena *arena, u32int clump, u64int a, int fix, int *pf
                                }
                                flush = 1;
                                trace(TraceProc, "syncarenaindex insert %V", ci->score);
       -                        insertscore(ci->score, &ia, 1);
       +                        insertscore(ci->score, &ia, IEDirty);
                        }
        
                        if(0 && clump / 1000 != (clump + n) / 1000)
 (DIR) diff --git a/src/cmd/venti/srv/venti.c b/src/cmd/venti/srv/venti.c
       t@@ -18,7 +18,7 @@ static void        ventiserver(void*);
        void
        usage(void)
        {
       -        fprint(2, "usage: venti [-Ldrs] [-a ventiaddr] [-c config] "
       +        fprint(2, "usage: venti [-Ldrsw] [-a ventiaddr] [-c config] "
        "[-h httpaddr] [-B blockcachesize] [-C cachesize] [-I icachesize] [-W webroot]\n");
                threadexitsall("usage");
        }
       t@@ -73,6 +73,9 @@ threadmain(int argc, char *argv[])
                case 's':
                        nofork = 1;
                        break;
       +        case 'w':                        /* compatibility with old venti */
       +                queuewrites = 1;
       +                break;
                case 'W':
                        webroot = EARGF(usage());
                        break;
       t@@ -103,9 +106,6 @@ threadmain(int argc, char *argv[])
                if(configfile == nil)
                        configfile = "venti.conf";
        
       -        if(initarenasum() < 0)
       -                fprint(2, "warning: can't initialize arena summing process: %r");
       -
                fprint(2, "conf...");
                if(initventi(configfile, &config) < 0)
                        sysfatal("can't init server: %r");
       t@@ -143,13 +143,7 @@ threadmain(int argc, char *argv[])
                        mem, mem / (8 * 1024));
                initlumpcache(mem, mem / (8 * 1024));
        
       -        icmem = u64log2(icmem / (sizeof(IEntry)+sizeof(IEntry*)) / ICacheDepth);
       -        if(icmem < 4)
       -                icmem = 4;
       -        if(0) fprint(2, "initialize %d bytes of index cache for %d index entries\n",
       -                (sizeof(IEntry)+sizeof(IEntry*)) * (1 << icmem) * ICacheDepth,
       -                (1 << icmem) * ICacheDepth);
       -        initicache(icmem, ICacheDepth);
       +        initicache(icmem);
                initicachewrite();
        
                /*
       t@@ -179,6 +173,9 @@ threadmain(int argc, char *argv[])
                        }
                }
        
       +        if(initarenasum() < 0)
       +                fprint(2, "warning: can't initialize arena summing process: %r");
       +
                fprint(2, "announce %s...", vaddr);
                ventisrv = vtlisten(vaddr);
                if(ventisrv == nil)
       t@@ -272,5 +269,3 @@ ventiserver(void *v)
                flushicache();
                threadexitsall(0);
        }
       -
       -
 (DIR) diff --git a/src/cmd/venti/srv/www/stats.js b/src/cmd/venti/srv/www/stats.js
       t@@ -38,6 +38,10 @@ graphname = new Array(
                        "icache dirty %",
                "arg=icachehit&graph=pctdiff&arg2=icachelookup&max=100",
                        "icache hit %",
       +        "arg=scachehit&graph=pctdiff&arg2=icachelookup&max=100",
       +                "scache hit %",
       +        "arg=icachemiss&graph=pctdiff&arg2=icachelookup&max=100",
       +                "icache miss %",
                "arg=icachelookuptime&graph=divdiff&arg2=icachelookup",
                        "icache lookup time",
                "arg=icacheprefetch&graph=diff",
       t@@ -75,6 +79,8 @@ graphname = new Array(
                        "fresh write RPC time",
                "arg=rpcwriteoldtime&graph=divdiff&arg2=rpcwriteold",
                        "dup write RPC time",
       +        "arg=cigloadtime&graph=divdiff&arg2=cigload",
       +                "cig load time",
        
                "arg=sumreadbyte&graph=diff",
                        "checksum bytes/second",
       t@@ -118,8 +124,11 @@ column1 = new Array(
                "!icache",
                "arg=icachedirty&graph=pct&arg2=icachesize&max=100",
                "arg=icachehit&graph=pctdiff&arg2=icachelookup&max=100",
       +        "arg=scachehit&graph=pctdiff&arg2=icachelookup&max=100",
       +        "arg=icachemiss&graph=pctdiff&arg2=icachelookup&max=100",
                "arg=icachewrite&graph=diff",
                "arg=icacheprefetch&graph=diff",
       +        "arg=scacheprefetch&graph=diff",
                
                "!dcache",
                "arg=dcachedirty&graph=pct&arg2=dcachesize&max=100",
       t@@ -154,6 +163,7 @@ column2 = new Array(
                "arg=rpcreaduncachedtime&graph=divdiff&arg2=rpcreaduncached",
                "arg=rpcwritenewtime&graph=divdiff&arg2=rpcwritenew",
                "arg=rpcwriteoldtime&graph=divdiff&arg2=rpcwriteold",
       +        "arg=cigloadtime&graph=divdiff&arg2=cigload",
                
                "END"
        )