From luoqi@chen.ml.org  Mon Jul 27 10:14:38 1998
Received: from chen.ml.org (luoqi.watermarkgroup.com [207.202.73.170])
          by hub.freebsd.org (8.8.8/8.8.8) with ESMTP id KAA23540
          for <FreeBSD-gnats-submit@freebsd.org>; Mon, 27 Jul 1998 10:14:34 -0700 (PDT)
          (envelope-from luoqi@chen.ml.org)
Received: (from luoqi@localhost)
	by chen.ml.org (8.8.8/8.8.8) id NAA07229;
	Mon, 27 Jul 1998 13:13:55 -0400 (EDT)
	(envelope-from luoqi)
Message-Id: <199807271713.NAA07229@chen.ml.org>
Date: Mon, 27 Jul 1998 13:13:55 -0400 (EDT)
From: Luoqi Chen <luoqi@chen.ml.org>
Reply-To: luoqi@chen.ml.org
To: FreeBSD-gnats-submit@freebsd.org
Subject: VMIO bug for FS with sub-page-size blocks (with fix)
X-Send-Pr-Version: 3.2

>Number:         7415
>Category:       kern
>Synopsis:       VMIO bug for FS with sub-page-size blocks
>Confidential:   no
>Severity:       serious
>Priority:       high
>Responsible:    freebsd-bugs
>State:          closed
>Quarter:        
>Keywords:       
>Date-Required:  
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Mon Jul 27 10:20:01 PDT 1998
>Closed-Date:    Tue Dec 22 08:35:46 PST 1998
>Last-Modified:  Tue Dec 22 08:36:41 PST 1998
>Originator:     Luoqi Chen
>Release:        FreeBSD 3.0-CURRENT i386
>Organization:
>Environment:

	3.0-current as of 7/27/98

>Description:

	There are some bugs in VMIO code when dealing with blocks not
	on a page-boundary. They may cause crash for FS with sub-page
	sized blocks (e.g. MSDOSFS), or VMIO enabled block devices
	(which uses 512-byte blocks).

>How-To-Repeat:

	First, create an file of size ~40K,
		dd if=/dev/zero of=/tmp/image bs=4096 count=10240
	config a vn device on top it,
		vnconfig -c /dev/vn0c /dev/zero
	create an msdos fs,
		newfs_msdsos /dev/rvn0c
	Now, try to mount it as an *ffs*,
		mount /dev/vn0c /dos
	the mount would fail, but it has the side effect of enabling
	VMIO on /dev/vn0c.

	Last, mount as an msdosfs,
		mount_msdos /dev/vn0c /dos
	and you will get a panic about "fault on nofault entry".

>Fix:
	
	The following patch should fix the problem. I hope this one
	won't silently rot away as some other patches I submitted,
	this is (IMO) a quite serious problem, and took me quite some
	time to figure it out.

Index: vfs_bio.c
===================================================================
RCS file: /fun/cvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.167
diff -u -r1.167 vfs_bio.c
--- vfs_bio.c	1998/07/13 07:05:55	1.167
+++ vfs_bio.c	1998/07/24 18:24:18
@@ -1325,6 +1325,7 @@
 		if (vm_page_is_valid(m,
 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 			return 0;
+		tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 	}
 	return 1;
 }
@@ -1367,9 +1368,9 @@
 				break;
 			}
 		}
-		boffset = (i << PAGE_SHIFT);
+		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 		if (boffset < bp->b_dirtyoff) {
-			bp->b_dirtyoff = boffset;
+			bp->b_dirtyoff = max(boffset, 0);
 		}
 
 		/*
@@ -1381,11 +1382,14 @@
 			}
 		}
 		boffset = (i + 1);
+#if 0
 		offset = boffset + bp->b_pages[0]->pindex;
 		if (offset >= object->size)
 			boffset = object->size - bp->b_pages[0]->pindex;
-		if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
-			bp->b_dirtyend = (boffset << PAGE_SHIFT);
+#endif
+		boffset = (boffset << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+		if (bp->b_dirtyend < boffset)
+			bp->b_dirtyend = min(boffset, bp->b_bufsize);
 	}
 }
 
@@ -1398,21 +1402,9 @@
 	struct buf *bp;
 	int i, s;
 	struct bufhashhdr *bh;
-	int maxsize;
 	int generation;
 	int checksize;
 
-	if (vp->v_mount) {
-		maxsize = vp->v_mount->mnt_stat.f_iosize;
-		/*
-		 * This happens on mount points.
-		 */
-		if (maxsize < size)
-			maxsize = size;
-	} else {
-		maxsize = size;
-	}
-
 #if !defined(MAX_PERF)
 	if (size > MAXBSIZE)
 		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
@@ -1503,7 +1495,22 @@
 		splx(s);
 		return (bp);
 	} else {
-		vm_object_t obj;
+		int bsize, maxsize, vmio;
+		off_t offset;
+
+		if (vp->v_type == VBLK)
+			bsize = DEV_BSIZE;
+		else if (vp->v_mountedhere)
+			bsize = vp->v_mountedhere->mnt_stat.f_iosize;
+		else if (vp->v_mount)
+			bsize = vp->v_mount->mnt_stat.f_iosize;
+		else
+			bsize = size;
+
+		offset = (off_t)blkno * bsize;
+		vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF);
+		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
+		maxsize = imax(maxsize, bsize);
 
 		if ((bp = getnewbuf(vp, blkno,
 			slpflag, slptimeo, size, maxsize)) == 0) {
@@ -1531,18 +1538,14 @@
 		 * be found by incore.
 		 */
 		bp->b_blkno = bp->b_lblkno = blkno;
-
-		if (vp->v_type != VBLK)
-			bp->b_offset = (off_t) blkno * maxsize;
-		else
-			bp->b_offset = (off_t) blkno * DEV_BSIZE;
+		bp->b_offset = offset;
 
 		bgetvp(vp, bp);
 		LIST_REMOVE(bp, b_hash);
 		bh = BUFHASH(vp, blkno);
 		LIST_INSERT_HEAD(bh, bp, b_hash);
 
-		if ((obj = vp->v_object) && (vp->v_flag & VOBJBUF)) {
+		if (vmio) {
 			bp->b_flags |= (B_VMIO | B_CACHE);
 #if defined(VFS_BIO_DEBUG)
 			if (vp->v_type != VREG && vp->v_type != VBLK)
@@ -1695,7 +1698,8 @@
 		int desiredpages;
 
 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
-		desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
+		desiredpages = size == 0 ? 0 :
+			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 
 #if !defined(NO_B_MALLOC)
 		if (bp->b_flags & B_MALLOC)
@@ -1744,8 +1748,6 @@
 			if (bp->b_npages < desiredpages) {
 				obj = vp->v_object;
 				tinc = PAGE_SIZE;
-				if (tinc > bsize)
-					tinc = bsize;
 
 				off = bp->b_offset;
 #ifdef DIAGNOSTIC
@@ -1759,10 +1761,9 @@
 				bp->b_validend = orig_validend;
 				bp->b_flags |= B_CACHE;
 				for (toff = 0; toff < newbsize; toff += tinc) {
-					int bytesinpage;
-
-					pageindex = toff >> PAGE_SHIFT;
 					objoff = OFF_TO_IDX(off + toff);
+					pageindex = objoff - OFF_TO_IDX(off);
+					tinc = PAGE_SIZE - ((off + toff) & PAGE_MASK);
 					if (pageindex < curbpnpages) {
 
 						m = bp->b_pages[pageindex];
@@ -1770,11 +1771,10 @@
 						if (m->pindex != objoff)
 							panic("allocbuf: page changed offset??!!!?");
 #endif
-						bytesinpage = tinc;
 						if (tinc > (newbsize - toff))
-							bytesinpage = newbsize - toff;
+							tinc = newbsize - toff;
 						if (bp->b_flags & B_CACHE)
-							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
+							vfs_buf_set_valid(bp, off, toff, tinc, m);
 						continue;
 					}
 					m = vm_page_lookup(obj, objoff);
@@ -1782,7 +1782,7 @@
 						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
 						if (!m) {
 							VM_WAIT;
-							vm_pageout_deficit += (desiredpages - bp->b_npages);
+							vm_pageout_deficit += (desiredpages - curbpnpages);
 							goto doretry;
 						}
 
@@ -1805,11 +1805,10 @@
 								(cnt.v_free_min + cnt.v_cache_min))) {
 							pagedaemon_wakeup();
 						}
-						bytesinpage = tinc;
 						if (tinc > (newbsize - toff))
-							bytesinpage = newbsize - toff;
+							tinc = newbsize - toff;
 						if (bp->b_flags & B_CACHE)
-							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
+							vfs_buf_set_valid(bp, off, toff, tinc, m);
 						m->flags &= ~PG_ZERO;
 						vm_page_wire(m);
 					}
@@ -2154,7 +2153,7 @@
 		 * This only bothers with the first valid range in the
 		 * page.
 		 */
-		svalid = off;
+		svalid = ((foff + off) & ~PAGE_MASK) - foff;
 		while (validbits && !(validbits & 1)) {
 			svalid += DEV_BSIZE;
 			validbits >>= 1;
@@ -2164,6 +2163,7 @@
 			evalid += DEV_BSIZE;
 			validbits >>= 1;
 		}
+		evalid = min(evalid, off + size);
 		/*
 		 * Make sure this range is contiguous with the range
 		 * built up from previous pages.  If not, then we will
@@ -2192,15 +2192,14 @@
 	vm_ooffset_t soff, eoff;
 
 	soff = off;
-	eoff = off + min(PAGE_SIZE, bp->b_bufsize);
+	eoff = qmin(off + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
 	if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
 		vm_ooffset_t sv, ev;
 		vm_page_set_invalid(m,
 		    (vm_offset_t) (soff & PAGE_MASK),
 		    (vm_offset_t) (eoff - soff));
-		off = off - pageno * PAGE_SIZE;
-		sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
-		ev = off + ((bp->b_validend + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
+		sv = (bp->b_offset + bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+		ev = (bp->b_offset + bp->b_validend) & ~(DEV_BSIZE - 1);
 		soff = qmax(sv, soff);
 		eoff = qmin(ev, eoff);
 	}
@@ -2285,18 +2284,21 @@
 			panic("vfs_clean_pages: no buffer offset");
 #endif
 
-		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
+		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 			vfs_page_set_valid(bp, foff, i, m);
+			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
 		}
 	}
 }
 
 void
 vfs_bio_clrbuf(struct buf *bp) {
-	int i;
+	int i, size;
+	caddr_t sa, ea;
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
-		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
+		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
+		    (bp->b_offset & PAGE_MASK) == 0) {
 			int mask;
 			mask = 0;
 			for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
@@ -2309,19 +2311,23 @@
 			bp->b_resid = 0;
 			return;
 		}
-		for(i=0;i<bp->b_npages;i++) {
+		ea = sa = bp->b_data;
+		for(i=0;i<bp->b_npages;i++,sa=ea) {
+			ea = (caddr_t)trunc_page(sa + PAGE_SIZE);
+			ea = (caddr_t)ulmin((u_long)ea,
+				(u_long)bp->b_data + bp->b_bufsize);
 			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
 				continue;
 			if( bp->b_pages[i]->valid == 0) {
 				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
-					bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
+					bzero(sa, ea - sa);
 				}
 			} else {
-				int j;
-				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
+				int j = ((u_long)sa & PAGE_MASK) / DEV_BSIZE;
+				for (; sa < ea; sa += DEV_BSIZE, j++) {
 					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
 						(bp->b_pages[i]->valid & (1<<j)) == 0)
-						bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
+						bzero(sa, DEV_BSIZE);
 				}
 			}
 			bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
>Release-Note:
>Audit-Trail:
State-Changed-From-To: open->feedback 
State-Changed-By: phk 
State-Changed-When: Tue Jul 28 02:15:14 PDT 1998 
State-Changed-Why:  
it sounds to me like the simple solution would be to either: 
A) have ufs clear the vmio flag agan 
B) have msdosfs (and other similar fs) clear the flag before they start. 

Could you explain what your patch does ? 

From: Poul-Henning Kamp <phk@critter.freebsd.dk>
To: freebsd-gnats-submit@freebsd.org
Cc:  Subject: Re: kern/7415
Date: Fri, 07 Aug 1998 08:26:27 +0200

 ------- Forwarded Message
 
 
 >it sounds to me like the simple solution would be to either:
 >A) have ufs clear the vmio flag agan
 >B) have msdosfs (and other similar fs) clear the flag before they start.
 
 Simple and wrong.  File systems know nothing of the vmio flag.  Either
 vm should clear it and be more careful about setting it (it only works
 now for ufs because ufs never requests a misaligned block), or it should
 always be set for block devices and fixes like the ones in the patch
 should be applied.
 
 >Could you explain what your patch does ?
 
 It attempts to make misaligned buffers (ones whose size and file offsets
 are multiples of PAGE_SIZE) actually work.  It isn't clear whether they
 were intended to work, but they will need to work if the buffer cache
 and vm are ever fully merged.
 
 Bruce
 
 
 ------- End of Forwarded Message
 

From: Poul-Henning Kamp <phk@critter.freebsd.dk>
To: freebsd-gnats-submit@freebsd.org
Cc:  Subject: Re: kern/7415
Date: Fri, 07 Aug 1998 08:27:24 +0200

 ------- Forwarded Message
 
 From: Luoqi Chen <luoqi@chen.ml.org>
 Message-Id: <199807281445.KAA19921@chen.ml.org>
 To: bde@zeta.org.au, freebsd-bugs@FreeBSD.ORG, luoqi@chen.ml.org,
         phk@FreeBSD.ORG
 Subject: Re: kern/7415
 In-Reply-To: <199807281056.UAA30494@godzilla.zeta.org.au>
 X-UIDL: 0bb3b238bf5e8a0a5e0814b07eec4131
 
 > >it sounds to me like the simple solution would be to either:
 > >A) have ufs clear the vmio flag agan
 > >B) have msdosfs (and other similar fs) clear the flag before they start.
 >
 > Simple and wrong.  File systems know nothing of the vmio flag.  Either
 > vm should clear it and be more careful about setting it (it only works
 > now for ufs because ufs never requests a misaligned block), or it should
 > always be set for block devices and fixes like the ones in the patch
 > should be applied.
 >
 > >Could you explain what your patch does ?
 >
 > It attempts to make misaligned buffers (ones whose size and file offsets
 > are multiples of PAGE_SIZE) actually work.  It isn't clear whether they
      ^
     not
 > were intended to work, but they will need to work if the buffer cache
 > and vm are ever fully merged.
 >
 > Bruce
 >
 Bruce explained it very well (much better than I could). I'd like make one
 comment, I believe the misaligned buffers are intended to work. The evidence
 is at around line 1830 of vfs_bio.c the statement,
 	((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
 and in numerous other places statements like
 	pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
 the original author were aware of these misaligned buffers and attempted to
 make them work.
 
 - -lq
 
 
 ------- End of Forwarded Message
 
State-Changed-From-To: feedback->closed 
State-Changed-By: luoqi 
State-Changed-When: Tue Dec 22 08:35:46 PST 1998 
State-Changed-Why:  
Fix committed. 
>Unformatted:
