/* net/atm/mmuio.c - MMU-supported high-speed I/O */

/* Written 1995-1997 by Werner Almesberger, EPFL LRC */


#include <linux/config.h>


#ifdef CONFIG_MMU_HACKS

#include <linux/mmuio.h>
#include <asm/atomic.h>


#define invalidate flush_tlb_all /* @@@ improve this */


#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/segment.h>
#include <asm/uaccess.h>
#include <asm/bitops.h>

#include <linux/skbuff.h>

#include <linux/netdevice.h> /* needed to include net/sock.h */
#include <net/sock.h>


/* #define MAX_SC_LOCKS 3 */


#if 0
#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
#else
#define DPRINTK(format,args...)
#endif


#ifndef CONFIG_MMU_HACKS_DEBUG
 
#define EVENT(s,a,b,c) DPRINTK(s,a,b,c)
 
static void event_dump(void) {}

#else
 
/*
 * Very extensive activity logging. Greatly improves bug detection speed but
 * costs a few Mbps if enabled.
 */
 
#define EV 64
 
static const char *ev[EV];

static unsigned long ev_a[EV],ev_b[EV],ev_c[EV];
static int ec = 0;
 
 
static void EVENT(const char *s,unsigned long a,unsigned long b,unsigned long c)
{
	ev[ec] = s;
	ev_a[ec] = a;
	ev_b[ec] = b;
	ev_c[ec] = c;
	ec = (ec+1) % EV;
}


static void event_dump(void)
{
	int n,i;
 
	printk(KERN_NOTICE "----- Event dump follows -----\n");
	for (n = 0; n < EV; n++) {
		i = (ec+n) % EV;
		printk(KERN_NOTICE);
		printk(ev[i] ? ev[i] : "(null)",ev_a[i],ev_b[i],ev_c[i]);
	}
	printk(KERN_NOTICE "----- Event dump ends here -----\n");
}
 
 
#endif /* CONFIG_MMU_HACKS_DEBUG */

/*
 * Helper functions to walk through page tables. If CREATE is set, they add
 * new entries if needed to reach a given PTE. PTEs are never created. If
 * CREATE is not set, *PMD and *PTE might become NULL while passing unavailable
 * memory regions.
 */


static inline int mmu_resolve(unsigned long addr,pgd_t **pgd,pmd_t **pmd,
    pte_t **pte,int create)
{
	*pgd = pgd_offset(current->mm,addr);
	*pmd = create ? pmd_alloc(*pgd,addr) : pmd_offset(*pgd,addr);
	if (!*pmd) {
		if (create) invalidate();
		*pte = NULL;
		return -ENOMEM;
	}
	*pte = create ? pte_alloc(*pmd,addr) : pte_offset(*pmd,addr);
	if (*pte) return 0;
	if (create) invalidate();
	return -ENOMEM;
}


static inline int mmu_step(unsigned long addr,pgd_t **pgd,pmd_t **pmd,
    pte_t **pte,int create)
{
	if (addr & (PTRS_PER_PTE*PAGE_SIZE-1)) {
		if (*pte) (*pte)++;
	}
	else {
		if (addr & (PTRS_PER_PMD*PTRS_PER_PTE*PAGE_SIZE-1)) {
			if (*pmd) (*pmd)++;
		}
		else {
			(*pgd)++;
			*pmd = create ? pmd_alloc(*pgd,addr) :
			    pmd_offset(*pgd,addr);
			if (!*pmd) {
				if (create) invalidate();
				*pte = NULL;
				return -ENOMEM;
			}
		}
		*pte = create ? pte_alloc(*pmd,addr) : pte_offset(*pmd,addr);
		if (!*pte) {
			if (create) invalidate();
			return -ENOMEM;
		}
	}
	return 0;
}


/*
 * Removes a range of pages belonging to the current process. This helps to
 * avoid undesirable copying when COW or swapped-out pages are overwritten in
 * one sweep.
 */


void free_range(unsigned long start,unsigned long size)
{
	pgd_t *pgd;
	pmd_t *pmd;
	pte_t *pte;
	unsigned long end;

	end = (start+size) & ~(PAGE_SIZE-1);
	start = (start+PAGE_SIZE-1) & ~(PAGE_SIZE-1);
	if (start <= end) return;
	(void) mmu_resolve(start,&pgd,&pmd,&pte,0);
	while (1) {
		if (pte && !pte_none(*pte)) {
			pte_t old_page;

			old_page = *pte;
			pte_clear(pte);
			if (!pte_present(old_page))
				swap_free(pte_val(old_page));
			else {
				current->mm->rss--;
				free_page(pte_page(old_page));
			}
		}
		if ((start += PAGE_SIZE) >= end) break;
		(void) mmu_step(start,&pgd,&pmd,&pte,0);
	}
	invalidate();
}


/*
 * Copies data by mapping kernel pages into the current process. If the data is
 * mis-aligned or if no whole pages can be copied, ordinary memory-to-memory
 * copies are done.
 *
 * TODO: Speed improvement: if copying "almost" a page, don't copy from kernel
 * to user, but still map the kernel page and copy the user data instead.
 * This may also reduce the number of bad COW/swap activity.
 *
 */


struct page_descriptor {
        struct page_descriptor *next;
        struct block_header *firstfree;
        int order;
        int nfree;
};


/*
 * Since we always work on "big" buffers (>= one memory page), kmalloc's 
 * page sharing doesn't get in the way.
 */


extern volatile unsigned long net_skbcount;


static void free_around(struct sk_buff *skb,unsigned long start,
    unsigned long end)
{
	struct page_descriptor *dsc;
	unsigned long order,first,last;

	net_skbcount--;
	/* FIXME: should also update kmalloc counters @@@ */
	dsc = (struct page_descriptor *) ((unsigned long) skb->head &
	    PAGE_MASK);
	order = dsc->order;
	order = order < 7 ? 0 : order-7;
	first = (unsigned long) dsc;
	last = first+(PAGE_SIZE << order);
	if (mem_map[MAP_NR(first)].count != 1) {
		printk(KERN_CRIT "free_around: mem_map[%ld].count is 0x%x\n",
		    MAP_NR(first),mem_map[MAP_NR(first)].count);
		event_dump();
		return;
	}
	while (first < last) {
		mem_map[MAP_NR(first)].count = 1;
		if (first < start || first >= end) free_page(first);
		first += PAGE_SIZE;
	}
}


/* fixme: what if reading into shared memory region ? */
void mmucp_tofs(unsigned long user,unsigned long size,struct sk_buff *skb,
    unsigned long kernel)
{
	unsigned long extra;
	pgd_t *pgd;
	pmd_t *pmd;
	pte_t *pte;
	int error;
	unsigned long hole_start;

	if (size > skb->len) size = skb->len;
	EVENT("mmucp_tofs 0x%lx to 0x%lx+%ld\n",kernel,user,size);
	if (skb->prev || skb->next || skb->lock || skb->users > 1) {
		copy_to_user((void *) user,(void *) kernel,size);
		return;
	}
	if (((kernel^user) & (PAGE_SIZE-1)) || size < PAGE_SIZE) {
		EVENT("memcpy(0x%lx,0x%lx,%ld);\n",user,kernel,size);
		copy_to_user((void *) user,(void *) kernel,size);
		kfree_skb(skb,FREE_READ);
		return;
	}
	if ((extra = -user & (PAGE_SIZE-1))) {
		if ((size -= extra) < PAGE_SIZE) {
			EVENT("memcpy(0x%lx,0x%lx,%ld);\n",user,kernel,
			    size+extra);
			copy_to_user((void *) user,(void *) kernel,size+extra);
			kfree_skb(skb,FREE_READ);
			return;
		}
		EVENT("memcpy(0x%lx,0x%lx,%ld);\n",user,kernel,size);
		copy_to_user((void *) user,(void *) kernel,extra);
		user += extra;
		kernel += extra;
	}
	if ((error = mmu_resolve(user,&pgd,&pmd,&pte,1)) < 0) {
		invalidate();
		oom(current);
		return;
	}
	hole_start = kernel;
	while (1) {
		pte_t old_page;

		if (mem_map[MAP_NR(pte_page(*pte))].count > 1) {
			EVENT("memcpy(0x%lx,0x%lx,PAGE_SIZE);\n",user,kernel,0);
			copy_to_user((void *) user,(void *) kernel,PAGE_SIZE);
		}
		else {
			old_page = *pte;
			pte_clear(pte);
			if (!pte_none(old_page))
				if (!pte_present(old_page))
					swap_free(pte_val(old_page));
				else {
					current->mm->rss--;
					free_page(pte_page(old_page));
				}
			mem_map[MAP_NR(kernel)].count = 1;
			/* Page is now owned only by user. */
			*pte = mk_pte(kernel,PAGE_SHARED);
			*pte = pte_mkdirty(*pte);
			EVENT("mapped 0x%lx at 0x%lx\n",kernel,user,0);
			current->mm->rss++;
		}
		user += PAGE_SIZE;
		kernel += PAGE_SIZE;
		if ((size -= PAGE_SIZE) < PAGE_SIZE) break;
		if ((error = mmu_step(user,&pgd,&pmd,&pte,1)) < 0) {
			kernel -= PAGE_SIZE; /* back off */
			size = 0;
			oom(current);
			break;
		}
	}
	if (size) {
		EVENT("memcpy(0x%lx,0x%lx,%ld);\n",user,kernel,size);
		copy_to_user((void *) user,(void *) kernel,size);
	}
	invalidate();
	/* use skb code for all the administrative overhead */
	skb->count++;
	kfree_skb(skb,FREE_READ);
	if (skb->count == 1) free_around(skb,hole_start,kernel);
	else {
		printk(KERN_CRIT "mmu_tofs: skb->count == %d\n",skb->count);
		event_dump();
	}
}


/*
 * Fault user pages (current process) in physical memory, lock them there,
 * and set them COW so that they stay around even if the user process tries
 * to scribble over them. The locked physical page ranges are recorded in the
 * scatter-gather vector IOV.
 *
 * FIXME: Should check user's physical memory size limit.
 */


int lock_user(unsigned long start,unsigned long size,int iov_max,
    struct iovec *iov)
{
	struct vm_area_struct *vma;
	unsigned long end,last,from,page;
	pgd_t *pgd;
	pmd_t *pmd;
	pte_t *pte;
	int iovcnt,error;

	EVENT("lock_user: %ld@0x%lx\n",size,start,0);
	end = start+size;
	if (start >= end) return 0;
	for (vma = find_vma(current,start); vma; vma = vma->vm_next)
		if (vma->vm_flags & (VM_SHARED | VM_SHM)) return -EAGAIN;
		else if (vma->vm_end >= end) break;
	iovcnt = 0;
	if ((error = mmu_resolve(start,&pgd,&pmd,&pte,1)) < 0) return error;
	last = from = 0;
	while (1) {
		mem_map_t *map;

		EVENT("<0x%p|0x%p|0x%p>\n",pgd,pmd,pte);
		EVENT("0x%lx,0x%lx,%d\n",start,end,iovcnt);
		if (pte_none(*pte) || !pte_present(*pte)) {
			struct vm_area_struct *vma;
 
			EVENT("handling missing page\n",0,0,0);
			if (!(vma = find_vma(current,start))) {
				printk(KERN_CRIT "lock_user: VMA (0x%lx) not "
				    "found",start);
				event_dump();
				return -EINVAL;
			}
			do_no_page(current,vma,start,vma->vm_flags & VM_WRITE);
		}
		page = pte_page(*pte);
		EVENT("got page 0x%lx\n",page,0,0);
		if (!page) {
			printk(KERN_ERR "lock_user: Gnorf, no page\n");
			event_dump();
			return -EINVAL;
		}
		map = mem_map+MAP_NR(page);
		if (PageReserved(map)) {
			printk(KERN_ERR "lock_user: reserved\n");
			event_dump();
			return -EINVAL;
		}
		atomic_inc(&map->count);
		*pte = pte_wrprotect(*pte);
		if (!last) {
			from = page+(start & (PAGE_SIZE-1));
			start &= ~(PAGE_SIZE-1);
		}
		else if (page != last+PAGE_SIZE) {
				if (iovcnt >= iov_max) return -ENOSPC;
				iov->iov_base = (caddr_t) from;
				iov->iov_len = last+PAGE_SIZE-from;
				EVENT("putting ... last=0x%lx,from=0x%lx\n",
				    last,from,0);
				iovcnt++;
				iov++;
                                from = page;
			}
		last = page;
		if ((start += PAGE_SIZE) >= end) break;
		if ((error = mmu_step(start,&pgd,&pmd,&pte,1)) < 0)
			return error;
	}
	invalidate();
	if (iovcnt >= iov_max) return -ENOSPC;
	iov->iov_base = (caddr_t) from;
	iov->iov_len = last+(end & (PAGE_SIZE-1))-from;
	if (start == end) iov->iov_len += PAGE_SIZE;
/*
for (i = 0; i <= iovcnt; i++)
  printk("iov[%d].iov_base = 0x%p\niov[%d].iov_len = 0x%lx\n",
i,iov[i-iovcnt].iov_base,i,iov[i-iovcnt].iov_len);
*/
	return iovcnt+1;
}


/*
 * Release user pages locked with lock_user. wrprotect isn't cleared, so we'll
 * get a few extra protection faults (COW handling doesn't copy pages that are
 * not shared), but that shouldn't do any harm.
 */


void unlock_user(int iovcnt,struct iovec *iov)
{
	unsigned long walk,end;
 
	while (iovcnt--) {
		end = (unsigned long) iov->iov_base+iov->iov_len;
		for (walk = (unsigned long) iov->iov_base & ~(PAGE_SIZE-1);
		    walk < end; walk += PAGE_SIZE) {
			mem_map_t *map;

			map = mem_map+MAP_NR(walk);
			free_page(walk);
		}
		iov++;
	}
}

#endif
