

#ifndef lint
static char rcsid[] =
    "@(#) $Header: rm_class.c,v 1.3 95/08/09 18:58:10 van Locked $ (LBL)";
#endif

#include <sys/param.h>
#include <sys/socket.h>
#include <sys/errno.h>
#include <sys/time.h>

#include <net/if.h>
/* #include <net/if_arp.h>*/

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>

#include <linux/time.h>

#include <net/tc_global.h>
#include <net/tc_types.h>

#include <syslog.h>
#include <stdlib.h>
#include <stdio.h>
#include <linux/skbuff.h>
#include <linux/string.h>

/*
 * Macros for dealing with time values.  We assume all times are
 * 'timevals'.  `microtime' is used to get the best available clock
 * resolution.  If `microtime' *doesn't* return a value that's about
 * ten times smaller than the average packet time on the fastest
 * link that will use these routines, a slightly different clock
 * scheme than this one should be used.
 * (Bias due to truncation error in this scheme will overestimate utilization
 * and discriminate against high bandwidth classes.  To remove this bias an
 * integrator needs to be added.  The simplest integrator uses a history of
 * 10 * avg.packet.time / min.tick.time packet completion entries.  This is
 * straight forward to add but we don't want to pay the extra memory
 * traffic to maintain it if it's not necessary (occasionally a vendor
 * accidentally builds a workstation with a decent clock - e.g., Sun & HP).) 
 */

#define RM_GETTIME(now) do_gettimeofday(&now)


#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
		      (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)

#define TV_DELTA(a, b, delta) { \
		register int xxs; \
 \
		delta = (a).tv_usec - (b).tv_usec; \
		if ((xxs = (a).tv_sec - (b).tv_sec)) { \
			switch (xxs) { \
			default: \
				if (xxs < 0) \
					printk(KERN_WARNING "tc_class: bogus time values %d ", xxs); \
				delta = 0; \
				/* fall through */ \
			case 2: \
				delta += 1000000; \
				/* fall through */ \
			case 1: \
				delta += 1000000; \
				break; \
			} \
		} \
}

#define TV_ADD_DELTA(a, delta, res) { \
		register int xxus = (a).tv_usec + (delta); \
 \
		res.tv_sec = a.tv_sec; \
		while (xxus >= 1000000) { \
			++((res).tv_sec); \
			xxus -= 1000000; \
		} \
		(res).tv_usec = xxus; \
}


/*
 * Table for mapping a bit mask (with one bit per priority level)
 * into the number of the highest priority set bit.
 */
u_char rmc_mask2pri[] = {
	0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, /* 00 - 0f */
	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 10 - 1f */
	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 20 - 2f */
	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 30 - 3f */
	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 40 - 4f */
	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 50 - 5f */
	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 60 - 6f */
	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 70 - 7f */
	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 80 - 8f */
	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 90 - 9f */
	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* a0 - af */
	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* b0 - bf */
	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* c0 - cf */
	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* d0 - df */
	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* e0 - ef */
	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7  /* f0 - ff */
};


/*
 * Create a new resource management class at priority 'pri' on the
 * interface given by 'ifdat'.
 *
 * nsecPerByte	is the data rate of the interface in nanoseconds/byte.
 *		E.g., 800 for a 10Mb/s ethernet.  If the class gets less
 *		than 100% of the bandwidth, this number should be the
 *		'effective' rate for the class.  Let f be the
 *		bandwidth fraction allocated to this class, and let
 *		nsPerByte be the data rate of the output link in
 *		nanoseconds/byte.  Then nsecPerByte is set to
 *		nsPerByte / f.  E.g., 1600 (= 800 / .5)
 *		for a class that gets 50% of an ethernet's bandwidth.
 *
 * action	the routine to call when the class is over limit.
 *
 * maxq		max allowable queue size for class (in packets).
 *
 * parent	parent class pointer.
 *
 * borrow	class to borrow from (should be either 'parent' or null).
 *
 * maxidle	max value allowed for class 'idle' time estimate (this
 *		parameter determines how large an initial burst of packets
 *		can be before overlimit action is invoked.
 *
 * offtime	how long 'delay' action will delay when class goes over
 *		limit (this parameter determines the steady-state burst
 *		size when a class is running over its limit).
 *
 * Maxidle and offtime have to be computed from the following:  If the
 * average packet size is s, the bandwidth fraction allocated to this
 * class is f, we want to allow b packet bursts, and the gain of the 
 * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
 *
 *   ptime = s * nsPerByte * (1 - f) / f
 *   maxidle = ptime * (1 - g^b) / g^b
 *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
 *
 * Operationally, it's convenient to specify maxidle & offtime in units
 * independent of the link bandwidth so the maxidle & offtime passed to
 * this routine are the above values multiplied by 8*f/(1000*nsPerByte).  
 * (The constant factor is a scale factor needed to make the parameters
 * integers.  This scaling also means that the 'unscaled' values of
 * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
 * not nanoseconds.)  Also note that the 'idle' filter computation keeps
 * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of 
 * maxidle also must be scaled upward by this value.  Thus, the passed 
 * values for maxidle and offtime can be computed as follows:
 *
 * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
 * offtime = offtime * 8 / (1000 * nsecPerByte)
 */
struct tc_class *
rmc_newclass(pri, ifdat, nsecPerByte, action, maxq,
	     parent, borrow, maxidle, offtime)
	register int pri;
	register struct tc_ifdat *ifdat;
	register u_int nsecPerByte;
	void (*action)();
	register int maxq;
	register struct tc_class *parent;
	register struct tc_class *borrow;
	register u_int maxidle;
	register u_int offtime;
{
	register struct tc_class *cl;
	register struct tc_class *peer;
	register int i, tim;

      	cl = (struct tc_class *)new_kmem_zalloc(sizeof(*cl), GFP_ATOMIC);	     
	if (cl == NULL) {
		panic("CBQ: not enough for resource management data structures.");
	}
	if (peer = ifdat->classes[pri]) {
		/* find the last class at this pri */
		cl->peer = peer;
		while (peer->peer != ifdat->classes[pri])
			peer = peer->peer;
		peer->peer = cl;
	} else {
		ifdat->classes[pri] = cl;
		ifdat->classlist[pri] = cl;
		cl->peer = cl;
	}
	cl->parent = parent;
	cl->borrow = borrow;
	cl->ifdat = ifdat;
	cl->priority = pri;
	cl->qmax = maxq;
	cl->qcnt = 0;           /*Martin*/ 
        cl->last.tv_sec = 0;    /*Martin*/
        cl->last.tv_usec = 0;   /*Martin*/
	cl->npackets = 0;
	cl->nbytes =0;
	cl->drops = 0;
	maxidle = (maxidle * nsecPerByte) >> 3;
	if (maxidle == 0)
		maxidle = 1;
	cl->maxidle = maxidle;
	cl->avgidle = maxidle;
	cl->tail = NULL;
	
	offtime = (offtime * nsecPerByte) >> 3;
	if (offtime == 0)
		offtime = 1;
	cl->offtime = offtime;
	cl->overlimit = action;

	tim = 0;
	for (i = 0; i < sizeof(cl->len2time)/sizeof(cl->len2time[0]); ++i) {
		register int v = tim / 1000;

		cl->len2time[i] =  v == 0? 1 : v;
		tim += nsecPerByte;
	}
	printk(KERN_WARNING "CBQ: New Class created maxq=%d\n",maxq);
	return (cl);
}

/*
 * This function deletes a class
 */

void
rmc_delete_class(struct tc_ifdat *ifdat, struct tc_class *cl)
{
	struct tc_class *peer;

	/*
	 * Delete class from class priority peer list.
	 */
	if (peer = ifdat->classes[cl->priority]) {
	   if (peer != NULL)
		if (peer != cl) {
			while (peer->peer != cl)
				peer = peer->peer;
			peer->peer = cl->peer;

			if (ifdat->classlist[cl->priority] == cl)
				ifdat->classlist[cl->priority] = cl->peer;
		} else {
			ifdat->classes[cl->priority] = NULL;
			ifdat->classlist[cl->priority] = NULL;
		}
	}

	/*
	 * Free the class structure.
	 */
	kmem_free((struct tc_class *)cl, sizeof(struct tc_class));
}


void rmc_root_overlimit();

/*
 * Initialize the resource management data structures associated
 * with the output portion of interface 'dev'.  'ifdat' is where
 * the structures will be built (for backwards compatibility, the
 * structures aren't kept in the ifnet struct).  'nsecPerByte' 
 * gives the link speed (inverse of bandwidth) in nanoseconds/byte.
 * 'restart' is the driver-specific routine that the generic 'delay
 * until under limit' action will call to restart output.  `maxq'
 * is the queue size of the 'link' & 'default' classes.  'maxqueued'
 * is the maximum number of packets that the resource management
 * code will allow to be queued 'downstream' (this is typically 1
 * or 2 -- just enough to keep the interface busy during the packet
 * completion interrupt latency).
 */
void
rmc_init(dev, ifdat, nsecPerByte, restart, maxq, maxqueued)
	register struct device *dev;
	register struct tc_ifdat *ifdat;
	register u_int nsecPerByte;
	void (*restart)();
	int maxq, maxqueued;
{
	register struct tc_class *cl;

	bzero(ifdat, sizeof(*ifdat));
	memset(ifdat, 0, sizeof(struct tc_ifdat));
	ifdat->dev = dev;
	ifdat->restart = restart;
	ifdat->maxqueued = maxqueued;
	ifdat->queued = 0;


	/*
	 * allocate space for the root class with ~96% avail to borrow
	 * and 128 packet bursts.
	 */
	ifdat->rootclass = rmc_newclass(0, ifdat, (nsecPerByte * 100) / 96,
			  rmc_root_overlimit, maxq, 0, 0, 879, 61);
	cl = ifdat->rootclass;
	/* default - low priority with 95% allocation */
	ifdat->defaultclass = rmc_newclass(1, ifdat, (nsecPerByte * 100) / 95,
					   rmc_delay_action, maxq, cl, cl,
					   600, 50);
	printk(KERN_WARNING "CBQ: Root  and Default Class  created %d  \n",((nsecPerByte * 100) / 96));
}



/*
 * Add packet given by sk_buff 'm' to queue for resource class 'cl'.
 * This routine is called by a driver's if_output routine.
 * If the limit on packets queued to the interface hardware hasn't
 * been reached, we call the interface 'restart' routine to try
 * to output another packet.
 *
 * This routine must be called with output packet completion
 * interrupts locked out (to avoid racing with rmc_dequeue_next).
 */
void
rmc_queue_packet(cl, m)
	register struct tc_class *cl;
	register struct sk_buff* m;
{
	register struct sk_buff* m0;
	register struct tc_ifdat *ifd = cl->ifdat;

	if ((m0 = cl->tail) != NULL){
		m->next = m0->next;
	}
	else {
	   register int cpri = cl->priority;
	   m0 = m;
	   if (++ifd->activecnt[cpri] == 1)
	      ifd->csum |= (1 << cpri);
	}
	m0->next = m;
	cl->tail = m;
	
	if (++cl->qcnt >= cl->qmax)
	   {
	      rmc_drop_action(cl);
	   }
}


/*
 * Return 1 if class 'cl' is under limit or can borrow from a parent,
 * 0 if overlimit.  As a side-effect, this routine will invoke the
 * class overlimit action if the class if overlimit.
 */
int
rmc_under_limit(cl)
	register struct tc_class *cl;
{
	register struct tc_class *lcl = cl;
	struct timeval now;
	
	if (cl->parent == 0)
		/* root class is always under limit */
		return (1);

	RM_GETTIME(now);
	if (cl->sleeping) {
		if (TV_LT(now, cl->undertime))
			return (0);

		cl->sleeping = 0;
		return (1);
	}
	while (cl->undertime.tv_sec && TV_LT(now, cl->undertime)) {
		++cl->borrows;
		if ((cl = cl->borrow) == 0) {
			++lcl->overactions;
			printk(KERN_WARNING "CBQ: lcl=%p overactions=%d",lcl, lcl->overactions);
			(lcl->overlimit)(lcl);
			return (0);
		}
	}
	return (1);
}

/*
 * Dequeue & return next packet from the highest priority class that
 * has a packet to send & has enough allocation to send it.  This
 * routine is called by a driver whenever it needs a new packet to
 * output, typically in the XXstart routine (e.g., lestart, bfstart,
 * etc.).  0 is returned if there is no packet to send.  As a side
 * effect of calling this routine, `overlimit' actions are called
 * for classes that have packets to send but are over their bandwidth
 * limit & can't borrow from their parent.
 */
struct sk_buff *
rmc_dequeue_next(ifd)
	register struct tc_ifdat *ifd;
{
	register struct sk_buff *m, *m0;
	register int csum, cpri;
	register struct tc_class *cl;
	
	csum = ifd->csum;
	while (csum) {
		register struct tc_class *clh;
		/*
		  if (csum > 2)
		  printk(KERN_WARNING "CBQ: csum =  %d\n",csum);
		  */
		cpri = rmc_mask2pri[csum];
		clh = cl = ifd->classlist[cpri];
		do {
			if ((m = cl->tail) &&
			    (cl->undertime.tv_sec == 0 || rmc_under_limit(cl)))
				goto out;
			cl = cl->peer;
		} while (cl != clh);

		csum &=~ (1 << cpri);
	}
	return (0);
  out:
	ifd->classlist[cpri] = cl->peer;
	ifd->class = cl;

	if ((m0 = m->next) != m)
		m->next = m0->next;
	else {
		cl->tail = 0;
		if (--ifd->activecnt[cpri] <= 0)
			ifd->csum &=~ (1 << cpri);
	}
	/*
	  if (cl->qcnt > 40)
	  printk(KERN_WARNING "CBQ: dequeuecount = %d pri= %d queued= %d \n"
	  , cl->qcnt,cl->priority, ifd->queued);	
	  */
	
	--cl->qcnt;
	++ifd->queued;
	switch (cl->priority) {
	case 7:
	   ++ifd->queued7;
	   break;
	case 6:
	   ++ifd->queued6;
	   break;
	case 5:
	   ++ifd->queued5;
	   break;
	default:
	   ++ifd->queued0;
	   break;
	}
	
	return (m0);
}

/*
 * Update the utilization estimate for the packet that just completed.
 * The packet's class & the parent(s) of that class all get their
 * estimators updated.  This routine is called by the driver's output-
 * packet-completion interrupt service routine.
 */
void
rmc_update_util(ifd)
	register struct tc_ifdat *ifd;
{
	register struct tc_class *cl;
	register int pktlen;
        register int idle, avgidle;
	struct timeval now;

	cl = ifd->class;
	pktlen = ifd->curlen;
	--ifd->queued;
	++cl->npackets;
	cl->nbytes += pktlen;
	RM_GETTIME(now);
	do {
		TV_DELTA(now, cl->last, idle);
		idle -= cl->len2time[pktlen];
		avgidle = cl->avgidle;
		avgidle += idle - (avgidle >> RM_FILTER_GAIN);

		if (avgidle <= 0) {
			cl->avgidle = 0;
			cl->undertime = now;
			TV_ADD_DELTA(now, cl->offtime, cl->undertime);
			++cl->over;
		} else {
			cl->avgidle = (avgidle > cl->maxidle)?  cl->maxidle :
								avgidle;
			cl->undertime.tv_sec = 0;
		}
		cl->last = now;
	} while (cl = cl->parent);
}

/*
 * Generic (not protocol-specific) over-limit action routines.  These
 * get invoked by rmc_under_limit() if a class with packets to send
 * is over its bandwidth limit & can't borrow from a parent class.
 */
void
rmc_drop_action(cl)
	register struct tc_class *cl;
{
	register struct sk_buff *m, *m0;
	register struct tc_ifdat *ifd;
	ifd = cl->ifdat;
	
	if ((m = cl->tail) == NULL)
		panic("rmc_drop_action: empty queue");
	if ((m0 = m->next) != m)
		m->next = m0->next;
	else {
		register struct tc_ifdat *ifd = cl->ifdat;
		register int cpri = cl->priority;

		cl->tail = 0;
		if (--ifd->activecnt[cpri] <= 0)
			ifd->csum &=~ (1 << cpri);
	}
	--cl->qcnt;
	++cl->drops;
	switch (cl->priority) {
	case 7:
	   ++ifd->drops7;
	   break;
	case 6:
	   ++ifd->drops6;
	   break;
	case 5:
	   ++ifd->drops5;
	   break;
	default:
	   ++ifd->drops0;
	   break;
	}
	/*
	  kmem_free(m0, sizeof(struct sk_buff));
	  */
	kfree_skb(m0, FREE_READ);
}

/*
 * rmc_delay_action() implements the 'delay' (i.e., rate-limiting) action
 * for a class that goes over it's bandwidth limit.  It simply schedules
 * a timeout to restart sending for the class at the future time when
 * the class would go under its limit.
 */
void
rmc_delay_action(cl)
     register struct tc_class *cl;
{ 

   /*   register int t = hzto(&cl->undertime);*/
   register int t = 1;
   struct timer_list timer =
        { NULL, NULL, 2, 0L, rmc_restart   };

	/*
	 *changed from cl-> to &cl-> 
	 * since packets are phased randomly with respect to the
	 * clock, 1 tick (the next clock tick) can be an arbitrarily
	 * short time so we have to wait for at least two ticks.
	 * Note that we always start a timer even though in the usual
	 * case packet traffic will cause this class be be rescanned
	 * and restarted rather than the relatively coarse system
	 * timer:  If there's no other traffic, we need the timer as
	 * a 'backstop' to restart this class.
	 */ 
	if (t <= 1)
	   t = 2;
   cl->sleeping = 1;
   printk(KERN_WARNING "CBQ: delay_action\n");
   add_timer(&timer); /* schedule the timer */

   /*   timeout(rmc_restart, (caddr_t)cl, t);*/
}  

/*
 * rmc_restart() is just a helper routine for rmc_delay_action -- it is
 * called by the system timer code & is responsible checking if the
 * class is still sleeping (it might have been restarted as a side
 * effect of the queue scan on a packet arrival) and, if so, restarting
 * output for the class.  Inspecting the class state & restarting output
 * require locking the class structure.  In general the driver is
 * responsible for locking but this is the only routine that is not
 * called directly or indirectly from the interface driver so it has
 * know about system locking conventions.  Under bsd, locking is done
 * by raising IPL to splimp so that's what's implemented here.  On a
 * different system this would probably need to be changed.
 */
void
rmc_restart(cl)
	register struct tc_class *cl;
{
   /*	register int s = splimp();*/
   unsigned long flags;
   save_flags(flags); cli();

	if (cl->sleeping) {
		register struct tc_ifdat *ifd = cl->ifdat;

		cl->sleeping = 0;
		if (ifd->queued < ifd->maxqueued)
			(ifd->restart)(ifd->dev);
	}
   /*	splx(s);*/
   restore_flags(flags);
}

/*
 * We should never get here (it should not be possible for the root
 * class to go overlimit) so panic & get a crash dump if we do.
 */
void
rmc_root_overlimit(cl)
	register struct tc_class *cl;
{
	printk(KERN_WARNING "CBQ: rmc_root_overlimit\n");
}

