net/sched/sch_generic.c at v2.6.22 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / sched / sch_generic.c
at v2.6.22 610 lines 14 kB view raw
  1/*
  2 * net/sched/sch_generic.c	Generic packet scheduler routines.
  3 *
  4 *		This program is free software; you can redistribute it and/or
  5 *		modify it under the terms of the GNU General Public License
  6 *		as published by the Free Software Foundation; either version
  7 *		2 of the License, or (at your option) any later version.
  8 *
  9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 10 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 11 *              - Ingress support
 12 */
 13
 14#include <asm/uaccess.h>
 15#include <asm/system.h>
 16#include <linux/bitops.h>
 17#include <linux/module.h>
 18#include <linux/types.h>
 19#include <linux/kernel.h>
 20#include <linux/sched.h>
 21#include <linux/string.h>
 22#include <linux/mm.h>
 23#include <linux/socket.h>
 24#include <linux/sockios.h>
 25#include <linux/in.h>
 26#include <linux/errno.h>
 27#include <linux/interrupt.h>
 28#include <linux/netdevice.h>
 29#include <linux/skbuff.h>
 30#include <linux/rtnetlink.h>
 31#include <linux/init.h>
 32#include <linux/rcupdate.h>
 33#include <linux/list.h>
 34#include <net/sock.h>
 35#include <net/pkt_sched.h>
 36
 37/* Main transmission queue. */
 38
 39/* Modifications to data participating in scheduling must be protected with
 40 * dev->queue_lock spinlock.
 41 *
 42 * The idea is the following:
 43 * - enqueue, dequeue are serialized via top level device
 44 *   spinlock dev->queue_lock.
 45 * - ingress filtering is serialized via top level device
 46 *   spinlock dev->ingress_lock.
 47 * - updates to tree and tree walking are only done under the rtnl mutex.
 48 */
 49
 50void qdisc_lock_tree(struct net_device *dev)
 51{
 52	spin_lock_bh(&dev->queue_lock);
 53	spin_lock(&dev->ingress_lock);
 54}
 55
 56void qdisc_unlock_tree(struct net_device *dev)
 57{
 58	spin_unlock(&dev->ingress_lock);
 59	spin_unlock_bh(&dev->queue_lock);
 60}
 61
 62/*
 63   dev->queue_lock serializes queue accesses for this device
 64   AND dev->qdisc pointer itself.
 65
 66   netif_tx_lock serializes accesses to device driver.
 67
 68   dev->queue_lock and netif_tx_lock are mutually exclusive,
 69   if one is grabbed, another must be free.
 70 */
 71
 72
 73/* Kick device.
 74
 75   Returns:  0  - queue is empty or throttled.
 76	    >0  - queue is not empty.
 77
 78   NOTE: Called under dev->queue_lock with locally disabled BH.
 79*/
 80
 81static inline int qdisc_restart(struct net_device *dev)
 82{
 83	struct Qdisc *q = dev->qdisc;
 84	struct sk_buff *skb;
 85
 86	/* Dequeue packet */
 87	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
 88		unsigned nolock = (dev->features & NETIF_F_LLTX);
 89
 90		dev->gso_skb = NULL;
 91
 92		/*
 93		 * When the driver has LLTX set it does its own locking
 94		 * in start_xmit. No need to add additional overhead by
 95		 * locking again. These checks are worth it because
 96		 * even uncongested locks can be quite expensive.
 97		 * The driver can do trylock like here too, in case
 98		 * of lock congestion it should return -1 and the packet
 99		 * will be requeued.
100		 */
101		if (!nolock) {
102			if (!netif_tx_trylock(dev)) {
103			collision:
104				/* So, someone grabbed the driver. */
105
106				/* It may be transient configuration error,
107				   when hard_start_xmit() recurses. We detect
108				   it by checking xmit owner and drop the
109				   packet when deadloop is detected.
110				*/
111				if (dev->xmit_lock_owner == smp_processor_id()) {
112					kfree_skb(skb);
113					if (net_ratelimit())
114						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
115					goto out;
116				}
117				__get_cpu_var(netdev_rx_stat).cpu_collision++;
118				goto requeue;
119			}
120		}
121
122		{
123			/* And release queue */
124			spin_unlock(&dev->queue_lock);
125
126			if (!netif_queue_stopped(dev)) {
127				int ret;
128
129				ret = dev_hard_start_xmit(skb, dev);
130				if (ret == NETDEV_TX_OK) {
131					if (!nolock) {
132						netif_tx_unlock(dev);
133					}
134					spin_lock(&dev->queue_lock);
135					q = dev->qdisc;
136					goto out;
137				}
138				if (ret == NETDEV_TX_LOCKED && nolock) {
139					spin_lock(&dev->queue_lock);
140					q = dev->qdisc;
141					goto collision;
142				}
143			}
144
145			/* NETDEV_TX_BUSY - we need to requeue */
146			/* Release the driver */
147			if (!nolock) {
148				netif_tx_unlock(dev);
149			}
150			spin_lock(&dev->queue_lock);
151			q = dev->qdisc;
152		}
153
154		/* Device kicked us out :(
155		   This is possible in three cases:
156
157		   0. driver is locked
158		   1. fastroute is enabled
159		   2. device cannot determine busy state
160		      before start of transmission (f.e. dialout)
161		   3. device is buggy (ppp)
162		 */
163
164requeue:
165		if (unlikely(q == &noop_qdisc))
166			kfree_skb(skb);
167		else if (skb->next)
168			dev->gso_skb = skb;
169		else
170			q->ops->requeue(skb, q);
171		netif_schedule(dev);
172	}
173	return 0;
174
175out:
176	BUG_ON((int) q->q.qlen < 0);
177	return q->q.qlen;
178}
179
180void __qdisc_run(struct net_device *dev)
181{
182	do {
183		if (!qdisc_restart(dev))
184			break;
185	} while (!netif_queue_stopped(dev));
186
187	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
188}
189
190static void dev_watchdog(unsigned long arg)
191{
192	struct net_device *dev = (struct net_device *)arg;
193
194	netif_tx_lock(dev);
195	if (dev->qdisc != &noop_qdisc) {
196		if (netif_device_present(dev) &&
197		    netif_running(dev) &&
198		    netif_carrier_ok(dev)) {
199			if (netif_queue_stopped(dev) &&
200			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
201
202				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
203				       dev->name);
204				dev->tx_timeout(dev);
205			}
206			if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
207				dev_hold(dev);
208		}
209	}
210	netif_tx_unlock(dev);
211
212	dev_put(dev);
213}
214
215static void dev_watchdog_init(struct net_device *dev)
216{
217	init_timer(&dev->watchdog_timer);
218	dev->watchdog_timer.data = (unsigned long)dev;
219	dev->watchdog_timer.function = dev_watchdog;
220}
221
222void __netdev_watchdog_up(struct net_device *dev)
223{
224	if (dev->tx_timeout) {
225		if (dev->watchdog_timeo <= 0)
226			dev->watchdog_timeo = 5*HZ;
227		if (!mod_timer(&dev->watchdog_timer,
228			       round_jiffies(jiffies + dev->watchdog_timeo)))
229			dev_hold(dev);
230	}
231}
232
233static void dev_watchdog_up(struct net_device *dev)
234{
235	__netdev_watchdog_up(dev);
236}
237
238static void dev_watchdog_down(struct net_device *dev)
239{
240	netif_tx_lock_bh(dev);
241	if (del_timer(&dev->watchdog_timer))
242		dev_put(dev);
243	netif_tx_unlock_bh(dev);
244}
245
246void netif_carrier_on(struct net_device *dev)
247{
248	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
249		linkwatch_fire_event(dev);
250	if (netif_running(dev))
251		__netdev_watchdog_up(dev);
252}
253
254void netif_carrier_off(struct net_device *dev)
255{
256	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
257		linkwatch_fire_event(dev);
258}
259
260/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
261   under all circumstances. It is difficult to invent anything faster or
262   cheaper.
263 */
264
265static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
266{
267	kfree_skb(skb);
268	return NET_XMIT_CN;
269}
270
271static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
272{
273	return NULL;
274}
275
276static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
277{
278	if (net_ratelimit())
279		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
280		       skb->dev->name);
281	kfree_skb(skb);
282	return NET_XMIT_CN;
283}
284
285struct Qdisc_ops noop_qdisc_ops = {
286	.id		=	"noop",
287	.priv_size	=	0,
288	.enqueue	=	noop_enqueue,
289	.dequeue	=	noop_dequeue,
290	.requeue	=	noop_requeue,
291	.owner		=	THIS_MODULE,
292};
293
294struct Qdisc noop_qdisc = {
295	.enqueue	=	noop_enqueue,
296	.dequeue	=	noop_dequeue,
297	.flags		=	TCQ_F_BUILTIN,
298	.ops		=	&noop_qdisc_ops,
299	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
300};
301
302static struct Qdisc_ops noqueue_qdisc_ops = {
303	.id		=	"noqueue",
304	.priv_size	=	0,
305	.enqueue	=	noop_enqueue,
306	.dequeue	=	noop_dequeue,
307	.requeue	=	noop_requeue,
308	.owner		=	THIS_MODULE,
309};
310
311static struct Qdisc noqueue_qdisc = {
312	.enqueue	=	NULL,
313	.dequeue	=	noop_dequeue,
314	.flags		=	TCQ_F_BUILTIN,
315	.ops		=	&noqueue_qdisc_ops,
316	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
317};
318
319
320static const u8 prio2band[TC_PRIO_MAX+1] =
321	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
322
323/* 3-band FIFO queue: old style, but should be a bit faster than
324   generic prio+fifo combination.
325 */
326
327#define PFIFO_FAST_BANDS 3
328
329static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
330					     struct Qdisc *qdisc)
331{
332	struct sk_buff_head *list = qdisc_priv(qdisc);
333	return list + prio2band[skb->priority & TC_PRIO_MAX];
334}
335
336static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
337{
338	struct sk_buff_head *list = prio2list(skb, qdisc);
339
340	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
341		qdisc->q.qlen++;
342		return __qdisc_enqueue_tail(skb, qdisc, list);
343	}
344
345	return qdisc_drop(skb, qdisc);
346}
347
348static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
349{
350	int prio;
351	struct sk_buff_head *list = qdisc_priv(qdisc);
352
353	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
354		if (!skb_queue_empty(list + prio)) {
355			qdisc->q.qlen--;
356			return __qdisc_dequeue_head(qdisc, list + prio);
357		}
358	}
359
360	return NULL;
361}
362
363static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
364{
365	qdisc->q.qlen++;
366	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
367}
368
369static void pfifo_fast_reset(struct Qdisc* qdisc)
370{
371	int prio;
372	struct sk_buff_head *list = qdisc_priv(qdisc);
373
374	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
375		__qdisc_reset_queue(qdisc, list + prio);
376
377	qdisc->qstats.backlog = 0;
378	qdisc->q.qlen = 0;
379}
380
381static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
382{
383	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
384
385	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
386	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
387	return skb->len;
388
389rtattr_failure:
390	return -1;
391}
392
393static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
394{
395	int prio;
396	struct sk_buff_head *list = qdisc_priv(qdisc);
397
398	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
399		skb_queue_head_init(list + prio);
400
401	return 0;
402}
403
404static struct Qdisc_ops pfifo_fast_ops = {
405	.id		=	"pfifo_fast",
406	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
407	.enqueue	=	pfifo_fast_enqueue,
408	.dequeue	=	pfifo_fast_dequeue,
409	.requeue	=	pfifo_fast_requeue,
410	.init		=	pfifo_fast_init,
411	.reset		=	pfifo_fast_reset,
412	.dump		=	pfifo_fast_dump,
413	.owner		=	THIS_MODULE,
414};
415
416struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
417{
418	void *p;
419	struct Qdisc *sch;
420	unsigned int size;
421	int err = -ENOBUFS;
422
423	/* ensure that the Qdisc and the private data are 32-byte aligned */
424	size = QDISC_ALIGN(sizeof(*sch));
425	size += ops->priv_size + (QDISC_ALIGNTO - 1);
426
427	p = kzalloc(size, GFP_KERNEL);
428	if (!p)
429		goto errout;
430	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
431	sch->padded = (char *) sch - (char *) p;
432
433	INIT_LIST_HEAD(&sch->list);
434	skb_queue_head_init(&sch->q);
435	sch->ops = ops;
436	sch->enqueue = ops->enqueue;
437	sch->dequeue = ops->dequeue;
438	sch->dev = dev;
439	dev_hold(dev);
440	atomic_set(&sch->refcnt, 1);
441
442	return sch;
443errout:
444	return ERR_PTR(-err);
445}
446
447struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
448				 unsigned int parentid)
449{
450	struct Qdisc *sch;
451
452	sch = qdisc_alloc(dev, ops);
453	if (IS_ERR(sch))
454		goto errout;
455	sch->stats_lock = &dev->queue_lock;
456	sch->parent = parentid;
457
458	if (!ops->init || ops->init(sch, NULL) == 0)
459		return sch;
460
461	qdisc_destroy(sch);
462errout:
463	return NULL;
464}
465
466/* Under dev->queue_lock and BH! */
467
468void qdisc_reset(struct Qdisc *qdisc)
469{
470	struct Qdisc_ops *ops = qdisc->ops;
471
472	if (ops->reset)
473		ops->reset(qdisc);
474}
475
476/* this is the rcu callback function to clean up a qdisc when there
477 * are no further references to it */
478
479static void __qdisc_destroy(struct rcu_head *head)
480{
481	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
482	kfree((char *) qdisc - qdisc->padded);
483}
484
485/* Under dev->queue_lock and BH! */
486
487void qdisc_destroy(struct Qdisc *qdisc)
488{
489	struct Qdisc_ops  *ops = qdisc->ops;
490
491	if (qdisc->flags & TCQ_F_BUILTIN ||
492	    !atomic_dec_and_test(&qdisc->refcnt))
493		return;
494
495	list_del(&qdisc->list);
496#ifdef CONFIG_NET_ESTIMATOR
497	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
498#endif
499	if (ops->reset)
500		ops->reset(qdisc);
501	if (ops->destroy)
502		ops->destroy(qdisc);
503
504	module_put(ops->owner);
505	dev_put(qdisc->dev);
506	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
507}
508
509void dev_activate(struct net_device *dev)
510{
511	/* No queueing discipline is attached to device;
512	   create default one i.e. pfifo_fast for devices,
513	   which need queueing and noqueue_qdisc for
514	   virtual interfaces
515	 */
516
517	if (dev->qdisc_sleeping == &noop_qdisc) {
518		struct Qdisc *qdisc;
519		if (dev->tx_queue_len) {
520			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
521						  TC_H_ROOT);
522			if (qdisc == NULL) {
523				printk(KERN_INFO "%s: activation failed\n", dev->name);
524				return;
525			}
526			list_add_tail(&qdisc->list, &dev->qdisc_list);
527		} else {
528			qdisc =  &noqueue_qdisc;
529		}
530		dev->qdisc_sleeping = qdisc;
531	}
532
533	if (!netif_carrier_ok(dev))
534		/* Delay activation until next carrier-on event */
535		return;
536
537	spin_lock_bh(&dev->queue_lock);
538	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
539	if (dev->qdisc != &noqueue_qdisc) {
540		dev->trans_start = jiffies;
541		dev_watchdog_up(dev);
542	}
543	spin_unlock_bh(&dev->queue_lock);
544}
545
546void dev_deactivate(struct net_device *dev)
547{
548	struct Qdisc *qdisc;
549	struct sk_buff *skb;
550
551	spin_lock_bh(&dev->queue_lock);
552	qdisc = dev->qdisc;
553	dev->qdisc = &noop_qdisc;
554
555	qdisc_reset(qdisc);
556
557	skb = dev->gso_skb;
558	dev->gso_skb = NULL;
559	spin_unlock_bh(&dev->queue_lock);
560
561	kfree_skb(skb);
562
563	dev_watchdog_down(dev);
564
565	/* Wait for outstanding dev_queue_xmit calls. */
566	synchronize_rcu();
567
568	/* Wait for outstanding qdisc_run calls. */
569	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
570		yield();
571}
572
573void dev_init_scheduler(struct net_device *dev)
574{
575	qdisc_lock_tree(dev);
576	dev->qdisc = &noop_qdisc;
577	dev->qdisc_sleeping = &noop_qdisc;
578	INIT_LIST_HEAD(&dev->qdisc_list);
579	qdisc_unlock_tree(dev);
580
581	dev_watchdog_init(dev);
582}
583
584void dev_shutdown(struct net_device *dev)
585{
586	struct Qdisc *qdisc;
587
588	qdisc_lock_tree(dev);
589	qdisc = dev->qdisc_sleeping;
590	dev->qdisc = &noop_qdisc;
591	dev->qdisc_sleeping = &noop_qdisc;
592	qdisc_destroy(qdisc);
593#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
594	if ((qdisc = dev->qdisc_ingress) != NULL) {
595		dev->qdisc_ingress = NULL;
596		qdisc_destroy(qdisc);
597	}
598#endif
599	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
600	qdisc_unlock_tree(dev);
601}
602
603EXPORT_SYMBOL(netif_carrier_on);
604EXPORT_SYMBOL(netif_carrier_off);
605EXPORT_SYMBOL(noop_qdisc);
606EXPORT_SYMBOL(qdisc_create_dflt);
607EXPORT_SYMBOL(qdisc_destroy);
608EXPORT_SYMBOL(qdisc_reset);
609EXPORT_SYMBOL(qdisc_lock_tree);
610EXPORT_SYMBOL(qdisc_unlock_tree);