File: /usr/src/linux/net/sched/estimator.c

1     /*
2      * net/sched/estimator.c	Simple rate estimator.
3      *
4      *		This program is free software; you can redistribute it and/or
5      *		modify it under the terms of the GNU General Public License
6      *		as published by the Free Software Foundation; either version
7      *		2 of the License, or (at your option) any later version.
8      *
9      * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10      */
11     
12     #include <asm/uaccess.h>
13     #include <asm/system.h>
14     #include <asm/bitops.h>
15     #include <linux/types.h>
16     #include <linux/kernel.h>
17     #include <linux/sched.h>
18     #include <linux/string.h>
19     #include <linux/mm.h>
20     #include <linux/socket.h>
21     #include <linux/sockios.h>
22     #include <linux/in.h>
23     #include <linux/errno.h>
24     #include <linux/interrupt.h>
25     #include <linux/netdevice.h>
26     #include <linux/skbuff.h>
27     #include <linux/rtnetlink.h>
28     #include <linux/init.h>
29     #include <linux/proc_fs.h>
30     #include <net/sock.h>
31     #include <net/pkt_sched.h>
32     
33     /*
34        This code is NOT intended to be used for statistics collection,
35        its purpose is to provide a base for statistical multiplexing
36        for controlled load service.
37        If you need only statistics, run a user level daemon which
38        periodically reads byte counters.
39     
40        Unfortunately, rate estimation is not a very easy task.
41        F.e. I did not find a simple way to estimate the current peak rate
42        and even failed to formulate the problem 8)8)
43     
44        So I preferred not to built an estimator into the scheduler,
45        but run this task separately.
46        Ideally, it should be kernel thread(s), but for now it runs
47        from timers, which puts apparent top bounds on the number of rated
48        flows, has minimal overhead on small, but is enough
49        to handle controlled load service, sets of aggregates.
50     
51        We measure rate over A=(1<<interval) seconds and evaluate EWMA:
52     
53        avrate = avrate*(1-W) + rate*W
54     
55        where W is chosen as negative power of 2: W = 2^(-ewma_log)
56     
57        The resulting time constant is:
58     
59        T = A/(-ln(1-W))
60     
61     
62        NOTES.
63     
64        * The stored value for avbps is scaled by 2^5, so that maximal
65          rate is ~1Gbit, avpps is scaled by 2^10.
66     
67        * Minimal interval is HZ/4=250msec (it is the greatest common divisor
68          for HZ=100 and HZ=1024 8)), maximal interval
69          is (HZ/4)*2^EST_MAX_INTERVAL = 8sec. Shorter intervals
70          are too expensive, longer ones can be implemented
71          at user level painlessly.
72      */
73     
74     #if (HZ%4) != 0
75     #error Bad HZ value.
76     #endif
77     
78     #define EST_MAX_INTERVAL	5
79     
80     struct qdisc_estimator
81     {
82     	struct qdisc_estimator	*next;
83     	struct tc_stats		*stats;
84     	unsigned		interval;
85     	int			ewma_log;
86     	u64			last_bytes;
87     	u32			last_packets;
88     	u32			avpps;
89     	u32			avbps;
90     };
91     
92     struct qdisc_estimator_head
93     {
94     	struct timer_list	timer;
95     	struct qdisc_estimator	*list;
96     };
97     
98     static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1];
99     
100     /* Estimator array lock */
101     static rwlock_t est_lock = RW_LOCK_UNLOCKED;
102     
103     static void est_timer(unsigned long arg)
104     {
105     	int idx = (int)arg;
106     	struct qdisc_estimator *e;
107     
108     	read_lock(&est_lock);
109     	for (e = elist[idx].list; e; e = e->next) {
110     		struct tc_stats *st = e->stats;
111     		u64 nbytes;
112     		u32 npackets;
113     		u32 rate;
114     
115     		spin_lock(st->lock);
116     		nbytes = st->bytes;
117     		npackets = st->packets;
118     		rate = (nbytes - e->last_bytes)<<(7 - idx);
119     		e->last_bytes = nbytes;
120     		e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
121     		st->bps = (e->avbps+0xF)>>5;
122     
123     		rate = (npackets - e->last_packets)<<(12 - idx);
124     		e->last_packets = npackets;
125     		e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
126     		e->stats->pps = (e->avpps+0x1FF)>>10;
127     		spin_unlock(st->lock);
128     	}
129     
130     	mod_timer(&elist[idx].timer, jiffies + ((HZ/4)<<idx));
131     	read_unlock(&est_lock);
132     }
133     
134     int qdisc_new_estimator(struct tc_stats *stats, struct rtattr *opt)
135     {
136     	struct qdisc_estimator *est;
137     	struct tc_estimator *parm = RTA_DATA(opt);
138     
139     	if (RTA_PAYLOAD(opt) < sizeof(*parm))
140     		return -EINVAL;
141     
142     	if (parm->interval < -2 || parm->interval > 3)
143     		return -EINVAL;
144     
145     	est = kmalloc(sizeof(*est), GFP_KERNEL);
146     	if (est == NULL)
147     		return -ENOBUFS;
148     
149     	memset(est, 0, sizeof(*est));
150     	est->interval = parm->interval + 2;
151     	est->stats = stats;
152     	est->ewma_log = parm->ewma_log;
153     	est->last_bytes = stats->bytes;
154     	est->avbps = stats->bps<<5;
155     	est->last_packets = stats->packets;
156     	est->avpps = stats->pps<<10;
157     
158     	est->next = elist[est->interval].list;
159     	if (est->next == NULL) {
160     		init_timer(&elist[est->interval].timer);
161     		elist[est->interval].timer.data = est->interval;
162     		elist[est->interval].timer.expires = jiffies + ((HZ/4)<<est->interval);
163     		elist[est->interval].timer.function = est_timer;
164     		add_timer(&elist[est->interval].timer);
165     	}
166     	write_lock_bh(&est_lock);
167     	elist[est->interval].list = est;
168     	write_unlock_bh(&est_lock);
169     	return 0;
170     }
171     
172     void qdisc_kill_estimator(struct tc_stats *stats)
173     {
174     	int idx;
175     	struct qdisc_estimator *est, **pest;
176     
177     	for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
178     		int killed = 0;
179     		pest = &elist[idx].list;
180     		while ((est=*pest) != NULL) {
181     			if (est->stats != stats) {
182     				pest = &est->next;
183     				continue;
184     			}
185     
186     			write_lock_bh(&est_lock);
187     			*pest = est->next;
188     			write_unlock_bh(&est_lock);
189     
190     			kfree(est);
191     			killed++;
192     		}
193     		if (killed && elist[idx].list == NULL)
194     			del_timer(&elist[idx].timer);
195     	}
196     }
197     
198