1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
|
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Fri, 17 Sep 2021 13:31:38 +0200
Subject: [PATCH 06/10] net: sched: Protect Qdisc::bstats with u64_stats
The not-per-CPU variant of qdisc tc (traffic control) statistics,
Qdisc::gnet_stats_basic_packed bstats, is protected with Qdisc::running
sequence counter.
This sequence counter is used for reliably protecting bstats reads from
parallel writes. Meanwhile, the seqcount's write section covers a much
wider area than bstats update: qdisc_run_begin() => qdisc_run_end().
That read/write section asymmetry can lead to needless retries of the
read section. To prepare for removing the Qdisc::running sequence
counter altogether, introduce a u64_stats sync point inside bstats
instead.
Modify _bstats_update() to start/end the bstats u64_stats write
section. Introduce _bstats_set(); it is now needed since raw writes done
within the bigger qdisc_run_begin/end() section need a helper for
starting/ending the u64_stats write section.
For bisectability, and finer commits granularity, the bstats read
section is still protected with a Qdisc::running read/retry loop and
qdisc_run_begin/end() still starts/ends that seqcount write section.
Once all call sites are modified to use _bstats_set/update(), the
Qdisc::running seqcount will be removed and bstats read/retry loop will
be modified to utilize the internal u64_stats sync point.
Note, using u64_stats implies no sequence counter protection for 64-bit
architectures. This can lead to the statistics "packets" vs. "bytes"
values getting out of sync on rare occasions. The individual values will
still be valid.
[bigeasy: Minor commit message edits, init all gnet_stats_basic_packed.]
Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/net/gen_stats.h | 14 ++++++++++++++
include/net/sch_generic.h | 2 ++
net/core/gen_estimator.c | 2 +-
net/core/gen_stats.c | 16 ++++++++++++++--
net/netfilter/xt_RATEEST.c | 1 +
net/sched/act_api.c | 2 ++
net/sched/sch_atm.c | 1 +
net/sched/sch_cbq.c | 1 +
net/sched/sch_drr.c | 1 +
net/sched/sch_ets.c | 1 +
net/sched/sch_generic.c | 1 +
net/sched/sch_gred.c | 4 +++-
net/sched/sch_hfsc.c | 1 +
net/sched/sch_htb.c | 7 +++++--
net/sched/sch_mq.c | 2 +-
net/sched/sch_mqprio.c | 5 +++--
net/sched/sch_qfq.c | 1 +
17 files changed, 53 insertions(+), 9 deletions(-)
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -11,6 +11,7 @@
struct gnet_stats_basic_packed {
__u64 bytes;
__u64 packets;
+ struct u64_stats_sync syncp;
};
struct gnet_stats_basic_cpu {
@@ -18,6 +19,19 @@ struct gnet_stats_basic_cpu {
struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));
+#ifdef CONFIG_LOCKDEP
+void gnet_stats_basic_packed_init(struct gnet_stats_basic_packed *b);
+
+#else
+
+static inline void gnet_stats_basic_packed_init(struct gnet_stats_basic_packed *b)
+{
+ b->bytes = 0;
+ b->packets = 0;
+ u64_stats_init(&b->syncp);
+}
+#endif
+
struct net_rate_estimator;
struct gnet_dump {
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -848,8 +848,10 @@ static inline int qdisc_enqueue(struct s
static inline void _bstats_update(struct gnet_stats_basic_packed *bstats,
__u64 bytes, __u32 packets)
{
+ u64_stats_update_begin(&bstats->syncp);
bstats->bytes += bytes;
bstats->packets += packets;
+ u64_stats_update_end(&bstats->syncp);
}
static inline void bstats_update(struct gnet_stats_basic_packed *bstats,
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -62,7 +62,7 @@ struct net_rate_estimator {
static void est_fetch_counters(struct net_rate_estimator *e,
struct gnet_stats_basic_packed *b)
{
- memset(b, 0, sizeof(*b));
+ gnet_stats_basic_packed_init(b);
if (e->stats_lock)
spin_lock(e->stats_lock);
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -18,7 +18,7 @@
#include <linux/gen_stats.h>
#include <net/netlink.h>
#include <net/gen_stats.h>
-
+#include <net/sch_generic.h>
static inline int
gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr)
@@ -114,6 +114,17 @@ gnet_stats_start_copy(struct sk_buff *sk
}
EXPORT_SYMBOL(gnet_stats_start_copy);
+#ifdef CONFIG_LOCKDEP
+/* Must not be inlined, due to u64_stats seqcount_t lockdep key */
+void gnet_stats_basic_packed_init(struct gnet_stats_basic_packed *b)
+{
+ b->bytes = 0;
+ b->packets = 0;
+ u64_stats_init(&b->syncp);
+}
+EXPORT_SYMBOL(gnet_stats_basic_packed_init);
+#endif
+
static void
__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu)
@@ -169,8 +180,9 @@ static int
struct gnet_stats_basic_packed *b,
int type)
{
- struct gnet_stats_basic_packed bstats = {0};
+ struct gnet_stats_basic_packed bstats;
+ gnet_stats_basic_packed_init(&bstats);
__gnet_stats_copy_basic(running, &bstats, cpu, b);
if (d->compat_tc_stats && type == TCA_STATS_BASIC) {
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -143,6 +143,7 @@ static int xt_rateest_tg_checkentry(cons
if (!est)
goto err1;
+ gnet_stats_basic_packed_init(&est->bstats);
strlcpy(est->name, info->name, sizeof(est->name));
spin_lock_init(&est->lock);
est->refcnt = 1;
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -490,6 +490,8 @@ int tcf_idr_create(struct tc_action_net
if (!p->cpu_qstats)
goto err3;
}
+ gnet_stats_basic_packed_init(&p->tcfa_bstats);
+ gnet_stats_basic_packed_init(&p->tcfa_bstats_hw);
spin_lock_init(&p->tcfa_lock);
p->tcfa_index = index;
p->tcfa_tm.install = jiffies;
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -548,6 +548,7 @@ static int atm_tc_init(struct Qdisc *sch
pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
INIT_LIST_HEAD(&p->flows);
INIT_LIST_HEAD(&p->link.list);
+ gnet_stats_basic_packed_init(&p->link.bstats);
list_add(&p->link.list, &p->flows);
p->link.q = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops, sch->handle, extack);
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1611,6 +1611,7 @@ cbq_change_class(struct Qdisc *sch, u32
if (cl == NULL)
goto failure;
+ gnet_stats_basic_packed_init(&cl->bstats);
err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
if (err) {
kfree(cl);
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -106,6 +106,7 @@ static int drr_change_class(struct Qdisc
if (cl == NULL)
return -ENOBUFS;
+ gnet_stats_basic_packed_init(&cl->bstats);
cl->common.classid = classid;
cl->quantum = quantum;
cl->qdisc = qdisc_create_dflt(sch->dev_queue,
--- a/net/sched/sch_ets.c
+++ b/net/sched/sch_ets.c
@@ -662,6 +662,7 @@ static int ets_qdisc_change(struct Qdisc
q->nbands = nbands;
for (i = nstrict; i < q->nstrict; i++) {
INIT_LIST_HEAD(&q->classes[i].alist);
+ gnet_stats_basic_packed_init(&q->classes[i].bstats);
if (q->classes[i].qdisc->q.qlen) {
list_add_tail(&q->classes[i].alist, &q->active);
q->classes[i].deficit = quanta[i];
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -892,6 +892,7 @@ struct Qdisc *qdisc_alloc(struct netdev_
__skb_queue_head_init(&sch->gso_skb);
__skb_queue_head_init(&sch->skb_bad_txq);
qdisc_skb_head_init(&sch->q);
+ gnet_stats_basic_packed_init(&sch->bstats);
spin_lock_init(&sch->q.lock);
if (ops->static_flags & TCQ_F_CPUSTATS) {
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -364,9 +364,11 @@ static int gred_offload_dump_stats(struc
hw_stats->handle = sch->handle;
hw_stats->parent = sch->parent;
- for (i = 0; i < MAX_DPs; i++)
+ for (i = 0; i < MAX_DPs; i++) {
+ gnet_stats_basic_packed_init(&hw_stats->stats.bstats[i]);
if (table->tab[i])
hw_stats->stats.xstats[i] = &table->tab[i]->stats;
+ }
ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats);
/* Even if driver returns failure adjust the stats - in case offload
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1406,6 +1406,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struc
if (err)
return err;
+ gnet_stats_basic_packed_init(&q->root.bstats);
q->root.cl_common.classid = sch->handle;
q->root.sched = q;
q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1311,7 +1311,7 @@ static void htb_offload_aggregate_stats(
struct htb_class *c;
unsigned int i;
- memset(&cl->bstats, 0, sizeof(cl->bstats));
+ gnet_stats_basic_packed_init(&cl->bstats);
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) {
@@ -1357,7 +1357,7 @@ htb_dump_class_stats(struct Qdisc *sch,
if (cl->leaf.q)
cl->bstats = cl->leaf.q->bstats;
else
- memset(&cl->bstats, 0, sizeof(cl->bstats));
+ gnet_stats_basic_packed_init(&cl->bstats);
cl->bstats.bytes += cl->bstats_bias.bytes;
cl->bstats.packets += cl->bstats_bias.packets;
} else {
@@ -1849,6 +1849,9 @@ static int htb_change_class(struct Qdisc
if (!cl)
goto failure;
+ gnet_stats_basic_packed_init(&cl->bstats);
+ gnet_stats_basic_packed_init(&cl->bstats_bias);
+
err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
if (err) {
kfree(cl);
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -133,7 +133,7 @@ static int mq_dump(struct Qdisc *sch, st
__u32 qlen = 0;
sch->q.qlen = 0;
- memset(&sch->bstats, 0, sizeof(sch->bstats));
+ gnet_stats_basic_packed_init(&sch->bstats);
memset(&sch->qstats, 0, sizeof(sch->qstats));
/* MQ supports lockless qdiscs. However, statistics accounting needs
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -390,7 +390,7 @@ static int mqprio_dump(struct Qdisc *sch
unsigned int ntx, tc;
sch->q.qlen = 0;
- memset(&sch->bstats, 0, sizeof(sch->bstats));
+ gnet_stats_basic_packed_init(&sch->bstats);
memset(&sch->qstats, 0, sizeof(sch->qstats));
/* MQ supports lockless qdiscs. However, statistics accounting needs
@@ -504,10 +504,11 @@ static int mqprio_dump_class_stats(struc
int i;
__u32 qlen = 0;
struct gnet_stats_queue qstats = {0};
- struct gnet_stats_basic_packed bstats = {0};
+ struct gnet_stats_basic_packed bstats;
struct net_device *dev = qdisc_dev(sch);
struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK];
+ gnet_stats_basic_packed_init(&bstats);
/* Drop lock here it will be reclaimed before touching
* statistics this is required because the d->lock we
* hold here is the look on dev_queue->qdisc_sleeping
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -465,6 +465,7 @@ static int qfq_change_class(struct Qdisc
if (cl == NULL)
return -ENOBUFS;
+ gnet_stats_basic_packed_init(&cl->bstats);
cl->common.classid = classid;
cl->deficit = lmax;
|