156
static void throtl_free_tg(struct rcu_head *head)
158
struct throtl_grp *tg;
160
tg = container_of(head, struct throtl_grp, rcu_head);
161
free_percpu(tg->blkg.stats_cpu);
154
165
static void throtl_put_tg(struct throtl_grp *tg)
156
167
BUG_ON(atomic_read(&tg->ref) <= 0);
157
168
if (!atomic_dec_and_test(&tg->ref))
172
* A group is freed in rcu manner. But having an rcu lock does not
173
* mean that one can access all the fields of blkg and assume these
174
* are valid. For example, don't try to follow throtl_data and
175
* request queue links.
177
* Having a reference to blkg under an rcu allows acess to only
178
* values local to groups like group stats and group rate limits
180
call_rcu(&tg->rcu_head, throtl_free_tg);
162
static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
163
struct cgroup *cgroup)
183
static void throtl_init_group(struct throtl_grp *tg)
165
struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
166
struct throtl_grp *tg = NULL;
168
struct backing_dev_info *bdi = &td->queue->backing_dev_info;
169
unsigned int major, minor;
172
* TODO: Speed up blkiocg_lookup_group() by maintaining a radix
173
* tree of blkg (instead of traversing through hash list all
178
* This is the common case when there are no blkio cgroups.
179
* Avoid lookup in this case
181
if (blkcg == &blkio_root_cgroup)
184
tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
186
/* Fill in device details for root group */
187
if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
188
sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
189
tg->blkg.dev = MKDEV(major, minor);
196
tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
200
185
INIT_HLIST_NODE(&tg->tg_node);
201
186
RB_CLEAR_NODE(&tg->rb_node);
202
187
bio_list_init(&tg->bio_lists[0]);
203
188
bio_list_init(&tg->bio_lists[1]);
189
tg->limits_changed = false;
191
/* Practically unlimited BW */
192
tg->bps[0] = tg->bps[1] = -1;
193
tg->iops[0] = tg->iops[1] = -1;
206
196
* Take the initial reference that will be released on destroy
209
199
* exit or cgroup deletion path depending on who is exiting first.
211
201
atomic_set(&tg->ref, 1);
204
/* Should be called with rcu read lock held (needed for blkcg) */
206
throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
208
hlist_add_head(&tg->tg_node, &td->tg_list);
209
td->nr_undestroyed_grps++;
213
__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
215
struct backing_dev_info *bdi = &td->queue->backing_dev_info;
216
unsigned int major, minor;
218
if (!tg || tg->blkg.dev)
222
* Fill in device details for a group which might not have been
223
* filled at group creation time as queue was being instantiated
224
* and driver had not attached a device yet
226
if (bdi->dev && dev_name(bdi->dev)) {
227
sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
228
tg->blkg.dev = MKDEV(major, minor);
233
* Should be called with without queue lock held. Here queue lock will be
234
* taken rarely. It will be taken only once during life time of a group
238
throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
240
if (!tg || tg->blkg.dev)
243
spin_lock_irq(td->queue->queue_lock);
244
__throtl_tg_fill_dev_details(td, tg);
245
spin_unlock_irq(td->queue->queue_lock);
248
static void throtl_init_add_tg_lists(struct throtl_data *td,
249
struct throtl_grp *tg, struct blkio_cgroup *blkcg)
251
__throtl_tg_fill_dev_details(td, tg);
213
253
/* Add group onto cgroup list */
214
sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
215
254
blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
216
MKDEV(major, minor), BLKIO_POLICY_THROTL);
255
tg->blkg.dev, BLKIO_POLICY_THROTL);
218
257
tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
219
258
tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
220
259
tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
221
260
tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
223
hlist_add_head(&tg->tg_node, &td->tg_list);
224
td->nr_undestroyed_grps++;
262
throtl_add_group_to_td_list(td, tg);
265
/* Should be called without queue lock and outside of rcu period */
266
static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
268
struct throtl_grp *tg = NULL;
271
tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
275
ret = blkio_alloc_blkg_stats(&tg->blkg);
282
throtl_init_group(tg);
287
throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
289
struct throtl_grp *tg = NULL;
293
* This is the common case when there are no blkio cgroups.
294
* Avoid lookup in this case
296
if (blkcg == &blkio_root_cgroup)
299
tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
301
__throtl_tg_fill_dev_details(td, tg);
306
* This function returns with queue lock unlocked in case of error, like
307
* request queue is no more
229
309
static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
231
struct cgroup *cgroup;
232
struct throtl_grp *tg = NULL;
235
cgroup = task_cgroup(current, blkio_subsys_id);
236
tg = throtl_find_alloc_tg(td, cgroup);
311
struct throtl_grp *tg = NULL, *__tg = NULL;
312
struct blkio_cgroup *blkcg;
313
struct request_queue *q = td->queue;
316
blkcg = task_blkio_cgroup(current);
317
tg = throtl_find_tg(td, blkcg);
324
* Need to allocate a group. Allocation of group also needs allocation
325
* of per cpu stats which in-turn takes a mutex() and can block. Hence
326
* we need to drop rcu lock and queue_lock before we call alloc
328
* Take the request queue reference to make sure queue does not
329
* go away once we return from allocation.
333
spin_unlock_irq(q->queue_lock);
335
tg = throtl_alloc_tg(td);
337
* We might have slept in group allocation. Make sure queue is not
340
if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
345
return ERR_PTR(-ENODEV);
349
/* Group allocated and queue is still alive. take the lock */
350
spin_lock_irq(q->queue_lock);
353
* Initialize the new group. After sleeping, read the blkcg again.
356
blkcg = task_blkio_cgroup(current);
359
* If some other thread already allocated the group while we were
360
* not holding queue lock, free up the group
362
__tg = throtl_find_tg(td, blkcg);
370
/* Group allocation failed. Account the IO to root group */
376
throtl_init_add_tg_lists(td, tg, blkcg);
239
377
rcu_read_unlock();
737
877
struct throtl_grp *tg;
738
878
struct hlist_node *pos, *n;
740
if (!atomic_read(&td->limits_changed))
880
if (!td->limits_changed)
743
throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
883
xchg(&td->limits_changed, false);
746
* Make sure updates from throtl_update_blkio_group_read_bps() group
747
* of functions to tg->limits_changed are visible. We do not
748
* want update td->limits_changed to be visible but update to
749
* tg->limits_changed not being visible yet on this cpu. Hence
885
throtl_log(td, "limits changed");
754
887
hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
755
if (throtl_tg_on_rr(tg) && tg->limits_changed) {
756
throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
757
" riops=%u wiops=%u", tg->bps[READ],
758
tg->bps[WRITE], tg->iops[READ],
888
if (!tg->limits_changed)
891
if (!xchg(&tg->limits_changed, false))
894
throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
895
" riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
896
tg->iops[READ], tg->iops[WRITE]);
899
* Restart the slices for both READ and WRITES. It
900
* might happen that a group's limit are dropped
901
* suddenly and we don't want to account recently
902
* dispatched IO with new low rate
904
throtl_start_new_slice(td, tg, 0);
905
throtl_start_new_slice(td, tg, 1);
907
if (throtl_tg_on_rr(tg))
760
908
tg_update_disptime(td, tg);
761
tg->limits_changed = false;
765
smp_mb__before_atomic_dec();
766
atomic_dec(&td->limits_changed);
767
smp_mb__after_atomic_dec();
770
912
/* Dispatch throttled bios. Should be called without queue lock held. */
911
1065
struct blkio_group *blkg, u64 read_bps)
913
1067
struct throtl_data *td = key;
915
tg_of_blkg(blkg)->bps[READ] = read_bps;
916
/* Make sure read_bps is updated before setting limits_changed */
918
tg_of_blkg(blkg)->limits_changed = true;
920
/* Make sure tg->limits_changed is updated before td->limits_changed */
921
smp_mb__before_atomic_inc();
922
atomic_inc(&td->limits_changed);
923
smp_mb__after_atomic_inc();
925
/* Schedule a work now to process the limit change */
926
throtl_schedule_delayed_work(td, 0);
1068
struct throtl_grp *tg = tg_of_blkg(blkg);
1070
tg->bps[READ] = read_bps;
1071
throtl_update_blkio_group_common(td, tg);
929
1074
static void throtl_update_blkio_group_write_bps(void *key,
930
1075
struct blkio_group *blkg, u64 write_bps)
932
1077
struct throtl_data *td = key;
1078
struct throtl_grp *tg = tg_of_blkg(blkg);
934
tg_of_blkg(blkg)->bps[WRITE] = write_bps;
936
tg_of_blkg(blkg)->limits_changed = true;
937
smp_mb__before_atomic_inc();
938
atomic_inc(&td->limits_changed);
939
smp_mb__after_atomic_inc();
940
throtl_schedule_delayed_work(td, 0);
1080
tg->bps[WRITE] = write_bps;
1081
throtl_update_blkio_group_common(td, tg);
943
1084
static void throtl_update_blkio_group_read_iops(void *key,
944
1085
struct blkio_group *blkg, unsigned int read_iops)
946
1087
struct throtl_data *td = key;
1088
struct throtl_grp *tg = tg_of_blkg(blkg);
948
tg_of_blkg(blkg)->iops[READ] = read_iops;
950
tg_of_blkg(blkg)->limits_changed = true;
951
smp_mb__before_atomic_inc();
952
atomic_inc(&td->limits_changed);
953
smp_mb__after_atomic_inc();
954
throtl_schedule_delayed_work(td, 0);
1090
tg->iops[READ] = read_iops;
1091
throtl_update_blkio_group_common(td, tg);
957
1094
static void throtl_update_blkio_group_write_iops(void *key,
958
1095
struct blkio_group *blkg, unsigned int write_iops)
960
1097
struct throtl_data *td = key;
1098
struct throtl_grp *tg = tg_of_blkg(blkg);
962
tg_of_blkg(blkg)->iops[WRITE] = write_iops;
964
tg_of_blkg(blkg)->limits_changed = true;
965
smp_mb__before_atomic_inc();
966
atomic_inc(&td->limits_changed);
967
smp_mb__after_atomic_inc();
968
throtl_schedule_delayed_work(td, 0);
1100
tg->iops[WRITE] = write_iops;
1101
throtl_update_blkio_group_common(td, tg);
971
void throtl_shutdown_timer_wq(struct request_queue *q)
1104
static void throtl_shutdown_wq(struct request_queue *q)
973
1106
struct throtl_data *td = q->td;
996
1129
struct throtl_grp *tg;
997
1130
struct bio *bio = *biop;
998
1131
bool rw = bio_data_dir(bio), update_disptime = true;
1132
struct blkio_cgroup *blkcg;
1000
1134
if (bio->bi_rw & REQ_THROTTLED) {
1001
1135
bio->bi_rw &= ~REQ_THROTTLED;
1140
* A throtl_grp pointer retrieved under rcu can be used to access
1141
* basic fields like stats and io rates. If a group has no rules,
1142
* just update the dispatch stats in lockless manner and return.
1146
blkcg = task_blkio_cgroup(current);
1147
tg = throtl_find_tg(td, blkcg);
1149
throtl_tg_fill_dev_details(td, tg);
1151
if (tg_no_rule_group(tg, rw)) {
1152
blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
1153
rw, bio->bi_rw & REQ_SYNC);
1161
* Either group has not been allocated yet or it is not an unlimited
1005
1165
spin_lock_irq(q->queue_lock);
1006
1166
tg = throtl_get_tg(td);
1169
if (PTR_ERR(tg) == -ENODEV) {
1171
* Queue is gone. No queue lock held here.
1008
1177
if (tg->nr_queued[rw]) {
1010
1179
* There is already another bio queued in same dir. No
1011
1180
* need to update dispatch time.
1012
* Still update the disptime if rate limits on this group
1015
if (!tg->limits_changed)
1016
update_disptime = false;
1018
tg->limits_changed = false;
1182
update_disptime = false;
1020
1183
goto queue_bio;
1023
1187
/* Bio is with-in rate limit of group */
1024
1188
if (tg_may_dispatch(td, tg, bio, NULL)) {
1025
1189
throtl_charge_bio(tg, bio);
1192
* We need to trim slice even when bios are not being queued
1193
* otherwise it might happen that a bio is not queued for
1194
* a long time and slice keeps on extending and trim is not
1195
* called for a long time. Now if limits are reduced suddenly
1196
* we take into account all the IO dispatched so far at new
1197
* low rate and * newly queued IO gets a really long dispatch
1200
* So keep on trimming slice even if bio is not queued.
1202
throtl_trim_slice(td, tg, rw);
1030
throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
1207
throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
1031
1208
" iodisp=%u iops=%u queued=%d/%d",
1032
1209
rw == READ ? 'R' : 'W',
1033
1210
tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1059
1236
INIT_HLIST_HEAD(&td->tg_list);
1060
1237
td->tg_service_tree = THROTL_RB_ROOT;
1061
atomic_set(&td->limits_changed, 0);
1063
/* Init root group */
1065
INIT_HLIST_NODE(&tg->tg_node);
1066
RB_CLEAR_NODE(&tg->rb_node);
1067
bio_list_init(&tg->bio_lists[0]);
1068
bio_list_init(&tg->bio_lists[1]);
1070
/* Practically unlimited BW */
1071
tg->bps[0] = tg->bps[1] = -1;
1072
tg->iops[0] = tg->iops[1] = -1;
1075
* Set root group reference to 2. One reference will be dropped when
1076
* all groups on tg_list are being deleted during queue exit. Other
1077
* reference will remain there as we don't want to delete this group
1078
* as it is statically allocated and gets destroyed when throtl_data
1081
atomic_set(&tg->ref, 2);
1082
hlist_add_head(&tg->tg_node, &td->tg_list);
1083
td->nr_undestroyed_grps++;
1238
td->limits_changed = false;
1085
1239
INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1241
/* alloc and Init root group. */
1243
tg = throtl_alloc_tg(td);
1087
1252
rcu_read_lock();
1088
blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
1089
0, BLKIO_POLICY_THROTL);
1253
throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
1090
1254
rcu_read_unlock();
1092
1256
/* Attach throtl data to request queue */