1
by Tim Gardner
Import upstream version 4.2.0 |
1 |
/*
|
2 |
* This file is part of UBIFS.
|
|
3 |
*
|
|
4 |
* Copyright (C) 2006-2008 Nokia Corporation.
|
|
5 |
*
|
|
6 |
* This program is free software; you can redistribute it and/or modify it
|
|
7 |
* under the terms of the GNU General Public License version 2 as published by
|
|
8 |
* the Free Software Foundation.
|
|
9 |
*
|
|
10 |
* This program is distributed in the hope that it will be useful, but WITHOUT
|
|
11 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
12 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
13 |
* more details.
|
|
14 |
*
|
|
15 |
* You should have received a copy of the GNU General Public License along with
|
|
16 |
* this program; if not, write to the Free Software Foundation, Inc., 51
|
|
17 |
* Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
18 |
*
|
|
19 |
* Authors: Adrian Hunter
|
|
20 |
* Artem Bityutskiy (Битюцкий Артём)
|
|
21 |
*/
|
|
22 |
||
23 |
/*
|
|
24 |
* This file implements garbage collection. The procedure for garbage collection
|
|
25 |
* is different depending on whether a LEB as an index LEB (contains index
|
|
26 |
* nodes) or not. For non-index LEBs, garbage collection finds a LEB which
|
|
27 |
* contains a lot of dirty space (obsolete nodes), and copies the non-obsolete
|
|
28 |
* nodes to the journal, at which point the garbage-collected LEB is free to be
|
|
29 |
* reused. For index LEBs, garbage collection marks the non-obsolete index nodes
|
|
30 |
* dirty in the TNC, and after the next commit, the garbage-collected LEB is
|
|
31 |
* to be reused. Garbage collection will cause the number of dirty index nodes
|
|
32 |
* to grow, however sufficient space is reserved for the index to ensure the
|
|
33 |
* commit will never run out of space.
|
|
34 |
*
|
|
35 |
* Notes about dead watermark. At current UBIFS implementation we assume that
|
|
36 |
* LEBs which have less than @c->dead_wm bytes of free + dirty space are full
|
|
37 |
* and not worth garbage-collecting. The dead watermark is one min. I/O unit
|
|
38 |
* size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
|
|
39 |
* Garbage Collector has to synchronize the GC head's write buffer before
|
|
40 |
* returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
|
|
41 |
* actually reclaim even very small pieces of dirty space by garbage collecting
|
|
42 |
* enough dirty LEBs, but we do not bother doing this at this implementation.
|
|
43 |
*
|
|
44 |
* Notes about dark watermark. The results of GC work depends on how big are
|
|
45 |
* the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
|
|
46 |
* if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
|
|
47 |
* have to waste large pieces of free space at the end of LEB B, because nodes
|
|
48 |
* from LEB A would not fit. And the worst situation is when all nodes are of
|
|
49 |
* maximum size. So dark watermark is the amount of free + dirty space in LEB
|
|
50 |
* which are guaranteed to be reclaimable. If LEB has less space, the GC might
|
|
51 |
* be unable to reclaim it. So, LEBs with free + dirty greater than dark
|
|
52 |
* watermark are "good" LEBs from GC's point of few. The other LEBs are not so
|
|
53 |
* good, and GC takes extra care when moving them.
|
|
54 |
*/
|
|
55 |
||
56 |
#include <linux/slab.h> |
|
57 |
#include <linux/pagemap.h> |
|
58 |
#include <linux/list_sort.h> |
|
59 |
#include "ubifs.h" |
|
60 |
||
61 |
/*
|
|
62 |
* GC may need to move more than one LEB to make progress. The below constants
|
|
63 |
* define "soft" and "hard" limits on the number of LEBs the garbage collector
|
|
64 |
* may move.
|
|
65 |
*/
|
|
66 |
#define SOFT_LEBS_LIMIT 4
|
|
67 |
#define HARD_LEBS_LIMIT 32
|
|
68 |
||
69 |
/**
|
|
70 |
* switch_gc_head - switch the garbage collection journal head.
|
|
71 |
* @c: UBIFS file-system description object
|
|
72 |
* @buf: buffer to write
|
|
73 |
* @len: length of the buffer to write
|
|
74 |
* @lnum: LEB number written is returned here
|
|
75 |
* @offs: offset written is returned here
|
|
76 |
*
|
|
77 |
* This function switch the GC head to the next LEB which is reserved in
|
|
78 |
* @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required,
|
|
79 |
* and other negative error code in case of failures.
|
|
80 |
*/
|
|
81 |
static int switch_gc_head(struct ubifs_info *c) |
|
82 |
{
|
|
83 |
int err, gc_lnum = c->gc_lnum; |
|
84 |
struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; |
|
85 |
||
86 |
ubifs_assert(gc_lnum != -1); |
|
87 |
dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)", |
|
88 |
wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum, |
|
89 |
c->leb_size - wbuf->offs - wbuf->used); |
|
90 |
||
91 |
err = ubifs_wbuf_sync_nolock(wbuf); |
|
92 |
if (err) |
|
93 |
return err; |
|
94 |
||
95 |
/*
|
|
96 |
* The GC write-buffer was synchronized, we may safely unmap
|
|
97 |
* 'c->gc_lnum'.
|
|
98 |
*/
|
|
99 |
err = ubifs_leb_unmap(c, gc_lnum); |
|
100 |
if (err) |
|
101 |
return err; |
|
102 |
||
103 |
err = ubifs_wbuf_sync_nolock(wbuf); |
|
104 |
if (err) |
|
105 |
return err; |
|
106 |
||
107 |
err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0); |
|
108 |
if (err) |
|
109 |
return err; |
|
110 |
||
111 |
c->gc_lnum = -1; |
|
112 |
err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0); |
|
113 |
return err; |
|
114 |
}
|
|
115 |
||
116 |
/**
|
|
117 |
* data_nodes_cmp - compare 2 data nodes.
|
|
118 |
* @priv: UBIFS file-system description object
|
|
119 |
* @a: first data node
|
|
120 |
* @a: second data node
|
|
121 |
*
|
|
122 |
* This function compares data nodes @a and @b. Returns %1 if @a has greater
|
|
123 |
* inode or block number, and %-1 otherwise.
|
|
124 |
*/
|
|
125 |
static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) |
|
126 |
{
|
|
127 |
ino_t inuma, inumb; |
|
128 |
struct ubifs_info *c = priv; |
|
129 |
struct ubifs_scan_node *sa, *sb; |
|
130 |
||
131 |
cond_resched(); |
|
132 |
if (a == b) |
|
133 |
return 0; |
|
134 |
||
135 |
sa = list_entry(a, struct ubifs_scan_node, list); |
|
136 |
sb = list_entry(b, struct ubifs_scan_node, list); |
|
137 |
||
138 |
ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY); |
|
139 |
ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY); |
|
140 |
ubifs_assert(sa->type == UBIFS_DATA_NODE); |
|
141 |
ubifs_assert(sb->type == UBIFS_DATA_NODE); |
|
142 |
||
143 |
inuma = key_inum(c, &sa->key); |
|
144 |
inumb = key_inum(c, &sb->key); |
|
145 |
||
146 |
if (inuma == inumb) { |
|
147 |
unsigned int blka = key_block(c, &sa->key); |
|
148 |
unsigned int blkb = key_block(c, &sb->key); |
|
149 |
||
150 |
if (blka <= blkb) |
|
151 |
return -1; |
|
152 |
} else if (inuma <= inumb) |
|
153 |
return -1; |
|
154 |
||
155 |
return 1; |
|
156 |
}
|
|
157 |
||
158 |
/*
|
|
159 |
* nondata_nodes_cmp - compare 2 non-data nodes.
|
|
160 |
* @priv: UBIFS file-system description object
|
|
161 |
* @a: first node
|
|
162 |
* @a: second node
|
|
163 |
*
|
|
164 |
* This function compares nodes @a and @b. It makes sure that inode nodes go
|
|
165 |
* first and sorted by length in descending order. Directory entry nodes go
|
|
166 |
* after inode nodes and are sorted in ascending hash valuer order.
|
|
167 |
*/
|
|
168 |
static int nondata_nodes_cmp(void *priv, struct list_head *a, |
|
169 |
struct list_head *b) |
|
170 |
{
|
|
171 |
ino_t inuma, inumb; |
|
172 |
struct ubifs_info *c = priv; |
|
173 |
struct ubifs_scan_node *sa, *sb; |
|
174 |
||
175 |
cond_resched(); |
|
176 |
if (a == b) |
|
177 |
return 0; |
|
178 |
||
179 |
sa = list_entry(a, struct ubifs_scan_node, list); |
|
180 |
sb = list_entry(b, struct ubifs_scan_node, list); |
|
181 |
||
182 |
ubifs_assert(key_type(c, &sa->key) != UBIFS_DATA_KEY && |
|
183 |
key_type(c, &sb->key) != UBIFS_DATA_KEY); |
|
184 |
ubifs_assert(sa->type != UBIFS_DATA_NODE && |
|
185 |
sb->type != UBIFS_DATA_NODE); |
|
186 |
||
187 |
/* Inodes go before directory entries */
|
|
188 |
if (sa->type == UBIFS_INO_NODE) { |
|
189 |
if (sb->type == UBIFS_INO_NODE) |
|
190 |
return sb->len - sa->len; |
|
191 |
return -1; |
|
192 |
}
|
|
193 |
if (sb->type == UBIFS_INO_NODE) |
|
194 |
return 1; |
|
195 |
||
196 |
ubifs_assert(key_type(c, &sa->key) == UBIFS_DENT_KEY || |
|
197 |
key_type(c, &sa->key) == UBIFS_XENT_KEY); |
|
198 |
ubifs_assert(key_type(c, &sb->key) == UBIFS_DENT_KEY || |
|
199 |
key_type(c, &sb->key) == UBIFS_XENT_KEY); |
|
200 |
ubifs_assert(sa->type == UBIFS_DENT_NODE || |
|
201 |
sa->type == UBIFS_XENT_NODE); |
|
202 |
ubifs_assert(sb->type == UBIFS_DENT_NODE || |
|
203 |
sb->type == UBIFS_XENT_NODE); |
|
204 |
||
205 |
inuma = key_inum(c, &sa->key); |
|
206 |
inumb = key_inum(c, &sb->key); |
|
207 |
||
208 |
if (inuma == inumb) { |
|
209 |
uint32_t hasha = key_hash(c, &sa->key); |
|
210 |
uint32_t hashb = key_hash(c, &sb->key); |
|
211 |
||
212 |
if (hasha <= hashb) |
|
213 |
return -1; |
|
214 |
} else if (inuma <= inumb) |
|
215 |
return -1; |
|
216 |
||
217 |
return 1; |
|
218 |
}
|
|
219 |
||
220 |
/**
|
|
221 |
* sort_nodes - sort nodes for GC.
|
|
222 |
* @c: UBIFS file-system description object
|
|
223 |
* @sleb: describes nodes to sort and contains the result on exit
|
|
224 |
* @nondata: contains non-data nodes on exit
|
|
225 |
* @min: minimum node size is returned here
|
|
226 |
*
|
|
227 |
* This function sorts the list of inodes to garbage collect. First of all, it
|
|
228 |
* kills obsolete nodes and separates data and non-data nodes to the
|
|
229 |
* @sleb->nodes and @nondata lists correspondingly.
|
|
230 |
*
|
|
231 |
* Data nodes are then sorted in block number order - this is important for
|
|
232 |
* bulk-read; data nodes with lower inode number go before data nodes with
|
|
233 |
* higher inode number, and data nodes with lower block number go before data
|
|
234 |
* nodes with higher block number;
|
|
235 |
*
|
|
236 |
* Non-data nodes are sorted as follows.
|
|
237 |
* o First go inode nodes - they are sorted in descending length order.
|
|
238 |
* o Then go directory entry nodes - they are sorted in hash order, which
|
|
239 |
* should supposedly optimize 'readdir()'. Direntry nodes with lower parent
|
|
240 |
* inode number go before direntry nodes with higher parent inode number,
|
|
241 |
* and direntry nodes with lower name hash values go before direntry nodes
|
|
242 |
* with higher name hash values.
|
|
243 |
*
|
|
244 |
* This function returns zero in case of success and a negative error code in
|
|
245 |
* case of failure.
|
|
246 |
*/
|
|
247 |
static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, |
|
248 |
struct list_head *nondata, int *min) |
|
249 |
{
|
|
250 |
int err; |
|
251 |
struct ubifs_scan_node *snod, *tmp; |
|
252 |
||
253 |
*min = INT_MAX; |
|
254 |
||
255 |
/* Separate data nodes and non-data nodes */
|
|
256 |
list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { |
|
257 |
ubifs_assert(snod->type == UBIFS_INO_NODE || |
|
258 |
snod->type == UBIFS_DATA_NODE || |
|
259 |
snod->type == UBIFS_DENT_NODE || |
|
260 |
snod->type == UBIFS_XENT_NODE || |
|
261 |
snod->type == UBIFS_TRUN_NODE); |
|
262 |
||
263 |
if (snod->type != UBIFS_INO_NODE && |
|
264 |
snod->type != UBIFS_DATA_NODE && |
|
265 |
snod->type != UBIFS_DENT_NODE && |
|
266 |
snod->type != UBIFS_XENT_NODE) { |
|
267 |
/* Probably truncation node, zap it */
|
|
268 |
list_del(&snod->list); |
|
269 |
kfree(snod); |
|
270 |
continue; |
|
271 |
}
|
|
272 |
||
273 |
ubifs_assert(key_type(c, &snod->key) == UBIFS_DATA_KEY || |
|
274 |
key_type(c, &snod->key) == UBIFS_INO_KEY || |
|
275 |
key_type(c, &snod->key) == UBIFS_DENT_KEY || |
|
276 |
key_type(c, &snod->key) == UBIFS_XENT_KEY); |
|
277 |
||
278 |
err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, |
|
279 |
snod->offs, 0); |
|
280 |
if (err < 0) |
|
281 |
return err; |
|
282 |
||
283 |
if (!err) { |
|
284 |
/* The node is obsolete, remove it from the list */
|
|
285 |
list_del(&snod->list); |
|
286 |
kfree(snod); |
|
287 |
continue; |
|
288 |
}
|
|
289 |
||
290 |
if (snod->len < *min) |
|
291 |
*min = snod->len; |
|
292 |
||
293 |
if (key_type(c, &snod->key) != UBIFS_DATA_KEY) |
|
294 |
list_move_tail(&snod->list, nondata); |
|
295 |
}
|
|
296 |
||
297 |
/* Sort data and non-data nodes */
|
|
298 |
list_sort(c, &sleb->nodes, &data_nodes_cmp); |
|
299 |
list_sort(c, nondata, &nondata_nodes_cmp); |
|
300 |
||
301 |
err = dbg_check_data_nodes_order(c, &sleb->nodes); |
|
302 |
if (err) |
|
303 |
return err; |
|
304 |
err = dbg_check_nondata_nodes_order(c, nondata); |
|
305 |
if (err) |
|
306 |
return err; |
|
307 |
return 0; |
|
308 |
}
|
|
309 |
||
310 |
/**
|
|
311 |
* move_node - move a node.
|
|
312 |
* @c: UBIFS file-system description object
|
|
313 |
* @sleb: describes the LEB to move nodes from
|
|
314 |
* @snod: the mode to move
|
|
315 |
* @wbuf: write-buffer to move node to
|
|
316 |
*
|
|
317 |
* This function moves node @snod to @wbuf, changes TNC correspondingly, and
|
|
318 |
* destroys @snod. Returns zero in case of success and a negative error code in
|
|
319 |
* case of failure.
|
|
320 |
*/
|
|
321 |
static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb, |
|
322 |
struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf) |
|
323 |
{
|
|
324 |
int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used; |
|
325 |
||
326 |
cond_resched(); |
|
327 |
err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len); |
|
328 |
if (err) |
|
329 |
return err; |
|
330 |
||
331 |
err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, |
|
332 |
snod->offs, new_lnum, new_offs, |
|
333 |
snod->len); |
|
334 |
list_del(&snod->list); |
|
335 |
kfree(snod); |
|
336 |
return err; |
|
337 |
}
|
|
338 |
||
339 |
/**
|
|
340 |
* move_nodes - move nodes.
|
|
341 |
* @c: UBIFS file-system description object
|
|
342 |
* @sleb: describes the LEB to move nodes from
|
|
343 |
*
|
|
344 |
* This function moves valid nodes from data LEB described by @sleb to the GC
|
|
345 |
* journal head. This function returns zero in case of success, %-EAGAIN if
|
|
346 |
* commit is required, and other negative error codes in case of other
|
|
347 |
* failures.
|
|
348 |
*/
|
|
349 |
static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) |
|
350 |
{
|
|
351 |
int err, min; |
|
352 |
LIST_HEAD(nondata); |
|
353 |
struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; |
|
354 |
||
355 |
if (wbuf->lnum == -1) { |
|
356 |
/*
|
|
357 |
* The GC journal head is not set, because it is the first GC
|
|
358 |
* invocation since mount.
|
|
359 |
*/
|
|
360 |
err = switch_gc_head(c); |
|
361 |
if (err) |
|
362 |
return err; |
|
363 |
}
|
|
364 |
||
365 |
err = sort_nodes(c, sleb, &nondata, &min); |
|
366 |
if (err) |
|
367 |
goto out; |
|
368 |
||
369 |
/* Write nodes to their new location. Use the first-fit strategy */
|
|
370 |
while (1) { |
|
371 |
int avail; |
|
372 |
struct ubifs_scan_node *snod, *tmp; |
|
373 |
||
374 |
/* Move data nodes */
|
|
375 |
list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { |
|
376 |
avail = c->leb_size - wbuf->offs - wbuf->used; |
|
377 |
if (snod->len > avail) |
|
378 |
/*
|
|
379 |
* Do not skip data nodes in order to optimize
|
|
380 |
* bulk-read.
|
|
381 |
*/
|
|
382 |
break; |
|
383 |
||
384 |
err = move_node(c, sleb, snod, wbuf); |
|
385 |
if (err) |
|
386 |
goto out; |
|
387 |
}
|
|
388 |
||
389 |
/* Move non-data nodes */
|
|
390 |
list_for_each_entry_safe(snod, tmp, &nondata, list) { |
|
391 |
avail = c->leb_size - wbuf->offs - wbuf->used; |
|
392 |
if (avail < min) |
|
393 |
break; |
|
394 |
||
395 |
if (snod->len > avail) { |
|
396 |
/*
|
|
397 |
* Keep going only if this is an inode with
|
|
398 |
* some data. Otherwise stop and switch the GC
|
|
399 |
* head. IOW, we assume that data-less inode
|
|
400 |
* nodes and direntry nodes are roughly of the
|
|
401 |
* same size.
|
|
402 |
*/
|
|
403 |
if (key_type(c, &snod->key) == UBIFS_DENT_KEY || |
|
404 |
snod->len == UBIFS_INO_NODE_SZ) |
|
405 |
break; |
|
406 |
continue; |
|
407 |
}
|
|
408 |
||
409 |
err = move_node(c, sleb, snod, wbuf); |
|
410 |
if (err) |
|
411 |
goto out; |
|
412 |
}
|
|
413 |
||
414 |
if (list_empty(&sleb->nodes) && list_empty(&nondata)) |
|
415 |
break; |
|
416 |
||
417 |
/*
|
|
418 |
* Waste the rest of the space in the LEB and switch to the
|
|
419 |
* next LEB.
|
|
420 |
*/
|
|
421 |
err = switch_gc_head(c); |
|
422 |
if (err) |
|
423 |
goto out; |
|
424 |
}
|
|
425 |
||
426 |
return 0; |
|
427 |
||
428 |
out: |
|
429 |
list_splice_tail(&nondata, &sleb->nodes); |
|
430 |
return err; |
|
431 |
}
|
|
432 |
||
433 |
/**
|
|
434 |
* gc_sync_wbufs - sync write-buffers for GC.
|
|
435 |
* @c: UBIFS file-system description object
|
|
436 |
*
|
|
437 |
* We must guarantee that obsoleting nodes are on flash. Unfortunately they may
|
|
438 |
* be in a write-buffer instead. That is, a node could be written to a
|
|
439 |
* write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is
|
|
440 |
* erased before the write-buffer is sync'd and then there is an unclean
|
|
441 |
* unmount, then an existing node is lost. To avoid this, we sync all
|
|
442 |
* write-buffers.
|
|
443 |
*
|
|
444 |
* This function returns %0 on success or a negative error code on failure.
|
|
445 |
*/
|
|
446 |
static int gc_sync_wbufs(struct ubifs_info *c) |
|
447 |
{
|
|
448 |
int err, i; |
|
449 |
||
450 |
for (i = 0; i < c->jhead_cnt; i++) { |
|
451 |
if (i == GCHD) |
|
452 |
continue; |
|
453 |
err = ubifs_wbuf_sync(&c->jheads[i].wbuf); |
|
454 |
if (err) |
|
455 |
return err; |
|
456 |
}
|
|
457 |
return 0; |
|
458 |
}
|
|
459 |
||
460 |
/**
|
|
461 |
* ubifs_garbage_collect_leb - garbage-collect a logical eraseblock.
|
|
462 |
* @c: UBIFS file-system description object
|
|
463 |
* @lp: describes the LEB to garbage collect
|
|
464 |
*
|
|
465 |
* This function garbage-collects an LEB and returns one of the @LEB_FREED,
|
|
466 |
* @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is
|
|
467 |
* required, and other negative error codes in case of failures.
|
|
468 |
*/
|
|
469 |
int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) |
|
470 |
{
|
|
471 |
struct ubifs_scan_leb *sleb; |
|
472 |
struct ubifs_scan_node *snod; |
|
473 |
struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; |
|
474 |
int err = 0, lnum = lp->lnum; |
|
475 |
||
476 |
ubifs_assert(c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 || |
|
477 |
c->need_recovery); |
|
478 |
ubifs_assert(c->gc_lnum != lnum); |
|
479 |
ubifs_assert(wbuf->lnum != lnum); |
|
480 |
||
481 |
if (lp->free + lp->dirty == c->leb_size) { |
|
482 |
/* Special case - a free LEB */
|
|
483 |
dbg_gc("LEB %d is free, return it", lp->lnum); |
|
484 |
ubifs_assert(!(lp->flags & LPROPS_INDEX)); |
|
485 |
||
486 |
if (lp->free != c->leb_size) { |
|
487 |
/*
|
|
488 |
* Write buffers must be sync'd before unmapping
|
|
489 |
* freeable LEBs, because one of them may contain data
|
|
490 |
* which obsoletes something in 'lp->pnum'.
|
|
491 |
*/
|
|
492 |
err = gc_sync_wbufs(c); |
|
493 |
if (err) |
|
494 |
return err; |
|
495 |
err = ubifs_change_one_lp(c, lp->lnum, c->leb_size, |
|
496 |
0, 0, 0, 0); |
|
497 |
if (err) |
|
498 |
return err; |
|
499 |
}
|
|
500 |
err = ubifs_leb_unmap(c, lp->lnum); |
|
501 |
if (err) |
|
502 |
return err; |
|
503 |
||
504 |
if (c->gc_lnum == -1) { |
|
505 |
c->gc_lnum = lnum; |
|
506 |
return LEB_RETAINED; |
|
507 |
}
|
|
508 |
||
509 |
return LEB_FREED; |
|
510 |
}
|
|
511 |
||
512 |
/*
|
|
513 |
* We scan the entire LEB even though we only really need to scan up to
|
|
514 |
* (c->leb_size - lp->free).
|
|
515 |
*/
|
|
516 |
sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0); |
|
517 |
if (IS_ERR(sleb)) |
|
518 |
return PTR_ERR(sleb); |
|
519 |
||
520 |
ubifs_assert(!list_empty(&sleb->nodes)); |
|
521 |
snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list); |
|
522 |
||
523 |
if (snod->type == UBIFS_IDX_NODE) { |
|
524 |
struct ubifs_gced_idx_leb *idx_gc; |
|
525 |
||
526 |
dbg_gc("indexing LEB %d (free %d, dirty %d)", |
|
527 |
lnum, lp->free, lp->dirty); |
|
528 |
list_for_each_entry(snod, &sleb->nodes, list) { |
|
529 |
struct ubifs_idx_node *idx = snod->node; |
|
530 |
int level = le16_to_cpu(idx->level); |
|
531 |
||
532 |
ubifs_assert(snod->type == UBIFS_IDX_NODE); |
|
533 |
key_read(c, ubifs_idx_key(c, idx), &snod->key); |
|
534 |
err = ubifs_dirty_idx_node(c, &snod->key, level, lnum, |
|
535 |
snod->offs); |
|
536 |
if (err) |
|
537 |
goto out; |
|
538 |
}
|
|
539 |
||
540 |
idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS); |
|
541 |
if (!idx_gc) { |
|
542 |
err = -ENOMEM; |
|
543 |
goto out; |
|
544 |
}
|
|
545 |
||
546 |
idx_gc->lnum = lnum; |
|
547 |
idx_gc->unmap = 0; |
|
548 |
list_add(&idx_gc->list, &c->idx_gc); |
|
549 |
||
550 |
/*
|
|
551 |
* Don't release the LEB until after the next commit, because
|
|
552 |
* it may contain data which is needed for recovery. So
|
|
553 |
* although we freed this LEB, it will become usable only after
|
|
554 |
* the commit.
|
|
555 |
*/
|
|
556 |
err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, |
|
557 |
LPROPS_INDEX, 1); |
|
558 |
if (err) |
|
559 |
goto out; |
|
560 |
err = LEB_FREED_IDX; |
|
561 |
} else { |
|
562 |
dbg_gc("data LEB %d (free %d, dirty %d)", |
|
563 |
lnum, lp->free, lp->dirty); |
|
564 |
||
565 |
err = move_nodes(c, sleb); |
|
566 |
if (err) |
|
567 |
goto out_inc_seq; |
|
568 |
||
569 |
err = gc_sync_wbufs(c); |
|
570 |
if (err) |
|
571 |
goto out_inc_seq; |
|
572 |
||
573 |
err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0); |
|
574 |
if (err) |
|
575 |
goto out_inc_seq; |
|
576 |
||
577 |
/* Allow for races with TNC */
|
|
578 |
c->gced_lnum = lnum; |
|
579 |
smp_wmb(); |
|
580 |
c->gc_seq += 1; |
|
581 |
smp_wmb(); |
|
582 |
||
583 |
if (c->gc_lnum == -1) { |
|
584 |
c->gc_lnum = lnum; |
|
585 |
err = LEB_RETAINED; |
|
586 |
} else { |
|
587 |
err = ubifs_wbuf_sync_nolock(wbuf); |
|
588 |
if (err) |
|
589 |
goto out; |
|
590 |
||
591 |
err = ubifs_leb_unmap(c, lnum); |
|
592 |
if (err) |
|
593 |
goto out; |
|
594 |
||
595 |
err = LEB_FREED; |
|
596 |
}
|
|
597 |
}
|
|
598 |
||
599 |
out: |
|
600 |
ubifs_scan_destroy(sleb); |
|
601 |
return err; |
|
602 |
||
603 |
out_inc_seq: |
|
604 |
/* We may have moved at least some nodes so allow for races with TNC */
|
|
605 |
c->gced_lnum = lnum; |
|
606 |
smp_wmb(); |
|
607 |
c->gc_seq += 1; |
|
608 |
smp_wmb(); |
|
609 |
goto out; |
|
610 |
}
|
|
611 |
||
612 |
/**
|
|
613 |
* ubifs_garbage_collect - UBIFS garbage collector.
|
|
614 |
* @c: UBIFS file-system description object
|
|
615 |
* @anyway: do GC even if there are free LEBs
|
|
616 |
*
|
|
617 |
* This function does out-of-place garbage collection. The return codes are:
|
|
618 |
* o positive LEB number if the LEB has been freed and may be used;
|
|
619 |
* o %-EAGAIN if the caller has to run commit;
|
|
620 |
* o %-ENOSPC if GC failed to make any progress;
|
|
621 |
* o other negative error codes in case of other errors.
|
|
622 |
*
|
|
623 |
* Garbage collector writes data to the journal when GC'ing data LEBs, and just
|
|
624 |
* marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point
|
|
625 |
* commit may be required. But commit cannot be run from inside GC, because the
|
|
626 |
* caller might be holding the commit lock, so %-EAGAIN is returned instead;
|
|
627 |
* And this error code means that the caller has to run commit, and re-run GC
|
|
628 |
* if there is still no free space.
|
|
629 |
*
|
|
630 |
* There are many reasons why this function may return %-EAGAIN:
|
|
631 |
* o the log is full and there is no space to write an LEB reference for
|
|
632 |
* @c->gc_lnum;
|
|
633 |
* o the journal is too large and exceeds size limitations;
|
|
634 |
* o GC moved indexing LEBs, but they can be used only after the commit;
|
|
635 |
* o the shrinker fails to find clean znodes to free and requests the commit;
|
|
636 |
* o etc.
|
|
637 |
*
|
|
638 |
* Note, if the file-system is close to be full, this function may return
|
|
639 |
* %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of
|
|
640 |
* the function. E.g., this happens if the limits on the journal size are too
|
|
641 |
* tough and GC writes too much to the journal before an LEB is freed. This
|
|
642 |
* might also mean that the journal is too large, and the TNC becomes to big,
|
|
643 |
* so that the shrinker is constantly called, finds not clean znodes to free,
|
|
644 |
* and requests commit. Well, this may also happen if the journal is all right,
|
|
645 |
* but another kernel process consumes too much memory. Anyway, infinite
|
|
646 |
* %-EAGAIN may happen, but in some extreme/misconfiguration cases.
|
|
647 |
*/
|
|
648 |
int ubifs_garbage_collect(struct ubifs_info *c, int anyway) |
|
649 |
{
|
|
650 |
int i, err, ret, min_space = c->dead_wm; |
|
651 |
struct ubifs_lprops lp; |
|
652 |
struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; |
|
653 |
||
654 |
ubifs_assert_cmt_locked(c); |
|
655 |
ubifs_assert(!c->ro_media && !c->ro_mount); |
|
656 |
||
657 |
if (ubifs_gc_should_commit(c)) |
|
658 |
return -EAGAIN; |
|
659 |
||
660 |
mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); |
|
661 |
||
662 |
if (c->ro_error) { |
|
663 |
ret = -EROFS; |
|
664 |
goto out_unlock; |
|
665 |
}
|
|
666 |
||
667 |
/* We expect the write-buffer to be empty on entry */
|
|
668 |
ubifs_assert(!wbuf->used); |
|
669 |
||
670 |
for (i = 0; ; i++) { |
|
671 |
int space_before, space_after; |
|
672 |
||
673 |
cond_resched(); |
|
674 |
||
675 |
/* Give the commit an opportunity to run */
|
|
676 |
if (ubifs_gc_should_commit(c)) { |
|
677 |
ret = -EAGAIN; |
|
678 |
break; |
|
679 |
}
|
|
680 |
||
681 |
if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) { |
|
682 |
/*
|
|
683 |
* We've done enough iterations. Indexing LEBs were
|
|
684 |
* moved and will be available after the commit.
|
|
685 |
*/
|
|
686 |
dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN"); |
|
687 |
ubifs_commit_required(c); |
|
688 |
ret = -EAGAIN; |
|
689 |
break; |
|
690 |
}
|
|
691 |
||
692 |
if (i > HARD_LEBS_LIMIT) { |
|
693 |
/*
|
|
694 |
* We've moved too many LEBs and have not made
|
|
695 |
* progress, give up.
|
|
696 |
*/
|
|
697 |
dbg_gc("hard limit, -ENOSPC"); |
|
698 |
ret = -ENOSPC; |
|
699 |
break; |
|
700 |
}
|
|
701 |
||
702 |
/*
|
|
703 |
* Empty and freeable LEBs can turn up while we waited for
|
|
704 |
* the wbuf lock, or while we have been running GC. In that
|
|
705 |
* case, we should just return one of those instead of
|
|
706 |
* continuing to GC dirty LEBs. Hence we request
|
|
707 |
* 'ubifs_find_dirty_leb()' to return an empty LEB if it can.
|
|
708 |
*/
|
|
709 |
ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1); |
|
710 |
if (ret) { |
|
711 |
if (ret == -ENOSPC) |
|
712 |
dbg_gc("no more dirty LEBs"); |
|
713 |
break; |
|
714 |
}
|
|
715 |
||
716 |
dbg_gc("found LEB %d: free %d, dirty %d, sum %d (min. space %d)", |
|
717 |
lp.lnum, lp.free, lp.dirty, lp.free + lp.dirty, |
|
718 |
min_space); |
|
719 |
||
720 |
space_before = c->leb_size - wbuf->offs - wbuf->used; |
|
721 |
if (wbuf->lnum == -1) |
|
722 |
space_before = 0; |
|
723 |
||
724 |
ret = ubifs_garbage_collect_leb(c, &lp); |
|
725 |
if (ret < 0) { |
|
726 |
if (ret == -EAGAIN) { |
|
727 |
/*
|
|
728 |
* This is not error, so we have to return the
|
|
729 |
* LEB to lprops. But if 'ubifs_return_leb()'
|
|
730 |
* fails, its failure code is propagated to the
|
|
731 |
* caller instead of the original '-EAGAIN'.
|
|
732 |
*/
|
|
733 |
err = ubifs_return_leb(c, lp.lnum); |
|
734 |
if (err) |
|
735 |
ret = err; |
|
736 |
break; |
|
737 |
}
|
|
738 |
goto out; |
|
739 |
}
|
|
740 |
||
741 |
if (ret == LEB_FREED) { |
|
742 |
/* An LEB has been freed and is ready for use */
|
|
743 |
dbg_gc("LEB %d freed, return", lp.lnum); |
|
744 |
ret = lp.lnum; |
|
745 |
break; |
|
746 |
}
|
|
747 |
||
748 |
if (ret == LEB_FREED_IDX) { |
|
749 |
/*
|
|
750 |
* This was an indexing LEB and it cannot be
|
|
751 |
* immediately used. And instead of requesting the
|
|
752 |
* commit straight away, we try to garbage collect some
|
|
753 |
* more.
|
|
754 |
*/
|
|
755 |
dbg_gc("indexing LEB %d freed, continue", lp.lnum); |
|
756 |
continue; |
|
757 |
}
|
|
758 |
||
759 |
ubifs_assert(ret == LEB_RETAINED); |
|
760 |
space_after = c->leb_size - wbuf->offs - wbuf->used; |
|
761 |
dbg_gc("LEB %d retained, freed %d bytes", lp.lnum, |
|
762 |
space_after - space_before); |
|
763 |
||
764 |
if (space_after > space_before) { |
|
765 |
/* GC makes progress, keep working */
|
|
766 |
min_space >>= 1; |
|
767 |
if (min_space < c->dead_wm) |
|
768 |
min_space = c->dead_wm; |
|
769 |
continue; |
|
770 |
}
|
|
771 |
||
772 |
dbg_gc("did not make progress"); |
|
773 |
||
774 |
/*
|
|
775 |
* GC moved an LEB bud have not done any progress. This means
|
|
776 |
* that the previous GC head LEB contained too few free space
|
|
777 |
* and the LEB which was GC'ed contained only large nodes which
|
|
778 |
* did not fit that space.
|
|
779 |
*
|
|
780 |
* We can do 2 things:
|
|
781 |
* 1. pick another LEB in a hope it'll contain a small node
|
|
782 |
* which will fit the space we have at the end of current GC
|
|
783 |
* head LEB, but there is no guarantee, so we try this out
|
|
784 |
* unless we have already been working for too long;
|
|
785 |
* 2. request an LEB with more dirty space, which will force
|
|
786 |
* 'ubifs_find_dirty_leb()' to start scanning the lprops
|
|
787 |
* table, instead of just picking one from the heap
|
|
788 |
* (previously it already picked the dirtiest LEB).
|
|
789 |
*/
|
|
790 |
if (i < SOFT_LEBS_LIMIT) { |
|
791 |
dbg_gc("try again"); |
|
792 |
continue; |
|
793 |
}
|
|
794 |
||
795 |
min_space <<= 1; |
|
796 |
if (min_space > c->dark_wm) |
|
797 |
min_space = c->dark_wm; |
|
798 |
dbg_gc("set min. space to %d", min_space); |
|
799 |
}
|
|
800 |
||
801 |
if (ret == -ENOSPC && !list_empty(&c->idx_gc)) { |
|
802 |
dbg_gc("no space, some index LEBs GC'ed, -EAGAIN"); |
|
803 |
ubifs_commit_required(c); |
|
804 |
ret = -EAGAIN; |
|
805 |
}
|
|
806 |
||
807 |
err = ubifs_wbuf_sync_nolock(wbuf); |
|
808 |
if (!err) |
|
809 |
err = ubifs_leb_unmap(c, c->gc_lnum); |
|
810 |
if (err) { |
|
811 |
ret = err; |
|
812 |
goto out; |
|
813 |
}
|
|
814 |
out_unlock: |
|
815 |
mutex_unlock(&wbuf->io_mutex); |
|
816 |
return ret; |
|
817 |
||
818 |
out: |
|
819 |
ubifs_assert(ret < 0); |
|
820 |
ubifs_assert(ret != -ENOSPC && ret != -EAGAIN); |
|
821 |
ubifs_wbuf_sync_nolock(wbuf); |
|
822 |
ubifs_ro_mode(c, ret); |
|
823 |
mutex_unlock(&wbuf->io_mutex); |
|
824 |
ubifs_return_leb(c, lp.lnum); |
|
825 |
return ret; |
|
826 |
}
|
|
827 |
||
828 |
/**
|
|
829 |
* ubifs_gc_start_commit - garbage collection at start of commit.
|
|
830 |
* @c: UBIFS file-system description object
|
|
831 |
*
|
|
832 |
* If a LEB has only dirty and free space, then we may safely unmap it and make
|
|
833 |
* it free. Note, we cannot do this with indexing LEBs because dirty space may
|
|
834 |
* correspond index nodes that are required for recovery. In that case, the
|
|
835 |
* LEB cannot be unmapped until after the next commit.
|
|
836 |
*
|
|
837 |
* This function returns %0 upon success and a negative error code upon failure.
|
|
838 |
*/
|
|
839 |
int ubifs_gc_start_commit(struct ubifs_info *c) |
|
840 |
{
|
|
841 |
struct ubifs_gced_idx_leb *idx_gc; |
|
842 |
const struct ubifs_lprops *lp; |
|
843 |
int err = 0, flags; |
|
844 |
||
845 |
ubifs_get_lprops(c); |
|
846 |
||
847 |
/*
|
|
848 |
* Unmap (non-index) freeable LEBs. Note that recovery requires that all
|
|
849 |
* wbufs are sync'd before this, which is done in 'do_commit()'.
|
|
850 |
*/
|
|
851 |
while (1) { |
|
852 |
lp = ubifs_fast_find_freeable(c); |
|
853 |
if (IS_ERR(lp)) { |
|
854 |
err = PTR_ERR(lp); |
|
855 |
goto out; |
|
856 |
}
|
|
857 |
if (!lp) |
|
858 |
break; |
|
859 |
ubifs_assert(!(lp->flags & LPROPS_TAKEN)); |
|
860 |
ubifs_assert(!(lp->flags & LPROPS_INDEX)); |
|
861 |
err = ubifs_leb_unmap(c, lp->lnum); |
|
862 |
if (err) |
|
863 |
goto out; |
|
864 |
lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0); |
|
865 |
if (IS_ERR(lp)) { |
|
866 |
err = PTR_ERR(lp); |
|
867 |
goto out; |
|
868 |
}
|
|
869 |
ubifs_assert(!(lp->flags & LPROPS_TAKEN)); |
|
870 |
ubifs_assert(!(lp->flags & LPROPS_INDEX)); |
|
871 |
}
|
|
872 |
||
873 |
/* Mark GC'd index LEBs OK to unmap after this commit finishes */
|
|
874 |
list_for_each_entry(idx_gc, &c->idx_gc, list) |
|
875 |
idx_gc->unmap = 1; |
|
876 |
||
877 |
/* Record index freeable LEBs for unmapping after commit */
|
|
878 |
while (1) { |
|
879 |
lp = ubifs_fast_find_frdi_idx(c); |
|
880 |
if (IS_ERR(lp)) { |
|
881 |
err = PTR_ERR(lp); |
|
882 |
goto out; |
|
883 |
}
|
|
884 |
if (!lp) |
|
885 |
break; |
|
886 |
idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS); |
|
887 |
if (!idx_gc) { |
|
888 |
err = -ENOMEM; |
|
889 |
goto out; |
|
890 |
}
|
|
891 |
ubifs_assert(!(lp->flags & LPROPS_TAKEN)); |
|
892 |
ubifs_assert(lp->flags & LPROPS_INDEX); |
|
893 |
/* Don't release the LEB until after the next commit */
|
|
894 |
flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX; |
|
895 |
lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1); |
|
896 |
if (IS_ERR(lp)) { |
|
897 |
err = PTR_ERR(lp); |
|
898 |
kfree(idx_gc); |
|
899 |
goto out; |
|
900 |
}
|
|
901 |
ubifs_assert(lp->flags & LPROPS_TAKEN); |
|
902 |
ubifs_assert(!(lp->flags & LPROPS_INDEX)); |
|
903 |
idx_gc->lnum = lp->lnum; |
|
904 |
idx_gc->unmap = 1; |
|
905 |
list_add(&idx_gc->list, &c->idx_gc); |
|
906 |
}
|
|
907 |
out: |
|
908 |
ubifs_release_lprops(c); |
|
909 |
return err; |
|
910 |
}
|
|
911 |
||
912 |
/**
|
|
913 |
* ubifs_gc_end_commit - garbage collection at end of commit.
|
|
914 |
* @c: UBIFS file-system description object
|
|
915 |
*
|
|
916 |
* This function completes out-of-place garbage collection of index LEBs.
|
|
917 |
*/
|
|
918 |
int ubifs_gc_end_commit(struct ubifs_info *c) |
|
919 |
{
|
|
920 |
struct ubifs_gced_idx_leb *idx_gc, *tmp; |
|
921 |
struct ubifs_wbuf *wbuf; |
|
922 |
int err = 0; |
|
923 |
||
924 |
wbuf = &c->jheads[GCHD].wbuf; |
|
925 |
mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); |
|
926 |
list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list) |
|
927 |
if (idx_gc->unmap) { |
|
928 |
dbg_gc("LEB %d", idx_gc->lnum); |
|
929 |
err = ubifs_leb_unmap(c, idx_gc->lnum); |
|
930 |
if (err) |
|
931 |
goto out; |
|
932 |
err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC, |
|
933 |
LPROPS_NC, 0, LPROPS_TAKEN, -1); |
|
934 |
if (err) |
|
935 |
goto out; |
|
936 |
list_del(&idx_gc->list); |
|
937 |
kfree(idx_gc); |
|
938 |
}
|
|
939 |
out: |
|
940 |
mutex_unlock(&wbuf->io_mutex); |
|
941 |
return err; |
|
942 |
}
|
|
943 |
||
944 |
/**
|
|
945 |
* ubifs_destroy_idx_gc - destroy idx_gc list.
|
|
946 |
* @c: UBIFS file-system description object
|
|
947 |
*
|
|
948 |
* This function destroys the @c->idx_gc list. It is called when unmounting
|
|
949 |
* so locks are not needed. Returns zero in case of success and a negative
|
|
950 |
* error code in case of failure.
|
|
951 |
*/
|
|
952 |
void ubifs_destroy_idx_gc(struct ubifs_info *c) |
|
953 |
{
|
|
954 |
while (!list_empty(&c->idx_gc)) { |
|
955 |
struct ubifs_gced_idx_leb *idx_gc; |
|
956 |
||
957 |
idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, |
|
958 |
list); |
|
959 |
c->idx_gc_cnt -= 1; |
|
960 |
list_del(&idx_gc->list); |
|
961 |
kfree(idx_gc); |
|
962 |
}
|
|
963 |
}
|
|
964 |
||
965 |
/**
|
|
966 |
* ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list.
|
|
967 |
* @c: UBIFS file-system description object
|
|
968 |
*
|
|
969 |
* Called during start commit so locks are not needed.
|
|
970 |
*/
|
|
971 |
int ubifs_get_idx_gc_leb(struct ubifs_info *c) |
|
972 |
{
|
|
973 |
struct ubifs_gced_idx_leb *idx_gc; |
|
974 |
int lnum; |
|
975 |
||
976 |
if (list_empty(&c->idx_gc)) |
|
977 |
return -ENOSPC; |
|
978 |
idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list); |
|
979 |
lnum = idx_gc->lnum; |
|
980 |
/* c->idx_gc_cnt is updated by the caller when lprops are updated */
|
|
981 |
list_del(&idx_gc->list); |
|
982 |
kfree(idx_gc); |
|
983 |
return lnum; |
|
984 |
}
|