2
* Swap block device support for MTDs
3
* Turns an MTD device into a swap device with block wear leveling
5
* Copyright © 2007,2011 Nokia Corporation. All rights reserved.
7
* Authors: Jarkko Lavinen <jarkko.lavinen@nokia.com>
9
* Based on Richard Purdie's earlier implementation in 2007. Background
10
* support and lock-less operation written by Adrian Hunter.
12
* This program is free software; you can redistribute it and/or
13
* modify it under the terms of the GNU General Public License
14
* version 2 as published by the Free Software Foundation.
16
* This program is distributed in the hope that it will be useful, but
17
* WITHOUT ANY WARRANTY; without even the implied warranty of
18
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19
* General Public License for more details.
21
* You should have received a copy of the GNU General Public License
22
* along with this program; if not, write to the Free Software
23
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
#include <linux/kernel.h>
28
#include <linux/module.h>
29
#include <linux/mtd/mtd.h>
30
#include <linux/mtd/blktrans.h>
31
#include <linux/rbtree.h>
32
#include <linux/sched.h>
33
#include <linux/slab.h>
34
#include <linux/vmalloc.h>
35
#include <linux/genhd.h>
36
#include <linux/swap.h>
37
#include <linux/debugfs.h>
38
#include <linux/seq_file.h>
39
#include <linux/device.h>
40
#include <linux/math64.h>
42
#define MTDSWAP_PREFIX "mtdswap"
45
* The number of free eraseblocks when GC should stop
47
#define CLEAN_BLOCK_THRESHOLD 20
50
* Number of free eraseblocks below which GC can also collect low frag
53
#define LOW_FRAG_GC_TRESHOLD 5
56
* Wear level cost amortization. We want to do wear leveling on the background
57
* without disturbing gc too much. This is made by defining max GC frequency.
58
* Frequency value 6 means 1/6 of the GC passes will pick an erase block based
59
* on the biggest wear difference rather than the biggest dirtiness.
61
* The lower freq2 should be chosen so that it makes sure the maximum erase
62
* difference will decrease even if a malicious application is deliberately
63
* trying to make erase differences large.
65
#define MAX_ERASE_DIFF 4000
66
#define COLLECT_NONDIRTY_BASE MAX_ERASE_DIFF
67
#define COLLECT_NONDIRTY_FREQ1 6
68
#define COLLECT_NONDIRTY_FREQ2 4
70
#define PAGE_UNDEF UINT_MAX
71
#define BLOCK_UNDEF UINT_MAX
72
#define BLOCK_ERROR (UINT_MAX - 1)
73
#define BLOCK_MAX (UINT_MAX - 2)
75
#define EBLOCK_BAD (1 << 0)
76
#define EBLOCK_NOMAGIC (1 << 1)
77
#define EBLOCK_BITFLIP (1 << 2)
78
#define EBLOCK_FAILED (1 << 3)
79
#define EBLOCK_READERR (1 << 4)
80
#define EBLOCK_IDX_SHIFT 5
87
unsigned int active_count;
88
unsigned int erase_count;
89
unsigned int pad; /* speeds up pointer decremtnt */
92
#define MTDSWAP_ECNT_MIN(rbroot) (rb_entry(rb_first(rbroot), struct swap_eb, \
94
#define MTDSWAP_ECNT_MAX(rbroot) (rb_entry(rb_last(rbroot), struct swap_eb, \
114
struct mtd_blktrans_dev *mbd_dev;
115
struct mtd_info *mtd;
118
unsigned int *page_data;
119
unsigned int *revmap;
122
unsigned int spare_eblks;
123
unsigned int pages_per_eblk;
124
unsigned int max_erase_count;
125
struct swap_eb *eb_data;
127
struct mtdswap_tree trees[MTDSWAP_TREE_CNT];
129
unsigned long long sect_read_count;
130
unsigned long long sect_write_count;
131
unsigned long long mtd_write_count;
132
unsigned long long mtd_read_count;
133
unsigned long long discard_count;
134
unsigned long long discard_page_count;
136
unsigned int curr_write_pos;
137
struct swap_eb *curr_write;
142
struct dentry *debugfs_root;
145
struct mtdswap_oobdata {
148
} __attribute__((packed));
150
#define MTDSWAP_MAGIC_CLEAN 0x2095
151
#define MTDSWAP_MAGIC_DIRTY (MTDSWAP_MAGIC_CLEAN + 1)
152
#define MTDSWAP_TYPE_CLEAN 0
153
#define MTDSWAP_TYPE_DIRTY 1
154
#define MTDSWAP_OOBSIZE sizeof(struct mtdswap_oobdata)
156
#define MTDSWAP_ERASE_RETRIES 3 /* Before marking erase block bad */
157
#define MTDSWAP_IO_RETRIES 3
160
MTDSWAP_SCANNED_CLEAN,
161
MTDSWAP_SCANNED_DIRTY,
162
MTDSWAP_SCANNED_BITFLIP,
167
* In the worst case mtdswap_writesect() has allocated the last clean
168
* page from the current block and is then pre-empted by the GC
169
* thread. The thread can consume a full erase block when moving a
172
#define MIN_SPARE_EBLOCKS 2
173
#define MIN_ERASE_BLOCKS (MIN_SPARE_EBLOCKS + 1)
175
#define TREE_ROOT(d, name) (&d->trees[MTDSWAP_ ## name].root)
176
#define TREE_EMPTY(d, name) (TREE_ROOT(d, name)->rb_node == NULL)
177
#define TREE_NONEMPTY(d, name) (!TREE_EMPTY(d, name))
178
#define TREE_COUNT(d, name) (d->trees[MTDSWAP_ ## name].count)
180
#define MTDSWAP_MBD_TO_MTDSWAP(dev) ((struct mtdswap_dev *)dev->priv)
182
static char partitions[128] = "";
183
module_param_string(partitions, partitions, sizeof(partitions), 0444);
184
MODULE_PARM_DESC(partitions, "MTD partition numbers to use as swap "
185
"partitions=\"1,3,5\"");
187
static unsigned int spare_eblocks = 10;
188
module_param(spare_eblocks, uint, 0444);
189
MODULE_PARM_DESC(spare_eblocks, "Percentage of spare erase blocks for "
190
"garbage collection (default 10%)");
192
static bool header; /* false */
193
module_param(header, bool, 0444);
194
MODULE_PARM_DESC(header,
195
"Include builtin swap header (default 0, without header)");
197
static int mtdswap_gc(struct mtdswap_dev *d, unsigned int background);
199
static loff_t mtdswap_eb_offset(struct mtdswap_dev *d, struct swap_eb *eb)
201
return (loff_t)(eb - d->eb_data) * d->mtd->erasesize;
204
static void mtdswap_eb_detach(struct mtdswap_dev *d, struct swap_eb *eb)
207
struct mtdswap_tree *tp;
210
tp = container_of(eb->root, struct mtdswap_tree, root);
211
oldidx = tp - &d->trees[0];
213
d->trees[oldidx].count--;
214
rb_erase(&eb->rb, eb->root);
218
static void __mtdswap_rb_add(struct rb_root *root, struct swap_eb *eb)
220
struct rb_node **p, *parent = NULL;
226
cur = rb_entry(parent, struct swap_eb, rb);
227
if (eb->erase_count > cur->erase_count)
233
rb_link_node(&eb->rb, parent, p);
234
rb_insert_color(&eb->rb, root);
237
static void mtdswap_rb_add(struct mtdswap_dev *d, struct swap_eb *eb, int idx)
239
struct rb_root *root;
241
if (eb->root == &d->trees[idx].root)
244
mtdswap_eb_detach(d, eb);
245
root = &d->trees[idx].root;
246
__mtdswap_rb_add(root, eb);
248
d->trees[idx].count++;
251
static struct rb_node *mtdswap_rb_index(struct rb_root *root, unsigned int idx)
258
while (i < idx && p) {
266
static int mtdswap_handle_badblock(struct mtdswap_dev *d, struct swap_eb *eb)
272
eb->flags |= EBLOCK_BAD;
273
mtdswap_eb_detach(d, eb);
276
/* badblocks not supported */
277
if (!d->mtd->block_markbad)
280
offset = mtdswap_eb_offset(d, eb);
281
dev_warn(d->dev, "Marking bad block at %08llx\n", offset);
282
ret = d->mtd->block_markbad(d->mtd, offset);
285
dev_warn(d->dev, "Mark block bad failed for block at %08llx "
286
"error %d\n", offset, ret);
294
static int mtdswap_handle_write_error(struct mtdswap_dev *d, struct swap_eb *eb)
296
unsigned int marked = eb->flags & EBLOCK_FAILED;
297
struct swap_eb *curr_write = d->curr_write;
299
eb->flags |= EBLOCK_FAILED;
300
if (curr_write == eb) {
301
d->curr_write = NULL;
303
if (!marked && d->curr_write_pos != 0) {
304
mtdswap_rb_add(d, eb, MTDSWAP_FAILING);
309
return mtdswap_handle_badblock(d, eb);
312
static int mtdswap_read_oob(struct mtdswap_dev *d, loff_t from,
313
struct mtd_oob_ops *ops)
315
int ret = d->mtd->read_oob(d->mtd, from, ops);
321
dev_warn(d->dev, "Read OOB failed %d for block at %08llx\n",
326
if (ops->oobretlen < ops->ooblen) {
327
dev_warn(d->dev, "Read OOB return short read (%zd bytes not "
328
"%zd) for block at %08llx\n",
329
ops->oobretlen, ops->ooblen, from);
336
static int mtdswap_read_markers(struct mtdswap_dev *d, struct swap_eb *eb)
338
struct mtdswap_oobdata *data, *data2;
341
struct mtd_oob_ops ops;
343
offset = mtdswap_eb_offset(d, eb);
345
/* Check first if the block is bad. */
346
if (d->mtd->block_isbad && d->mtd->block_isbad(d->mtd, offset))
347
return MTDSWAP_SCANNED_BAD;
349
ops.ooblen = 2 * d->mtd->ecclayout->oobavail;
350
ops.oobbuf = d->oob_buf;
353
ops.mode = MTD_OOB_AUTO;
355
ret = mtdswap_read_oob(d, offset, &ops);
357
if (ret && ret != -EUCLEAN)
360
data = (struct mtdswap_oobdata *)d->oob_buf;
361
data2 = (struct mtdswap_oobdata *)
362
(d->oob_buf + d->mtd->ecclayout->oobavail);
364
if (le16_to_cpu(data->magic) == MTDSWAP_MAGIC_CLEAN) {
365
eb->erase_count = le32_to_cpu(data->count);
367
ret = MTDSWAP_SCANNED_BITFLIP;
369
if (le16_to_cpu(data2->magic) == MTDSWAP_MAGIC_DIRTY)
370
ret = MTDSWAP_SCANNED_DIRTY;
372
ret = MTDSWAP_SCANNED_CLEAN;
375
eb->flags |= EBLOCK_NOMAGIC;
376
ret = MTDSWAP_SCANNED_DIRTY;
382
static int mtdswap_write_marker(struct mtdswap_dev *d, struct swap_eb *eb,
385
struct mtdswap_oobdata n;
388
struct mtd_oob_ops ops;
391
ops.oobbuf = (uint8_t *)&n;
392
ops.mode = MTD_OOB_AUTO;
395
if (marker == MTDSWAP_TYPE_CLEAN) {
396
n.magic = cpu_to_le16(MTDSWAP_MAGIC_CLEAN);
397
n.count = cpu_to_le32(eb->erase_count);
398
ops.ooblen = MTDSWAP_OOBSIZE;
399
offset = mtdswap_eb_offset(d, eb);
401
n.magic = cpu_to_le16(MTDSWAP_MAGIC_DIRTY);
402
ops.ooblen = sizeof(n.magic);
403
offset = mtdswap_eb_offset(d, eb) + d->mtd->writesize;
406
ret = d->mtd->write_oob(d->mtd, offset , &ops);
409
dev_warn(d->dev, "Write OOB failed for block at %08llx "
410
"error %d\n", offset, ret);
411
if (ret == -EIO || ret == -EBADMSG)
412
mtdswap_handle_write_error(d, eb);
416
if (ops.oobretlen != ops.ooblen) {
417
dev_warn(d->dev, "Short OOB write for block at %08llx: "
419
offset, ops.oobretlen, ops.ooblen);
427
* Are there any erase blocks without MAGIC_CLEAN header, presumably
428
* because power was cut off after erase but before header write? We
429
* need to guestimate the erase count.
431
static void mtdswap_check_counts(struct mtdswap_dev *d)
433
struct rb_root hist_root = RB_ROOT;
434
struct rb_node *medrb;
436
unsigned int i, cnt, median;
439
for (i = 0; i < d->eblks; i++) {
442
if (eb->flags & (EBLOCK_NOMAGIC | EBLOCK_BAD | EBLOCK_READERR))
445
__mtdswap_rb_add(&hist_root, eb);
452
medrb = mtdswap_rb_index(&hist_root, cnt / 2);
453
median = rb_entry(medrb, struct swap_eb, rb)->erase_count;
455
d->max_erase_count = MTDSWAP_ECNT_MAX(&hist_root);
457
for (i = 0; i < d->eblks; i++) {
460
if (eb->flags & (EBLOCK_NOMAGIC | EBLOCK_READERR))
461
eb->erase_count = median;
463
if (eb->flags & (EBLOCK_NOMAGIC | EBLOCK_BAD | EBLOCK_READERR))
466
rb_erase(&eb->rb, &hist_root);
470
static void mtdswap_scan_eblks(struct mtdswap_dev *d)
476
for (i = 0; i < d->eblks; i++) {
479
status = mtdswap_read_markers(d, eb);
481
eb->flags |= EBLOCK_READERR;
482
else if (status == MTDSWAP_SCANNED_BAD) {
483
eb->flags |= EBLOCK_BAD;
488
case MTDSWAP_SCANNED_CLEAN:
491
case MTDSWAP_SCANNED_DIRTY:
492
case MTDSWAP_SCANNED_BITFLIP:
496
idx = MTDSWAP_FAILING;
499
eb->flags |= (idx << EBLOCK_IDX_SHIFT);
502
mtdswap_check_counts(d);
504
for (i = 0; i < d->eblks; i++) {
507
if (eb->flags & EBLOCK_BAD)
510
idx = eb->flags >> EBLOCK_IDX_SHIFT;
511
mtdswap_rb_add(d, eb, idx);
516
* Place eblk into a tree corresponding to its number of active blocks
519
static void mtdswap_store_eb(struct mtdswap_dev *d, struct swap_eb *eb)
521
unsigned int weight = eb->active_count;
522
unsigned int maxweight = d->pages_per_eblk;
524
if (eb == d->curr_write)
527
if (eb->flags & EBLOCK_BITFLIP)
528
mtdswap_rb_add(d, eb, MTDSWAP_BITFLIP);
529
else if (eb->flags & (EBLOCK_READERR | EBLOCK_FAILED))
530
mtdswap_rb_add(d, eb, MTDSWAP_FAILING);
531
if (weight == maxweight)
532
mtdswap_rb_add(d, eb, MTDSWAP_USED);
533
else if (weight == 0)
534
mtdswap_rb_add(d, eb, MTDSWAP_DIRTY);
535
else if (weight > (maxweight/2))
536
mtdswap_rb_add(d, eb, MTDSWAP_LOWFRAG);
538
mtdswap_rb_add(d, eb, MTDSWAP_HIFRAG);
542
static void mtdswap_erase_callback(struct erase_info *done)
544
wait_queue_head_t *wait_q = (wait_queue_head_t *)done->priv;
548
static int mtdswap_erase_block(struct mtdswap_dev *d, struct swap_eb *eb)
550
struct mtd_info *mtd = d->mtd;
551
struct erase_info erase;
552
wait_queue_head_t wq;
553
unsigned int retries = 0;
557
if (eb->erase_count > d->max_erase_count)
558
d->max_erase_count = eb->erase_count;
561
init_waitqueue_head(&wq);
562
memset(&erase, 0, sizeof(struct erase_info));
565
erase.callback = mtdswap_erase_callback;
566
erase.addr = mtdswap_eb_offset(d, eb);
567
erase.len = mtd->erasesize;
568
erase.priv = (u_long)&wq;
570
ret = mtd->erase(mtd, &erase);
572
if (retries++ < MTDSWAP_ERASE_RETRIES) {
574
"erase of erase block %#llx on %s failed",
575
erase.addr, mtd->name);
580
dev_err(d->dev, "Cannot erase erase block %#llx on %s\n",
581
erase.addr, mtd->name);
583
mtdswap_handle_badblock(d, eb);
587
ret = wait_event_interruptible(wq, erase.state == MTD_ERASE_DONE ||
588
erase.state == MTD_ERASE_FAILED);
590
dev_err(d->dev, "Interrupted erase block %#llx erassure on %s",
591
erase.addr, mtd->name);
595
if (erase.state == MTD_ERASE_FAILED) {
596
if (retries++ < MTDSWAP_ERASE_RETRIES) {
598
"erase of erase block %#llx on %s failed",
599
erase.addr, mtd->name);
604
mtdswap_handle_badblock(d, eb);
611
static int mtdswap_map_free_block(struct mtdswap_dev *d, unsigned int page,
615
struct swap_eb *old_eb = d->curr_write;
616
struct rb_root *clean_root;
619
if (old_eb == NULL || d->curr_write_pos >= d->pages_per_eblk) {
621
if (TREE_EMPTY(d, CLEAN))
624
clean_root = TREE_ROOT(d, CLEAN);
625
eb = rb_entry(rb_first(clean_root), struct swap_eb, rb);
626
rb_erase(&eb->rb, clean_root);
628
TREE_COUNT(d, CLEAN)--;
630
ret = mtdswap_write_marker(d, eb, MTDSWAP_TYPE_DIRTY);
631
} while (ret == -EIO || ret == -EBADMSG);
636
d->curr_write_pos = 0;
639
mtdswap_store_eb(d, old_eb);
642
*block = (d->curr_write - d->eb_data) * d->pages_per_eblk +
645
d->curr_write->active_count++;
646
d->revmap[*block] = page;
652
static unsigned int mtdswap_free_page_cnt(struct mtdswap_dev *d)
654
return TREE_COUNT(d, CLEAN) * d->pages_per_eblk +
655
d->pages_per_eblk - d->curr_write_pos;
658
static unsigned int mtdswap_enough_free_pages(struct mtdswap_dev *d)
660
return mtdswap_free_page_cnt(d) > d->pages_per_eblk;
663
static int mtdswap_write_block(struct mtdswap_dev *d, char *buf,
664
unsigned int page, unsigned int *bp, int gc_context)
666
struct mtd_info *mtd = d->mtd;
674
while (!mtdswap_enough_free_pages(d))
675
if (mtdswap_gc(d, 0) > 0)
678
ret = mtdswap_map_free_block(d, page, bp);
679
eb = d->eb_data + (*bp / d->pages_per_eblk);
681
if (ret == -EIO || ret == -EBADMSG) {
682
d->curr_write = NULL;
684
d->revmap[*bp] = PAGE_UNDEF;
691
writepos = (loff_t)*bp << PAGE_SHIFT;
692
ret = mtd->write(mtd, writepos, PAGE_SIZE, &retlen, buf);
693
if (ret == -EIO || ret == -EBADMSG) {
696
d->revmap[*bp] = PAGE_UNDEF;
697
mtdswap_handle_write_error(d, eb);
702
dev_err(d->dev, "Write to MTD device failed: %d (%zd written)",
707
if (retlen != PAGE_SIZE) {
708
dev_err(d->dev, "Short write to MTD device: %zd written",
719
d->revmap[*bp] = PAGE_UNDEF;
724
static int mtdswap_move_block(struct mtdswap_dev *d, unsigned int oldblock,
725
unsigned int *newblock)
727
struct mtd_info *mtd = d->mtd;
728
struct swap_eb *eb, *oldeb;
731
unsigned int page, retries;
734
page = d->revmap[oldblock];
735
readpos = (loff_t) oldblock << PAGE_SHIFT;
739
ret = mtd->read(mtd, readpos, PAGE_SIZE, &retlen, d->page_buf);
741
if (ret < 0 && ret != -EUCLEAN) {
742
oldeb = d->eb_data + oldblock / d->pages_per_eblk;
743
oldeb->flags |= EBLOCK_READERR;
745
dev_err(d->dev, "Read Error: %d (block %u)\n", ret,
748
if (retries < MTDSWAP_IO_RETRIES)
754
if (retlen != PAGE_SIZE) {
755
dev_err(d->dev, "Short read: %zd (block %u)\n", retlen,
761
ret = mtdswap_write_block(d, d->page_buf, page, newblock, 1);
763
d->page_data[page] = BLOCK_ERROR;
764
dev_err(d->dev, "Write error: %d\n", ret);
768
eb = d->eb_data + *newblock / d->pages_per_eblk;
769
d->page_data[page] = *newblock;
770
d->revmap[oldblock] = PAGE_UNDEF;
771
eb = d->eb_data + oldblock / d->pages_per_eblk;
777
d->page_data[page] = BLOCK_ERROR;
778
d->revmap[oldblock] = PAGE_UNDEF;
782
static int mtdswap_gc_eblock(struct mtdswap_dev *d, struct swap_eb *eb)
784
unsigned int i, block, eblk_base, newblock;
788
eblk_base = (eb - d->eb_data) * d->pages_per_eblk;
790
for (i = 0; i < d->pages_per_eblk; i++) {
791
if (d->spare_eblks < MIN_SPARE_EBLOCKS)
794
block = eblk_base + i;
795
if (d->revmap[block] == PAGE_UNDEF)
798
ret = mtdswap_move_block(d, block, &newblock);
799
if (ret < 0 && !errcode)
806
static int __mtdswap_choose_gc_tree(struct mtdswap_dev *d)
810
if (TREE_COUNT(d, CLEAN) < LOW_FRAG_GC_TRESHOLD)
811
stopat = MTDSWAP_LOWFRAG;
813
stopat = MTDSWAP_HIFRAG;
815
for (idx = MTDSWAP_BITFLIP; idx >= stopat; idx--)
816
if (d->trees[idx].root.rb_node != NULL)
822
static int mtdswap_wlfreq(unsigned int maxdiff)
824
unsigned int h, x, y, dist, base;
827
* Calculate linear ramp down from f1 to f2 when maxdiff goes from
828
* MAX_ERASE_DIFF to MAX_ERASE_DIFF + COLLECT_NONDIRTY_BASE. Similar
829
* to triangle with height f1 - f1 and width COLLECT_NONDIRTY_BASE.
832
dist = maxdiff - MAX_ERASE_DIFF;
833
if (dist > COLLECT_NONDIRTY_BASE)
834
dist = COLLECT_NONDIRTY_BASE;
837
* Modelling the slop as right angular triangle with base
838
* COLLECT_NONDIRTY_BASE and height freq1 - freq2. The ratio y/x is
839
* equal to the ratio h/base.
841
h = COLLECT_NONDIRTY_FREQ1 - COLLECT_NONDIRTY_FREQ2;
842
base = COLLECT_NONDIRTY_BASE;
845
y = (x * h + base / 2) / base;
847
return COLLECT_NONDIRTY_FREQ2 + y;
850
static int mtdswap_choose_wl_tree(struct mtdswap_dev *d)
852
static unsigned int pick_cnt;
853
unsigned int i, idx = -1, wear, max;
854
struct rb_root *root;
857
for (i = 0; i <= MTDSWAP_DIRTY; i++) {
858
root = &d->trees[i].root;
859
if (root->rb_node == NULL)
862
wear = d->max_erase_count - MTDSWAP_ECNT_MIN(root);
869
if (max > MAX_ERASE_DIFF && pick_cnt >= mtdswap_wlfreq(max) - 1) {
878
static int mtdswap_choose_gc_tree(struct mtdswap_dev *d,
879
unsigned int background)
883
if (TREE_NONEMPTY(d, FAILING) &&
884
(background || (TREE_EMPTY(d, CLEAN) && TREE_EMPTY(d, DIRTY))))
885
return MTDSWAP_FAILING;
887
idx = mtdswap_choose_wl_tree(d);
888
if (idx >= MTDSWAP_CLEAN)
891
return __mtdswap_choose_gc_tree(d);
894
static struct swap_eb *mtdswap_pick_gc_eblk(struct mtdswap_dev *d,
895
unsigned int background)
897
struct rb_root *rp = NULL;
898
struct swap_eb *eb = NULL;
901
if (background && TREE_COUNT(d, CLEAN) > CLEAN_BLOCK_THRESHOLD &&
902
TREE_EMPTY(d, DIRTY) && TREE_EMPTY(d, FAILING))
905
idx = mtdswap_choose_gc_tree(d, background);
909
rp = &d->trees[idx].root;
910
eb = rb_entry(rb_first(rp), struct swap_eb, rb);
912
rb_erase(&eb->rb, rp);
914
d->trees[idx].count--;
918
static unsigned int mtdswap_test_patt(unsigned int i)
920
return i % 2 ? 0x55555555 : 0xAAAAAAAA;
923
static unsigned int mtdswap_eblk_passes(struct mtdswap_dev *d,
926
struct mtd_info *mtd = d->mtd;
927
unsigned int test, i, j, patt, mtd_pages;
929
unsigned int *p1 = (unsigned int *)d->page_buf;
930
unsigned char *p2 = (unsigned char *)d->oob_buf;
931
struct mtd_oob_ops ops;
934
ops.mode = MTD_OOB_AUTO;
935
ops.len = mtd->writesize;
936
ops.ooblen = mtd->ecclayout->oobavail;
938
ops.datbuf = d->page_buf;
939
ops.oobbuf = d->oob_buf;
940
base = mtdswap_eb_offset(d, eb);
941
mtd_pages = d->pages_per_eblk * PAGE_SIZE / mtd->writesize;
943
for (test = 0; test < 2; test++) {
945
for (i = 0; i < mtd_pages; i++) {
946
patt = mtdswap_test_patt(test + i);
947
memset(d->page_buf, patt, mtd->writesize);
948
memset(d->oob_buf, patt, mtd->ecclayout->oobavail);
949
ret = mtd->write_oob(mtd, pos, &ops);
953
pos += mtd->writesize;
957
for (i = 0; i < mtd_pages; i++) {
958
ret = mtd->read_oob(mtd, pos, &ops);
962
patt = mtdswap_test_patt(test + i);
963
for (j = 0; j < mtd->writesize/sizeof(int); j++)
967
for (j = 0; j < mtd->ecclayout->oobavail; j++)
968
if (p2[j] != (unsigned char)patt)
971
pos += mtd->writesize;
974
ret = mtdswap_erase_block(d, eb);
979
eb->flags &= ~EBLOCK_READERR;
983
mtdswap_handle_badblock(d, eb);
987
static int mtdswap_gc(struct mtdswap_dev *d, unsigned int background)
992
if (d->spare_eblks < MIN_SPARE_EBLOCKS)
995
eb = mtdswap_pick_gc_eblk(d, background);
999
ret = mtdswap_gc_eblock(d, eb);
1003
if (eb->flags & EBLOCK_FAILED) {
1004
mtdswap_handle_badblock(d, eb);
1008
eb->flags &= ~EBLOCK_BITFLIP;
1009
ret = mtdswap_erase_block(d, eb);
1010
if ((eb->flags & EBLOCK_READERR) &&
1011
(ret || !mtdswap_eblk_passes(d, eb)))
1015
ret = mtdswap_write_marker(d, eb, MTDSWAP_TYPE_CLEAN);
1018
mtdswap_rb_add(d, eb, MTDSWAP_CLEAN);
1019
else if (ret != -EIO && ret != -EBADMSG)
1020
mtdswap_rb_add(d, eb, MTDSWAP_DIRTY);
1025
static void mtdswap_background(struct mtd_blktrans_dev *dev)
1027
struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
1031
ret = mtdswap_gc(d, 1);
1032
if (ret || mtd_blktrans_cease_background(dev))
1037
static void mtdswap_cleanup(struct mtdswap_dev *d)
1041
vfree(d->page_data);
1046
static int mtdswap_flush(struct mtd_blktrans_dev *dev)
1048
struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
1051
d->mtd->sync(d->mtd);
1055
static unsigned int mtdswap_badblocks(struct mtd_info *mtd, uint64_t size)
1058
unsigned int badcnt;
1062
if (mtd->block_isbad)
1063
for (offset = 0; offset < size; offset += mtd->erasesize)
1064
if (mtd->block_isbad(mtd, offset))
1070
static int mtdswap_writesect(struct mtd_blktrans_dev *dev,
1071
unsigned long page, char *buf)
1073
struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
1074
unsigned int newblock, mapped;
1078
d->sect_write_count++;
1080
if (d->spare_eblks < MIN_SPARE_EBLOCKS)
1084
/* Ignore writes to the header page */
1085
if (unlikely(page == 0))
1091
mapped = d->page_data[page];
1092
if (mapped <= BLOCK_MAX) {
1093
eb = d->eb_data + (mapped / d->pages_per_eblk);
1095
mtdswap_store_eb(d, eb);
1096
d->page_data[page] = BLOCK_UNDEF;
1097
d->revmap[mapped] = PAGE_UNDEF;
1100
ret = mtdswap_write_block(d, buf, page, &newblock, 0);
1101
d->mtd_write_count++;
1106
eb = d->eb_data + (newblock / d->pages_per_eblk);
1107
d->page_data[page] = newblock;
1112
/* Provide a dummy swap header for the kernel */
1113
static int mtdswap_auto_header(struct mtdswap_dev *d, char *buf)
1115
union swap_header *hd = (union swap_header *)(buf);
1117
memset(buf, 0, PAGE_SIZE - 10);
1119
hd->info.version = 1;
1120
hd->info.last_page = d->mbd_dev->size - 1;
1121
hd->info.nr_badpages = 0;
1123
memcpy(buf + PAGE_SIZE - 10, "SWAPSPACE2", 10);
1128
static int mtdswap_readsect(struct mtd_blktrans_dev *dev,
1129
unsigned long page, char *buf)
1131
struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
1132
struct mtd_info *mtd = d->mtd;
1133
unsigned int realblock, retries;
1139
d->sect_read_count++;
1142
if (unlikely(page == 0))
1143
return mtdswap_auto_header(d, buf);
1148
realblock = d->page_data[page];
1149
if (realblock > BLOCK_MAX) {
1150
memset(buf, 0x0, PAGE_SIZE);
1151
if (realblock == BLOCK_UNDEF)
1157
eb = d->eb_data + (realblock / d->pages_per_eblk);
1158
BUG_ON(d->revmap[realblock] == PAGE_UNDEF);
1160
readpos = (loff_t)realblock << PAGE_SHIFT;
1164
ret = mtd->read(mtd, readpos, PAGE_SIZE, &retlen, buf);
1166
d->mtd_read_count++;
1167
if (ret == -EUCLEAN) {
1168
eb->flags |= EBLOCK_BITFLIP;
1169
mtdswap_rb_add(d, eb, MTDSWAP_BITFLIP);
1174
dev_err(d->dev, "Read error %d\n", ret);
1175
eb->flags |= EBLOCK_READERR;
1176
mtdswap_rb_add(d, eb, MTDSWAP_FAILING);
1178
if (retries < MTDSWAP_IO_RETRIES)
1184
if (retlen != PAGE_SIZE) {
1185
dev_err(d->dev, "Short read %zd\n", retlen);
1192
static int mtdswap_discard(struct mtd_blktrans_dev *dev, unsigned long first,
1195
struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
1198
unsigned int mapped;
1202
for (page = first; page < first + nr_pages; page++) {
1203
mapped = d->page_data[page];
1204
if (mapped <= BLOCK_MAX) {
1205
eb = d->eb_data + (mapped / d->pages_per_eblk);
1207
mtdswap_store_eb(d, eb);
1208
d->page_data[page] = BLOCK_UNDEF;
1209
d->revmap[mapped] = PAGE_UNDEF;
1210
d->discard_page_count++;
1211
} else if (mapped == BLOCK_ERROR) {
1212
d->page_data[page] = BLOCK_UNDEF;
1213
d->discard_page_count++;
1220
static int mtdswap_show(struct seq_file *s, void *data)
1222
struct mtdswap_dev *d = (struct mtdswap_dev *) s->private;
1224
unsigned int count[MTDSWAP_TREE_CNT];
1225
unsigned int min[MTDSWAP_TREE_CNT];
1226
unsigned int max[MTDSWAP_TREE_CNT];
1227
unsigned int i, cw = 0, cwp = 0, cwecount = 0, bb_cnt, mapped, pages;
1229
char *name[] = {"clean", "used", "low", "high", "dirty", "bitflip",
1232
mutex_lock(&d->mbd_dev->lock);
1234
for (i = 0; i < MTDSWAP_TREE_CNT; i++) {
1235
struct rb_root *root = &d->trees[i].root;
1237
if (root->rb_node) {
1238
count[i] = d->trees[i].count;
1239
min[i] = rb_entry(rb_first(root), struct swap_eb,
1241
max[i] = rb_entry(rb_last(root), struct swap_eb,
1247
if (d->curr_write) {
1249
cwp = d->curr_write_pos;
1250
cwecount = d->curr_write->erase_count;
1254
for (i = 0; i < d->eblks; i++)
1255
sum += d->eb_data[i].erase_count;
1257
use_size = (uint64_t)d->eblks * d->mtd->erasesize;
1258
bb_cnt = mtdswap_badblocks(d->mtd, use_size);
1261
pages = d->mbd_dev->size;
1262
for (i = 0; i < pages; i++)
1263
if (d->page_data[i] != BLOCK_UNDEF)
1266
mutex_unlock(&d->mbd_dev->lock);
1268
for (i = 0; i < MTDSWAP_TREE_CNT; i++) {
1272
if (min[i] != max[i])
1273
seq_printf(s, "%s:\t%5d erase blocks, erased min %d, "
1275
name[i], count[i], min[i], max[i]);
1277
seq_printf(s, "%s:\t%5d erase blocks, all erased %d "
1278
"times\n", name[i], count[i], min[i]);
1282
seq_printf(s, "bad:\t%5u erase blocks\n", bb_cnt);
1285
seq_printf(s, "current erase block: %u pages used, %u free, "
1286
"erased %u times\n",
1287
cwp, d->pages_per_eblk - cwp, cwecount);
1289
seq_printf(s, "total erasures: %lu\n", sum);
1291
seq_printf(s, "\n");
1293
seq_printf(s, "mtdswap_readsect count: %llu\n", d->sect_read_count);
1294
seq_printf(s, "mtdswap_writesect count: %llu\n", d->sect_write_count);
1295
seq_printf(s, "mtdswap_discard count: %llu\n", d->discard_count);
1296
seq_printf(s, "mtd read count: %llu\n", d->mtd_read_count);
1297
seq_printf(s, "mtd write count: %llu\n", d->mtd_write_count);
1298
seq_printf(s, "discarded pages count: %llu\n", d->discard_page_count);
1300
seq_printf(s, "\n");
1301
seq_printf(s, "total pages: %u\n", pages);
1302
seq_printf(s, "pages mapped: %u\n", mapped);
1307
static int mtdswap_open(struct inode *inode, struct file *file)
1309
return single_open(file, mtdswap_show, inode->i_private);
1312
static const struct file_operations mtdswap_fops = {
1313
.open = mtdswap_open,
1315
.llseek = seq_lseek,
1316
.release = single_release,
1319
static int mtdswap_add_debugfs(struct mtdswap_dev *d)
1321
struct gendisk *gd = d->mbd_dev->disk;
1322
struct device *dev = disk_to_dev(gd);
1324
struct dentry *root;
1325
struct dentry *dent;
1327
root = debugfs_create_dir(gd->disk_name, NULL);
1332
dev_err(dev, "failed to initialize debugfs\n");
1336
d->debugfs_root = root;
1338
dent = debugfs_create_file("stats", S_IRUSR, root, d,
1341
dev_err(d->dev, "debugfs_create_file failed\n");
1342
debugfs_remove_recursive(root);
1343
d->debugfs_root = NULL;
1350
static int mtdswap_init(struct mtdswap_dev *d, unsigned int eblocks,
1351
unsigned int spare_cnt)
1353
struct mtd_info *mtd = d->mbd_dev->mtd;
1354
unsigned int i, eblk_bytes, pages, blocks;
1359
d->spare_eblks = spare_cnt;
1360
d->pages_per_eblk = mtd->erasesize >> PAGE_SHIFT;
1362
pages = d->mbd_dev->size;
1363
blocks = eblocks * d->pages_per_eblk;
1365
for (i = 0; i < MTDSWAP_TREE_CNT; i++)
1366
d->trees[i].root = RB_ROOT;
1368
d->page_data = vmalloc(sizeof(int)*pages);
1370
goto page_data_fail;
1372
d->revmap = vmalloc(sizeof(int)*blocks);
1376
eblk_bytes = sizeof(struct swap_eb)*d->eblks;
1377
d->eb_data = vmalloc(eblk_bytes);
1381
memset(d->eb_data, 0, eblk_bytes);
1382
for (i = 0; i < pages; i++)
1383
d->page_data[i] = BLOCK_UNDEF;
1385
for (i = 0; i < blocks; i++)
1386
d->revmap[i] = PAGE_UNDEF;
1388
d->page_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
1392
d->oob_buf = kmalloc(2 * mtd->ecclayout->oobavail, GFP_KERNEL);
1396
mtdswap_scan_eblks(d);
1407
vfree(d->page_data);
1409
printk(KERN_ERR "%s: init failed (%d)\n", MTDSWAP_PREFIX, ret);
1413
static void mtdswap_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
1415
struct mtdswap_dev *d;
1416
struct mtd_blktrans_dev *mbd_dev;
1420
unsigned int eblocks, eavailable, bad_blocks, spare_cnt;
1421
uint64_t swap_size, use_size, size_limit;
1422
struct nand_ecclayout *oinfo;
1425
parts = &partitions[0];
1429
while ((this_opt = strsep(&parts, ",")) != NULL) {
1430
if (strict_strtoul(this_opt, 0, &part) < 0)
1433
if (mtd->index == part)
1437
if (mtd->index != part)
1440
if (mtd->erasesize < PAGE_SIZE || mtd->erasesize % PAGE_SIZE) {
1441
printk(KERN_ERR "%s: Erase size %u not multiple of PAGE_SIZE "
1442
"%lu\n", MTDSWAP_PREFIX, mtd->erasesize, PAGE_SIZE);
1446
if (PAGE_SIZE % mtd->writesize || mtd->writesize > PAGE_SIZE) {
1447
printk(KERN_ERR "%s: PAGE_SIZE %lu not multiple of write size"
1448
" %u\n", MTDSWAP_PREFIX, PAGE_SIZE, mtd->writesize);
1452
oinfo = mtd->ecclayout;
1454
printk(KERN_ERR "%s: mtd%d does not have OOB\n",
1455
MTDSWAP_PREFIX, mtd->index);
1459
if (!mtd->oobsize || oinfo->oobavail < MTDSWAP_OOBSIZE) {
1460
printk(KERN_ERR "%s: Not enough free bytes in OOB, "
1461
"%d available, %zu needed.\n",
1462
MTDSWAP_PREFIX, oinfo->oobavail, MTDSWAP_OOBSIZE);
1466
if (spare_eblocks > 100)
1467
spare_eblocks = 100;
1469
use_size = mtd->size;
1470
size_limit = (uint64_t) BLOCK_MAX * PAGE_SIZE;
1472
if (mtd->size > size_limit) {
1473
printk(KERN_WARNING "%s: Device too large. Limiting size to "
1474
"%llu bytes\n", MTDSWAP_PREFIX, size_limit);
1475
use_size = size_limit;
1478
eblocks = mtd_div_by_eb(use_size, mtd);
1479
use_size = eblocks * mtd->erasesize;
1480
bad_blocks = mtdswap_badblocks(mtd, use_size);
1481
eavailable = eblocks - bad_blocks;
1483
if (eavailable < MIN_ERASE_BLOCKS) {
1484
printk(KERN_ERR "%s: Not enough erase blocks. %u available, "
1485
"%d needed\n", MTDSWAP_PREFIX, eavailable,
1490
spare_cnt = div_u64((uint64_t)eavailable * spare_eblocks, 100);
1492
if (spare_cnt < MIN_SPARE_EBLOCKS)
1493
spare_cnt = MIN_SPARE_EBLOCKS;
1495
if (spare_cnt > eavailable - 1)
1496
spare_cnt = eavailable - 1;
1498
swap_size = (uint64_t)(eavailable - spare_cnt) * mtd->erasesize +
1499
(header ? PAGE_SIZE : 0);
1501
printk(KERN_INFO "%s: Enabling MTD swap on device %lu, size %llu KB, "
1502
"%u spare, %u bad blocks\n",
1503
MTDSWAP_PREFIX, part, swap_size / 1024, spare_cnt, bad_blocks);
1505
d = kzalloc(sizeof(struct mtdswap_dev), GFP_KERNEL);
1509
mbd_dev = kzalloc(sizeof(struct mtd_blktrans_dev), GFP_KERNEL);
1515
d->mbd_dev = mbd_dev;
1519
mbd_dev->devnum = mtd->index;
1520
mbd_dev->size = swap_size >> PAGE_SHIFT;
1523
if (!(mtd->flags & MTD_WRITEABLE))
1524
mbd_dev->readonly = 1;
1526
if (mtdswap_init(d, eblocks, spare_cnt) < 0)
1529
if (add_mtd_blktrans_dev(mbd_dev) < 0)
1532
d->dev = disk_to_dev(mbd_dev->disk);
1534
ret = mtdswap_add_debugfs(d);
1536
goto debugfs_failed;
1541
del_mtd_blktrans_dev(mbd_dev);
1551
static void mtdswap_remove_dev(struct mtd_blktrans_dev *dev)
1553
struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
1555
debugfs_remove_recursive(d->debugfs_root);
1556
del_mtd_blktrans_dev(dev);
1561
static struct mtd_blktrans_ops mtdswap_ops = {
1565
.blksize = PAGE_SIZE,
1566
.flush = mtdswap_flush,
1567
.readsect = mtdswap_readsect,
1568
.writesect = mtdswap_writesect,
1569
.discard = mtdswap_discard,
1570
.background = mtdswap_background,
1571
.add_mtd = mtdswap_add_mtd,
1572
.remove_dev = mtdswap_remove_dev,
1573
.owner = THIS_MODULE,
1576
static int __init mtdswap_modinit(void)
1578
return register_mtd_blktrans(&mtdswap_ops);
1581
static void __exit mtdswap_modexit(void)
1583
deregister_mtd_blktrans(&mtdswap_ops);
1586
module_init(mtdswap_modinit);
1587
module_exit(mtdswap_modexit);
1590
MODULE_LICENSE("GPL");
1591
MODULE_AUTHOR("Jarkko Lavinen <jarkko.lavinen@nokia.com>");
1592
MODULE_DESCRIPTION("Block device access to an MTD suitable for using as "