172
174
eversion_t last_complete; // last version pg was complete through.
174
176
eversion_t log_tail; // oldest log entry.
175
bool log_backlog; // do we store a complete log?
178
hobject_t last_backfill; // objects >= this and < last_complete may be missing
177
180
interval_set<snapid_t> purged_snaps;
248
Info() : log_backlog(false) {}
249
Info(pg_t p) : pgid(p), log_backlog(false) { }
252
: last_backfill(hobject_t::get_max())
256
last_backfill(hobject_t::get_max())
251
259
bool is_empty() const { return last_update.version == 0; }
252
260
bool dne() const { return history.epoch_created == 0; }
262
bool is_incomplete() const { return last_backfill != hobject_t::get_max(); }
254
264
void encode(bufferlist &bl) const {
258
268
::encode(pgid, bl);
259
269
::encode(last_update, bl);
260
270
::encode(last_complete, bl);
261
271
::encode(log_tail, bl);
262
::encode(log_backlog, bl);
272
::encode(last_backfill, bl);
263
273
::encode(stats, bl);
264
274
history.encode(bl);
265
275
::encode(purged_snaps, bl);
278
288
::decode(last_update, bl);
279
289
::decode(last_complete, bl);
280
290
::decode(log_tail, bl);
281
::decode(log_backlog, bl);
293
::decode(log_backlog, bl);
296
::decode(last_backfill, bl);
282
297
::decode(stats, bl);
283
298
history.decode(bl);
346
358
* Log - incremental log of recent pg changes.
347
* also, serves as a recovery queue.
349
* when backlog is true,
350
* objects with versions <= bottom are in log.
351
* we do not have any deletion info before that time, however.
352
* log is a "summary" in that it contains all objects in the PG.
359
* serves as a recovery queue for recent changes.
362
BACKLOG = 4, // event invented by generate_backlog
369
BACKLOG = 4, // event invented by generate_backlog [deprecated]
363
370
LOST_REVERT = 5, // lost new version, revert to an older version.
364
371
LOST_DELETE = 6, // lost new version, revert to no object (deleted).
365
372
LOST_MARK = 7, // lost new version, now EIO
470
477
eversion_t head; // newest entry
471
478
eversion_t tail; // version prior to oldest
474
* backlog - true if log is a complete summary of pg contents.
475
* updated will include all items in pg, but deleted will not
476
* include negative entries for items deleted prior to 'tail'.
480
480
list<Entry> log; // the actual log.
482
Log() : backlog(false) {}
496
495
return head.version == 0 && head.epoch == 0;
498
size_t approx_size() const {
499
return head.version - tail.version;
499
502
list<Entry>::iterator find_entry(eversion_t v) {
500
503
int fromhead = head.version - v.version;
501
504
int fromtail = v.version - tail.version;
527
529
::decode(struct_v, bl);
528
530
::decode(head, bl);
529
531
::decode(tail, bl);
530
::decode(backlog, bl);
534
::decode(backlog, bl);
531
536
::decode(log, bl);
534
void copy_after(const Log &other, eversion_t v);
535
bool copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor);
536
void copy_non_backlog(const Log &other);
540
* copy entries from the tail of another Log
542
* @param other Log to copy from
543
* @param from copy entries after this version
545
void copy_after(const Log &other, eversion_t from);
548
* copy a range of entries from another Log
550
* @param other Log to copy from
551
* @param from copy entries after this version
552
* @parem to up to and including this version
554
void copy_range(const Log &other, eversion_t from, eversion_t to);
557
* copy up to N entries
559
* @param o source log
560
* @param max max number of entreis to copy
562
void copy_up_to(const Log &other, int max);
537
564
ostream& print(ostream& out) const;
539
566
WRITE_CLASS_ENCODER(Log)
761
805
bool deleting; // true while RemoveWQ should be chewing on us
763
void lock(bool no_lockdep=false) {
764
//generic_dout(0) << this << " " << info.pgid << " lock" << dendl;
765
_lock.Lock(no_lockdep);
807
void lock(bool no_lockdep = false);
768
809
//generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
814
/* During handle_osd_map, the osd holds a write lock to the osdmap.
815
* *_with_map_lock_held assume that the map_lock is already held */
816
void lock_with_map_lock_held();
771
818
void assert_locked() {
772
819
assert(_lock.is_locked());
821
bool is_locked() const {
775
822
return _lock.is_locked();
844
891
/* You should not use these items without taking their respective queue locks
845
892
* (if they have one) */
846
xlist<PG*>::item recovery_item, backlog_item, scrub_item, scrub_finalize_item, snap_trim_item, remove_item, stat_queue_item;
893
xlist<PG*>::item recovery_item, scrub_item, scrub_finalize_item, snap_trim_item, remove_item, stat_queue_item;
847
894
int recovery_ops_active;
895
bool waiting_on_backfill;
848
896
#ifdef DEBUG_RECOVERY_OIDS
849
897
set<hobject_t> recovering_oids;
852
epoch_t generate_backlog_epoch; // epoch we decided to build a backlog.
853
900
utime_t replay_until;
869
916
eversion_t pg_trim_to;
871
918
// [primary only] content recovery state
872
bool have_master_log;
874
920
bool prior_set_built;
876
922
struct PriorSet {
877
923
set<int> probe; /// current+prior OSDs we need to probe.
878
set<int> down; /// down osds that would normally be in @probe and might be interesting.
924
set<int> down; /// down osds that would normally be in @a probe and might be interesting.
879
925
map<int,epoch_t> blocked_by; /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
881
bool pg_down; /// some down osds are included in @cur; the DOWN pg state bit should be set.
927
bool pg_down; /// some down osds are included in @a cur; the DOWN pg state bit should be set.
882
928
PriorSet(const OSDMap &osdmap,
883
929
const map<epoch_t, Interval> &past_intervals,
884
930
const vector<int> &up,
886
932
const Info &info,
887
933
const PG *debug_pg=NULL);
889
bool affected_by_map(const OSDMap *osdmap, const PG *debug_pg=0) const;
935
bool affected_by_map(const OSDMapRef osdmap, const PG *debug_pg=0) const;
892
938
friend std::ostream& operator<<(std::ostream& oss,
893
939
const struct PriorSet &prior);
895
bool may_need_replay(const OSDMap *osdmap) const;
941
bool may_need_replay(const OSDMapRef osdmap) const;
974
1020
struct AdvMap : boost::statechart::event< AdvMap > {
977
1023
vector<int> newup, newacting;
978
AdvMap(OSDMap *osdmap, OSDMap *lastmap, vector<int>& newup, vector<int>& newacting):
1024
AdvMap(OSDMapRef osdmap, OSDMapRef lastmap, vector<int>& newup, vector<int>& newacting):
979
1025
osdmap(osdmap), lastmap(lastmap), newup(newup), newacting(newacting) {}
982
struct BacklogComplete : boost::statechart::event< BacklogComplete > {
983
BacklogComplete() : boost::statechart::event< BacklogComplete >() {}
1028
struct RecoveryComplete : boost::statechart::event< RecoveryComplete > {
1029
RecoveryComplete() : boost::statechart::event< RecoveryComplete >() {}
985
1031
struct ActMap : boost::statechart::event< ActMap > {
986
1032
ActMap() : boost::statechart::event< ActMap >() {}
1139
1185
struct NeedNewMap : boost::statechart::event< NeedNewMap > {
1140
1186
NeedNewMap() : boost::statechart::event< NeedNewMap >() {}
1189
struct IsIncomplete : boost::statechart::event< IsIncomplete > {
1190
IsIncomplete() : boost::statechart::event< IsIncomplete >() {}
1143
1193
struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
1144
1194
Primary(my_context ctx);
1147
1197
typedef boost::mpl::list <
1148
1198
boost::statechart::custom_reaction< ActMap >,
1149
boost::statechart::custom_reaction< BacklogComplete >,
1150
1199
boost::statechart::custom_reaction< MNotifyRec >,
1151
1200
boost::statechart::custom_reaction< AdvMap >,
1152
boost::statechart::transition< NeedNewMap, WaitActingChange >
1201
boost::statechart::transition< NeedNewMap, WaitActingChange >,
1202
boost::statechart::transition< IsIncomplete, Incomplete >
1154
boost::statechart::result react(const BacklogComplete&);
1155
1204
boost::statechart::result react(const ActMap&);
1156
1205
boost::statechart::result react(const AdvMap&);
1157
1206
boost::statechart::result react(const MNotifyRec&);
1160
1209
struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
1162
1211
typedef boost::mpl::list <
1163
boost::statechart::custom_reaction< MLogRec >
1212
boost::statechart::custom_reaction< MLogRec >,
1213
boost::statechart::custom_reaction< MInfoRec >,
1214
boost::statechart::custom_reaction< MNotifyRec >
1165
1216
WaitActingChange(my_context ctx);
1166
1217
boost::statechart::result react(const MLogRec&);
1218
boost::statechart::result react(const MInfoRec&);
1219
boost::statechart::result react(const MNotifyRec&);
1223
struct Incomplete : boost::statechart::state< Incomplete, Primary>,
1225
Incomplete(my_context ctx);
1192
1251
boost::statechart::custom_reaction< AdvMap >,
1193
1252
boost::statechart::custom_reaction< MInfoRec >,
1194
1253
boost::statechart::custom_reaction< MNotifyRec >,
1195
boost::statechart::custom_reaction< MLogRec >
1254
boost::statechart::custom_reaction< MLogRec >,
1255
boost::statechart::custom_reaction< RecoveryComplete >
1197
1257
boost::statechart::result react(const ActMap&);
1198
1258
boost::statechart::result react(const AdvMap&);
1199
1259
boost::statechart::result react(const MInfoRec& infoevt);
1200
1260
boost::statechart::result react(const MNotifyRec& notevt);
1201
1261
boost::statechart::result react(const MLogRec& logevt);
1262
boost::statechart::result react(const RecoveryComplete&);
1204
1265
struct ReplicaActive : boost::statechart::state< ReplicaActive, Started >, NamedState {
1228
1288
boost::statechart::custom_reaction< MQuery >,
1229
1289
boost::statechart::custom_reaction< MLogRec >,
1230
1290
boost::statechart::custom_reaction< MInfoRec >,
1231
boost::statechart::custom_reaction< BacklogComplete >,
1232
1291
boost::statechart::custom_reaction< ActMap >,
1233
1292
boost::statechart::transition< Activate, ReplicaActive >
1235
1294
boost::statechart::result react(const MQuery& query);
1236
boost::statechart::result react(const BacklogComplete&);
1237
1295
boost::statechart::result react(const MLogRec& logevt);
1238
1296
boost::statechart::result react(const MInfoRec& infoevt);
1239
1297
boost::statechart::result react(const ActMap&);
1273
1329
typedef boost::mpl::list <
1274
1330
boost::statechart::custom_reaction< MLogRec >,
1275
boost::statechart::custom_reaction< BacklogComplete >,
1276
1331
boost::statechart::custom_reaction< GotLog >
1278
1333
boost::statechart::result react(const MLogRec& logevt);
1279
boost::statechart::result react(const BacklogComplete&);
1280
1334
boost::statechart::result react(const GotLog&);
1325
1379
void handle_query(int from, const PG::Query& q,
1326
1380
epoch_t query_epoch,
1327
1381
RecoveryCtx *ctx);
1328
void handle_advance_map(OSDMap *osdmap, OSDMap *lastmap,
1382
void handle_advance_map(OSDMapRef osdmap, OSDMapRef lastmap,
1329
1383
vector<int>& newup, vector<int>& newacting,
1330
1384
RecoveryCtx *ctx);
1331
1385
void handle_activate_map(RecoveryCtx *ctx);
1332
void handle_backlog_generated(RecoveryCtx *ctx);
1386
void handle_recovery_complete(RecoveryCtx *ctx);
1333
1387
void handle_create(RecoveryCtx *ctx);
1334
1388
void handle_loaded(RecoveryCtx *ctx);
1335
1389
} recovery_state;
1394
* peer_info -- projected (updates _before_ replicas ack)
1395
* peer_missing -- committed (updates _after_ replicas ack)
1339
1398
bool need_up_thru;
1340
1399
set<int> stray_set; // non-acting osds that have PG data.
1341
1400
eversion_t oldest_update; // acting: lowest (valid) last_update in active set
1342
1401
map<int,Info> peer_info; // info from peers (stray or prior)
1343
1402
map<int, Missing> peer_missing;
1344
1403
set<int> peer_log_requested; // logs i've requested (and start stamps)
1345
set<int> peer_backlog_requested;
1346
1404
set<int> peer_missing_requested;
1347
1405
set<int> stray_purged; // i deleted these strays; ignore racing PGInfo from them
1348
1406
set<int> peer_activated;
1354
1412
epoch_t last_peering_reset;
1417
* Represents the objects in a range [begin, end)
1420
* 1) begin == end == hobject_t() indicates the the interval is unpopulated
1421
* 2) Else, objects contains all objects in [begin, end)
1423
struct BackfillInterval {
1424
// info about a backfill interval on a peer
1425
map<hobject_t,eversion_t> objects;
1432
begin = end = hobject_t();
1435
void reset(hobject_t start) {
1437
begin = end = start;
1440
/// true if there are no objects in this interval
1442
return objects.empty();
1445
/// true if interval extends to the end of the range
1446
bool extends_to_end() {
1447
return end == hobject_t::get_max();
1450
/// Adjusts begin to the first object
1453
begin = objects.begin()->first;
1458
/// drop first entry, and adjust @begin accordingly
1460
assert(!objects.empty());
1461
objects.erase(objects.begin());
1462
if (objects.empty())
1465
begin = objects.begin()->first;
1469
BackfillInterval backfill_info;
1470
BackfillInterval peer_backfill_info;
1471
int backfill_target;
1356
1473
friend class OSD;
1476
int get_backfill_target() const {
1477
return backfill_target;
1360
1484
list<class Message*> waiting_for_active;
1485
list<class Message*> waiting_for_all_missing;
1361
1486
map<hobject_t, list<class Message*> > waiting_for_missing_object,
1362
1487
waiting_for_degraded_object;
1363
1488
map<eversion_t,list<Message*> > waiting_for_ondisk;
1364
1489
map<eversion_t,class MOSDOp*> replay_queue;
1366
void take_object_waiters(map<hobject_t, list<Message*> >& m);
1491
void requeue_object_waiters(map<hobject_t, list<Message*> >& m);
1368
1493
bool block_if_wrlocked(MOSDOp* op, object_info_t& oi);
1401
1526
void build_prior(std::auto_ptr<PriorSet> &prior_set);
1402
1527
void clear_prior();
1404
bool adjust_need_up_thru(const OSDMap *osdmap);
1529
bool adjust_need_up_thru(const OSDMapRef osdmap);
1406
bool all_unfound_are_queried_or_lost(const OSDMap* osdmap) const;
1531
bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1407
1532
virtual void mark_all_unfound_lost(int how) = 0;
1409
1534
bool calc_min_last_complete_ondisk() {
1439
1564
void discover_all_missing(std::map< int, map<pg_t,PG::Query> > &query_map);
1441
bool build_backlog_map(map<eversion_t,Log::Entry>& omap);
1442
void assemble_backlog(map<eversion_t,Log::Entry>& omap);
1443
void drop_backlog();
1445
1566
void trim_write_ahead();
1447
bool choose_acting(int newest_update_osd) const;
1448
bool recover_master_log(map< int, map<pg_t,Query> >& query_map,
1449
eversion_t &oldest_update);
1450
eversion_t calc_oldest_known_update() const;
1451
void do_peer(ObjectStore::Transaction& t, list<Context*>& tfin,
1452
map< int, map<pg_t,Query> >& query_map,
1453
map<int, MOSDPGInfo*> *activator_map=0);
1454
bool choose_log_location(const PriorSet &prior_set,
1456
bool &wait_on_backlog,
1458
eversion_t &newest_update,
1459
eversion_t &oldest_update) const;
1568
map<int, Info>::const_iterator find_best_info(const map<int, Info> &infos) const;
1569
bool calc_acting(int& newest_update_osd, vector<int>& want) const;
1570
bool choose_acting(int& newest_update_osd);
1460
1571
void build_might_have_unfound();
1461
1572
void replay_queued_ops();
1462
1573
void activate(ObjectStore::Transaction& t, list<Context*>& tfin,
1488
1599
void clear_recovery_state();
1489
1600
virtual void _clear_recovery_state() = 0;
1490
1601
void defer_recovery();
1491
virtual void check_recovery_op_pulls(const OSDMap *newmap) = 0;
1602
virtual void check_recovery_op_pulls(const OSDMapRef newmap) = 0;
1492
1603
void start_recovery_op(const hobject_t& soid);
1493
1604
void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1507
1618
int scrub_waiting_on;
1508
1619
epoch_t scrub_epoch_start;
1509
1620
ScrubMap primary_scrubmap;
1621
MOSDRepScrub *active_rep_scrub;
1511
1623
void repair_object(const hobject_t& soid, ScrubMap::object *po, int bad_peer, int ok_peer);
1512
1624
bool _compare_scrub_objects(ScrubMap::object &auth,
1545
1657
_lock("PG::_lock"),
1546
1658
ref(0), deleting(false), dirty_info(false), dirty_log(false),
1547
1659
info(p), coll(p), log_oid(loid), biginfo_oid(ioid),
1548
recovery_item(this), backlog_item(this), scrub_item(this), scrub_finalize_item(this), snap_trim_item(this), remove_item(this), stat_queue_item(this),
1660
recovery_item(this), scrub_item(this), scrub_finalize_item(this), snap_trim_item(this), remove_item(this), stat_queue_item(this),
1549
1661
recovery_ops_active(0),
1550
generate_backlog_epoch(0),
1662
waiting_on_backfill(0),
1553
have_master_log(true),
1554
1665
recovery_state(this),
1555
1666
need_up_thru(false),
1556
1667
last_peering_reset(0),
1668
backfill_target(-1),
1557
1669
pg_stats_lock("PG::pg_stats_lock"),
1558
1670
pg_stats_valid(false),
1559
1671
finish_sync_event(NULL),
1560
1672
finalizing_scrub(false),
1561
1673
scrub_reserved(false), scrub_reserve_failed(false),
1674
scrub_waiting_on(0),
1584
1697
bool is_primary() const { return role == PG_ROLE_HEAD; }
1585
1698
bool is_replica() const { return role > 0; }
1700
epoch_t get_last_peering_reset() const { return last_peering_reset; }
1587
1702
//int get_state() const { return state; }
1588
1703
bool state_test(int m) const { return (state & m) != 0; }
1634
1749
/// share new pg log entries after a pg is active
1635
1750
void share_pg_log();
1637
void start_peering_interval(const OSDMap *lastmap,
1752
void start_peering_interval(const OSDMapRef lastmap,
1638
1753
const vector<int>& newup,
1639
1754
const vector<int>& newacting);
1640
1755
void set_last_peering_reset();
1662
1777
RecoveryCtx *rctx) {
1663
1778
recovery_state.handle_query(from, q, query_epoch, rctx);
1665
void handle_advance_map(OSDMap *osdmap, OSDMap *lastmap,
1780
void handle_advance_map(OSDMapRef osdmap, OSDMapRef lastmap,
1666
1781
vector<int>& newup, vector<int>& newacting,
1667
1782
RecoveryCtx *rctx) {
1668
1783
recovery_state.handle_advance_map(osdmap, lastmap, newup, newacting, rctx);
1670
1785
void handle_activate_map(RecoveryCtx *rctx) {
1671
1786
recovery_state.handle_activate_map(rctx);
1673
void handle_backlog_generated(RecoveryCtx *rctx) {
1674
recovery_state.handle_backlog_generated(rctx);
1788
void handle_recovery_complete(RecoveryCtx *rctx) {
1789
recovery_state.handle_recovery_complete(rctx);
1676
1791
void handle_create(RecoveryCtx *rctx) {
1677
1792
recovery_state.handle_create(rctx);
1685
1800
virtual void do_op(MOSDOp *op) = 0;
1686
1801
virtual void do_sub_op(MOSDSubOp *op) = 0;
1687
1802
virtual void do_sub_op_reply(MOSDSubOpReply *op) = 0;
1803
virtual void do_scan(MOSDPGScan *op) = 0;
1804
virtual void do_backfill(MOSDPGBackfill *op) = 0;
1688
1805
virtual bool snap_trimmer() = 0;
1690
1807
virtual bool same_for_read_since(epoch_t e) = 0;
1735
1852
out << " v " << pgi.last_update;
1736
1853
if (pgi.last_complete != pgi.last_update)
1737
1854
out << " lc " << pgi.last_complete;
1738
out << " (" << pgi.log_tail << "," << pgi.last_update << "]"
1739
<< (pgi.log_backlog ? "+backlog":"");
1855
out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
1856
if (pgi.is_incomplete())
1857
out << " lb " << pgi.last_backfill;
1741
1859
//out << " c " << pgi.epoch_created;
1742
1860
out << " n=" << pgi.stats.stats.sum.num_objects;