1081
1150
note["repair"] += p->second;
1082
1151
if (p->first & PG_STATE_SPLITTING)
1083
1152
note["splitting"] += p->second;
1153
if (p->first & PG_STATE_RECOVERING)
1154
note["recovering"] += p->second;
1155
if (p->first & PG_STATE_INCOMPLETE)
1156
note["incomplete"] += p->second;
1157
if (p->first & PG_STATE_BACKFILL)
1158
note["backfill"] += p->second;
1161
hash_map<pg_t, pg_stat_t> stuck_pgs;
1162
utime_t now(ceph_clock_now(g_ceph_context));
1163
utime_t cutoff = now - utime_t(g_conf->mon_pg_stuck_threshold, 0);
1165
pg_map.get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
1166
if (!stuck_pgs.empty()) {
1167
note["stuck inactive"] = stuck_pgs.size();
1171
pg_map.get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs);
1172
if (!stuck_pgs.empty()) {
1173
note["stuck unclean"] = stuck_pgs.size();
1177
pg_map.get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
1178
if (!stuck_pgs.empty()) {
1179
note["stuck stale"] = stuck_pgs.size();
1183
for (hash_map<pg_t,pg_stat_t>::iterator p = stuck_pgs.begin();
1184
p != stuck_pgs.end();
1187
ss << "pg " << p->first << " is stuck " << pg_state_string(p->second.state)
1188
<< ", last acting " << p->second.acting;
1189
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
1085
1193
if (!note.empty()) {
1087
1194
for (map<string,int>::iterator p = note.begin(); p != note.end(); p++) {
1088
if (p != note.begin())
1090
1196
ss << p->second << " pgs " << p->first;
1197
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
1200
for (hash_map<pg_t,pg_stat_t>::const_iterator p = pg_map.pg_stat.begin();
1201
p != pg_map.pg_stat.end();
1203
if (p->second.state & (PG_STATE_STALE |
1206
PG_STATE_INCONSISTENT |
1209
PG_STATE_SPLITTING |
1210
PG_STATE_RECOVERING |
1211
PG_STATE_INCOMPLETE |
1212
PG_STATE_BACKFILL) &&
1213
stuck_pgs.count(p->first) == 0) {
1215
ss << "pg " << p->first << " is " << pg_state_string(p->second.state);
1216
if (p->second.stats.sum.num_objects_unfound)
1217
ss << ", " << p->second.stats.sum.num_objects_unfound << " unfound";
1218
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
1094
1224
stringstream rss;
1095
1225
pg_map.recovery_summary(rss);
1096
1226
if (!rss.str().empty()) {
1097
if (ret != HEALTH_OK)
1227
summary.push_back(make_pair(HEALTH_WARN, "recovery " + rss.str()));
1229
detail->push_back(make_pair(HEALTH_WARN, "recovery " + rss.str()));
1232
check_full_osd_health(summary, detail, pg_map.full_osds, "full", HEALTH_ERR);
1233
check_full_osd_health(summary, detail, pg_map.nearfull_osds, "near full", HEALTH_WARN);
1236
void PGMonitor::check_full_osd_health(list<pair<health_status_t,string> >& summary,
1237
list<pair<health_status_t,string> > *detail,
1238
const set<int>& s, const char *desc,
1239
health_status_t sev) const
1243
ss << s.size() << " " << desc << " osd(s)";
1244
summary.push_back(make_pair(sev, ss.str()));
1246
for (set<int>::const_iterator p = s.begin(); p != s.end(); ++p) {
1248
const osd_stat_t& os = pg_map.osd_stat.find(*p)->second;
1249
int ratio = (int)(((float)os.kb_used) / (float) os.kb * 100.0);
1250
ss << "osd." << *p << " is " << desc << " at " << ratio << "%";
1251
detail->push_back(make_pair(sev, ss.str()));
1257
int PGMonitor::dump_stuck_pg_stats(ostream& ss,
1259
vector<const char*>& args) const
1261
string format = "plain";
1263
int threshold = g_conf->mon_pg_stuck_threshold;
1267
if (args.size() < 2) {
1268
ss << "Must specify inactive or unclean or stale.";
1272
PGMap::StuckPG stuck_type = PGMap::STUCK_NONE;
1273
string type = args[1];
1274
if (type == "inactive")
1275
stuck_type = PGMap::STUCK_INACTIVE;
1276
if (type == "unclean")
1277
stuck_type = PGMap::STUCK_UNCLEAN;
1278
if (type == "stale")
1279
stuck_type = PGMap::STUCK_STALE;
1280
if (stuck_type == PGMap::STUCK_NONE) {
1281
ss << "Invalid stuck type '" << type
1282
<< "'. Valid types are: inactive, unclean, or stale";
1286
for (std::vector<const char*>::iterator i = args.begin() + 2;
1287
i != args.end(); ) {
1288
if (ceph_argparse_double_dash(args, i)) {
1290
} else if (ceph_argparse_witharg(args, i, &val,
1291
"-f", "--format", (char*)NULL)) {
1292
if (val != "json" && val != "plain") {
1293
ss << "format must be json or plain";
1297
} else if (ceph_argparse_withint(args, i, &seconds, &err,
1298
"-t", "--threshold", (char*)NULL)) {
1299
if (!err.str().empty()) {
1303
threshold = seconds;
1304
} else if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
1306
ds << "Usage: ceph pg dump_stuck inactive|unclean|stale [options]" << std::endl
1308
<< "Get stats for pgs that have not been active, clean, or refreshed in some number of seconds." << std::endl
1310
<< "Options: " << std::endl
1311
<< " -h, --help display usage info" << std::endl
1312
<< " -f, --format [plain|json] output format (default: plain)" << std::endl
1313
<< " -t, --threshold [seconds] how many seconds 'stuck' is (default: 300)" << std::endl;
1317
ss << "invalid argument '" << *i << "'";
1322
utime_t now(ceph_clock_now(g_ceph_context));
1323
utime_t cutoff = now - utime_t(threshold, 0);
1326
if (format == "json") {
1327
JSONFormatter jsf(true);
1328
pg_map.dump_stuck(&jsf, stuck_type, cutoff);
1331
pg_map.dump_stuck_plain(ds, stuck_type, cutoff);