3
# check_status.pl Nagios Plugin - Version 1.3
4
# Last Updated: 1/9/2003
6
# Report any bugs/questions to Russell Scibetti at russell@quadrix.com
8
# check_status Change Log:
11
# - Better help and documentation (separate doc?)
12
# - Take argument (patterns to match) from a separate spec file
15
# - Added ChangeLog information and updated --help output
16
# - hostdown (hd) argument for how a service check should respond
17
# when its host is Down/Unreachable
18
# (--hostdown="ok|warning|critical|unknown")
19
# - Changed name from check_state to check_status
20
# - Set hostdown to default to OK when the argument isn't specified
21
# - Number of Hosts checked is now output in OK result
23
# Version 1.2 additions:
25
# - Added ability to handle ack'd and downtimed services differently
26
# depending on argument provided
27
# (--ack="ok|warning|critical|unknown|down|unreachable"
28
# --dt="ok|warning|critical|unknown|down|unreachable")
30
# Version 1.1 additions:
32
# - Added --host=<regex>, --servhost=<regex> to allow for specific field
33
# matching (host for matching hostname in host checks, servhost for
34
# matching the hostname in service checks, service for matching the
35
# service name in service checks)
36
# - Output the number of OK services for an OK output
38
# Version 1.0 features:
40
# - Freshness check of status.log (timestamp)
41
# - Match service or host checks
42
# - Can ignore acknowledged or downtimes services/hosts (--ack, --dt)
43
# - Can output different levels of detail dependent on # of problems
44
# - Can check for number of critical, warning, or unknowns
46
#############################################################
51
Getopt::Long::Configure('bundling');
54
("V" => \$version, "version" => \$version,
55
"h" => \$help, "help" => \$help,
56
"v" => \$verbose, "verbose" => \$verbose,
57
"w=s" => \$warning, "warning=s" => \$warning,
58
"c=s" => \$critical, "critical=s" => \$critical,
59
"u=s" => \$unknown, "unknown=s" => \$unknown,
60
"p=s" => \$pattern, "pattern=s" => \$pattern,
61
"S:s" => \$service, "service:s" => \$service,
62
"s=s" => \$status, "status=s" => \$status,
63
"d=s" => \$dir, "dir=s" => \$dir,
64
"D=s" => \$details, "details=s" => \$details,
65
"H:s" => \$host, "host:s" => \$host,
66
"f=s" => \$freshness, "freshness=s" => \$freshness,
67
"servhost=s" => \$servhost,
68
"a:s" => \$ack, "ack:s" => \$ack,
69
"dt:s"=> \$dt, "downtime:s" => \$dt,
70
"hd:s"=> \$hdown, "hostdown:s" => \$hdown,
83
my $unreach="UNREACHABLE";
85
# Print out Help information
92
# Print out version information
98
# Check for status log or directory argument or print usage
101
print "Usage: $0 -s <status file> | -d <Nagios log dir>\n";
102
print "Use the --help option for full list of arguments\n";
105
elsif ($dir =~ m#[^/]/$#) {
106
$status = $dir . "status.log";
109
$status = $dir . "/status.log";
119
if (!$host && !$servhost) {
123
if (!$host && !$service) {
131
elsif (!($ack =~ "ok|critical|warning|unknown|down|unreachable")) {
132
print "Invalid value for ack\n";
141
elsif (!($dt =~ "ok|critical|warning|unknown|down|unreachable")) {
142
print "Invalid value for dt\n";
147
if (defined $hdown) {
151
elsif (!($hdown =~ "ok|critical|warning|unknown|down|unreachable")) {
152
print "Invalid value for hostdown\n";
157
my $much_details = 0;
159
my $ServiceNotOK = "CRITICAL|WARNING|UNKNOWN";
160
my $HostNotOK = "DOWN|UNREACHABLE";
162
my %numprob = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0);
171
my %warnlevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0);
172
my %critlevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0);
173
my %unklevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0);
174
my %hostlevel = ("DOWN",0,"UNREACHABLE",0);
176
# Store Hosts in downtime
180
# Store Hosts in a Down/Unreachable state
184
# Hash for storing state-change to OK times for hosts:
187
# Number of matches in parsing
191
if ($warning =~ /,/) {
192
@wlev = split /,/,$warning;
193
$warnlevel{"WARNING"} = $wlev[0];
194
$warnlevel{"CRITICAL"} = $wlev[1];
196
$warnlevel{"UNKNOWN"} = $wlev[2];
200
$WarnOnly = $warning;
208
if ($critical =~ /,/) {
209
@clev = split /,/,$critical;
210
$critlevel{"WARNING"} = $clev[0];
211
$critlevel{"CRITICAL"} = $clev[1];
213
$critlevel{"UNKNOWN"} = $clev[2];
217
$CritOnly = $critical;
225
if ($unknown =~ /,/) {
226
@ulev = split /,/,$unknown;
227
$unklevel{"WARNING"} = $ulev[0];
228
$unklevel{"CRITICAL"} = $ulev[1];
230
$unklevel{"UNKNOWN"} = $ulev[2];
243
$freshness = 30 * 60;
246
$freshness = $freshness * 60;
249
my %ct = ("CRITICAL",0,"WARNING",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0);
250
my %much_ct = ("CRITICAL",0,"WARNING",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0);
252
my %output = ("CRITICAL","","WARNING","","UNKNOWN","","DOWN","","UNREACHABLE","");
253
my %much_output = ("CRITICAL","","WARNING","","UNKNOWN","","DOWN","","UNREACHABLE","");
256
if ($details =~ /,/) {
257
my @tempv = split /,/,$details;
258
$much_details = $tempv[0];
259
$details = $tempv[1];
263
open("sta","$status") || die "Cannot open status file $status!";
266
$file_time = stat($status)->mtime;
268
if ($curr_time - $file_time > $freshness) {
269
printf "State CRITICAL - Status file is stale!!!\n";
270
exitcheck($CRITICAL);
275
if (/^[^\s]+[\s]+HOST;/) {
276
@hdata = split /;/,$_;
278
# If you care about matching hosts (not services):
279
if ($host && $hdata[1] =~ /$host/) {
281
if ( $hdata[2] =~ /$HostNotOK/ ) {
282
addproblem($_,$hdata[2]);
286
# If you are matching services, gather host information:
288
if ( $hdata[2] =~ /$HostNotOK/ ) {
289
$hostdown[$numdown] = $hdata[1];
293
$hostoktimes{$hdata[1]} = $hdata[4];
295
if ( $hdata[17] ne "0" ) {
296
$hostdowntime[$numdowntime] = $hdata[1];
301
elsif (!$host && /^[^\s]+[\s]+SERVICE;/) {
302
@servdata = split /;/,$_;
303
if ( ( $pattern && ($_ =~ /$pattern/)) ||
304
(($servdata[1] =~ /$servhost/) && ($servdata[2] =~ /$service/)) ){
306
if (($servdata[5] eq "HARD") && ($servdata[3] =~ /$ServiceNotOK/)) {
307
addproblem($_,$servdata[3]);
316
print "Nothing Matches your criteria!\n";
320
# Count the number of problems (for reference):
322
$total = $numprob{"DOWN"} + $numprob{"UNREACHABLE"};
325
$total = $numprob{"WARNING"} + $numprob{"CRITICAL"} + $numprob{"UNKNOWN"};
328
my $numok = $nummatch - $total;
330
# If this is a host state check:
332
if ($numprob{"DOWN"}>0 || $numprob{"UNREACHABLE"}>0 ) {
333
if ($details && ($total <= $details)) {
334
print "State CRITICAL - $total Host Problems: $output{$down} $output{$unreach}\n";
335
exitcheck($CRITICAL);
338
print "State CRITICAL - $numprob{$down} Hosts Down, $numprob{$unreach} Hosts Unreachable\n";
339
exitcheck($CRITICAL);
343
print "State OK - $numok Hosts Up, $total Problems\n";
348
#If you only defined a Critical level in terms of # of criticals...
349
elsif ($CritOnly && ($numprob{"CRITICAL"} >= $CritOnly)) {
350
countAndPrint($crit,$numprob{$crit},0);
351
exitcheck($CRITICAL);
354
#Critical in terms on # criticals and # warnings...
355
elsif (!$CritOnly && ($numprob{"WARNING"} >= $critlevel{"WARNING"} ||
356
$numprob{"CRITICAL"} >= $critlevel{"CRITICAL"} ||
357
$numprob{"UNKNOWN"} >= $critlevel{"UNKNOWN"} )) {
358
countAndPrint($crit,$total,1);
359
exitcheck($CRITICAL);
362
#Warning in terms of # warnings only...
363
elsif ($WarnOnly && ($numprob{"WARNING"} >= $WarnOnly)) {
364
countAndPrint($warn,$numprob{$warn},0);
368
#Warning in terms of # warnings and # criticals...
369
elsif (!$WarnOnly && ($numprob{"WARNING"} >= $warnlevel{"WARNING"} ||
370
$numprob{"CRITICAL"} >= $warnlevel{"CRITICAL"} ||
371
$numprob{"UNKNOWN"} >= $warnlevel{"UNKNOWN"})) {
372
countAndPrint($warn,$total,1);
376
#Unknown in terms on # unknown only...
377
elsif ( $UnkOnly && ($numprob{"UNKNOWN"}>=$UnkOnly) ) {
378
countAndPrint($unk,$numprob{$unk},0);
382
#Unknown in terms of # warning, critical, and unknown...
383
elsif (!$UnkOnly && ($numprob{"WARNING"} >= $unklevel{"WARNING"} ||
384
$numprob{"CRITICAL"} >= $unklevel{"CRITICAL"} ||
385
$numprob{"UNKNOWN"} >= $unklevel{"UNKNOWN"})) {
386
countAndPrint($unk,$total,1);
392
print "State OK - $numok OK, $total problems\n";
398
############################
400
############################
402
# Return the proper exit code for Critical, Warning, Unknown, or OK
412
# Decide what to print for services:
416
my $alltypes = $_[2];
417
my $output = "State $state - ";
420
if ($count<=$much_details) {
422
$output .= "$count problems: $much_output{$crit} $much_output{$warn} $much_output{$unk}";
425
$output .= "$count \L$state\E: $much_output{$state}";
428
elsif ($count<=$details) {
430
$output .= "$count problems: $output{$crit} $output{$warn} $output{$unk}";
433
$output .= "$count \L$state\E: $output{$state}";
438
$output .= "$numprob{$crit} critical, $numprob{$warn} warning, $numprob{$unk} unknown";
441
$output .= "$count \L$state\E";
446
$output .= "$count problems";
453
# Add-in the problem found in the status log
460
my @values = split /;/,$_[0];
463
my $namehold = $values[1];
464
if ($ack && ($values[13] eq "1")) {
472
elsif ($hdown && grep /$namehold/, @hostdown) {
473
if ($hdown =~ "ok") {
478
$diffout = "$values[1] is down";
481
elsif ($dt && (($values[27] ne "0") || (grep /$namehold/, @hostdowntime))){
489
elsif (exists $hostoktimes{$namehold}) {
490
# If the state change time of the host is more recent than the last
491
# service check, must wait until the next service check runs!
492
if ($hostoktimes{$namehold} > $values[6]) {
498
if ($ack && $values[5]) {
506
elsif ($dt && ($values[17] ne "0")) {
516
if ($details && $test) {
519
$much_output{$type} .= " $diffout;";
520
$output{$type} .= "$diffout;";
525
if ($much_details && $much_ct{$type}<$much_details) {
526
$much_output{$type} .= " $values[2] on $values[1] $values[31];";
529
if ($ct{$type} < $details) {
530
$output{$type} .= " $values[2] on $values[1];";
536
$much_output{$type} .= " $values[1] $_[1] $values[20],";
538
$output{$type} .= " $values[1] HOST $_[1],";
547
################################
549
# Version and Help Information
551
################################
555
$0 (nagios-plugins) 1.3
556
The nagios plugins come with ABSOLUTELY NO WARRANTY. You may redistribute
557
copies of the plugins under the terms of the GNU General Public License.
558
For more information about these matters, see the file named COPYING.
565
This plugin parses through the Nagios status log and will return a
566
Critical, Warning, or Unknown state depending on the number of
567
Critical, Warning, and/or Unknown services found in the log
568
(or Down/Unreachable hosts when matching against hosts)
570
Usage: $0 -s <Status File> | -d <Nagios Log Directory>
571
[-w #[,#][,#]] [-c #[,#][,#]] [-u #[,#][,#]]
572
[--service=<RegEx> | --servhost=<RegEx> | --pattern=<RegEx> |
573
--host | --host=<RegEx>]
574
[--ack[=string]] [--dt[=string]] [--hostdown[=string]]
575
[-D #[,#]] [--ok] [-f <Log freshness in # minutes>]
578
NOTE: One of -s and -d must be specified
581
-s, --status=FILE_NAME
582
Location and name of status log (e.g. /usr/local/nagios/var/status.log)
583
-d, --dir=DIRECTORY_NAME
584
Directory that contains the nagios logs (e.g. /usr/local/nagios/var/)
585
-w, --warning=INTEGER[,INTEGER][,INTEGER]
586
#: Number of warnings to result in a WARNING state
588
#,#: Warning,Criticals to result in a WARNING state
590
#,#,#: Warning,Critical,Unknown to result in a WARNING state
592
-c, --critical=INTEGER[,INTEGER][,INTEGER]
593
#: Number of criticals to result in a CRITICAL state
595
#,#: Warning,Criticals to result in a CRITICAL state
597
#,#,#: Warning,Critical,Unknown to result in a CRITICAL state
599
-u, --unknown=INTEGER[,INTEGER][,INTEGER]
600
#: Number of unknowns to result in a UNKNOWN state
602
#,#: Warning,Criticals to result in a UNKNOWN state
604
#,#,#: Warning,Critical,Unknown to result in a UNKNOWN state
606
-r, --service[=REGEX]
607
Only match services [that match the RegEx]
608
(--service is default setting if no other matching arguments provided)
610
Only match services whose host match the RegEx
612
Only parse for this regular expression (services only, not hosts)
614
Report on the state of hosts (whose name matches the RegEx if provided)
615
-a, --ack[=ok|warning|critical|unknown|down|unreachable]
616
Handle Acknowledged problems [--ack defaults to ok]
617
--dt, --downtime[=ok|warning|critical|unknown|down|unreachable]
618
Handle problems in scheduled downtime [--dt defaults to ok]
619
--hd, --hostdown[=ok|warning|critical|unknown|down|unreachable]
620
Handle services whose Host is down [--hd defaults to ok]
621
-D, --details=INTEGER[,INTEGER]
622
Amount of verbosity to output
624
<= 1st integer, return full details (each plugin's output)
625
<= 2nd integer, return some details (list each service host pair)
626
> 2nd integer, return the # of problems
627
-f, --freshness=INTEGER
628
Number of minutes old the log can be to make sure Nagios is running
629
(Default = 30 minutes)
631
Return an OK exit code, regardless of number of problems found
633
Print detailed help screen
635
Print version information
637
For service checking (use --service and/or --servhost):
638
1. The values of warning, critical, and unknown default to 1, i.e.
639
$0 will return CRITICAL if there is at least 1 critical service,
640
WARNING if there is at least 1 warning service, and UNKNOWN if there is
641
at least one unknown service.
643
2. If a service's host is DOWN or UNREACHABLE, $0 will use the
644
value of --hostdown to determine how to treat the service. Without that
645
argument, $0 will count the service as OK.
647
3. If a service's host is OK, but the last host-state change occurred more
648
recently than the last service check, $0 will ignore that service
649
(want to wait until the service has been checked after a host has recovered
650
or you may get service alert for services that still need to be checked)
652
4. If the --dt, --ack, or --hd tags are used, $0 will use the value
653
of the arguments to determine how to handle services in downtime, acknowledged,
654
or with down hosts (default=OK). For service checks, --dt will also check
655
if the service's host is in a downtime.
657
For host checking (use --host):
658
1. Using the --host argument, $0 will look for DOWN and UNREACHABLE
659
hosts. If any are found, $0 will return a CRITICAL. You can provide
660
an REGEX for --host to only check hosts with matching host names.
662
2. If the --dt or --ack tags are used, $0 will use the value of the
663
--dt/--ack arguments to determine the state of the host (default is OK)