1
package HP::Proliant::Component::EventSubsystem;
2
our @ISA = qw(HP::Proliant::Component);
5
use constant { OK => 0, WARNING => 1, CRITICAL => 2, UNKNOWN => 3 };
11
runtime => $params{runtime},
12
rawdata => $params{rawdata},
13
method => $params{method},
14
condition => $params{condition},
15
status => $params{status},
19
extendedinfo => undef,
23
if ($self->{method} eq 'snmp') {
24
$self = HP::Proliant::Component::EventSubsystem::SNMP->new(%params);
25
my $sysUpTime = SNMP::Utils::get_object(
26
$self->{rawdata}, '1.3.6.1.2.1.1.3.0') || 3600*24*100;
27
$self->{boottime} = int(time - $sysUpTime / 100);
28
} elsif ($self->{method} eq 'cli') {
29
$self = HP::Proliant::Component::EventSubsystem::CLI->new(%params);
30
my $uptime = do { local (@ARGV, $/) = "/proc/uptime"; my $x = <>; close ARGV; $x };
31
# also watch 10 minutes of booting before the operating system starts ticking
32
$self->{boottime} = time - int((split(/\s+/, $uptime))[0]) - 600;
38
for my $event (reverse @{$self->{events}}) {
39
if ($event->{cpqHeEventLogUpdateTime} != 0) {
40
$lasttime = $event->{cpqHeEventLogUpdateTime};
42
$event->{cpqHeEventLogUpdateTime} = $lasttime;
45
# maybe the most recent events had zero timestamps.
46
# fill them up with timestamps from the past.
47
for my $event (@{$self->{events}}) {
48
if ($event->{cpqHeEventLogUpdateTime} != 0) {
49
$lasttime = $event->{cpqHeEventLogUpdateTime};
51
$event->{cpqHeEventLogUpdateTime} = $lasttime;
54
# we need the boottime in the event's check method
55
for my $event (@{$self->{events}}) {
56
$event->{boottime} = $self->{boottime};
64
$self->add_info('checking events');
65
if (scalar (@{$self->{events}}) == 0) {
66
#$self->overall_check();
67
$self->add_info('no events found');
69
foreach (sort { $a->{cpqHeEventLogEntryNumber} <=> $b->{cpqHeEventLogEntryNumber}}
71
$_->check($self->{warningtime}, $self->{criticaltime});
78
foreach (@{$self->{events}}) {
84
package HP::Proliant::Component::EventSubsystem::Event;
85
our @ISA = qw(HP::Proliant::Component::EventSubsystem);
88
use constant { OK => 0, WARNING => 1, CRITICAL => 2, UNKNOWN => 3 };
91
our $interesting_events = {
92
# POST Error: 201-Memory Error Multi-bit error occurred during memory initialization, Board 1, Bank B. Bank containing DIMM(s) has been disabled..
93
# POST Error: 201-Memory Error Single-bit error occured during memory initialization, Board 1, DIMM 1. Bank containing DIMM(s) has been disabled..
94
# POST Error: 207-Memory initialization error on Memory Board 5 DIMM 7. The operating system may not have access to all of the memory installed in the system..
95
# POST Error: 207-Invalid Memory Configuration-Mismatched DIMMs within DIMM Bank Memory in Bank A Not Utilized..
96
# POST Error: 210 - Quick Path Interconnect (QPI) Link Degradation. A QPI link is operating in a degraded performace state..
98
'201-Memory', '207-Memory',
99
'210\s*-\s*Quick Path Interconnect.*degraded.*'
102
'Corrected Memory Error threshold exceeded',
103
'Uncorrectable Memory Error',
112
runtime => $params{runtime},
113
cpqHeEventLogEntryNumber => $params{cpqHeEventLogEntryNumber},
114
cpqHeEventLogEntrySeverity => lc $params{cpqHeEventLogEntrySeverity},
115
cpqHeEventLogEntryClass => $params{cpqHeEventLogEntryClass},
116
cpqHeEventLogEntryCount => $params{cpqHeEventLogEntryCount} || 1,
117
cpqHeEventLogInitialTime => $params{cpqHeEventLogInitialTime},
118
cpqHeEventLogUpdateTime => $params{cpqHeEventLogUpdateTime},
119
cpqHeEventLogErrorDesc => $params{cpqHeEventLogErrorDesc},
122
extendedinfo => undef,
124
if (! $self->{cpqHeEventLogInitialTime}) {
125
$self->{cpqHeEventLogInitialTime} = $self->{cpqHeEventLogUpdateTime};
129
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
132
#<----- ignore -------><----- warning ------><---------- critical --------->
134
# If we have --eventrange <warnlookback>/<critlookback>
135
# Very young events are shown as critical
136
# If the event gets older, it is shown as warning
137
# At some time, the event is no longer shown
138
# Without --eventrange the event is shown as critical until you manually repair it
139
if ($params{runtime}->{options}->{eventrange}) {
140
my ($warningrange, $criticalrange) = split(/\//, $params{runtime}->{options}->{eventrange});
141
if (! $criticalrange) {
142
$criticalrange = $warningrange;
144
if ($criticalrange =~ /^(\d+)[s]*$/) {
146
} elsif ($criticalrange =~ /^(\d+)m$/) {
147
$criticalrange = $1 * 60;
148
} elsif ($criticalrange =~ /^(\d+)h$/) {
149
$criticalrange = $1 * 3600;
150
} elsif ($criticalrange =~ /^(\d+)d$/) {
151
$criticalrange = $1 * 3600 * 24;
153
die "range has to be <number>[smhd]";
155
if ($warningrange =~ /^(\d+)[s]*$/) {
157
} elsif ($warningrange =~ /^(\d+)m$/) {
158
$warningrange = $1 * 60;
159
} elsif ($warningrange =~ /^(\d+)h$/) {
160
$warningrange = $1 * 3600;
161
} elsif ($warningrange =~ /^(\d+)d$/) {
162
$warningrange = $1 * 3600 * 24;
164
die "range has to be <number>[smhd]";
166
$self->{warningtime} = time - $warningrange;
167
$self->{criticaltime} = time - $criticalrange;
169
$self->{warningtime} = 0;
170
$self->{criticaltime} = 0;
178
$self->blacklist('evt', $self->{cpqHeEventLogEntryNumber});
179
# only check severity "critical" and "caution"
180
# optional: only check interesting events
181
# POST events only if they date maximum from reboot-5min
182
# younger than critical? -> critical
184
$self->add_info(sprintf "Event: %d Added: %s Class: (%s) %s %s",
185
$self->{cpqHeEventLogEntryNumber},
186
$self->{cpqHeEventLogUpdateTime},
187
$self->{cpqHeEventLogEntryClass},
188
$self->{cpqHeEventLogEntrySeverity},
189
$self->{cpqHeEventLogErrorDesc});
190
if ($self->{cpqHeEventLogEntrySeverity} eq "caution" ||
191
$self->{cpqHeEventLogEntrySeverity} eq "critical") {
192
# also watch 10 minutes of booting before the operating system starts ticking
193
if ($self->{cpqHeEventLogUpdateTime} >= $self->{boottime}) {
194
foreach my $class (keys %{$HP::Proliant::Component::EventSubsystem::Event::interesting_events}) {
195
foreach my $pattern (@{$HP::Proliant::Component::EventSubsystem::Event::interesting_events->{$class}}) {
196
if ($self->{cpqHeEventLogErrorDesc} =~ /$pattern/) {
197
if ($self->{cpqHeEventLogUpdateTime} < $self->{warningtime}) {
198
# you didn't care for this problem too long.
199
# don't say i didn't warn you.
204
} elsif ($self->{cpqHeEventLogUpdateTime} < $self->{criticaltime}) {
205
$self->add_message(WARNING, $self->{info});
208
$self->add_message(CRITICAL, $self->{info});
222
printf "[EVENT_%s]\n", $self->{cpqHeEventLogEntryNumber};
223
foreach (qw(cpqHeEventLogEntryNumber cpqHeEventLogEntrySeverity
224
cpqHeEventLogEntryCount cpqHeEventLogInitialTime
225
cpqHeEventLogUpdateTime cpqHeEventLogErrorDesc)) {
226
if ($_ =~ /.*Time$/) {
227
printf "%s: %s\n", $_, scalar localtime $self->{$_};
229
printf "%s: %s\n", $_, $self->{$_};
232
printf "info: %s\n\n", $self->{info};