~vcs-imports/mysql-mmm/2.0

« back to all changes in this revision

Viewing changes to lib/Monitor/Monitor.pm

  • Committer: mail at pascalhofmann
  • Date: 2009-05-20 13:06:24 UTC
  • Revision ID: vcs-imports@canonical.com-20090520130624-uzrm6naivl5udkli
mmm2:
- Show unreachable agents in "mmm_control status" output.
- Flag unreachable agents on startup to prevent assignment of roles when monitor is set active.
- Improved logging
- Documentation update

Show diffs side-by-side

added added

removed removed

Lines of Context:
63
63
        $self->roles(MMM::Monitor::Roles->instance());
64
64
        $self->passive_info('');
65
65
 
 
66
        my $checks      = $self->checks_status;
66
67
 
67
68
        #___________________________________________________________________________
68
69
        #
114
115
                        $agent_status->{$host} = { state => $state, roles => \@roles, master => $master };
115
116
                }
116
117
                elsif ($agent->state ne 'ADMIN_OFFLINE') {
117
 
                        ERROR "(startup) Switching to passive mode:  The status of the agent on host '$host' could not be determined (answer was: $res).";
 
118
                        if ($checks->ping($host) && $checks->mysql($host) && !$agent->agent_down()) {
 
119
                                ERROR "Can't reach agent on host '$host'";
 
120
                                $agent->agent_down(1);
 
121
                        }
 
122
                        ERROR "Switching to passive mode: The status of the agent on host '$host' could not be determined (answer was: $res).";
118
123
                        $status                 = 0;
119
124
                        $host_status    = 0;
120
125
                }
142
147
                        };
143
148
                }
144
149
                elsif ($agent->state ne 'ADMIN_OFFLINE') {
145
 
                        ERROR "(startup) Switching to passive mode:  The status of the system '$host' could not be determined (answer was: $res).";
 
150
                        if ($checks->ping($host) && $checks->mysql($host) && !$agent->agent_down()) {
 
151
                                ERROR "Can't reach agent on host '$host'";
 
152
                                $agent->agent_down(1);
 
153
                        }
 
154
                        ERROR "Switching to passive mode: The status of the system '$host' could not be determined (answer was: $res).";
146
155
                        $status                 = 0;
147
156
                        $host_status    = 0;
 
157
 
148
158
                }
149
159
 
150
160
 
152
162
                #
153
163
                # Skip comparison, if we coult not fetch AGENT/SYSTEM status
154
164
                #_______________________________________________________________________
155
 
 
 
165
                
156
166
                next unless (defined($agent_status->{$host}));
157
167
                next unless (defined($system_status->{$host}));
158
168
 
163
173
                #_______________________________________________________________________
164
174
 
165
175
                if ($agent_status->{$host}->{state} ne 'UNKNOWN' && $agent_status->{$host}->{state} ne $agent->state) {
166
 
                        ERROR "(startup) Switching to passive mode: Agent state '", $agent_status->{$host}->{state}, "' differs from stored one '", $agent->state, "' for host '$host'.";
 
176
                        ERROR "Switching to passive mode: Agent state '", $agent_status->{$host}->{state}, "' differs from stored one '", $agent->state, "' for host '$host'.";
167
177
                        $status                 = 0;
168
178
                        $host_status    = 0;
169
179
                        next;
186
196
                        next if ($diff->Same);
187
197
 
188
198
                        ERROR sprintf(
189
 
                                "(startup) Switching to passive mode: Roles of host '$host' [%s] differ from stored ones [%s]",
 
199
                                "Switching to passive mode: Roles of host '$host' [%s] differ from stored ones [%s]",
190
200
                                join(', ', @{$system_status->{$host}->{roles}}),
191
201
                                join(', ', @{$agent->roles})
192
202
                        );
199
209
                foreach my $role (@{$agent->roles}) {
200
210
                        next if ($self->roles->is_active_master_role($role));
201
211
                        next if ($system_status->{$host}->{writable});
202
 
                        WARN "(startup) Active master $host was not writable at monitor startup. (Don't mind, the host will be made writable soon)"
 
212
                        WARN "Active master $host was not writable at monitor startup. (Don't mind, the host will be made writable soon)"
203
213
                }
204
214
                
205
215
        }
236
246
                }
237
247
                my $status_str = sprintf("\nStored status:\n%s\nAgent status:\n%s\nSystem status:\n%s", $agents->get_status_info(), $agent_status_str, $system_status_str);
238
248
                $self->passive_info("Discrepancies between stored status, agent status and system status during startup.\n" . $status_str);
239
 
                FATAL "(startup) Switching to PASSIVE MODE!!! $status_str";
 
249
                FATAL "Switching to passive mode now."; # TODO verbessern: besser erklären
 
250
                INFO $status_str;
240
251
 
241
252
                foreach my $host (keys(%{$main::config->{host}})) {
242
253
                        my $agent = $agents->get($host);
262
273
                        $agent->roles(\@roles);
263
274
                }
264
275
 
 
276
                WARN "Monitor started in passive mode.";
 
277
 
265
278
                return;
266
279
        }
267
280
 
272
285
 
273
286
                # Set new hosts to AWAITING_RECOVERY
274
287
                if ($agent->state eq 'UNKNOWN') {
275
 
                        WARN "(startup) Detected new host '$host': Setting its initial state to 'AWAITING_RECOVERY'. Use 'mmm_control set_online $host' to switch it online.";
 
288
                        WARN "Detected new host '$host': Setting its initial state to 'AWAITING_RECOVERY'. Use 'mmm_control set_online $host' to switch it online.";
276
289
                        $agent->state('AWAITING_RECOVERY');
277
290
                }
278
291
 
279
292
                # Apply roles loaded from status file
280
293
                foreach my $role (@{$agent->roles}) {
281
294
                        unless ($self->roles->exists_ip($role->name, $role->ip)) {
282
 
                                WARN "(startup) Detected change in role definitions: Role '$role' was removed.";
 
295
                                WARN "Detected change in role definitions: Role '$role' was removed.";
283
296
                                next;
284
297
                        }
285
298
                        unless ($self->roles->can_handle($role->name, $host)) {
286
 
                                WARN "(startup) Detected change in role definitions: Host '$host' can't handle role '$role' anymore.";
 
299
                                WARN "Detected change in role definitions: Host '$host' can't handle role '$role' anymore.";
287
300
                                next;
288
301
                        }
289
302
                        $self->roles->set_role($role->name, $role->ip, $host);
290
303
                }
291
304
        }
292
305
 
 
306
        INFO "Monitor started in active mode."  unless ($self->passive);
 
307
        WARN "Monitor started in passive mode." if ($self->passive);
293
308
}
294
309
 
295
310
 
405
420
 
406
421
                        # ONLINE -> HARD_OFFLINE
407
422
                        unless ($ping && $mysql) {
408
 
                                FATAL "State of host '$host' changed from $state to HARD_OFFLINE";
 
423
                                FATAL sprintf("State of host '%s' changed from %s to HARD_OFFLINE (ping: %s, mysql: %s)", $host, $state, ($ping? 'OK' : 'not OK'), ($mysql? 'OK' : 'not OK'));
409
424
                                $agent->state('HARD_OFFLINE');
410
425
                                $self->roles->clear_host_roles($host);
411
426
                                $self->send_agent_status($host);
479
494
                ########################################################################
480
495
 
481
496
                if ($state eq 'REPLICATION_FAIL') {
482
 
                # REPLICATION_FAIL -> REPLICATION_DELAY
 
497
                        # REPLICATION_FAIL -> REPLICATION_DELAY
483
498
                        if ($ping && $mysql && !$rep_backlog && $rep_threads) {
484
499
                                FATAL "State of host '$host' changed from $state to REPLICATION_DELAY";
485
500
                                $agent->state('REPLICATION_DELAY');
487
502
                        }
488
503
                }
489
504
                if ($state eq 'REPLICATION_DELAY') {
490
 
                # REPLICATION_DELAY -> REPLICATION_FAIL
 
505
                        # REPLICATION_DELAY -> REPLICATION_FAIL
491
506
                        if ($ping && $mysql && !$rep_threads) {
492
507
                                FATAL "State of host '$host' changed from $state to REPLICATION_FAIL";
493
508
                                $agent->state('REPLICATION_FAIL');
498
513
                ########################################################################
499
514
 
500
515
                if ($state eq 'REPLICATION_DELAY' || $state eq 'REPLICATION_FAIL') {
501
 
                        if ($ping && $mysql && (($rep_backlog && $rep_threads) || $peer_state ne 'ONLINE')
502
 
                        ) {
 
516
                        if ($ping && $mysql && (($rep_backlog && $rep_threads) || $peer_state ne 'ONLINE')) {
503
517
 
504
518
                                # REPLICATION_DELAY || REPLICATION_FAIL -> AWAITING_RECOVERY
505
519
                                if ($agent->flapping) {
516
530
                                next;
517
531
                        }
518
532
 
519
 
                # REPLICATION_DELAY || REPLICATION_FAIL -> HARD_OFFLINE
 
533
                        # REPLICATION_DELAY || REPLICATION_FAIL -> HARD_OFFLINE
520
534
                        unless ($ping && $mysql) {
521
 
                                FATAL "State of host '$host' changed from $state to HARD_OFFLINE";
 
535
                                FATAL sprintf("State of host '%s' changed from %s to HARD_OFFLINE (ping: %s, mysql: %s)", $host, $state, ($ping? 'OK' : 'not OK'), ($mysql? 'OK' : 'not OK'));
522
536
                                $agent->state('HARD_OFFLINE');
523
537
                                $self->send_agent_status($host);
524
538
                                # TODO kill host (remove ips, drop connections, iptable connections, ...) if sending state was not ok
635
649
                        $agent->agent_down(1);
636
650
                }
637
651
        }
638
 
        elsif ($agent->agent_down()) {
 
652
        elsif ($agent->agent_down) {
639
653
                FATAL "Agent on host '$host' is reachable again";
640
654
                $agent->agent_down(0);
641
655
        }