3
A brief file description
5
@section license License
7
Licensed to the Apache Software Foundation (ASF) under one
8
or more contributor license agreements. See the NOTICE file
9
distributed with this work for additional information
10
regarding copyright ownership. The ASF licenses this file
11
to you under the Apache License, Version 2.0 (the
12
"License"); you may not use this file except in compliance
13
with the License. You may obtain a copy of the License at
15
http://www.apache.org/licenses/LICENSE-2.0
17
Unless required by applicable law or agreed to in writing, software
18
distributed under the License is distributed on an "AS IS" BASIS,
19
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20
See the License for the specific language governing permissions and
21
limitations under the License.
27
* Function defs for the Alarms keeper.
29
* $Date: 2007-10-05 16:56:44 $
40
#include "TSControlMain.h"
45
#include "P_RecCore.h"
47
const char *alarmText[] = {
49
"[TrafficManager] Traffic Server process was reset.",
50
"[TrafficManager] Traffic Server process established.",
53
"Invalid Configuration",
60
"Mgmt Debugging Alarm",
61
"Configuration File Update Failed",
62
"Unable to Establish Manager User-Interface Services",
67
"HTTP Origin Server is Congested",
68
"Congested HTTP Origin Server is now Alleviated",
69
"", /* congested server */
70
"" /* alleviated server */
73
const int alarmTextNum = sizeof(alarmText) / sizeof(char *);
80
cblist = ink_hash_table_create(InkHashTableKeyType_String);
81
local_alarms = ink_hash_table_create(InkHashTableKeyType_String);
82
remote_alarms = ink_hash_table_create(InkHashTableKeyType_String);
83
ink_mutex_init(&mutex, "alarms-mutex");
84
alarm_bin = REC_readString("proxy.config.alarm.bin", &found);
86
alarm_bin_path = REC_readString("proxy.config.alarm.abs_path", &found);
88
if (!alarm_bin_path) {
89
alarm_bin_path = REC_readString("proxy.config.bin_path", &found);
92
alarmOEMcount = minOEMkey;
95
} /* End Alarms::Alarms */
100
ink_hash_table_destroy(cblist);
101
ink_hash_table_destroy_and_xfree_values(local_alarms);
102
ink_hash_table_destroy_and_xfree_values(remote_alarms);
103
ink_mutex_destroy(&mutex);
105
} /* End Alarms::Alarms */
109
Alarms::registerCallback(AlarmCallbackFunc func)
113
ink_mutex_acquire(&mutex);
114
snprintf(cb_buf, sizeof(cb_buf), "%d", cur_cb++);
115
Debug("alarm", "[Alarms::registerCallback] Registering Alarms callback\n");
116
ink_hash_table_insert(cblist, cb_buf, (void *) func);
117
ink_mutex_release(&mutex);
119
} /* End Alarms::registerCallback */
123
Alarms::isCurrentAlarm(alarm_t a, char *ip)
127
InkHashTableValue hash_value;
129
ink_mutex_acquire(&mutex);
131
snprintf(buf, sizeof(buf), "%d", a);
133
snprintf(buf, sizeof(buf), "%d-%s", a, ip);
136
if (!ip && ink_hash_table_lookup(local_alarms, buf, &hash_value) != 0) {
138
} else if (ip && ink_hash_table_lookup(remote_alarms, buf, &hash_value) != 0) {
141
ink_mutex_release(&mutex);
143
} /* End Alarms::isCurrentAlarm */
147
Alarms::resolveAlarm(alarm_t a, char *ip)
150
InkHashTableValue hash_value;
152
ink_mutex_acquire(&mutex);
154
snprintf(buf, sizeof(buf), "%d", a);
156
snprintf(buf, sizeof(buf), "%d-%s", a, ip);
159
if (!ip && ink_hash_table_lookup(local_alarms, buf, &hash_value) != 0) {
160
ink_hash_table_delete(local_alarms, buf);
161
if (((Alarm *) hash_value)->description) {
162
xfree(((Alarm *) hash_value)->description);
165
} else if (ip && ink_hash_table_lookup(remote_alarms, buf, &hash_value) != 0) {
168
snprintf(buf2, sizeof(buf2), "aresolv: %d\n", a);
169
if (!lmgmt->ccom->sendReliableMessage(inet_addr(ip), buf2, strlen(buf2))) {
170
ink_mutex_release(&mutex);
173
ink_hash_table_delete(remote_alarms, buf);
176
ink_mutex_release(&mutex);
179
} /* End Alarms::resolveAlarm */
183
Alarms::signalAlarm(alarm_t a, const char *desc, const char *ip)
185
static time_t last_sent = 0;
186
static char prev_alarm_text[2048] = "";
191
InkHashTableValue hash_value;
192
InkHashTableEntry *entry;
193
InkHashTableIteratorState iterator_state;
195
/* Assign correct priorities */
197
case MGMT_ALARM_PROXY_CACHE_ERROR:
198
priority = 1; // INKqa07595
200
case MGMT_ALARM_PROXY_CACHE_WARNING:
202
case MGMT_ALARM_PROXY_PEER_BORN:
205
case MGMT_ALARM_PROXY_PEER_DIED:
208
case MGMT_ALARM_PING_FAILURE:
211
case MGMT_ALARM_PROXY_PROCESS_DIED:
214
case MGMT_ALARM_PROXY_PROCESS_BORN:
215
mgmt_log(stderr, "[Alarms::signalAlarm] Server Process born\n");
218
case MGMT_ALARM_ADD_ALARM:
221
case MGMT_ALARM_PROXY_HTTP_CONGESTED_SERVER:
222
case MGMT_ALARM_PROXY_HTTP_ALLEVIATED_SERVER:
224
case MGMT_ALARM_WDA_BILLING_CONNECTION_DIED:
225
case MGMT_ALARM_WDA_BILLING_CORRUPTED_DATA:
226
case MGMT_ALARM_WDA_XF_ENGINE_DOWN:
234
/* Quick hack to buffer repeat alarms and only send every 15 min */
235
if (desc && (priority == 1 || priority == 2) && !ip) {
237
if (strcmp(prev_alarm_text, desc) == 0) { /* a repeated alarm */
239
/* INKqa11884: repeated wireless alarms always signalled */
240
if (a != MGMT_ALARM_WDA_BILLING_CONNECTION_DIED &&
241
a != MGMT_ALARM_WDA_BILLING_CORRUPTED_DATA &&
242
a != MGMT_ALARM_WDA_XF_ENGINE_DOWN) {
244
time_t time_delta = time(0) - last_sent;
245
if (time_delta < 900) {
246
mgmt_log("[Alarms::signalAlarm] Skipping Alarm: '%s'\n", desc);
253
ink_strncpy(prev_alarm_text, desc, sizeof(prev_alarm_text));
258
Debug("alarm", "[Alarms::signalAlarm] Sending Alarm: '%s'", desc);
261
desc = (char *) getAlarmText(a);
264
* Exec alarm bin for priority alarms everytime, regardless if they are
265
* potentially duplicates. However, only exec this for you own alarms,
266
* don't want every node in the cluster reporting the same alarm.
268
if (priority == 1 && alarm_bin && alarm_bin_path && !ip) {
274
ink_mutex_acquire(&mutex);
276
// if an OEM alarm, then must create the unique key alarm type;
277
// this key is used to hash the new OEM alarm descritption in the hash table
278
if (a == MGMT_ALARM_ADD_ALARM) {
279
a = (alarmOEMcount - minOEMkey) % (maxOEMkey - minOEMkey) + minOEMkey;
282
snprintf(buf, sizeof(buf), "%d", a);
283
if (ink_hash_table_lookup(local_alarms, buf, &hash_value) != 0) {
284
// INKqa11884: if wireless alarm already active, just
285
// update desc with new timestamp and skip to actions part
286
if (a == MGMT_ALARM_WDA_BILLING_CONNECTION_DIED ||
287
a == MGMT_ALARM_WDA_BILLING_CORRUPTED_DATA ||
288
a == MGMT_ALARM_WDA_XF_ENGINE_DOWN) {
289
Debug("alarm", "[signalAlarm] wireless alarm already active");
290
atmp = (Alarm *) hash_value;
293
ink_mutex_release(&mutex);
298
snprintf(buf, sizeof(buf), "%d-%s", a, ip);
299
if (ink_hash_table_lookup(remote_alarms, buf, &hash_value) != 0) {
300
// Reset the seen flag so that we know the remote alarm is
302
atmp = (Alarm *) hash_value;
305
// INKqa11884: if wireless alarm already active, just
306
// update desc with new timstamp and skip to actions part
307
if (a == MGMT_ALARM_WDA_BILLING_CONNECTION_DIED ||
308
a == MGMT_ALARM_WDA_BILLING_CORRUPTED_DATA ||
309
a == MGMT_ALARM_WDA_XF_ENGINE_DOWN) {
310
Debug("alarm", "[Alarms::signalAlarm] wireless alarm already active");
313
ink_mutex_release(&mutex);
319
ink_assert((atmp = (Alarm *) xmalloc(sizeof(Alarm))));
323
atmp->priority = priority;
324
atmp->description = NULL;
328
atmp->inet_address = 0;
329
ink_hash_table_insert(local_alarms, (InkHashTableKey) (buf), (atmp));
332
atmp->inet_address = inet_addr(ip);
333
ink_hash_table_insert(remote_alarms, (InkHashTableKey) (buf), (atmp));
337
// Swap desc with time-stamped description. Kinda hackish
338
// Temporary until we get a new
339
// alarm system in place. TS 5.0.0, 02/08/2001
342
char my_ctime_str[32];
344
ink_ctime_r(&my_time_t, my_ctime_str);
345
char *p = my_ctime_str;
346
while (*p != '\n' && *p != '\0')
351
const size_t new_desc_size = sizeof(char) * (strlen(desc) + strlen(my_ctime_str) + 4);
352
ink_assert(new_desc = (char *) alloca(new_desc_size));
353
snprintf(new_desc, new_desc_size, "[%s] %s", my_ctime_str, desc);
355
if (atmp->description)
356
xfree(atmp->description);
357
const size_t atmp_desc_size = sizeof(char) * (strlen(desc) + 1);
358
ink_assert(atmp->description = (char *) xmalloc(atmp_desc_size));
359
ink_strncpy(atmp->description, desc, atmp_desc_size);
361
ink_mutex_release(&mutex);
363
#if defined(MGMT_API)
364
if (mgmt_alarm_event_q) {
365
// ADDED CODE here is where we Add to the queue of alarms one more
366
EventNoticeForm *new_alarm;
368
new_alarm = (EventNoticeForm *) xmalloc(sizeof(EventNoticeForm));
370
Debug("alarm", "can't xmalloc so can't create new alarm struct.\n");
373
// allocated space copy over values
374
// remember AlarmID start from 0 exactly 1 off but everything else
376
new_alarm->alarm_t = (AlarmID) (atmp->type - 1);
377
new_alarm->priority = atmp->priority;
378
new_alarm->linger = atmp->linger;
379
new_alarm->local = atmp->local;
380
new_alarm->seen = atmp->seen;
382
new_alarm->inet_address = atmp->inet_address;
383
if (!atmp->description)
384
new_alarm->description = NULL;
386
new_alarm->description = (char *) xmalloc(sizeof(char) * (strlen(atmp->description) + 1));
387
if (!new_alarm->description)
388
new_alarm->description = NULL; // rather have alarm without description than drop it completely
390
strcpy(new_alarm->description, atmp->description);
393
// new alarm is complete now add it
394
ink_mutex_acquire(&mgmt_alarm_event_q->mgmt_alarm_lock);
397
enqueue(mgmt_alarm_event_q->mgmt_alarm_q, new_alarm);
399
ink_mutex_release(&mgmt_alarm_event_q->mgmt_alarm_lock);
403
for (entry = ink_hash_table_iterator_first(cblist, &iterator_state);
404
entry != NULL; entry = ink_hash_table_iterator_next(cblist, &iterator_state)) {
406
AlarmCallbackFunc func = (AlarmCallbackFunc) ink_hash_table_entry_value(remote_alarms, entry);
408
const size_t tmp_size = sizeof(char) * (strlen(ip) + 1);
409
ink_assert((tmp = (char *) xmalloc(tmp_size)));
410
ink_strncpy(tmp, ip, tmp_size);
416
const size_t tmp2_size = sizeof(char) * (strlen(desc) + 1);
417
ink_assert((tmp2 = (char *) xmalloc(tmp2_size)));
418
ink_strncpy(tmp2, desc, tmp2_size);
422
Debug("alarm", "[Alarms::signalAlarm] invoke callback for %d", a);
423
(*(func)) (a, tmp, tmp2);
425
/* Priority 2 alarms get signalled if they are the first unsolved occurence. */
426
if (priority == 2 && alarm_bin && alarm_bin_path && !ip) {
431
} /* End Alarms::signalAlarm */
436
* Function resets the "seen" flag for a given peer's alarms. This allows
437
* us to flush alarms that may have expired naturally or were dealt.
440
Alarms::resetSeenFlag(char *ip)
442
InkHashTableEntry *entry;
443
InkHashTableIteratorState iterator_state;
445
ink_mutex_acquire(&mutex);
446
for (entry = ink_hash_table_iterator_first(remote_alarms, &iterator_state);
447
entry != NULL; entry = ink_hash_table_iterator_next(remote_alarms, &iterator_state)) {
449
char *key = (char *) ink_hash_table_entry_key(remote_alarms, entry);
450
Alarm *tmp = (Alarm *) ink_hash_table_entry_value(remote_alarms, entry);
452
if (strstr(key, ip)) {
456
ink_mutex_release(&mutex);
458
} /* End Alarms::resetSeenFlag */
463
* This function is a sweeper functionto clean up those alarms that have
464
* been taken care of through otehr local managers or at the peer itself.
467
Alarms::clearUnSeen(char *ip)
469
InkHashTableEntry *entry;
470
InkHashTableIteratorState iterator_state;
472
ink_mutex_acquire(&mutex);
473
for (entry = ink_hash_table_iterator_first(remote_alarms, &iterator_state);
474
entry != NULL; entry = ink_hash_table_iterator_next(remote_alarms, &iterator_state)) {
476
char *key = (char *) ink_hash_table_entry_key(remote_alarms, entry);
477
Alarm *tmp = (Alarm *) ink_hash_table_entry_value(remote_alarms, entry);
479
if (strstr(key, ip)) { /* Make sure alarm is for correct ip */
480
if (!tmp->seen) { /* Make sure we did not see it in peer's report */
481
ink_hash_table_delete(remote_alarms, key); /* Safe in iterator? */
482
xfree(tmp->description);
487
ink_mutex_release(&mutex);
489
} /* End Alarms::clearUnSeen */
493
* constructAlarmMessage(...)
494
* This functions builds a message buffer for passing to peers. It basically
495
* takes the current list of local alarms and builds an alarm message.
498
Alarms::constructAlarmMessage(char *ip, char *message, int max)
502
InkHashTableEntry *entry;
503
InkHashTableIteratorState iterator_state;
508
// Insert the standard mcast packet header
509
n = ClusterCom::constructSharedPacketHeader(message, ip, max);
511
ink_mutex_acquire(&mutex);
512
if (!((n + (int) strlen("type: alarm\n")) < max)) {
519
ink_strncpy(&message[n], "type: alarm\n", max - n);
520
n += strlen("type: alarm\n");
522
for (entry = ink_hash_table_iterator_first(local_alarms, &iterator_state);
523
(entry != NULL && n < max); entry = ink_hash_table_iterator_next(local_alarms, &iterator_state)) {
525
Alarm *tmp = (Alarm *) ink_hash_table_entry_value(remote_alarms, entry);
527
if (tmp->description) {
528
snprintf(buf, sizeof(buf), "alarm: %d %s\n", tmp->type, tmp->description);
530
snprintf(buf, sizeof(buf), "alarm: %d No details available\n", tmp->type);
533
if (!((n + (int) strlen(buf)) < max)) {
536
ink_strncpy(&message[n], buf, max - n);
540
if (n == bsum) { /* No alarms */
541
if (!((n + (int) strlen("alarm: none\n")) < max)) {
547
ink_strncpy(&message[n], "alarm: none\n", max - n);
548
n += strlen("alarm: none\n");
550
ink_mutex_release(&mutex);
552
} /* End Alarms::constructAlarmMessage */
556
* checkSystemNAlert(...)
557
* This function should test the system and signal local alarms. Sending
558
* out remote notification commands if necessary.
561
Alarms::checkSystemNAlert()
564
} /* End Alarms::checkSystenNAlert */
567
Alarms::execAlarmBin(const char *desc)
570
char *alarm_email_from_name = 0;
571
char *alarm_email_from_addr = 0;
572
char *alarm_email_to_addr = 0;
576
alarm_email_from_name = REC_readString("proxy.config.product_name", &found);
578
alarm_email_from_name = 0;
579
alarm_email_from_addr = REC_readString("proxy.config.admin.admin_user", &found);
581
alarm_email_from_addr = 0;
582
alarm_email_to_addr = REC_readString("proxy.config.alarm_email", &found);
584
alarm_email_to_addr = 0;
591
ink_filepath_make(cmd_line, sizeof(cmd_line), alarm_bin_path, alarm_bin);
594
if ((pid = fork()) < 0)
596
if ((pid = fork1()) < 0)
599
mgmt_elog(stderr, "[Alarms::execAlarmBin] Unable to fork1 process\n");
600
} else if (pid > 0) { /* Parent */
602
bool script_done = false;
603
time_t timeout = (time_t) REC_readInteger("proxy.config.alarm.script_runtime", &found);
605
timeout = 5; // default time = 5 secs
606
time_t time_delta = 0;
607
time_t first_time = time(0);
608
while (time_delta <= timeout) {
609
// waitpid will return child's pid if status is available
610
// or -1 if there is some problem; returns 0 if child status
612
if (waitpid(pid, &status, WNOHANG) != 0) {
613
Debug("alarm", "[Alarms::execAlarmBin] child pid %d has status", pid);
617
time_delta = time(0) - first_time;
619
// need to kill the child script process if it's not complete
621
Debug("alarm", "[Alarms::execAlarmBin] kill child pid %d", pid);
623
waitpid(pid, &status, 0); // to reap the thread
627
if (alarm_email_from_name && alarm_email_from_addr && alarm_email_to_addr) {
628
res = execl(cmd_line, alarm_bin, desc, alarm_email_from_name, alarm_email_from_addr, alarm_email_to_addr, (char*)NULL);
630
res = execl(cmd_line, alarm_bin, desc, (char*)NULL);
638
char *fileExt = NULL;
640
if ((fileExt = strchr(alarm_bin, '.')) != NULL) {
641
if (ink_strcasecmp(fileExt, ".CMD") == 0 || ink_strcasecmp(fileExt, ".BAT") == 0) {
647
ink_filepath_make(cmd_line, alarm_bin_path, alarm_bin);
649
sprintf(cmd_line, "CMD.EXE /C \"%s\\%s\"", alarm_bin_path, alarm_bin);
652
SetEnvironmentVariable("TRAFFIC_SERVER_ALARM_MSG", desc);
653
SetEnvironmentVariable("ADMIN_EMAIL", alarm_email_to_addr);
656
PROCESS_INFORMATION procInfo;
657
ZeroMemory((PVOID) & suInfo, sizeof(suInfo));
659
// hide the new console window from the user
660
suInfo.cb = sizeof(STARTUPINFO);
661
suInfo.dwFlags = STARTF_USESHOWWINDOW;
662
suInfo.wShowWindow = SW_HIDE;
664
if (CreateProcess(NULL, cmd_line, NULL, // FIX THIS: process security attributes
665
NULL, // FIX THIS: thread security attributes
666
FALSE, // no need to make handles inheritable
667
0, // FIX THIS: specify a priority
668
NULL, // FIX THIS: specify environment variables
669
ts_base_dir, // make script run from TSBase
670
&suInfo, &procInfo) == FALSE) {
671
mgmt_elog(stderr, "[Alarm::execAlarmBin] CreateProcess error: %s\n", ink_last_err());
673
CloseHandle(procInfo.hThread);
674
CloseHandle(procInfo.hProcess);
680
if (alarm_email_from_name)
681
xfree(alarm_email_from_name);
682
if (alarm_email_from_addr)
683
xfree(alarm_email_from_addr);
684
if (alarm_email_to_addr)
685
xfree(alarm_email_to_addr);
692
// returns the corresponding text for the alarm id
695
Alarms::getAlarmText(alarm_t id)
697
const char *wda_conn_died = "The connection to the billing system is broken. Unable to retrieve user profile.";
698
const char *wda_corr_data =
699
"Could not read user profile or URL list from the billing system. The data received doesn't have the expected format.";
700
const char *wda_xf_down = "The XF engine heartbeat could not be properly detected. It appears dead.";
703
case MGMT_ALARM_WDA_BILLING_CONNECTION_DIED:
704
return wda_conn_died;
705
case MGMT_ALARM_WDA_BILLING_CORRUPTED_DATA:
706
return wda_corr_data;
707
case MGMT_ALARM_WDA_XF_ENGINE_DOWN:
710
if (id < alarmTextNum)
711
return alarmText[id];
713
return alarmText[0]; // "Unknown Alarm";