3
# Licensed to the Apache Software Foundation (ASF) under one
4
# or more contributor license agreements. See the NOTICE file
5
# distributed with this work for additional information
6
# regarding copyright ownership. The ASF licenses this file
7
# to you under the Apache License, Version 2.0 (the
8
# "License"); you may not use this file except in compliance
9
# with the License. You may obtain a copy of the License at
11
# http://www.apache.org/licenses/LICENSE-2.0
13
# Unless required by applicable law or agreed to in writing,
14
# software distributed under the License is distributed on an
15
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16
# KIND, either express or implied. See the License for the
17
# specific language governing permissions and limitations
22
# NAGIOS SERVER Check (status log update)
26
notification_options w,u,c
27
first_notification_delay 0
28
notification_interval 0 # Send the notification once
32
hostgroup_name nagios-server
34
service_description NAGIOS::Nagios status log freshness
36
check_command check_nagios!10!@STATUS_DAT@!@NAGIOS_BIN@
37
normal_check_interval 5
38
retry_check_interval 0.5
43
hostgroup_name nagios-server
45
service_description HDFS::Percent DataNodes with space available
47
check_command check_aggregate!"DATANODE::DataNode space"!10%!30%
48
normal_check_interval 2
49
retry_check_interval 1
54
hostgroup_name nagios-server
56
service_description HDFS::Percent DataNodes live
58
check_command check_aggregate!"DATANODE::DataNode process"!10%!30%
59
normal_check_interval 0.5
60
retry_check_interval 0.25
66
hostgroup_name namenode
68
service_description NAMENODE::NameNode Web UI
70
check_command check_webui!namenode
71
normal_check_interval 1
72
retry_check_interval 1
77
hostgroup_name namenode
79
service_description NAMENODE::NameNode edit logs directory status
81
check_command check_name_dir_status!50070
82
normal_check_interval 0.5
83
retry_check_interval 0.5
88
hostgroup_name namenode
90
service_description NAMENODE::NameNode Host CPU utilization
92
check_command check_cpu!200%!250%
93
normal_check_interval 5
94
retry_check_interval 2
99
hostgroup_name namenode
101
service_description NAMENODE::NameNode process
103
check_command check_tcp!8020!-w 1 -c 1
104
normal_check_interval 0.5
105
retry_check_interval 0.25
110
hostgroup_name namenode
112
service_description HDFS::Blocks health
114
check_command check_hdfs_blocks!50070!0%!0%
115
normal_check_interval 2
116
retry_check_interval 1
121
hostgroup_name namenode
123
service_description HDFS::HDFS capacity utilization
125
check_command check_hdfs_capacity!50070!80%!90%
126
normal_check_interval 10
127
retry_check_interval 1
132
hostgroup_name namenode
134
service_description HDFS::NameNode RPC Latency
136
check_command check_rpcq_latency!NameNode!50070!3000!5000
137
normal_check_interval 5
138
retry_check_interval 1
143
###########################################################################
145
# GANGLIA SERVER Checks
148
hostgroup_name ganglia-server
150
service_description GANGLIA::GangliaServer process
151
servicegroups GANGLIA
152
check_command check_tcp!8651!-w 1 -c 1
153
normal_check_interval 0.25
154
retry_check_interval 0.25
159
hostgroup_name ganglia-server
161
service_description GANGLIA::Ganglia Monitor process for Slaves
162
servicegroups GANGLIA
163
check_command check_tcp!8660!-w 1 -c 1
164
normal_check_interval 0.25
165
retry_check_interval 0.25
170
hostgroup_name ganglia-server
172
service_description GANGLIA::Ganglia Monitor process for NameNode
173
servicegroups GANGLIA
174
check_command check_tcp!8661!-w 1 -c 1
175
normal_check_interval 0.25
176
retry_check_interval 0.25
181
hostgroup_name ganglia-server
183
service_description GANGLIA::Ganglia Monitor process for HBase Master
184
servicegroups GANGLIA
185
check_command check_tcp!8663!-w 1 -c 1
186
normal_check_interval 0.25
187
retry_check_interval 0.25
192
hostgroup_name ganglia-server
194
service_description GANGLIA::Ganglia Monitor process for ResourceManager
195
servicegroups GANGLIA
196
check_command check_tcp!8664!-w 1 -c 1
197
normal_check_interval 0.25
198
retry_check_interval 0.25
203
hostgroup_name ganglia-server
205
service_description GANGLIA::Ganglia Monitor process for NodeManager
206
servicegroups GANGLIA
207
check_command check_tcp!8660!-w 1 -c 1
208
normal_check_interval 0.25
209
retry_check_interval 0.25
214
hostgroup_name ganglia-server
216
service_description GANGLIA::Ganglia Monitor process for HistoryServer
217
servicegroups GANGLIA
218
check_command check_tcp!8666!-w 1 -c 1
219
normal_check_interval 0.25
220
retry_check_interval 0.25
224
###########################################################################
230
hostgroup_name resourcemanager
232
service_description RESOURCEMANAGER::ResourceManager Web UI
234
check_command check_webui!resourcemanager!8088
235
normal_check_interval 1
236
retry_check_interval 1
241
hostgroup_name resourcemanager
243
service_description RESOURCEMANAGER::ResourceManager CPU utilization
245
check_command check_cpu!200%!250%
246
normal_check_interval 5
247
retry_check_interval 2
252
hostgroup_name resourcemanager
254
service_description RESOURCEMANAGER::ResourceManager RPC latency
256
check_command check_rpcq_latency!ResorceManager!8088!3000!5000
257
normal_check_interval 5
258
retry_check_interval 1
263
hostgroup_name resourcemanager
265
service_description RESOURCEMANAGER::Percent NodeManager live
267
check_command check_resourcemanager_nodes_percentage!8088!lost!10!30
268
normal_check_interval 1
269
retry_check_interval 1
274
hostgroup_name resourcemanager
276
service_description RESOURCEMANAGER::Percent NodeManager healthy
278
check_command check_resourcemanager_nodes_percentage!8088!unhealthy!10!30
279
normal_check_interval 1
280
retry_check_interval 1
285
hostgroup_name resourcemanager
287
service_description RESOURCEMANAGER::ResourceManager process
289
check_command check_tcp!8088!-w 1 -c 1
290
normal_check_interval 1
291
retry_check_interval 0.5
296
# YARN::NODEMANAGER Checks
298
hostgroup_name nodemanagers
300
service_description NODEMANAGER::NodeManager process
302
check_command check_tcp!8042!-w 1 -c 1
303
normal_check_interval 1
304
retry_check_interval 0.5
309
hostgroup_name nodemanagers
311
service_description NODEMANAGER::NodeManager health
313
check_command check_nodemanager_health!8042
314
normal_check_interval 1
315
retry_check_interval 1
320
# MAPREDUCE::JOBHISTORY Checks
322
hostgroup_name historyserver2
324
service_description JOBHISTORY::HistoryServer Web UI
325
servicegroups MAPREDUCE
326
check_command check_webui!historyserver2!19888
327
normal_check_interval 1
328
retry_check_interval 1
333
hostgroup_name historyserver2
335
service_description JOBHISTORY::HistoryServer CPU utilization
336
servicegroups MAPREDUCE
337
check_command check_cpu!200%!250%
338
normal_check_interval 5
339
retry_check_interval 2
344
hostgroup_name historyserver2
346
service_description JOBHISTORY::HistoryServer RPC latency
347
servicegroups MAPREDUCE
348
check_command check_rpcq_latency!JobHistoryServer!19888!3000!5000
349
normal_check_interval 5
350
retry_check_interval 1
354
###########################################################################
359
hostgroup_name nagios-server
361
service_description ZOOKEEPER::Percent ZooKeeper Servers live
362
servicegroups ZOOKEEPER
363
check_command check_aggregate!"ZKSERVERS::ZooKeeper Server process"!35%!70%
364
normal_check_interval 0.5
365
retry_check_interval 0.25
370
hostgroup_name zookeeper-servers
372
service_description ZKSERVERS::ZooKeeper Server live
373
servicegroups ZOOKEEPER
374
check_command check_tcp!2181!-w 1 -c 1
375
normal_check_interval 1
376
retry_check_interval 0.5
380
###########################################################################
385
# HBASE::REGIONSERVER Checks
387
hostgroup_name nagios-server
389
service_description HBASE::Percent RegionServers live
391
check_command check_aggregate!"REGIONSERVER::RegionServer process"!10%!30%
392
normal_check_interval 0.5
393
retry_check_interval 0.25
397
# HBASE::REGIONSERVER Checks
399
hostgroup_name region-servers
401
service_description REGIONSERVER::Process down
403
check_command check_tcp!60020!-w 1 -c 1
404
normal_check_interval 1
405
retry_check_interval 0.5
409
# HBASE::MASTER Checks
411
hostgroup_name hbasemaster
413
service_description HBASEMASTER::HBase Web UI down
415
check_command check_webui!hbase
416
normal_check_interval 1
417
retry_check_interval 1
421
# HBASE::MASTER Checks
423
hostgroup_name hbasemaster
425
service_description HBASEMASTER::HBaseMaster CPU utilization
427
check_command check_cpu!200%!250%
428
normal_check_interval 5
429
retry_check_interval 2
433
# HBASE::MASTER Checks
435
hostgroup_name hbasemaster
437
service_description HBASEMASTER::HBaseMaster Process down
439
check_command check_tcp!60000!-w 1 -c 1
440
normal_check_interval 0.5
441
retry_check_interval 0.25
445
###########################################################################
447
# HIVE + WEBHCAT Checks
450
# HIVE Metastore check
452
hostgroup_name hiveserver
454
service_description HIVE-METASTORE::HIVE-METASTORE status check
455
servicegroups HIVE-METASTORE
456
check_command check_hive_metastore_status!9083!@JAVA_HOME@
457
normal_check_interval 0.5
458
retry_check_interval 0.5
464
hostgroup_name webhcat-server
466
service_description WEBHCAT::WebHcat status check
467
servicegroups WEBHCAT
468
check_command check_webhcat_status!50111!v1
469
normal_check_interval 1
470
retry_check_interval 0.5
474
###########################################################################
481
hostgroup_name oozie-server
483
service_description OOZIE::Oozie status check
485
check_command check_oozie_status!11000!@JAVA_HOME@
486
normal_check_interval 1
487
retry_check_interval 1