~asanjar/charms/trusty/hdp-hadoop/test

« back to all changes in this revision

Viewing changes to hooks/test/hdp_scripts/hdp_manual_install_rpm_helper_files-2.1.1.385/configuration_files/nagios/objects/hadoop-services.cfg

  • Committer: amir sanjar
  • Date: 2014-07-21 19:53:44 UTC
  • Revision ID: amir.sanjar@canonical.com-20140721195344-a23z0lrebqzhl167
namenode & data node initialization

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
#
2
 
#
3
 
# Licensed to the Apache Software Foundation (ASF) under one
4
 
# or more contributor license agreements.  See the NOTICE file
5
 
# distributed with this work for additional information
6
 
# regarding copyright ownership.  The ASF licenses this file
7
 
# to you under the Apache License, Version 2.0 (the
8
 
# "License"); you may not use this file except in compliance
9
 
# with the License.  You may obtain a copy of the License at
10
 
#
11
 
#   http://www.apache.org/licenses/LICENSE-2.0
12
 
#
13
 
# Unless required by applicable law or agreed to in writing,
14
 
# software distributed under the License is distributed on an
15
 
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16
 
# KIND, either express or implied.  See the License for the
17
 
# specific language governing permissions and limitations
18
 
# under the License.
19
 
#
20
 
#
21
 
 
22
 
# NAGIOS SERVER Check (status log update)
23
 
define service {
24
 
        name                            hadoop-service
25
 
        use                             generic-service
26
 
        notification_options            w,u,c
27
 
        first_notification_delay        0
28
 
        notification_interval           0     # Send the notification once
29
 
}
30
 
 
31
 
define service {        
32
 
        hostgroup_name          nagios-server        
33
 
        use                     hadoop-service
34
 
        service_description     NAGIOS::Nagios status log freshness
35
 
        servicegroups           NAGIOS
36
 
        check_command           check_nagios!10!@STATUS_DAT@!@NAGIOS_BIN@
37
 
        normal_check_interval   5
38
 
        retry_check_interval    0.5
39
 
        max_check_attempts      2
40
 
}
41
 
 
42
 
define service {
43
 
        hostgroup_name          nagios-server
44
 
        use                     hadoop-service
45
 
        service_description     HDFS::Percent DataNodes with space available
46
 
        servicegroups           HDFS
47
 
        check_command           check_aggregate!"DATANODE::DataNode space"!10%!30%
48
 
        normal_check_interval   2
49
 
        retry_check_interval    1 
50
 
        max_check_attempts      1
51
 
}
52
 
 
53
 
define service {
54
 
        hostgroup_name          nagios-server
55
 
        use                     hadoop-service
56
 
        service_description     HDFS::Percent DataNodes live
57
 
        servicegroups           HDFS
58
 
        check_command           check_aggregate!"DATANODE::DataNode process"!10%!30%
59
 
        normal_check_interval   0.5
60
 
        retry_check_interval    0.25
61
 
        max_check_attempts      3
62
 
}
63
 
 
64
 
# HDFS Checks
65
 
define service {
66
 
        hostgroup_name          namenode
67
 
        use                     hadoop-service
68
 
        service_description     NAMENODE::NameNode Web UI
69
 
        servicegroups           HDFS
70
 
        check_command           check_webui!namenode
71
 
        normal_check_interval   1
72
 
        retry_check_interval    1
73
 
        max_check_attempts      3
74
 
}
75
 
 
76
 
define service {
77
 
        hostgroup_name          namenode
78
 
        use                     hadoop-service
79
 
        service_description     NAMENODE::NameNode edit logs directory status
80
 
        servicegroups           HDFS
81
 
        check_command           check_name_dir_status!50070
82
 
        normal_check_interval   0.5
83
 
        retry_check_interval    0.5
84
 
        max_check_attempts      3
85
 
}
86
 
 
87
 
define service {        
88
 
        hostgroup_name          namenode        
89
 
        use                     hadoop-service
90
 
        service_description     NAMENODE::NameNode Host CPU utilization
91
 
        servicegroups           HDFS
92
 
        check_command           check_cpu!200%!250%
93
 
        normal_check_interval   5
94
 
        retry_check_interval    2 
95
 
        max_check_attempts      5
96
 
}
97
 
 
98
 
define service {
99
 
        hostgroup_name          namenode
100
 
        use                     hadoop-service
101
 
        service_description     NAMENODE::NameNode process
102
 
        servicegroups           HDFS
103
 
        check_command           check_tcp!8020!-w 1 -c 1
104
 
        normal_check_interval   0.5
105
 
        retry_check_interval    0.25
106
 
        max_check_attempts      3
107
 
}
108
 
 
109
 
define service {
110
 
        hostgroup_name          namenode
111
 
        use                     hadoop-service
112
 
        service_description     HDFS::Blocks health
113
 
        servicegroups           HDFS
114
 
        check_command           check_hdfs_blocks!50070!0%!0%
115
 
        normal_check_interval   2
116
 
        retry_check_interval    1 
117
 
        max_check_attempts      1
118
 
}
119
 
 
120
 
define service {
121
 
        hostgroup_name          namenode
122
 
        use                     hadoop-service
123
 
        service_description     HDFS::HDFS capacity utilization
124
 
        servicegroups           HDFS
125
 
        check_command           check_hdfs_capacity!50070!80%!90%
126
 
        normal_check_interval   10
127
 
        retry_check_interval    1 
128
 
        max_check_attempts      1
129
 
}
130
 
 
131
 
define service {
132
 
        hostgroup_name          namenode
133
 
        use                     hadoop-service
134
 
        service_description     HDFS::NameNode RPC Latency
135
 
        servicegroups           HDFS
136
 
        check_command           check_rpcq_latency!NameNode!50070!3000!5000
137
 
        normal_check_interval   5
138
 
        retry_check_interval    1 
139
 
        max_check_attempts      5
140
 
}
141
 
 
142
 
 
143
 
###########################################################################
144
 
#
145
 
# GANGLIA SERVER Checks
146
 
#
147
 
define service {
148
 
        hostgroup_name          ganglia-server
149
 
        use                     hadoop-service
150
 
        service_description     GANGLIA::GangliaServer process
151
 
        servicegroups           GANGLIA
152
 
        check_command           check_tcp!8651!-w 1 -c 1
153
 
        normal_check_interval   0.25
154
 
        retry_check_interval    0.25
155
 
        max_check_attempts      4
156
 
}
157
 
 
158
 
define service {
159
 
        hostgroup_name          ganglia-server
160
 
        use                     hadoop-service
161
 
        service_description     GANGLIA::Ganglia Monitor process for Slaves
162
 
        servicegroups           GANGLIA
163
 
        check_command           check_tcp!8660!-w 1 -c 1
164
 
        normal_check_interval   0.25
165
 
        retry_check_interval    0.25
166
 
        max_check_attempts      4
167
 
}
168
 
 
169
 
define service {
170
 
        hostgroup_name          ganglia-server
171
 
        use                     hadoop-service
172
 
        service_description     GANGLIA::Ganglia Monitor process for NameNode
173
 
        servicegroups           GANGLIA
174
 
        check_command           check_tcp!8661!-w 1 -c 1
175
 
        normal_check_interval   0.25
176
 
        retry_check_interval    0.25
177
 
        max_check_attempts      4
178
 
}
179
 
 
180
 
define service {
181
 
        hostgroup_name          ganglia-server
182
 
        use                     hadoop-service
183
 
        service_description     GANGLIA::Ganglia Monitor process for HBase Master
184
 
        servicegroups           GANGLIA
185
 
        check_command           check_tcp!8663!-w 1 -c 1
186
 
        normal_check_interval   0.25
187
 
        retry_check_interval    0.25
188
 
        max_check_attempts      4
189
 
}
190
 
 
191
 
define service {
192
 
        hostgroup_name          ganglia-server
193
 
        use                     hadoop-service
194
 
        service_description     GANGLIA::Ganglia Monitor process for ResourceManager
195
 
        servicegroups           GANGLIA
196
 
        check_command           check_tcp!8664!-w 1 -c 1
197
 
        normal_check_interval   0.25
198
 
        retry_check_interval    0.25
199
 
        max_check_attempts      4
200
 
}
201
 
 
202
 
define service {
203
 
        hostgroup_name          ganglia-server
204
 
        use                     hadoop-service
205
 
        service_description     GANGLIA::Ganglia Monitor process for NodeManager
206
 
        servicegroups           GANGLIA
207
 
        check_command           check_tcp!8660!-w 1 -c 1
208
 
        normal_check_interval   0.25
209
 
        retry_check_interval    0.25
210
 
        max_check_attempts      4
211
 
}
212
 
 
213
 
define service {
214
 
        hostgroup_name          ganglia-server
215
 
        use                     hadoop-service
216
 
        service_description     GANGLIA::Ganglia Monitor process for HistoryServer
217
 
        servicegroups           GANGLIA
218
 
        check_command           check_tcp!8666!-w 1 -c 1
219
 
        normal_check_interval   0.25
220
 
        retry_check_interval    0.25
221
 
        max_check_attempts      4
222
 
}
223
 
 
224
 
###########################################################################
225
 
#
226
 
# YARN Checks
227
 
#
228
 
 
229
 
define service {
230
 
        hostgroup_name          resourcemanager
231
 
        use                     hadoop-service
232
 
        service_description     RESOURCEMANAGER::ResourceManager Web UI
233
 
        servicegroups           YARN
234
 
        check_command           check_webui!resourcemanager!8088
235
 
        normal_check_interval   1
236
 
        retry_check_interval    1
237
 
        max_check_attempts      3
238
 
}
239
 
 
240
 
define service {
241
 
        hostgroup_name          resourcemanager
242
 
        use                     hadoop-service
243
 
        service_description     RESOURCEMANAGER::ResourceManager CPU utilization
244
 
        servicegroups           YARN
245
 
        check_command           check_cpu!200%!250%
246
 
        normal_check_interval   5
247
 
        retry_check_interval    2 
248
 
        max_check_attempts      5
249
 
}
250
 
 
251
 
define service {
252
 
        hostgroup_name          resourcemanager
253
 
        use                     hadoop-service
254
 
        service_description     RESOURCEMANAGER::ResourceManager RPC latency
255
 
        servicegroups           YARN
256
 
        check_command           check_rpcq_latency!ResorceManager!8088!3000!5000
257
 
        normal_check_interval   5
258
 
        retry_check_interval    1 
259
 
        max_check_attempts      5
260
 
}
261
 
 
262
 
define service {
263
 
        hostgroup_name          resourcemanager
264
 
        use                     hadoop-service
265
 
        service_description     RESOURCEMANAGER::Percent NodeManager live
266
 
        servicegroups           YARN
267
 
        check_command           check_resourcemanager_nodes_percentage!8088!lost!10!30
268
 
        normal_check_interval   1
269
 
        retry_check_interval    1
270
 
        max_check_attempts      3
271
 
}
272
 
 
273
 
define service {
274
 
        hostgroup_name          resourcemanager
275
 
        use                     hadoop-service
276
 
        service_description     RESOURCEMANAGER::Percent NodeManager healthy
277
 
        servicegroups           YARN
278
 
        check_command           check_resourcemanager_nodes_percentage!8088!unhealthy!10!30
279
 
        normal_check_interval   1
280
 
        retry_check_interval    1
281
 
        max_check_attempts      3
282
 
}
283
 
 
284
 
define service {
285
 
        hostgroup_name          resourcemanager
286
 
        use                     hadoop-service
287
 
        service_description     RESOURCEMANAGER::ResourceManager process
288
 
        servicegroups           YARN
289
 
        check_command           check_tcp!8088!-w 1 -c 1
290
 
        normal_check_interval   1
291
 
        retry_check_interval    0.5
292
 
        max_check_attempts      3
293
 
}
294
 
 
295
 
 
296
 
# YARN::NODEMANAGER Checks
297
 
define service {
298
 
        hostgroup_name          nodemanagers
299
 
        use                     hadoop-service
300
 
        service_description     NODEMANAGER::NodeManager process
301
 
        servicegroups           YARN
302
 
        check_command           check_tcp!8042!-w 1 -c 1
303
 
        normal_check_interval   1
304
 
        retry_check_interval    0.5
305
 
        max_check_attempts      3
306
 
}
307
 
 
308
 
define service {
309
 
        hostgroup_name          nodemanagers
310
 
        use                     hadoop-service
311
 
        service_description     NODEMANAGER::NodeManager health
312
 
        servicegroups           YARN
313
 
        check_command           check_nodemanager_health!8042
314
 
        normal_check_interval   1
315
 
        retry_check_interval    1
316
 
        max_check_attempts      3
317
 
}
318
 
 
319
 
 
320
 
# MAPREDUCE::JOBHISTORY Checks
321
 
define service {
322
 
        hostgroup_name          historyserver2
323
 
        use                     hadoop-service
324
 
        service_description     JOBHISTORY::HistoryServer Web UI
325
 
        servicegroups           MAPREDUCE
326
 
        check_command           check_webui!historyserver2!19888
327
 
        normal_check_interval   1
328
 
        retry_check_interval    1
329
 
        max_check_attempts      3
330
 
}
331
 
 
332
 
define service {
333
 
        hostgroup_name          historyserver2
334
 
        use                     hadoop-service
335
 
        service_description     JOBHISTORY::HistoryServer CPU utilization
336
 
        servicegroups           MAPREDUCE
337
 
        check_command           check_cpu!200%!250%
338
 
        normal_check_interval   5
339
 
        retry_check_interval    2 
340
 
        max_check_attempts      5
341
 
}
342
 
 
343
 
define service {
344
 
        hostgroup_name          historyserver2
345
 
        use                     hadoop-service
346
 
        service_description     JOBHISTORY::HistoryServer RPC latency
347
 
        servicegroups           MAPREDUCE
348
 
        check_command           check_rpcq_latency!JobHistoryServer!19888!3000!5000
349
 
        normal_check_interval   5
350
 
        retry_check_interval    1 
351
 
        max_check_attempts      5
352
 
}
353
 
 
354
 
###########################################################################
355
 
#
356
 
# ZOOKEEPER Checks
357
 
#
358
 
define service {
359
 
        hostgroup_name          nagios-server
360
 
        use                     hadoop-service
361
 
        service_description     ZOOKEEPER::Percent ZooKeeper Servers live
362
 
        servicegroups           ZOOKEEPER
363
 
        check_command           check_aggregate!"ZKSERVERS::ZooKeeper Server process"!35%!70%
364
 
        normal_check_interval   0.5
365
 
        retry_check_interval    0.25
366
 
        max_check_attempts      3
367
 
}
368
 
 
369
 
define service {
370
 
        hostgroup_name          zookeeper-servers
371
 
        use                     hadoop-service
372
 
        service_description     ZKSERVERS::ZooKeeper Server live
373
 
        servicegroups           ZOOKEEPER
374
 
        check_command           check_tcp!2181!-w 1 -c 1
375
 
        normal_check_interval   1
376
 
        retry_check_interval    0.5
377
 
        max_check_attempts      3
378
 
}
379
 
 
380
 
###########################################################################
381
 
#
382
 
# HBASE Checks
383
 
#
384
 
 
385
 
# HBASE::REGIONSERVER Checks
386
 
define service {
387
 
        hostgroup_name          nagios-server
388
 
        use                     hadoop-service
389
 
        service_description     HBASE::Percent RegionServers live
390
 
        servicegroups           HBASE
391
 
        check_command           check_aggregate!"REGIONSERVER::RegionServer process"!10%!30%
392
 
        normal_check_interval   0.5
393
 
        retry_check_interval    0.25
394
 
        max_check_attempts      3
395
 
}
396
 
 
397
 
# HBASE::REGIONSERVER Checks
398
 
define service {
399
 
        hostgroup_name          region-servers
400
 
        use                     hadoop-service
401
 
        service_description     REGIONSERVER::Process down
402
 
        servicegroups           HBASE
403
 
        check_command           check_tcp!60020!-w 1 -c 1
404
 
        normal_check_interval   1
405
 
        retry_check_interval    0.5
406
 
        max_check_attempts      3
407
 
}
408
 
 
409
 
# HBASE::MASTER Checks
410
 
define service {
411
 
        hostgroup_name          hbasemaster
412
 
        use                     hadoop-service
413
 
        service_description     HBASEMASTER::HBase Web UI down
414
 
        servicegroups           HBASE
415
 
        check_command           check_webui!hbase
416
 
        normal_check_interval   1
417
 
        retry_check_interval    1
418
 
        max_check_attempts      3
419
 
}
420
 
 
421
 
# HBASE::MASTER Checks
422
 
define service {
423
 
        hostgroup_name          hbasemaster
424
 
        use                     hadoop-service
425
 
        service_description     HBASEMASTER::HBaseMaster CPU utilization
426
 
        servicegroups           HBASE
427
 
        check_command           check_cpu!200%!250%
428
 
        normal_check_interval   5
429
 
        retry_check_interval    2 
430
 
        max_check_attempts      5
431
 
}
432
 
 
433
 
# HBASE::MASTER Checks
434
 
define service {
435
 
        hostgroup_name          hbasemaster
436
 
        use                     hadoop-service
437
 
        service_description     HBASEMASTER::HBaseMaster Process down
438
 
        servicegroups           HBASE
439
 
        check_command           check_tcp!60000!-w 1 -c 1
440
 
        normal_check_interval   0.5
441
 
        retry_check_interval    0.25
442
 
        max_check_attempts      4
443
 
}
444
 
 
445
 
###########################################################################
446
 
#
447
 
# HIVE + WEBHCAT Checks
448
 
#
449
 
 
450
 
# HIVE Metastore check
451
 
define service {
452
 
        hostgroup_name          hiveserver
453
 
        use                     hadoop-service
454
 
        service_description     HIVE-METASTORE::HIVE-METASTORE status check
455
 
        servicegroups           HIVE-METASTORE
456
 
        check_command           check_hive_metastore_status!9083!@JAVA_HOME@
457
 
        normal_check_interval   0.5
458
 
        retry_check_interval    0.5
459
 
        max_check_attempts      3
460
 
}
461
 
 
462
 
# Webhcat check
463
 
define service {
464
 
        hostgroup_name          webhcat-server
465
 
        use                     hadoop-service
466
 
        service_description     WEBHCAT::WebHcat status check
467
 
        servicegroups           WEBHCAT 
468
 
        check_command           check_webhcat_status!50111!v1
469
 
        normal_check_interval   1
470
 
        retry_check_interval    0.5
471
 
        max_check_attempts      3
472
 
}
473
 
 
474
 
###########################################################################
475
 
#
476
 
# Oozie Checks
477
 
#
478
 
 
479
 
# Oozie check
480
 
define service {
481
 
        hostgroup_name          oozie-server
482
 
        use                     hadoop-service
483
 
        service_description     OOZIE::Oozie status check
484
 
        servicegroups           OOZIE
485
 
        check_command           check_oozie_status!11000!@JAVA_HOME@
486
 
        normal_check_interval   1
487
 
        retry_check_interval    1
488
 
        max_check_attempts      3
489
 
}
490