4
# Copyright (c) 2006-2007 Cisco Systems. All rights reserved.
5
# Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved.
7
# This Software is licensed under one of the following licenses:
9
# 1) under the terms of the "Common Public License 1.0" a copy of which is
10
# available from the Open Source Initiative, see
11
# http://www.opensource.org/licenses/cpl.php.
13
# 2) under the terms of the "The BSD License" a copy of which is
14
# available from the Open Source Initiative, see
15
# http://www.opensource.org/licenses/bsd-license.php.
17
# 3) under the terms of the "GNU General Public License (GPL) Version 2" a
18
# copy of which is available from the Open Source Initiative, see
19
# http://www.opensource.org/licenses/gpl-license.php.
21
# Licensee has the right to choose one of the above licenses.
23
# Redistributions of source code must retain the above copyright
24
# notice and one of the license notices.
26
# Redistributions in binary form must reproduce both the above copyright
27
# notice, one of the license notices in the documentation
28
# and/or other materials provided with the distribution.
31
# Description: Test health of HCA
41
#TODO: ask vlad about the path .
44
MTHCA_DRIVER_NEEDED=mthca
45
HERMON_DRIVER_NEEDED=mlx4_
46
CONNECTIB_DRIVER_NEEDED=mlx5_
47
DRIVER_NEEDED=$MTHCA_DRIVER_NEEDED
48
INFINI_CLASS_PATH=/sys/class/infiniband
49
G_LSPCI_OUTPUT_FILE=/tmp/hca_self_test_lspci.output
51
########################################################################################
52
# Required FW version definitions. These lines below are optionally set by the installer
61
########################################################################################
66
message=${1} # argument 1 - message
67
color=${2} # argument 2 - color
70
tput sgr0 # Reset to normal
77
# Check whether the script is being run as root and exit if otherwise
78
if [ `id -g` -ne 0 ]; then
79
echo "Error: hca_self_test must be run as root."
88
if [ -f /etc/redhat-release -o -f /etc/fedora-release ]; then
90
elif [ -f /etc/SuSE-release ]; then
92
elif [ -f /etc/debian_version ]; then
94
INSTALL_PREFIX=/opt/topspin
97
echo "Error: hca_self_test does not support this OS."
102
for cmd in lspci cat id $g_pkg_cmd uname grep ls awk egrep modprobe; do
103
cmd_exist=`which $cmd 2> /dev/null`
104
if [ "$cmd_exist" == "" ]; then
105
echo "Error: $cmd tool was not found in the PATH"
111
HCA_LSPCI_NAME="InfiniBand"
112
VPI_LSPCI_NAME="Network controller"
113
NIC_LSPCI_NAME="Ethernet controller"
114
MEM_CON_LSPCI_NAME="Memory controller"
116
MEL_LSPCI_NAME_EXP="${HCA_LSPCI_NAME}|${VPI_LSPCI_NAME}|${NIC_LSPCI_NAME}"
118
echo "---- Performing Adapter Device Self Test ----"
121
NUM_IB_DEV=`lspci 2> /dev/null | grep -E "(${MEL_LSPCI_NAME_EXP})(\s\[[0-9]+\])?: Mellanox Technolog" | wc -l`
122
NUM_MEM_CON=`lspci 2> /dev/null | grep "${MEM_CON_LSPCI_NAME}(\s\[[0-9]+\])?: Mellanox Technolog" | wc -l`
124
let "NUM_HCAS=$NUM_IB_DEV + $NUM_MEM_CON"
125
echo "Number of CAs Detected ................. $NUM_HCAS";
127
if [ $NUM_HCAS -ne 0 ]; then
128
if [ $NUM_MEM_CON -ne 0 ]; then
129
echo -e "PCI Device Check ....................... ${red}FAIL"
131
echo " REASON: jumper set on CA or CA hardware failure"
134
echo -e "PCI Device Check ....................... ${green}PASS"
138
echo -e "PCI Device Check ....................... ${red}FAIL"
140
echo " REASON: no CAs in the system"
146
mlx4_core_ko=`modinfo mlx4_core | grep filename | awk '{print $NF}'`
147
if [ $OS_TYPE = "DEBIAN" ]; then
151
RPM_USR_VER=`dpkg -s libibverbs1 2> /dev/null | wc -l`
152
RPM_KER_VER=`dpkg -l 2> /dev/null | grep -E "ofed-kernel" | wc -l`
153
RPM_KER_NAME=`echo $mlx4_core_ko | awk -F '/' '{print$4}'`
154
BOOTED_KER=`uname -r`
155
if [ "$BOOTED_KER" != "$RPM_KER_NAME" ]; then
161
RPM_USR_VER=`rpm -q libibverbs 2> /dev/null | wc -l`
162
RPM_KER_VER=`rpm -qa 2> /dev/null | grep -E "kernel-ib|ofa_kernel" | wc -l`
163
KER_RPM=`rpm -qf $mlx4_core_ko 2> /dev/null | grep -E "kernel-ib|ofa_kernel"`
165
if [ ! -z $KER_RPM ]; then
166
RPM_KER_NAME=`echo $mlx4_core_ko | awk -F '/' '{print$4}'`
169
if (rpm -q ofa_kernel_dkms > /dev/null 2>&1); then
171
RPM_KER_NAME=`rpm -q --queryformat "[%{NAME}-%{VERSION}]\n" ofa_kernel_dkms`
178
OFED_VERSION=$(ofed_info 2> /dev/null | head -1)
179
RPM_KER_ARCH=`uname -m`
180
if [ $OS_TYPE = "RED_HAT" ]; then
181
BOOTED_KER=`uname -r`
182
elif [ $OS_TYPE = "SUSE" ]; then
183
# Have to munge uname output a bit.
184
BOOTED_KER=`uname -r | sed s@-@_@g`
187
if [ $RPM_USR_VER -eq 0 ] && [ $RPM_KER_VER -eq 0 ]; then
188
echo -e "Host Driver RPM Check .................. ${red}FAIL"
190
echo " REASON: no RPMs found"
193
elif [ $RPM_USR_VER -eq 0 ]; then
194
echo -e "Host Driver RPM Check .................. ${red}FAIL"
196
echo " REASON: no user level RPMs found"
199
elif [ $RPM_KER_VER -eq 0 ]; then
200
echo -e "Host Driver RPM Check .................. ${red}FAIL"
202
echo " REASON: no kernel level RPMs found"
207
if [ $RPM_KER_VER -ne 0 ]; then
208
if [ $RPM_CUR_BOOTED_KER -eq 0 ]; then
209
echo -e "Host Driver RPM Check .................. ${red}FAIL"
211
echo " REASON: no RPMs found for currently booted kernel $BOOTED_KER"
217
echo "Kernel Arch ............................ $RPM_KER_ARCH"
218
if [ "$RPM_KER_NAME" != "" ]; then
219
echo "Host Driver Version .................... $OFED_VERSION $RPM_KER_NAME"
221
echo "Host Driver Version .................... NA"
224
if [ $RPM_CHECK_FAIL -eq 0 ]; then
225
echo -e "Host Driver RPM Check .................. ${green}PASS"
230
function get_curr_ca_pci_info () {
233
if [ "${g_lspci_was_ran}" == "" ]; then
234
lspci 2> /dev/null | grep -E "(${MEL_LSPCI_NAME_EXP}|${MEM_CON_LSPCI_NAME})(\s\[[0-9]+\])?: Mellanox Technolog" > ${G_LSPCI_OUTPUT_FILE}
237
current_pci_info=$(cat ${G_LSPCI_OUTPUT_FILE} | head -$(expr $loop_cnt + 1) | tail -1)
238
echo ${current_pci_info}
241
function get_ca_type() {
244
current_pci_info=`get_curr_ca_pci_info $loop_cnt`
246
if [[ ${current_pci_info} =~ ${HCA_LSPCI_NAME} ]] || [[ ${current_pci_info} =~ ${MEM_CON_LSPCI_NAME} ]]; then
248
elif [[ ${current_pci_info} =~ ${VPI_LSPCI_NAME} ]]; then
250
elif [[ ${current_pci_info} =~ ${NIC_LSPCI_NAME} ]]; then
258
function get_device_id {
261
current_pci_info=`get_curr_ca_pci_info $loop_cnt`
262
PCI_DEV=$(echo $current_pci_info | awk '{print $1}')
264
HexDevice_ID=$(lspci -n -d "15b3:" 2> /dev/null | grep $PCI_DEV | tail -1 | cut -d ":" -f4 | cut -d " " -f1)
265
if [ "$HexDevice_ID" != "" ]; then
266
HexDevice_ID=0x$HexDevice_ID
267
let "tmp=$HexDevice_ID"
268
Device_ID=$(echo $tmp)
270
Device_ID=$(mstflint -d $PCI_DEV q 2> /dev/null | grep "Device ID" | awk '{print $3}')
276
g_connectx="ConnectX"
277
g_connectx3="ConnectX-3"
278
g_connectx3_pro="ConnectX-3_Pro"
279
g_connectib="Connect-IB"
280
g_InfiniHost_III_Ex="InfiniHost_III_Ex"
281
g_InfiniHost_III_Ex_memfree="InfiniHost_III_Ex_m"
282
g_InfiniHost_III_Lx="InfiniHost_III_Lx"
283
g_InfiniHost="InfiniHost"
286
function get_hca_name {
288
if [ $dev_id -eq 25208 ]; then
289
ret_val=$g_InfiniHost_III_Ex
290
elif [ $dev_id -eq 25218 ]; then
291
ret_val=$g_InfiniHost_III_Ex_memfree
292
elif [ $dev_id -eq 24204 -o $dev_id -eq 25204 ]; then
293
ret_val=$g_InfiniHost_III_Lx
294
elif [ $dev_id -eq 23108 ]; then
295
ret_val=$g_InfiniHost
296
elif [ $dev_id -eq 4099 ]; then
298
elif [ $dev_id -eq 4103 ]; then
299
ret_val=$g_connectx3_pro
300
elif [ $dev_id -eq 4113 ]; then
310
function get_driver {
313
Device_ID=$(get_device_id $LOOP_COUNT)
314
if [ "$Device_ID" != "" ]; then
315
hca_name=$(get_hca_name $Device_ID)
316
if [ "$hca_name" != "" ]; then
317
if [ "$hca_name" == "$g_connectx" ] || [ "$hca_name" == "$g_connectx3" ] || [ "$hca_name" == "$g_connectx3_pro" ]; then
318
driver_need=$HERMON_DRIVER_NEEDED
319
elif [ "$hca_name" == "$g_connectib" ]; then
320
driver_need=$CONNECTIB_DRIVER_NEEDED
322
driver_need=$MTHCA_DRIVER_NEEDED
330
function compare_fw {
334
n_1=$(echo $needed | cut -f1 -d"." | cut -b 2-)
335
n_2=$(echo $needed | cut -f2 -d".")
336
n_3=$(echo $needed | cut -f3 -d".")
337
f_1=$(echo $found | cut -f1 -d"." | cut -b 2-)
338
f_2=$(echo $found | cut -f2 -d".")
339
f_3=$(echo $found | cut -f3 -d".")
341
if [ $n_1 -gt $f_1 ]; then
343
elif [ $f_1 -gt $n_1 ]; then
345
elif [ $n_2 -gt $f_2 ]; then
347
elif [ $f_2 -gt $n_2 ]; then
349
elif [ $n_3 -gt $f_3 ]; then
351
elif [ $f_3 -gt $n_3 ]; then
364
echo -e "Firmware Check on CA #$LOOP_COUNT (${ca_type}) .......... ${result}"
367
if [ $NUM_HCAS -ne 0 ]; then
373
# To take care of more than one HCA
374
while [ $LOOP_COUNT -lt $NUM_HCAS ]
376
ca_type=$(get_ca_type $LOOP_COUNT)
377
if [ $RPM_USR_VER -ne 0 ] && [ $RPM_CUR_BOOTED_KER -ne 0 ]; then
380
device_num=$mthca_dev_num
383
PCI_DEVICE=$(lspci 2> /dev/null | grep Mellanox | head -$(expr $LOOP_COUNT + 1) | tail -1 | awk '{print $1}')
384
Device_ID=$(get_device_id $LOOP_COUNT)
385
if [ "$Device_ID" != "" ]; then
386
hca_name=$(get_hca_name $Device_ID)
387
if [ "$hca_name" != "" ]; then
389
# get the FW and the Expected FW
390
if [ "$hca_name" == "$g_InfiniHost_III_Ex" ]; then
391
FW_NEEDED=$ARBEL_FW_NEEDED
392
elif [ "$hca_name" == $g_InfiniHost_III_Ex_memfree ]; then
393
FW_NEEDED=$ARBEL_MF_FW_NEEDED
394
elif [ "$hca_name" == "$g_InfiniHost_III_Lx" ]; then
395
FW_NEEDED=$SINAI_FW_NEEDED
396
elif [ "$hca_name" == "$g_connectx" ]; then
397
FW_NEEDED=$HERMON_FW_NEEDED
398
DRIVER_NEEDED=$HERMON_DRIVER_NEEDED
399
device_num=$mlx_dev_num
400
elif [ "$hca_name" == "$g_connectx3" ]; then
401
FW_NEEDED=$CX3_FW_NEEDED
402
DRIVER_NEEDED=$HERMON_DRIVER_NEEDED
403
device_num=$mlx_dev_num
404
elif [ "$hca_name" == "$g_connectx3_pro" ]; then
405
FW_NEEDED=$CX3_PRO_FW_NEEDED
406
DRIVER_NEEDED=$HERMON_DRIVER_NEEDED
407
device_num=$mlx_dev_num
408
elif [ "$hca_name" == "$g_connectib" ]; then
409
FW_NEEDED=$CONNECTIB_FW_NEEDED
410
DRIVER_NEEDED=$CONNECTIB_DRIVER_NEEDED
411
device_num=$mlx5_dev_num
412
elif [ "$hca_name" == "$g_InfiniHost" ]; then
413
FW_NEEDED=$TAVOR_FW_NEEDED
415
legal=$(echo $FW_NEEDED | grep v\[0-9\]\[0-9\]*.\[0-9\]\[0-9\]*.\[0-9\]\[0-9\]*)
417
# increase the mlx and mthca counter
419
"$g_connectx"|"$g_connectx3"|"$g_connectx3_pro")
420
let "mlx_dev_num=$mlx_dev_num + 1"
423
let "mlx5_dev_num=$mlx5_dev_num + 1"
425
let "mthca_dev_num=$mthca_dev_num + 1"
429
FW_FOUND=v$(mstflint -d $PCI_DEVICE q 2> /dev/null | grep "FW Version" | awk '{print $3}')
430
if [ "$FW_FOUND" = "v" ]; then
431
if [ -f "$INFINI_CLASS_PATH/$DRIVER_NEEDED$device_num/fw_ver" ]; then
432
FW_FOUND=v`cat $INFINI_CLASS_PATH/$DRIVER_NEEDED$device_num/fw_ver 2> /dev/null`
434
echo_fw_check ${LOOP_COUNT} ${ca_type} "${red}FAIL"
435
echo " REASON: CA #$LOOP_COUNT: failed to get firmware version"
441
if [ "$no_firmware" != "1" ]; then
442
echo -e "Firmware on CA #$LOOP_COUNT ${ca_type} .................. $FW_FOUND"
443
if [ "$FW_NEEDED" == "$legal" -a "$FW_NEEDED" != "" ]; then
444
if [ "$FW_FOUND" = "$FW_NEEDED" ]; then
445
echo_fw_check ${LOOP_COUNT} ${ca_type} "${green}PASS"
447
newest=$(compare_fw $FW_FOUND $FW_NEEDED)
448
if [ "$newest" = "found" ]; then
449
echo_fw_check ${LOOP_COUNT} ${ca_type} "${green}PASS"
450
echo " NOTE: The found fw version is higher than the fw included in this package ($FW_NEEDED)"
452
echo_fw_check ${LOOP_COUNT} ${ca_type} "${red}FAIL"
453
echo " REASON: mismatch CA #$LOOP_COUNT firmware detected (found $FW_FOUND, required $FW_NEEDED)"
458
echo_fw_check ${LOOP_COUNT} ${ca_type} "NA"
459
if [ "$FW_NEEDED" == "" ]; then
460
echo " REASON: NO required fw version"
462
echo " REASON: Bad required fw version format ($FW_NEEDED)"
467
echo_fw_check ${LOOP_COUNT} ${ca_type} "NA"
470
echo_fw_check ${LOOP_COUNT} ${ca_type} "NA"
473
echo_fw_check ${LOOP_COUNT} ${ca_type} "NA"
475
let "LOOP_COUNT=$LOOP_COUNT + 1"
478
echo_fw_check ${LOOP_COUNT} ${ca_type} "${red}FAIL"
479
echo " REASON: no CAs in the system"
483
# Check host driver initialization
485
if [ $NUM_HCAS -ne 0 ] && [ $RPM_CHECK_FAIL -eq 0 ]; then
486
MODPROBE_OUT_FILE="/tmp/hca_self_test_modprobe.output"
487
# Save the output of modprobe ib_ipoib in a tmp file
488
modprobe ib_ipoib &> $MODPROBE_OUT_FILE
490
if [ $RET_CODE -eq 0 ]; then
491
echo -e "Host Driver Initialization ............. ${green}PASS"
494
# After successful initialization wait for IB SM sweep
497
echo -e "Host Driver Initialization ............. ${red}FAIL"
501
if [ `grep "No such device" $MODPROBE_OUT_FILE 2> /dev/null | wc -l` -ne 0 ]; then
502
echo " REASON: host driver initialization reported: No such device"
504
# "No such file or directory"
505
if [ `grep "No such file or directory" $MODPROBE_OUT_FILE 2> /dev/null | wc -l` -ne 0 ]; then
506
echo " REASON: host driver initialization reported: No such file or directory"
507
echo " It is possible that driver rpm might be missing file(s)"
509
# "kernel-module version mismatch"
510
if [ `grep "kernel-module version mismatch" $MODPROBE_OUT_FILE 2> /dev/null | wc -l` -ne 0 ]; then
511
echo " REASON: host driver initialization reported: kernel-module version mismatch"
513
# "unresolved symbol"
514
# Note: Could not test "unresolved symbol" error
515
if [ `grep "unresolved symbol" $MODPROBE_OUT_FILE 2> /dev/null | wc -l` -ne 0 ]; then
516
echo " REASON: host driver initialization reported: unresolved symbol"
520
echo "Host Driver Initialization ............. NA"
524
function get_link_layer() {
527
link_layer_file=$INFINI_CLASS_PATH/${device_name}/ports/${port_num}/link_layer
530
if [ -f ${link_layer_file} ]; then
531
link_layer=`cat $link_layer_file 2> /dev/null`
536
function report_port_state() {
542
port_dir=$INFINI_CLASS_PATH/${device_name}/ports/${port_num}
543
port_state_file=${port_dir}/state
544
port_rate_file=${port_dir}/rate
546
if [ -f ${port_state_file} ]; then
547
PORT_STATE=`awk -F": " '{print $2}' ${port_state_file} 2> /dev/null`
548
link_layer=`get_link_layer ${device_name} ${port_num}`
550
if [ "x$PORT_STATE" == "xACTIVE" ]; then
551
PORT_SPEED=`awk -F\( '{print $2}' ${port_rate_file} 2> /dev/null | sed 's/)//'`
552
state="${green}UP $PORT_SPEED"
554
if [ "x$PORT_STATE" == "xINIT" ]; then
560
echo -e "Port State of Port #${port_num} on CA #$LOOP_COUNT (${ca_type})..... ${state} (${link_layer})"
567
if [ $HOST_DRIVER_INIT -eq 1 ]; then
568
NUM_HCAS_PROC=`ls $INFINI_CLASS_PATH 2> /dev/null | wc -l`
577
# To take care of multiple HCAs
578
while [ $LOOP_COUNT -lt $NUM_HCAS_PROC ]
580
driver_need=$(get_driver $LOOP_COUNT)
581
if [ "$driver_need" != "" ]; then
582
if [ "$driver_need" == "$HERMON_DRIVER_NEEDED" ]; then
583
device_num=$mlx_dev_num
584
let "mlx_dev_num=$mlx_dev_num + 1"
585
elif [ "$driver_need" == "$CONNECTIB_DRIVER_NEEDED" ]; then
586
device_num=$mlx5_dev_num
587
let "mlx5_dev_num=$mlx5_dev_num + 1"
589
device_num=$mthca_dev_num
590
let "mthca_dev_num=$mthca_dev_num + 1"
593
if [ -f $INFINI_CLASS_PATH/$driver_need$device_num/ports/1/state ]; then
594
let "NUM_PORT_ACTIVE+=`grep ACTIVE $INFINI_CLASS_PATH/$driver_need$device_num/ports/1/state 2> /dev/null | wc -l`"
596
if [ -f $INFINI_CLASS_PATH/$driver_need$device_num/ports/2/state ]; then
597
let "NUM_PORT_ACTIVE+=`grep ACTIVE $INFINI_CLASS_PATH/$driver_need$device_num/ports/2/state 2> /dev/null | wc -l`"
601
let "LOOP_COUNT=$LOOP_COUNT + 1"
603
echo "Number of CA Ports Active .............. $NUM_PORT_ACTIVE"
606
# Get each port state
612
while [ $LOOP_COUNT -lt $NUM_HCAS_PROC ]
615
driver_need=$(get_driver $LOOP_COUNT)
616
ca_type=$(get_ca_type $LOOP_COUNT)
617
if [ "$driver_need" != "" ]; then
618
if [ "$driver_need" == "$HERMON_DRIVER_NEEDED" ]; then
619
device_num=$mlx_dev_num
620
let "mlx_dev_num=$mlx_dev_num + 1"
621
elif [ "$driver_need" == "$CONNECTIB_DRIVER_NEEDED" ]; then
622
device_num=$mlx5_dev_num
623
let "mlx5_dev_num=$mlx5_dev_num + 1"
625
device_num=$mthca_dev_num
626
let "mthca_dev_num=$mthca_dev_num + 1"
628
device_name=${driver_need}${device_num}
629
report_port_state 1 ${LOOP_COUNT} ${device_name} ${ca_type}
630
report_port_state 2 ${LOOP_COUNT} ${device_name} ${ca_type}
632
let "LOOP_COUNT=$LOOP_COUNT + 1"
635
echo "Number of CA Ports Active .............. NA"
640
# Error counters check
646
echo -e "Error Counter Check on CA #$LOOP_COUNT (${ca_type})...... ${result}"
649
if [ $HOST_DRIVER_INIT -eq 1 ]; then
655
while [ $LOOP_COUNT -lt $NUM_HCAS_PROC ]; do
657
driver_need=$(get_driver $LOOP_COUNT)
658
ca_type=$(get_ca_type $LOOP_COUNT)
659
if [ "$driver_need" != "" ]; then
663
if [ "$driver_need" == "$HERMON_DRIVER_NEEDED" ]; then
664
device_num=$mlx_dev_num
665
let "mlx_dev_num=$mlx_dev_num + 1"
666
elif [ "$driver_need" == "$CONNECTIB_DRIVER_NEEDED" ]; then
667
device_num=$mlx5_dev_num
668
let "mlx5_dev_num=$mlx5_dev_num + 1"
670
device_num=$mthca_dev_num
671
let "mthca_dev_num=$mthca_dev_num + 1"
674
for port in `seq 1 2`; do
675
link_layer_file="$INFINI_CLASS_PATH/$driver_need$device_num/ports/$port/link_layer"
676
if [ -f $link_layer_file ]; then
677
proto=`cat $link_layer_file 2> /dev/null`
678
if [ "$proto" == "Ethernet" ]; then
679
let "check_port$port=0"
685
# Error counters check
686
ERROR_COUNTER_PRINT=0
687
if [ -f $INFINI_CLASS_PATH/$driver_need$device_num/ports/1/state ] && [ $check_port1 -eq 1 ]; then
688
ERROR_COUNTER_PORT_1=0
690
for i in $INFINI_CLASS_PATH/$driver_need$device_num/ports/1/counters/*error*; do
691
err_cnt=`cat $i 2> /dev/null`
693
if [ $RET_CODE -eq 0 ]; then
694
if [ $err_cnt -gt 20 ]; then
695
let "ERROR_COUNTER_PORT_1=$ERROR_COUNTER_PORT_1 + 1"
698
echo "-W- Failed to read $i file"
702
if [ -f $INFINI_CLASS_PATH/$driver_need$device_num/ports/2/state ] && [ $check_port2 -eq 1 ]; then
704
ERROR_COUNTER_PORT_2=0
705
for i in $INFINI_CLASS_PATH/$driver_need$device_num/ports/2/counters/*error*; do
706
err_cnt=`cat $i 2> /dev/null`
708
if [ $RET_CODE -eq 0 ]; then
709
if [ $err_cnt -gt 20 ]; then
711
let "ERROR_COUNTER_PORT_2=$ERROR_COUNTER_PORT_2 + 1"
714
echo "-W- Failed to read $i file"
719
let ERROR_COUNTER_PORT_2=0
722
let "ERROR_COUNTER=$ERROR_COUNTER_PORT_1 + $ERROR_COUNTER_PORT_2"
723
# Print FAIL only once
724
if [ $ERROR_COUNTER -ne 0 ] && [ $ERROR_COUNTER_PRINT -ne 1 ]; then
725
echo_error_cnt ${LOOP_COUNT} ${ca_type} "${red}FAIL"
726
echo " REASON: found errors in the following counters"
727
ERROR_COUNTER_PRINT=1
731
# List the counters which are non-zero
732
if [ $ERROR_COUNTER -ne 0 ]; then
733
# Print only if error counters are non-zero of a specific IB port
734
if [ $ERROR_COUNTER_PORT_1 -ne 0 ]; then
735
echo " Errors in $INFINI_CLASS_PATH/$driver_need$device_num/ports/1/counters"
737
for i in $INFINI_CLASS_PATH/$driver_need$device_num/ports/1/counters/*error*; do
738
err_cnt=`cat $i 2> /dev/null`
740
if [ $RET_CODE -eq 0 ]; then
741
if [ $err_cnt -gt 20 ]; then
742
echo " $(basename $i): $err_cnt";
745
echo "-W- Failed to read $i file"
751
if [ $ERROR_COUNTER_PORT_2 -ne 0 ]; then
752
echo " Errors in $INFINI_CLASS_PATH/$driver_need$device_num/ports/2/counters"
754
for i in $INFINI_CLASS_PATH/$driver_need$device_num/ports/2/counters/*error*; do
755
err_cnt=`cat $i 2> /dev/null`
757
if [ $RET_CODE -eq 0 ]; then
758
if [ $err_cnt -gt 20 ]; then
759
echo " $(basename $i): $err_cnt";
762
echo "-W- Failed to read $i file"
768
if [ $ERROR_COUNTER -eq 0 ]; then
769
result="${green}PASS"
770
if [ $check_port1 -ne 1 ] && [ $check_port2 -ne 1 ]; then
771
result="NA (Eth ports)"
773
echo_error_cnt "${LOOP_COUNT}" "${ca_type}" "${result}"
776
# Reset these variables for other HCAs
777
let ERROR_COUNTER_PORT_1=0
778
let ERROR_COUNTER_PORT_2=0
782
echo_error_cnt ${LOOP_COUNT} ${ca_type} "NA"
784
let "LOOP_COUNT=$LOOP_COUNT + 1"
788
echo "Error Counter Check .................... NA"
792
# Kernel syslog check
793
# Save the output of dmesg in a tmp file
794
if [ $HOST_DRIVER_INIT -eq 1 ]; then
795
dmesg > /tmp/hca_self_test_dmesg.output
796
VAPI_ERROR_COUNT=`egrep oom-\|"Out of Memory"\|tsIb\|VAPI\|THH_\|THHUL\|KERNEL_IB\|IB_NET\|MOD_LNX_SDP /tmp/hca_self_test_dmesg.output 2> /dev/null | grep -v 'SOCK: GETSOCKOPT unimplemented option <2>' | wc -l`
797
OOPS_COUNT=`grep Oops /tmp/hca_self_test_dmesg.output 2> /dev/null | wc -l`
798
KERNEL_PANIC_COUNT=`grep "Kernel panic" /tmp/hca_self_test_dmesg.output 2> /dev/null | wc -l`
800
if [ $VAPI_ERROR_COUNT -eq 0 ] && [ $OOPS_COUNT -eq 0 ] && [ $KERNEL_PANIC_COUNT -eq 0 ]; then
801
echo -e "Kernel Syslog Check .................... ${green}PASS"
804
echo -e "Kernel Syslog Check .................... ${red}FAIL"
807
if [ $OOPS_COUNT -ne 0 ]; then
808
echo " REASON: Kernel syslog reported: Oops "
809
grep Oops /tmp/hca_self_test_dmesg.output | uniq | awk -F'\n' '{print " " $1 }'
811
if [ $KERNEL_PANIC_COUNT -ne 0 ]; then
812
echo " REASON: Kernel syslog reported: Kernel panic "
813
grep "Kernel panic" /tmp/hca_self_test_dmesg.output | uniq | awk -F'\n' '{print " " $1 }'
815
if [ $VAPI_ERROR_COUNT -ne 0 ]; then
816
echo " REASON: Kernel syslog reported: Driver messages "
817
egrep oom-\|"Out of Memory"\|tsIb\|VAPI\|THH_\|THHUL\|KERNEL_IB\|IB_NET\|MOD_LNX_SDP /tmp/hca_self_test_dmesg.output | grep -v 'SOCK: GETSOCKOPT unimplemented option <2>' | uniq | awk -F'\n' '{print " " $1 }'
821
echo "Kernel Syslog Check .................... NA"
827
if [ $NUM_HCAS -ne 0 ]; then
833
# To take care of more than one HCA
834
while [ $LOOP_COUNT -lt $NUM_HCAS ]
836
driver_need=$(get_driver $LOOP_COUNT)
837
ca_type=$(get_ca_type $LOOP_COUNT)
840
if [ "$driver_need" != "" ]; then
841
if [ "$driver_need" == "$HERMON_DRIVER_NEEDED" ]; then
842
device_num=${mlx_dev_num}
843
let "mlx_dev_num=$mlx_dev_num + 1"
844
elif [ "$driver_need" == "$CONNECTIB_DRIVER_NEEDED" ]; then
845
device_num=$mlx5_dev_num
846
let "mlx5_dev_num=$mlx5_dev_num + 1"
848
device_num=${mthca_dev_num}
849
let "mthca_dev_num=$mthca_dev_num + 1"
852
if [ -f "$INFINI_CLASS_PATH/$driver_need$device_num/node_guid" ]; then
853
NODE_GUID=$(sed 's/\([0-9a-f]\)\([0-9a-f]\)\([0-9a-f]\)\([0-9a-f]\)/\1\2:\3\4/g' < $INFINI_CLASS_PATH/$driver_need$device_num/node_guid)
855
PCI_DEVICE=$(lspci 2> /dev/null | grep Mellanox | head -$(expr $LOOP_COUNT + 1) | tail -1 | awk '{print $1}')
856
NODE_GUID1=$(mstflint -d $PCI_DEVICE q 2> /dev/null | grep "GUIDs:" | awk '{print $2}' | sed 's/\([0-9a-f]\)\([0-9a-f]\)\([0-9a-f]\)\([0-9a-f]\)/\1\2:\3\4:/g' | cut -b -23)
858
if [ "$NODE_GUID1" != "" ]; then
859
NODE_GUID=${NODE_GUID}
863
echo "Node GUID on CA #$LOOP_COUNT (${ca_type}) ............... ${NODE_GUID}"
864
let "LOOP_COUNT=$LOOP_COUNT + 1"
868
echo "------------------ DONE ---------------------"
870
#rm -f /tmp/hca_self_test_modprobe.output
871
rm -f /tmp/hca_self_test_dmesg.output
872
rm -f ${G_LSPCI_OUTPUT_FILE}