~ubuntu-branches/debian/sid/mstflint/sid

« back to all changes in this revision

Viewing changes to small_utils/hca_self_test.ofed

  • Committer: Package Import Robot
  • Author(s): Ana Beatriz Guerrero Lopez
  • Date: 2014-07-04 15:39:23 UTC
  • mfrom: (1.1.1)
  • Revision ID: package-import@ubuntu.com-20140704153923-khknwv3o1jeap3oo
Tags: 3.7.0-1
* New upstream release: 3.7.0-1.10.gdf7ec73
* Add build depends on libibmad-dev and autotools-dev.
* Remove build depends on automake and libtool.
* Switch to dh 9 and source format version 3.0
* Remove placeholder manpages.
* Remove flag DM-Upload-Allowed.
* Remove all current Uploaders, they're welcome back anytime.
  Add myself to Uploaders.
* Bump Standards-Version to 3.9.5 (no changes required).
* Update homepage.
* Add a watch file.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#!/bin/bash
 
2
 
 
3
#
 
4
# Copyright (c) 2006-2007 Cisco Systems. All rights reserved.
 
5
# Copyright (c) 2011      Mellanox Technologies LTD. All rights reserved.
 
6
#
 
7
# This Software is licensed under one of the following licenses:
 
8
#
 
9
# 1) under the terms of the "Common Public License 1.0" a copy of which is
 
10
#    available from the Open Source Initiative, see
 
11
#    http://www.opensource.org/licenses/cpl.php.
 
12
#
 
13
# 2) under the terms of the "The BSD License" a copy of which is
 
14
#    available from the Open Source Initiative, see
 
15
#    http://www.opensource.org/licenses/bsd-license.php.
 
16
#
 
17
# 3) under the terms of the "GNU General Public License (GPL) Version 2" a
 
18
#    copy of which is available from the Open Source Initiative, see
 
19
#    http://www.opensource.org/licenses/gpl-license.php.
 
20
#
 
21
# Licensee has the right to choose one of the above licenses.
 
22
#
 
23
# Redistributions of source code must retain the above copyright
 
24
# notice and one of the license notices.
 
25
#
 
26
# Redistributions in binary form must reproduce both the above copyright
 
27
# notice, one of the license notices in the documentation
 
28
# and/or other materials provided with the distribution.
 
29
#
 
30
#
 
31
# Description: Test health of HCA
 
32
 
 
33
# For colored text
 
34
green='\E[32m'
 
35
red='\E[31m'
 
36
cyan='\E[36m'
 
37
 
 
38
bold_start='\033[1m'
 
39
bold_stop='\033[0m'
 
40
 
 
41
#TODO: ask vlad about the path .
 
42
INSTALL_PREFIX=/usr
 
43
 
 
44
MTHCA_DRIVER_NEEDED=mthca
 
45
HERMON_DRIVER_NEEDED=mlx4_
 
46
CONNECTIB_DRIVER_NEEDED=mlx5_
 
47
DRIVER_NEEDED=$MTHCA_DRIVER_NEEDED
 
48
INFINI_CLASS_PATH=/sys/class/infiniband
 
49
G_LSPCI_OUTPUT_FILE=/tmp/hca_self_test_lspci.output
 
50
 
 
51
########################################################################################
 
52
# Required FW version definitions. These lines below are optionally set by the installer
 
53
ARBEL_FW_NEEDED=
 
54
ARBEL_MF_FW_NEEDED=
 
55
TAVOR_FW_NEEDED=
 
56
SINAI_FW_NEEDED=
 
57
HERMON_FW_NEEDED=
 
58
CX3_FW_NEEDED=
 
59
CX3_PRO_FW_NEEDED=
 
60
CONNECTIB_FW_NEEDED=
 
61
########################################################################################
 
62
 
 
63
 
 
64
# Color echo
 
65
cecho () {
 
66
    message=${1}     # argument 1 - message
 
67
    color=${2}       # argument 2 - color
 
68
    echo -e "$color"
 
69
    echo "$message"
 
70
    tput sgr0        # Reset to normal
 
71
    return
 
72
}
 
73
 
 
74
# Exit code
 
75
EXIT_CODE=0
 
76
 
 
77
# Check whether the script is being run as root and exit if otherwise
 
78
if [ `id -g` -ne 0 ]; then
 
79
    echo "Error: hca_self_test must be run as root."
 
80
    EXIT_CODE=1
 
81
    exit $EXIT_CODE
 
82
fi
 
83
 
 
84
echo
 
85
g_pkg_cmd="rpm"
 
86
 
 
87
# Get OS type
 
88
if [ -f /etc/redhat-release -o -f /etc/fedora-release ]; then
 
89
    OS_TYPE="RED_HAT"
 
90
elif [ -f /etc/SuSE-release ]; then
 
91
    OS_TYPE="SUSE"
 
92
elif [ -f /etc/debian_version ]; then
 
93
    OS_TYPE="DEBIAN"
 
94
    INSTALL_PREFIX=/opt/topspin
 
95
    g_pkg_cmd="dpkg"
 
96
else
 
97
    echo "Error: hca_self_test does not support this OS."
 
98
    EXIT_CODE=1
 
99
    exit $EXIT_CODE
 
100
fi
 
101
 
 
102
for cmd in  lspci cat id $g_pkg_cmd uname grep ls awk egrep modprobe; do
 
103
    cmd_exist=`which $cmd 2> /dev/null`
 
104
    if [ "$cmd_exist" == "" ]; then
 
105
            echo "Error: $cmd tool was not found in the PATH"
 
106
            exit 1
 
107
    fi
 
108
done
 
109
 
 
110
 
 
111
HCA_LSPCI_NAME="InfiniBand"
 
112
VPI_LSPCI_NAME="Network controller"
 
113
NIC_LSPCI_NAME="Ethernet controller"
 
114
MEM_CON_LSPCI_NAME="Memory controller"
 
115
 
 
116
MEL_LSPCI_NAME_EXP="${HCA_LSPCI_NAME}|${VPI_LSPCI_NAME}|${NIC_LSPCI_NAME}"
 
117
 
 
118
echo "---- Performing Adapter Device Self Test ----"
 
119
 
 
120
# HCA/PCI check
 
121
NUM_IB_DEV=`lspci 2> /dev/null | grep -E "(${MEL_LSPCI_NAME_EXP})(\s\[[0-9]+\])?: Mellanox Technolog" | wc -l`
 
122
NUM_MEM_CON=`lspci 2> /dev/null | grep "${MEM_CON_LSPCI_NAME}(\s\[[0-9]+\])?: Mellanox Technolog" | wc -l`
 
123
 
 
124
let "NUM_HCAS=$NUM_IB_DEV + $NUM_MEM_CON"
 
125
echo "Number of CAs Detected ................. $NUM_HCAS";
 
126
 
 
127
if [ $NUM_HCAS -ne 0 ]; then
 
128
    if [ $NUM_MEM_CON -ne 0 ]; then
 
129
        echo -e "PCI Device Check ....................... ${red}FAIL"
 
130
        tput sgr0
 
131
        echo "    REASON: jumper set on CA or CA hardware failure"
 
132
        EXIT_CODE=1
 
133
    else
 
134
        echo -e "PCI Device Check ....................... ${green}PASS"
 
135
        tput sgr0
 
136
    fi
 
137
else
 
138
    echo -e "PCI Device Check ....................... ${red}FAIL"
 
139
    tput sgr0
 
140
    echo "    REASON: no CAs in the system"
 
141
    EXIT_CODE=1
 
142
    exit $EXIT_CODE
 
143
fi
 
144
 
 
145
 
 
146
mlx4_core_ko=`modinfo mlx4_core | grep filename | awk '{print $NF}'`
 
147
if [ $OS_TYPE = "DEBIAN" ]; then
 
148
    RPM_CHECK_FAIL=0
 
149
    RPM_CUR_BOOTED_KER=1
 
150
    
 
151
    RPM_USR_VER=`dpkg -s libibverbs1 2> /dev/null | wc -l`
 
152
    RPM_KER_VER=`dpkg -l 2> /dev/null | grep -E "ofed-kernel" | wc -l`
 
153
    RPM_KER_NAME=`echo $mlx4_core_ko | awk -F '/' '{print$4}'`
 
154
    BOOTED_KER=`uname -r`
 
155
    if [ "$BOOTED_KER" != "$RPM_KER_NAME" ]; then
 
156
        RPM_CUR_BOOTED_KER=0
 
157
    fi
 
158
else
 
159
    # RPM check
 
160
    RPM_CHECK_FAIL=0
 
161
    RPM_USR_VER=`rpm -q libibverbs 2> /dev/null | wc -l`
 
162
    RPM_KER_VER=`rpm -qa 2> /dev/null | grep -E "kernel-ib|ofa_kernel" | wc -l`
 
163
    KER_RPM=`rpm -qf $mlx4_core_ko 2> /dev/null | grep -E "kernel-ib|ofa_kernel"`
 
164
 
 
165
    if [ ! -z $KER_RPM ]; then
 
166
        RPM_KER_NAME=`echo $mlx4_core_ko | awk -F '/' '{print$4}'`
 
167
        RPM_CUR_BOOTED_KER=1
 
168
    else
 
169
        if (rpm -q ofa_kernel_dkms > /dev/null 2>&1); then
 
170
            IS_DKMS=1
 
171
            RPM_KER_NAME=`rpm -q --queryformat "[%{NAME}-%{VERSION}]\n" ofa_kernel_dkms`
 
172
            RPM_CUR_BOOTED_KER=1
 
173
        else
 
174
            RPM_CUR_BOOTED_KER=0
 
175
        fi
 
176
    fi
 
177
fi
 
178
OFED_VERSION=$(ofed_info 2> /dev/null | head -1)    
 
179
RPM_KER_ARCH=`uname -m`    
 
180
if [ $OS_TYPE = "RED_HAT" ]; then    
 
181
    BOOTED_KER=`uname -r`    
 
182
elif [ $OS_TYPE = "SUSE" ]; then    
 
183
    # Have to munge uname output a bit.    
 
184
    BOOTED_KER=`uname -r | sed s@-@_@g`    
 
185
fi    
 
186
 
 
187
if [ $RPM_USR_VER -eq 0 ] && [ $RPM_KER_VER -eq 0 ]; then    
 
188
    echo -e "Host Driver RPM Check .................. ${red}FAIL"    
 
189
    tput sgr0    
 
190
    echo "    REASON: no RPMs found"    
 
191
    RPM_CHECK_FAIL=1    
 
192
    EXIT_CODE=1    
 
193
elif [ $RPM_USR_VER -eq 0 ]; then    
 
194
    echo -e "Host Driver RPM Check .................. ${red}FAIL"    
 
195
    tput sgr0    
 
196
    echo "    REASON: no user level RPMs found"    
 
197
    RPM_CHECK_FAIL=1    
 
198
    EXIT_CODE=1    
 
199
elif [ $RPM_KER_VER -eq 0 ]; then    
 
200
    echo -e "Host Driver RPM Check .................. ${red}FAIL"    
 
201
    tput sgr0    
 
202
    echo "    REASON: no kernel level RPMs found"    
 
203
    RPM_CHECK_FAIL=1    
 
204
    EXIT_CODE=1    
 
205
fi    
 
206
 
 
207
if [ $RPM_KER_VER -ne 0 ]; then    
 
208
    if [ $RPM_CUR_BOOTED_KER -eq 0 ]; then    
 
209
        echo -e "Host Driver RPM Check .................. ${red}FAIL"    
 
210
        tput sgr0    
 
211
        echo "    REASON: no RPMs found for currently booted kernel $BOOTED_KER"    
 
212
        RPM_CHECK_FAIL=1    
 
213
        EXIT_CODE=1    
 
214
    fi    
 
215
fi    
 
216
 
 
217
echo "Kernel Arch ............................ $RPM_KER_ARCH"    
 
218
if [ "$RPM_KER_NAME" != "" ]; then    
 
219
    echo "Host Driver Version .................... $OFED_VERSION $RPM_KER_NAME"    
 
220
else    
 
221
    echo "Host Driver Version .................... NA"    
 
222
fi    
 
223
 
 
224
if [ $RPM_CHECK_FAIL -eq 0 ]; then    
 
225
    echo -e "Host Driver RPM Check .................. ${green}PASS"    
 
226
    tput sgr0    
 
227
fi    
 
228
 
 
229
 
 
230
function get_curr_ca_pci_info () {
 
231
    loop_cnt=$1
 
232
 
 
233
    if [ "${g_lspci_was_ran}" == "" ]; then
 
234
        lspci 2> /dev/null | grep -E "(${MEL_LSPCI_NAME_EXP}|${MEM_CON_LSPCI_NAME})(\s\[[0-9]+\])?: Mellanox Technolog" > ${G_LSPCI_OUTPUT_FILE}
 
235
        g_lspci_was_ran=1
 
236
    fi
 
237
    current_pci_info=$(cat ${G_LSPCI_OUTPUT_FILE} | head -$(expr $loop_cnt + 1) | tail -1)
 
238
    echo ${current_pci_info}
 
239
}
 
240
 
 
241
function get_ca_type() {
 
242
    loop_cnt=$1
 
243
 
 
244
    current_pci_info=`get_curr_ca_pci_info $loop_cnt`
 
245
 
 
246
    if [[ ${current_pci_info} =~ ${HCA_LSPCI_NAME} ]] || [[ ${current_pci_info} =~ ${MEM_CON_LSPCI_NAME} ]]; then
 
247
        ca_type=HCA
 
248
    elif [[ ${current_pci_info}  =~ ${VPI_LSPCI_NAME} ]]; then
 
249
        ca_type=VPI
 
250
    elif [[ ${current_pci_info}  =~ ${NIC_LSPCI_NAME} ]]; then
 
251
        ca_type=NIC
 
252
    else
 
253
        ca_type="N/A"
 
254
    fi
 
255
    echo "${ca_type}"
 
256
}
 
257
#get the device ID
 
258
function get_device_id {
 
259
    loop_cnt=$1
 
260
 
 
261
    current_pci_info=`get_curr_ca_pci_info $loop_cnt`
 
262
    PCI_DEV=$(echo $current_pci_info | awk '{print $1}')
 
263
 
 
264
    HexDevice_ID=$(lspci -n -d "15b3:"  2> /dev/null | grep $PCI_DEV |  tail -1 | cut -d ":" -f4 | cut -d " " -f1)
 
265
    if [ "$HexDevice_ID" != "" ]; then
 
266
        HexDevice_ID=0x$HexDevice_ID
 
267
        let "tmp=$HexDevice_ID"
 
268
        Device_ID=$(echo $tmp)
 
269
    else
 
270
        Device_ID=$(mstflint -d $PCI_DEV q 2> /dev/null | grep "Device ID" | awk '{print $3}')
 
271
    fi
 
272
    echo $Device_ID
 
273
}
 
274
#get the HCA NAME
 
275
 
 
276
g_connectx="ConnectX"
 
277
g_connectx3="ConnectX-3"
 
278
g_connectx3_pro="ConnectX-3_Pro"
 
279
g_connectib="Connect-IB"
 
280
g_InfiniHost_III_Ex="InfiniHost_III_Ex"
 
281
g_InfiniHost_III_Ex_memfree="InfiniHost_III_Ex_m"
 
282
g_InfiniHost_III_Lx="InfiniHost_III_Lx"
 
283
g_InfiniHost="InfiniHost"
 
284
 
 
285
 
 
286
function  get_hca_name {
 
287
    dev_id=$1
 
288
    if [ $dev_id -eq 25208 ]; then
 
289
        ret_val=$g_InfiniHost_III_Ex
 
290
    elif [ $dev_id -eq 25218  ]; then
 
291
        ret_val=$g_InfiniHost_III_Ex_memfree
 
292
    elif [ $dev_id -eq 24204 -o $dev_id -eq 25204 ]; then
 
293
        ret_val=$g_InfiniHost_III_Lx
 
294
    elif [ $dev_id -eq 23108 ]; then
 
295
        ret_val=$g_InfiniHost
 
296
    elif [ $dev_id -eq 4099 ]; then
 
297
        ret_val=$g_connectx3
 
298
    elif [ $dev_id -eq 4103 ]; then
 
299
        ret_val=$g_connectx3_pro
 
300
    elif [ $dev_id -eq 4113 ]; then
 
301
        ret_val=$g_connectib
 
302
    else
 
303
        ret_val=$g_connectx
 
304
    fi
 
305
    echo $ret_val
 
306
 
 
307
}
 
308
 
 
309
#get the Driver Name
 
310
function get_driver {
 
311
    loop_cnt=$1
 
312
    driver_need=""
 
313
    Device_ID=$(get_device_id $LOOP_COUNT)
 
314
    if [ "$Device_ID" != "" ]; then
 
315
        hca_name=$(get_hca_name $Device_ID)
 
316
        if [ "$hca_name" != "" ]; then
 
317
            if [ "$hca_name" == "$g_connectx" ] || [ "$hca_name" == "$g_connectx3" ] || [ "$hca_name" == "$g_connectx3_pro" ]; then
 
318
                driver_need=$HERMON_DRIVER_NEEDED
 
319
            elif [ "$hca_name" == "$g_connectib" ]; then 
 
320
                driver_need=$CONNECTIB_DRIVER_NEEDED
 
321
            else
 
322
                driver_need=$MTHCA_DRIVER_NEEDED
 
323
            fi
 
324
        fi
 
325
    fi
 
326
    echo $driver_need
 
327
}
 
328
 
 
329
 
 
330
function compare_fw  {
 
331
    found=$1
 
332
    needed=$2
 
333
 
 
334
    n_1=$(echo $needed | cut -f1 -d"." | cut -b 2-)
 
335
    n_2=$(echo $needed | cut -f2 -d".")
 
336
    n_3=$(echo $needed | cut -f3 -d".")
 
337
    f_1=$(echo $found  | cut -f1 -d"." | cut -b 2-)
 
338
    f_2=$(echo $found  | cut -f2 -d".")
 
339
    f_3=$(echo $found  | cut -f3 -d".")
 
340
 
 
341
    if [ $n_1 -gt $f_1 ]; then
 
342
        echo "required"
 
343
    elif [ $f_1 -gt $n_1 ]; then
 
344
        echo "found"
 
345
    elif [ $n_2 -gt $f_2 ]; then
 
346
        echo "required"
 
347
    elif [ $f_2 -gt $n_2 ]; then
 
348
        echo "found"
 
349
   elif [ $n_3 -gt $f_3 ]; then
 
350
        echo "required"
 
351
    elif [ $f_3 -gt $n_3 ]; then
 
352
        echo "found"
 
353
    fi
 
354
 
 
355
}
 
356
 
 
357
 
 
358
 
 
359
# HCA firmware check
 
360
echo_fw_check () {
 
361
    LOOP_COUNT=$1
 
362
    ca_type=$2
 
363
    result=$3
 
364
    echo -e "Firmware Check on CA #$LOOP_COUNT (${ca_type}) .......... ${result}"
 
365
    tput sgr0
 
366
}
 
367
if [ $NUM_HCAS -ne 0 ]; then
 
368
    mlx_dev_num=0
 
369
    mlx5_dev_num=0
 
370
    mthca_dev_num=0
 
371
    LOOP_COUNT=0
 
372
 
 
373
    # To take care of more than one HCA
 
374
    while [ $LOOP_COUNT -lt $NUM_HCAS ]
 
375
    do
 
376
        ca_type=$(get_ca_type $LOOP_COUNT)
 
377
        if [ $RPM_USR_VER -ne 0 ] && [ $RPM_CUR_BOOTED_KER -ne 0 ]; then
 
378
 
 
379
            #default mthca0
 
380
            device_num=$mthca_dev_num
 
381
 
 
382
            ## get the Device Id
 
383
            PCI_DEVICE=$(lspci 2> /dev/null | grep Mellanox | head -$(expr $LOOP_COUNT + 1) | tail -1 | awk '{print $1}')
 
384
            Device_ID=$(get_device_id $LOOP_COUNT)
 
385
            if [ "$Device_ID" != "" ]; then
 
386
                hca_name=$(get_hca_name $Device_ID)
 
387
                if [ "$hca_name" != "" ]; then
 
388
 
 
389
                   # get the FW and the Expected FW
 
390
                    if [ "$hca_name" == "$g_InfiniHost_III_Ex" ]; then
 
391
                        FW_NEEDED=$ARBEL_FW_NEEDED
 
392
                    elif [ "$hca_name" == $g_InfiniHost_III_Ex_memfree ]; then
 
393
                        FW_NEEDED=$ARBEL_MF_FW_NEEDED
 
394
                    elif [ "$hca_name" == "$g_InfiniHost_III_Lx" ]; then
 
395
                        FW_NEEDED=$SINAI_FW_NEEDED
 
396
                    elif [ "$hca_name" == "$g_connectx" ]; then
 
397
                        FW_NEEDED=$HERMON_FW_NEEDED
 
398
                        DRIVER_NEEDED=$HERMON_DRIVER_NEEDED
 
399
                        device_num=$mlx_dev_num
 
400
                    elif [ "$hca_name" == "$g_connectx3" ]; then
 
401
                        FW_NEEDED=$CX3_FW_NEEDED
 
402
                        DRIVER_NEEDED=$HERMON_DRIVER_NEEDED
 
403
                        device_num=$mlx_dev_num
 
404
                    elif [ "$hca_name" == "$g_connectx3_pro" ]; then
 
405
                        FW_NEEDED=$CX3_PRO_FW_NEEDED
 
406
                        DRIVER_NEEDED=$HERMON_DRIVER_NEEDED
 
407
                        device_num=$mlx_dev_num
 
408
                    elif [ "$hca_name" == "$g_connectib" ]; then
 
409
                        FW_NEEDED=$CONNECTIB_FW_NEEDED
 
410
                        DRIVER_NEEDED=$CONNECTIB_DRIVER_NEEDED
 
411
                        device_num=$mlx5_dev_num
 
412
                    elif [ "$hca_name" == "$g_InfiniHost" ]; then
 
413
                        FW_NEEDED=$TAVOR_FW_NEEDED
 
414
                    fi
 
415
                    legal=$(echo $FW_NEEDED | grep v\[0-9\]\[0-9\]*.\[0-9\]\[0-9\]*.\[0-9\]\[0-9\]*)
 
416
 
 
417
                    # increase the mlx and mthca counter
 
418
                    case "$hca_name" in
 
419
                    "$g_connectx"|"$g_connectx3"|"$g_connectx3_pro")
 
420
                        let "mlx_dev_num=$mlx_dev_num + 1"
 
421
                    ;;
 
422
                    "$g_connectib")
 
423
                        let "mlx5_dev_num=$mlx5_dev_num + 1"
 
424
                    ;;                    *)
 
425
                        let "mthca_dev_num=$mthca_dev_num + 1"
 
426
                    ;;
 
427
                    esac
 
428
 
 
429
                    FW_FOUND=v$(mstflint -d $PCI_DEVICE q 2> /dev/null | grep "FW Version" | awk '{print $3}')
 
430
                    if [ "$FW_FOUND" = "v" ]; then
 
431
                        if [ -f "$INFINI_CLASS_PATH/$DRIVER_NEEDED$device_num/fw_ver" ]; then
 
432
                            FW_FOUND=v`cat $INFINI_CLASS_PATH/$DRIVER_NEEDED$device_num/fw_ver 2> /dev/null`
 
433
                        else
 
434
                            echo_fw_check ${LOOP_COUNT} ${ca_type} "${red}FAIL"
 
435
                            echo "    REASON: CA #$LOOP_COUNT: failed to get firmware version"
 
436
                            EXIT_CODE=1
 
437
                            no_firmware=1
 
438
                        fi
 
439
                    fi
 
440
 
 
441
                    if [ "$no_firmware" != "1" ]; then
 
442
                        echo -e "Firmware on CA #$LOOP_COUNT ${ca_type} .................. $FW_FOUND"
 
443
                        if [ "$FW_NEEDED" == "$legal" -a "$FW_NEEDED" != "" ]; then
 
444
                            if [ "$FW_FOUND" = "$FW_NEEDED" ]; then
 
445
                                echo_fw_check ${LOOP_COUNT} ${ca_type} "${green}PASS"
 
446
                            else
 
447
                                newest=$(compare_fw $FW_FOUND $FW_NEEDED)
 
448
                                if [ "$newest" = "found" ]; then
 
449
                                    echo_fw_check ${LOOP_COUNT} ${ca_type} "${green}PASS"
 
450
                                    echo "    NOTE: The found fw version is higher than the fw included in this package ($FW_NEEDED)"
 
451
                                else
 
452
                                    echo_fw_check ${LOOP_COUNT} ${ca_type} "${red}FAIL"
 
453
                                    echo "    REASON: mismatch CA #$LOOP_COUNT firmware detected (found $FW_FOUND, required $FW_NEEDED)"
 
454
                                    EXIT_CODE=1
 
455
                                fi
 
456
                            fi
 
457
                        else
 
458
                            echo_fw_check ${LOOP_COUNT} ${ca_type} "NA"
 
459
                            if [ "$FW_NEEDED" == "" ]; then
 
460
                                echo "    REASON: NO required fw version"
 
461
                            else
 
462
                                echo "    REASON: Bad required fw version format ($FW_NEEDED)"
 
463
                            fi
 
464
                        fi
 
465
                    fi
 
466
                else
 
467
                    echo_fw_check ${LOOP_COUNT} ${ca_type} "NA"
 
468
                fi
 
469
            else
 
470
                echo_fw_check ${LOOP_COUNT} ${ca_type} "NA"
 
471
            fi
 
472
        else
 
473
            echo_fw_check ${LOOP_COUNT} ${ca_type} "NA"
 
474
        fi
 
475
        let "LOOP_COUNT=$LOOP_COUNT + 1"
 
476
    done
 
477
else
 
478
    echo_fw_check ${LOOP_COUNT} ${ca_type} "${red}FAIL"
 
479
    echo "    REASON: no CAs in the system"
 
480
    EXIT_CODE=1
 
481
fi
 
482
 
 
483
# Check host driver initialization
 
484
HOST_DRIVER_INIT=0
 
485
if [ $NUM_HCAS -ne 0 ] && [ $RPM_CHECK_FAIL -eq 0 ]; then
 
486
    MODPROBE_OUT_FILE="/tmp/hca_self_test_modprobe.output"
 
487
    # Save the output of modprobe ib_ipoib in a tmp file
 
488
    modprobe ib_ipoib &> $MODPROBE_OUT_FILE
 
489
    let RET_CODE=$?
 
490
    if [ $RET_CODE -eq 0 ]; then
 
491
        echo -e "Host Driver Initialization ............. ${green}PASS"
 
492
        tput sgr0
 
493
        HOST_DRIVER_INIT=1
 
494
        # After successful initialization wait for IB SM sweep
 
495
        sleep 5
 
496
    else
 
497
        echo -e "Host Driver Initialization ............. ${red}FAIL"
 
498
        tput sgr0
 
499
        EXIT_CODE=1
 
500
        # "No such device"
 
501
        if [ `grep "No such device" $MODPROBE_OUT_FILE 2> /dev/null | wc -l` -ne 0 ]; then
 
502
            echo "    REASON: host driver initialization reported: No such device"
 
503
        fi
 
504
        # "No such file or directory"
 
505
        if [ `grep "No such file or directory" $MODPROBE_OUT_FILE 2> /dev/null | wc -l` -ne 0 ]; then
 
506
            echo "    REASON: host driver initialization reported: No such file or directory"
 
507
            echo "            It is possible that driver rpm might be missing file(s)"
 
508
        fi
 
509
        # "kernel-module version mismatch"
 
510
        if [ `grep "kernel-module version mismatch" $MODPROBE_OUT_FILE 2> /dev/null | wc -l` -ne 0 ]; then
 
511
            echo "    REASON: host driver initialization reported: kernel-module version mismatch"
 
512
        fi
 
513
        # "unresolved symbol"
 
514
        # Note: Could not test "unresolved symbol" error
 
515
        if [ `grep "unresolved symbol" $MODPROBE_OUT_FILE 2> /dev/null | wc -l` -ne 0 ]; then
 
516
            echo "    REASON: host driver initialization reported: unresolved symbol"
 
517
        fi
 
518
    fi
 
519
else
 
520
    echo "Host Driver Initialization ............. NA"
 
521
    EXIT_CODE=1
 
522
fi
 
523
 
 
524
function get_link_layer() {
 
525
    device_name=$1
 
526
    port_num=$2
 
527
    link_layer_file=$INFINI_CLASS_PATH/${device_name}/ports/${port_num}/link_layer
 
528
    # Default value
 
529
    link_layer=IB
 
530
    if [ -f ${link_layer_file} ]; then
 
531
        link_layer=`cat $link_layer_file 2> /dev/null`
 
532
    fi
 
533
    echo ${link_layer}
 
534
}
 
535
 
 
536
function report_port_state() {
 
537
    port_num=$1;
 
538
    LOOP_COUNT=$2
 
539
    device_name=$3
 
540
    ca_type=$4
 
541
 
 
542
    port_dir=$INFINI_CLASS_PATH/${device_name}/ports/${port_num}
 
543
    port_state_file=${port_dir}/state
 
544
    port_rate_file=${port_dir}/rate
 
545
 
 
546
if [ -f ${port_state_file} ]; then
 
547
    PORT_STATE=`awk -F": " '{print $2}' ${port_state_file} 2> /dev/null`
 
548
    link_layer=`get_link_layer ${device_name} ${port_num}`
 
549
 
 
550
    if [ "x$PORT_STATE" == "xACTIVE" ]; then
 
551
        PORT_SPEED=`awk -F\( '{print $2}' ${port_rate_file} 2> /dev/null | sed 's/)//'`
 
552
        state="${green}UP $PORT_SPEED"
 
553
    else
 
554
        if [ "x$PORT_STATE" == "xINIT" ]; then
 
555
            state="${cyan}INIT"
 
556
        else
 
557
            state="${red}DOWN"
 
558
        fi
 
559
    fi
 
560
    echo -e "Port State of Port #${port_num} on CA #$LOOP_COUNT (${ca_type})..... ${state} (${link_layer})"
 
561
    tput sgr0
 
562
fi
 
563
 
 
564
}
 
565
 
 
566
# Port info
 
567
if [ $HOST_DRIVER_INIT -eq 1 ]; then
 
568
    NUM_HCAS_PROC=`ls $INFINI_CLASS_PATH 2> /dev/null | wc -l`
 
569
    LOOP_COUNT=0
 
570
    NUM_PORT_ACTIVE=0
 
571
 
 
572
    mlx_dev_num=0
 
573
    mlx5_dev_num=0
 
574
    mthca_dev_num=0
 
575
    LOOP_COUNT=0
 
576
 
 
577
    # To take care of multiple HCAs
 
578
    while [ $LOOP_COUNT -lt $NUM_HCAS_PROC ]
 
579
    do
 
580
        driver_need=$(get_driver $LOOP_COUNT)
 
581
         if [ "$driver_need" != "" ]; then
 
582
             if [ "$driver_need" == "$HERMON_DRIVER_NEEDED" ]; then
 
583
                  device_num=$mlx_dev_num
 
584
                  let "mlx_dev_num=$mlx_dev_num + 1"
 
585
             elif [ "$driver_need" == "$CONNECTIB_DRIVER_NEEDED" ]; then
 
586
                  device_num=$mlx5_dev_num
 
587
                  let "mlx5_dev_num=$mlx5_dev_num + 1"
 
588
              else
 
589
                  device_num=$mthca_dev_num
 
590
                  let "mthca_dev_num=$mthca_dev_num + 1"
 
591
             fi
 
592
 
 
593
             if [ -f $INFINI_CLASS_PATH/$driver_need$device_num/ports/1/state ]; then
 
594
                let "NUM_PORT_ACTIVE+=`grep ACTIVE $INFINI_CLASS_PATH/$driver_need$device_num/ports/1/state 2> /dev/null | wc -l`"
 
595
             fi
 
596
             if [ -f $INFINI_CLASS_PATH/$driver_need$device_num/ports/2/state ]; then
 
597
                let "NUM_PORT_ACTIVE+=`grep ACTIVE $INFINI_CLASS_PATH/$driver_need$device_num/ports/2/state 2> /dev/null | wc -l`"
 
598
             fi
 
599
        fi
 
600
 
 
601
        let "LOOP_COUNT=$LOOP_COUNT + 1"
 
602
    done
 
603
    echo "Number of CA Ports Active .............. $NUM_PORT_ACTIVE"
 
604
 
 
605
 
 
606
    # Get each port state
 
607
 
 
608
    mlx_dev_num=0
 
609
    mthca_dev_num=0
 
610
    LOOP_COUNT=0
 
611
    mlx5_dev_num=0
 
612
    while [ $LOOP_COUNT -lt $NUM_HCAS_PROC ]
 
613
    do
 
614
 
 
615
        driver_need=$(get_driver $LOOP_COUNT)
 
616
        ca_type=$(get_ca_type $LOOP_COUNT)
 
617
        if [ "$driver_need" != "" ]; then
 
618
            if [ "$driver_need" == "$HERMON_DRIVER_NEEDED" ]; then
 
619
                device_num=$mlx_dev_num
 
620
                let "mlx_dev_num=$mlx_dev_num + 1"
 
621
            elif [ "$driver_need" == "$CONNECTIB_DRIVER_NEEDED" ]; then
 
622
                  device_num=$mlx5_dev_num
 
623
                  let "mlx5_dev_num=$mlx5_dev_num + 1"
 
624
            else
 
625
                device_num=$mthca_dev_num
 
626
                let "mthca_dev_num=$mthca_dev_num + 1"
 
627
            fi
 
628
            device_name=${driver_need}${device_num}
 
629
            report_port_state 1 ${LOOP_COUNT} ${device_name} ${ca_type}
 
630
            report_port_state 2 ${LOOP_COUNT} ${device_name} ${ca_type}
 
631
        fi
 
632
        let "LOOP_COUNT=$LOOP_COUNT + 1"
 
633
    done
 
634
else
 
635
    echo "Number of CA Ports Active .............. NA"
 
636
fi
 
637
 
 
638
# -D-
 
639
 
 
640
# Error counters check
 
641
 
 
642
echo_error_cnt () {
 
643
    LOOP_COUNT=$1
 
644
    ca_type=$2
 
645
    result=$3
 
646
    echo -e "Error Counter Check on CA #$LOOP_COUNT (${ca_type})...... ${result}"
 
647
    tput sgr0
 
648
}
 
649
if [ $HOST_DRIVER_INIT -eq 1 ]; then
 
650
 
 
651
    mlx_dev_num=0
 
652
    mthca_dev_num=0
 
653
    LOOP_COUNT=0
 
654
    mlx5_dev_num=0
 
655
    while [ $LOOP_COUNT -lt $NUM_HCAS_PROC ]; do
 
656
 
 
657
        driver_need=$(get_driver $LOOP_COUNT)
 
658
        ca_type=$(get_ca_type $LOOP_COUNT)
 
659
        if [ "$driver_need" != "" ]; then
 
660
            check_port1=1
 
661
            check_port2=1
 
662
 
 
663
            if [ "$driver_need" == "$HERMON_DRIVER_NEEDED" ]; then
 
664
                 device_num=$mlx_dev_num
 
665
                 let "mlx_dev_num=$mlx_dev_num + 1"
 
666
            elif [ "$driver_need" == "$CONNECTIB_DRIVER_NEEDED" ]; then
 
667
                  device_num=$mlx5_dev_num
 
668
                  let "mlx5_dev_num=$mlx5_dev_num + 1"
 
669
            else
 
670
                 device_num=$mthca_dev_num
 
671
                 let "mthca_dev_num=$mthca_dev_num + 1"
 
672
            fi
 
673
 
 
674
            for port in `seq 1 2`; do
 
675
                link_layer_file="$INFINI_CLASS_PATH/$driver_need$device_num/ports/$port/link_layer"
 
676
                if [ -f $link_layer_file ]; then
 
677
                    proto=`cat $link_layer_file 2> /dev/null`
 
678
                    if [ "$proto" == "Ethernet" ]; then
 
679
                        let "check_port$port=0"
 
680
                    fi
 
681
                fi
 
682
            done
 
683
 
 
684
 
 
685
            # Error counters check
 
686
            ERROR_COUNTER_PRINT=0
 
687
            if [ -f $INFINI_CLASS_PATH/$driver_need$device_num/ports/1/state ] && [ $check_port1 -eq 1 ]; then
 
688
                ERROR_COUNTER_PORT_1=0
 
689
 
 
690
                for i in $INFINI_CLASS_PATH/$driver_need$device_num/ports/1/counters/*error*; do
 
691
                    err_cnt=`cat $i 2> /dev/null`
 
692
                    RET_CODE=$?
 
693
                    if [ $RET_CODE -eq 0 ]; then
 
694
                        if [ $err_cnt -gt 20 ]; then
 
695
                            let "ERROR_COUNTER_PORT_1=$ERROR_COUNTER_PORT_1 + 1"
 
696
                        fi;
 
697
                    else
 
698
                        echo "-W- Failed to read $i file"
 
699
                    fi
 
700
                done
 
701
            fi
 
702
            if [ -f $INFINI_CLASS_PATH/$driver_need$device_num/ports/2/state ] && [ $check_port2 -eq 1 ]; then
 
703
 
 
704
                ERROR_COUNTER_PORT_2=0
 
705
                for i in $INFINI_CLASS_PATH/$driver_need$device_num/ports/2/counters/*error*; do
 
706
                    err_cnt=`cat $i 2> /dev/null`
 
707
                    RET_CODE=$?
 
708
                    if [ $RET_CODE -eq 0 ]; then
 
709
                        if [ $err_cnt -gt 20 ]; then
 
710
 
 
711
                            let "ERROR_COUNTER_PORT_2=$ERROR_COUNTER_PORT_2 + 1"
 
712
                        fi;
 
713
                    else
 
714
                        echo "-W- Failed to read $i file"
 
715
                    fi
 
716
                done
 
717
 
 
718
            else
 
719
                let ERROR_COUNTER_PORT_2=0
 
720
            fi
 
721
 
 
722
            let "ERROR_COUNTER=$ERROR_COUNTER_PORT_1 + $ERROR_COUNTER_PORT_2"
 
723
            # Print FAIL only once
 
724
            if [ $ERROR_COUNTER -ne 0 ] && [ $ERROR_COUNTER_PRINT -ne 1 ]; then
 
725
                echo_error_cnt ${LOOP_COUNT} ${ca_type} "${red}FAIL"
 
726
                echo "    REASON: found errors in the following counters"
 
727
                ERROR_COUNTER_PRINT=1
 
728
                EXIT_CODE=1
 
729
            fi
 
730
 
 
731
            # List the counters which are non-zero
 
732
            if [ $ERROR_COUNTER -ne 0 ]; then
 
733
                # Print only if error counters are non-zero of a specific IB port
 
734
                if [ $ERROR_COUNTER_PORT_1 -ne 0 ]; then
 
735
                    echo "      Errors in $INFINI_CLASS_PATH/$driver_need$device_num/ports/1/counters"
 
736
 
 
737
                    for i in $INFINI_CLASS_PATH/$driver_need$device_num/ports/1/counters/*error*; do
 
738
                        err_cnt=`cat $i 2> /dev/null`
 
739
                        RET_CODE=$?
 
740
                        if [ $RET_CODE -eq 0 ]; then
 
741
                            if [ $err_cnt -gt 20 ]; then
 
742
                                echo "         $(basename $i): $err_cnt";
 
743
                            fi;
 
744
                        else
 
745
                            echo "-W- Failed to read $i file"
 
746
                        fi
 
747
                    done
 
748
 
 
749
                fi
 
750
 
 
751
                if [ $ERROR_COUNTER_PORT_2 -ne 0 ]; then
 
752
                    echo "      Errors in $INFINI_CLASS_PATH/$driver_need$device_num/ports/2/counters"
 
753
 
 
754
                    for i in $INFINI_CLASS_PATH/$driver_need$device_num/ports/2/counters/*error*; do
 
755
                        err_cnt=`cat $i 2> /dev/null`
 
756
                        RET_CODE=$?
 
757
                        if [ $RET_CODE -eq 0 ]; then
 
758
                            if [ $err_cnt -gt 20 ]; then
 
759
                                echo "         $(basename $i): $err_cnt";
 
760
                            fi;
 
761
                        else
 
762
                            echo "-W- Failed to read $i file"
 
763
                        fi
 
764
                    done
 
765
                fi
 
766
            fi
 
767
 
 
768
            if [ $ERROR_COUNTER -eq 0 ]; then
 
769
                result="${green}PASS"
 
770
                if [ $check_port1 -ne 1 ] && [ $check_port2 -ne 1 ]; then
 
771
                    result="NA (Eth ports)"
 
772
                fi
 
773
                echo_error_cnt "${LOOP_COUNT}" "${ca_type}" "${result}"
 
774
            fi
 
775
 
 
776
            # Reset these variables for other HCAs
 
777
            let ERROR_COUNTER_PORT_1=0
 
778
            let ERROR_COUNTER_PORT_2=0
 
779
 
 
780
 
 
781
        else
 
782
            echo_error_cnt ${LOOP_COUNT} ${ca_type} "NA"
 
783
        fi
 
784
        let "LOOP_COUNT=$LOOP_COUNT + 1"
 
785
    done
 
786
 
 
787
else
 
788
    echo "Error Counter Check .................... NA"
 
789
fi
 
790
 
 
791
 
 
792
# Kernel syslog check
 
793
# Save the output of dmesg in a tmp file
 
794
if [ $HOST_DRIVER_INIT -eq 1 ]; then
 
795
    dmesg > /tmp/hca_self_test_dmesg.output
 
796
    VAPI_ERROR_COUNT=`egrep oom-\|"Out of Memory"\|tsIb\|VAPI\|THH_\|THHUL\|KERNEL_IB\|IB_NET\|MOD_LNX_SDP /tmp/hca_self_test_dmesg.output 2> /dev/null | grep -v 'SOCK: GETSOCKOPT unimplemented option <2>' | wc -l`
 
797
    OOPS_COUNT=`grep Oops /tmp/hca_self_test_dmesg.output 2> /dev/null | wc -l`
 
798
    KERNEL_PANIC_COUNT=`grep "Kernel panic" /tmp/hca_self_test_dmesg.output 2> /dev/null | wc -l`
 
799
 
 
800
    if [ $VAPI_ERROR_COUNT -eq 0 ] && [ $OOPS_COUNT -eq 0 ] && [ $KERNEL_PANIC_COUNT -eq 0 ]; then
 
801
        echo -e "Kernel Syslog Check .................... ${green}PASS"
 
802
        tput sgr0
 
803
    else
 
804
        echo -e "Kernel Syslog Check .................... ${red}FAIL"
 
805
        tput sgr0
 
806
        EXIT_CODE=1
 
807
        if [ $OOPS_COUNT -ne 0 ]; then
 
808
            echo "    REASON: Kernel syslog reported: Oops "
 
809
            grep Oops /tmp/hca_self_test_dmesg.output | uniq | awk -F'\n' '{print "      " $1 }'
 
810
        fi
 
811
        if [ $KERNEL_PANIC_COUNT -ne 0 ]; then
 
812
            echo "    REASON: Kernel syslog reported: Kernel panic "
 
813
            grep "Kernel panic" /tmp/hca_self_test_dmesg.output | uniq | awk -F'\n' '{print "      " $1 }'
 
814
        fi
 
815
        if [ $VAPI_ERROR_COUNT -ne 0 ]; then
 
816
            echo "    REASON: Kernel syslog reported: Driver messages "
 
817
            egrep oom-\|"Out of Memory"\|tsIb\|VAPI\|THH_\|THHUL\|KERNEL_IB\|IB_NET\|MOD_LNX_SDP /tmp/hca_self_test_dmesg.output | grep -v 'SOCK: GETSOCKOPT unimplemented option <2>' | uniq | awk -F'\n' '{print "      " $1 }'
 
818
        fi
 
819
    fi
 
820
else
 
821
    echo "Kernel Syslog Check .................... NA"
 
822
fi
 
823
 
 
824
 
 
825
#get the NODE Guide
 
826
 
 
827
if [ $NUM_HCAS -ne 0 ]; then
 
828
    mlx_dev_num=0
 
829
    mlx5_dev_num=0
 
830
    mthca_dev_num=0
 
831
    LOOP_COUNT=0
 
832
 
 
833
    # To take care of more than one HCA
 
834
    while [ $LOOP_COUNT -lt $NUM_HCAS ]
 
835
    do
 
836
        driver_need=$(get_driver $LOOP_COUNT)
 
837
        ca_type=$(get_ca_type $LOOP_COUNT)
 
838
 
 
839
        NODE_GUID="NA"
 
840
        if [ "$driver_need" != "" ]; then
 
841
            if [ "$driver_need" == "$HERMON_DRIVER_NEEDED" ]; then
 
842
                device_num=${mlx_dev_num}
 
843
                let "mlx_dev_num=$mlx_dev_num + 1"
 
844
            elif [ "$driver_need" == "$CONNECTIB_DRIVER_NEEDED" ]; then
 
845
                  device_num=$mlx5_dev_num
 
846
                  let "mlx5_dev_num=$mlx5_dev_num + 1"
 
847
            else
 
848
                device_num=${mthca_dev_num}
 
849
                let "mthca_dev_num=$mthca_dev_num + 1"
 
850
            fi
 
851
 
 
852
            if [ -f "$INFINI_CLASS_PATH/$driver_need$device_num/node_guid" ]; then
 
853
                NODE_GUID=$(sed 's/\([0-9a-f]\)\([0-9a-f]\)\([0-9a-f]\)\([0-9a-f]\)/\1\2:\3\4/g' < $INFINI_CLASS_PATH/$driver_need$device_num/node_guid)
 
854
            else
 
855
                PCI_DEVICE=$(lspci 2> /dev/null | grep Mellanox | head -$(expr $LOOP_COUNT + 1) | tail -1 | awk '{print $1}')
 
856
                NODE_GUID1=$(mstflint -d $PCI_DEVICE q 2> /dev/null | grep "GUIDs:" | awk '{print $2}' | sed 's/\([0-9a-f]\)\([0-9a-f]\)\([0-9a-f]\)\([0-9a-f]\)/\1\2:\3\4:/g' | cut -b -23)
 
857
 
 
858
                if [ "$NODE_GUID1" != "" ]; then
 
859
                    NODE_GUID=${NODE_GUID}
 
860
                fi
 
861
            fi
 
862
        fi
 
863
        echo "Node GUID on CA #$LOOP_COUNT (${ca_type}) ............... ${NODE_GUID}"
 
864
        let "LOOP_COUNT=$LOOP_COUNT + 1"
 
865
     done
 
866
 
 
867
fi
 
868
echo "------------------ DONE ---------------------"
 
869
echo
 
870
#rm -f /tmp/hca_self_test_modprobe.output
 
871
rm -f /tmp/hca_self_test_dmesg.output
 
872
rm -f ${G_LSPCI_OUTPUT_FILE}
 
873
exit $EXIT_CODE