#!/usr/bin/env python3
# IBM_PROLOG_BEGIN_TAG
# This is an automatically generated prolog.
#
# $Source: op-test-framework/testcases/OpTestEEH.py $
#
# OpenPOWER Automated Test Project
#
# Contributors Listed Below - COPYRIGHT 2016
# [+] International Business Machines Corp.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the License.
#
# IBM_PROLOG_END_TAG
'''
OpTestEEH
---------
This testcase basically tests all OPAL EEH Error injection tests.
- fenced PHB
- frozen PE
'''
import time
import subprocess
import subprocess
import re
import sys
import os
import unittest
import OpTestConfiguration
from common.OpTestSystem import OpSystemState
from common.Exceptions import CommandFailed
import logging
import OpTestLogger
log = OpTestLogger.optest_logger_glob.get_logger(__name__)
EEH_HIT = 0
EEH_MISS = 1
[docs]class EEHRecoveryFailed(Exception):
'''
EEH Recovery failed on thing for reason.
'''
def __init__(self, thing, dev, log=None):
self.thing = thing
self.dev = dev
self.log = log
def __str__(self):
return "%s %s recovery failed: Log=%s" % (self.thing, self.dev, self.log)
[docs]class EEHRemoveFailed(Exception):
'''
EEH Remove failed on thing for reason.
'''
def __init__(self, thing, dev, log=None):
self.thing = thing
self.dev = dev
self.log = log
def __str__(self):
return "%s %s remove failed: Log=%s" % (self.thing, self.dev, self.log)
[docs]class EEHLocCodeFailed(Exception):
'''
Failed to log location code along with failure.
'''
def __init__(self, thing, dev, log=None):
self.thing = thing
self.dev = dev
self.log = log
def __str__(self):
return "%s %s location code failure: Log=%s" % (self.thing, self.dev, self.log)
[docs]class OpTestEEH(unittest.TestCase):
[docs] @classmethod
def setUpClass(cls):
conf = OpTestConfiguration.conf
cls.cv_HOST = conf.host()
cls.cv_IPMI = conf.ipmi()
cls.cv_SYSTEM = conf.system()
# By default test will run on all PHBs/PE's, if one want to skip certain ones mention in this format.
# ['PCI0001', 'PCI0002', 'PCI0003', 'PCI0004', 'PCI0005', 'PCI0030', 'PCI0031', 'PCI0032']
cls.skip_phbs = []
cls.skip_pes = [] # ['0002:00:00.0']
[docs] def setUp(self):
self.cv_SYSTEM.goto_state(OpSystemState.OS)
self.set_con_log_lev_crit()
self.cv_HOST.host_gather_opal_msg_log(console=1)
self.cv_HOST.host_gather_kernel_log(console=1)
self.cv_SYSTEM.console.run_command("dmesg -D")
self.cv_SYSTEM.console.run_command("uname -a")
self.cv_SYSTEM.console.run_command("cat /etc/os-release")
def get_test_pci_domains(self):
root_domain = self.cv_HOST.host_get_root_phb(console=1)
pci_domains = self.cv_HOST.host_get_list_of_pci_domains(console=1)
log.debug(
"Skipping the root phb %s for both fenced/frozen EEH Testcases" % root_domain)
pci_domains.remove(root_domain)
if len(self.skip_phbs) != 0:
log.debug("Skipping the known phbs %s from user" % self.skip_phbs)
pci_domains = [
domain for domain in pci_domains if domain not in self.skip_phbs]
return pci_domains
[docs] def prepare_logs(self):
'''
This function is used to prepare opal and kernel logs to
a reference point, so that we can compare logs for each EEH
iteration easily.
Basically, we throw logs in a temp file and diff them afterwards.
'''
cmd = "cat /sys/firmware/opal/msglog|grep ',[0-4]\]' > /tmp/opal_msglog"
self.cv_SYSTEM.console.run_command_ignore_fail(cmd)
self.cv_SYSTEM.console.run_command("dmesg -C")
[docs] def gather_logs(self):
'''
This function is used to gather opal and kernel logs
for each EEH iteration instead of full logs.
This should make it easier to debug problems as you'll have
the specific log messages that occured for each test.
'''
cmd = "grep ',[0-4]\]' /sys/firmware/opal/msglog | diff - /tmp/opal_msglog"
self.cv_SYSTEM.console.run_command_ignore_fail(cmd)
self.cv_SYSTEM.console.run_command("dmesg")
[docs] def check_eeh_phb_recovery(self, i_domain):
'''
This function is used to actually check the PHB recovery
after an EEH Fenced PHB Error Injection.
We scrape the kernel log for the correct strings.
'''
cmd = "dmesg | grep -i 'EEH: Notify device driver to resume'; echo $?"
tries = 60
for i in range(1, tries+1):
res = self.cv_SYSTEM.console.run_command(cmd)
if int(res[-1]):
log.debug("Waiting for PHB %s EEH Completion: (%d/%d)" %
(i_domain, i, tries))
time.sleep(1)
else:
break
else:
self.gather_logs()
raise EEHRecoveryFailed("EEH recovery failed", i_domain)
tries = 30
domain = i_domain.split("PCI")[1]
for j in range(1, tries+1):
list = self.get_list_of_pci_devices()
for device in list:
if domain in device:
return True
time.sleep(0.2)
else:
return False
[docs] def get_list_of_pci_devices(self):
'''
This function is used to get list of PCI devices
available at that instant of time.
'''
cmd = "ls /sys/bus/pci/devices"
res = self.cv_SYSTEM.console.run_command(cmd)
return res
[docs] def get_dic_of_pe_vs_addr(self):
'''
Get dictionary of pe vs config addr.
e.g.
::
{'0001:0c:00.2': '2', '0001:0b:00.0': 'fb', '0001:0c:00.0': '2'}
'''
pe_dic = {}
# Get list of PE's
console = self.cv_SYSTEM.console
res = console.run_command(
"ls /sys/bus/pci/devices/ | awk {'print $1'}")
if len(self.skip_pes) != 0:
log.debug("Skipping the known PE's %s from user" % self.skip_pes)
for pe in res:
if not pe or pe in self.skip_pes:
continue
cmd = "cat /sys/bus/pci/devices/%s/eeh_pe_config_addr" % pe
addr = console.run_command(cmd)
pe_dic[pe] = ((addr[0]).split("x"))[1]
return pe_dic
[docs] def run_pe_4(self, addr, e, f, phb, pe, con):
'''
Inject error and check for recovery, finally gather logs.
Returns True if PE recovers, False if failed to recover.
'''
self.prepare_logs()
try:
self.inject_error(addr, e, f, phb, pe)
except CommandFailed as cf:
if cf.exitcode != 0:
log.debug("Skipping verification as command failed")
return EEH_MISS
if not self.check_eeh_hit():
return EEH_MISS
else:
log.debug("PE %s EEH hit success" % pe)
return EEH_HIT
[docs] def inject_error(self, addr, e, f, phb, pe):
'''
Inject error:
addr
PE address
e
0
32 bit errors
1
64 bit errors
f (Function)
0
MMIO read
4
CFG read
6
MMIO write
10
CFG write
phb
PHB Index
pe
PE BDF address
'''
cmd = "echo %s:%s:%s:0:0 > /sys/kernel/debug/powerpc/PCI%s/err_injct && lspci -ns %s" % (
addr, e, f, phb, pe)
res = self.cv_SYSTEM.console.run_command(cmd)
[docs] def check_eeh_slot_resets(self):
'''
Check for EEH Slot Reset count. i.e. ::
$ cat /proc/powerpc/eeh | tail -n 1
eeh_slot_resets=10
Returns integer (in above example '10').
'''
cmd = "cat /proc/powerpc/eeh | tail -n 1"
count = self.cv_SYSTEM.console.run_command(cmd)
output = (count[0].split("="))[1]
return output
[docs] def check_eeh_pe_recovery(self, pe):
'''
return True if PE is available in the system
else return False. Which will be useful for
checking the PE after injecting the EEH error
'''
cmd = "dmesg | grep -i 'EEH: Notify device driver to resume'; echo $?"
tries = 60
for i in range(1, tries+1):
res = self.cv_SYSTEM.console.run_command(cmd)
if int(res[-1]):
log.debug("Waiting for PE %s EEH Completion: (%d/%d)" %
(pe, i, tries))
time.sleep(1)
else:
break
else:
self.gather_logs()
raise EEHRecoveryFailed("EEH recovery failed", pe)
tries = 30
for j in range(1, tries+1):
list = self.get_list_of_pci_devices()
for device in list:
if pe in device:
return True
time.sleep(1)
else:
return False
def check_eeh_hit(self):
c = self.cv_SYSTEM.console
tries = 10
for i in range(1, tries+1):
try:
res = c.run_command("dmesg | grep 'EEH: Frozen'")
except CommandFailed as cf:
continue
return True
time.sleep(1)
else:
return False
def check_eeh_removed(self):
tries = 60
c = self.cv_SYSTEM.console
for i in range(1, tries+1):
try:
res = c.run_command("dmesg | grep 'permanently disabled'")
except CommandFailed as cf:
continue
return True
time.sleep(1)
else:
return False
[docs] def set_con_log_lev_crit(self):
'''
EEH verbose is enabled in P9. Until we have a capable of setting console log level in runtime,
or EEH verbose is disabled in P9, we need to disable the console logging to make tests run.
(But at the cost of one additional IPL execution overhead)
TODO: Remove this once we have a way of disabling console log level in runtime without
having an additional IPL.
'''
self.cpu = self.cv_HOST.host_get_proc_gen(console=1)
if self.cpu in ["POWER8", "POWER8E"]:
return
try:
level = "".join(self.cv_SYSTEM.console.run_command(
"nvram -p ibm,skiboot --print-config=log-level-driver"))
except CommandFailed:
level = "5"
if level == "2":
return
self.cv_SYSTEM.console.run_command(
"nvram -p ibm,skiboot --update-config log-level-driver=2")
self.cv_SYSTEM.goto_state(OpSystemState.OFF)
self.cv_SYSTEM.goto_state(OpSystemState.OS)
def verify_location_code_logging(self, pe):
tries = 60
c = self.cv_SYSTEM.console
for i in range(1, tries+1):
try:
res = c.run_command(
"dmesg | grep -i --color=never 'EEH: PE location:'")
found = True
except CommandFailed as cf:
continue
if found:
break
time.sleep(1)
else:
raise EEHLocCodeFailed(
"PE ", pe, "Kernel failed to log the location codes for a PCI EEH error")
matchObj = re.match(
"(.*) EEH: PE location: (.*), PHB.*", res[-1], re.I)
if matchObj:
loc_code = matchObj.group(2)
if loc_code == 'N/A':
log.warning(
"FW/Kernel failed to log the pcie slot/device location code")
[docs]class OpTestEEHbasic_fenced_phb(OpTestEEH):
'''
This testcase has below steps:
1. Get the list of pci PHB domains
2. Get the root PHB domain where the root file system
is installed(We need to skip this as EEH recovery will
fail on root PHB).
3. Start injecting the fenced PHB errors in a for loop
Only one time basic check whether PHB recovered or not
'''
def runTest(self):
pci_domains = self.get_test_pci_domains()
for domain in pci_domains:
self.prepare_logs()
cmd = "echo 0x8000000000000000 > /sys/kernel/debug/powerpc/%s/err_injct_outbound; lspci;" % domain
log.debug(
"=================Injecting the fenced PHB error on PHB: %s=================" % domain)
self.cv_SYSTEM.console.run_command_ignore_fail(cmd)
# Give some time to EEH PCI Error recovery
if self.check_eeh_phb_recovery(domain):
log.debug("PHB %s recovery successful" % domain)
else:
self.gather_logs()
raise EEHRecoveryFailed("PHB domain", domain)
self.gather_logs()
[docs]class OpTestEEHmax_fenced_phb(OpTestEEH):
'''
This testcase has below steps:
1. Get the list of pci PHB domains
2. Get the root PHB domain where the root file system
is installed(We need to skip this as EEH recovery will
fail on root PHB).
3. Set the MAX EEH Freeze count to 1 so that we can
test max EEH Recovery capacity within less for loop executions
By default it is 6,
4. Start injecting the fenced PHB errors two times across all PHB domains
except root PHB.( As we set max EEH Freeze count to 1).
So expectation is first time it should recover and second time
EEH should properly remove the device and OS should not crash.
'''
def runTest(self):
pci_domains = self.get_test_pci_domains()
# Set the max EEH freeze count to 1
cmd = "echo 1 > /sys/kernel/debug/powerpc/eeh_max_freezes"
self.cv_SYSTEM.console.run_command(cmd)
for i in range(0, 2):
for domain in pci_domains:
self.prepare_logs()
cmd = "echo 0x8000000000000000 > /sys/kernel/debug/powerpc/%s/err_injct_outbound; lspci;" % domain
log.debug(
"=================Injecting the fenced PHB error on PHB: %s=================" % domain)
self.cv_SYSTEM.console.run_command_ignore_fail(cmd)
# Give some time to EEH PCI Error recovery
if i == 0:
if self.check_eeh_phb_recovery(domain):
log.debug("PHB %s recovery successful" % domain)
else:
self.gather_logs()
raise EEHRecoveryFailed("PHB domain", domain)
else:
if self.check_eeh_removed():
log.debug(
"PHB domain %s removed successfully" % domain)
else:
self.gather_logs()
raise EEHRemoveFailed("PHB domain", domain)
self.gather_logs()
[docs]class OpTestEEHbasic_frozen_pe(OpTestEEH):
'''
This testcase has below steps
1. Get the list of pci PHB domains
2. Get the root PHB domain where the root file system
is installed(We need to skip this as EEH recovery will
fail on root PHB).
3. get dictionary of pe vs config addr.
Ex: ::
{'0001:0c:00.2': '2', '0001:0b:00.0': 'fb', '0001:0c:00.0': '2'}
4. Prepare below command & Start inject frozenPE errors on all PE's ::
echo "PE_number:<0,1>:<function>:0:0" > /sys/kernel/debug/powerpc/PCIxxxx/err_injct
5. Gather necssary logs(dmesg & OPAL) and check the device(PE) got
recovered or not.
'''
def runTest(self):
pci_domains = self.get_test_pci_domains()
pe_dic = self.get_dic_of_pe_vs_addr()
log.debug(
"==================Testing frozen PE error injection=====================")
# Frequently used function
# 0 : MMIO read
# 4 : CFG read
# 6 : MMIO write
# 10: CFG write
func = [0, 4, 6, 10]
ERROR = [0, 1]
# "pe_no:0:function:address:mask" - 32-bit PCI errors
# "pe_no:1:function:address:mask" - 64-bit PCI errors
# Ex: echo "PE_number:<0,1>:<function>:0:0" > /sys/kernel/debug/powerpc/PCIxxxx/err_injct
# echo 2:0:4:0:0 > /sys/kernel/debug/powerpc/PCI0001/err_injct && lspci -ns 0001:0c:00.0; echo $?
# Inject error on every PE
for pe, addr in list(pe_dic.items()):
count = 0
recover = True
for e in ERROR:
phb = (pe.split(":"))[0]
if count > 5 or not recover:
break
# Skip the PE's under root PHB
if any(phb in s for s in pci_domains):
for f in func:
log.debug(
"===================Running error injection on pe %s func %s===================" % (pe, f))
rc = 1
rc = self.run_pe_4(
addr, e, f, phb, pe, self.cv_SYSTEM.console)
if rc == EEH_MISS:
continue
count += 1
if count < 6: # Upto five times check for recovery
try:
ret = self.check_eeh_pe_recovery(pe)
self.gather_logs()
except EEHRecoveryFailed:
log.error(
"EEH_FAIL: PE %s recovery failed after %d EEH error" % (pe, count))
recover = False
break
if not ret:
log.error(
"EEH_FAIL: PE %s recovery failed after %d EEH error" % (pe, count))
recover = False
break
log.debug(
"PE %s recovered after %d EEH error" % (pe, count))
elif count == 6: # sixth time check for removal
if not self.check_eeh_removed():
log.error(
"EEH_FAIL: PE %s remove failed after 6th EEH hit" % pe)
else:
log.debug(
"PE %s removed successfully after 6th EEH hit" % pe)
self.check_eeh_slot_resets()
self.verify_location_code_logging(pe)
[docs]class OpTestEEHmax_frozen_pe(OpTestEEH):
'''
This testcase has below steps
1. Get the list of pci PHB domains
2. Get the root PHB domain where the root file system
is installed(We need to skip this as EEH recovery will
fail on root PHB).
3. get dictionary of pe vs config addr. Ex: ::
{'0001:0c:00.2': '2', '0001:0b:00.0': 'fb', '0001:0c:00.0': '2'}
4. Prepare below command & Start inject frozenPE errors on all PE's ::
echo "PE_number:<0,1>:<function>:0:0" > /sys/kernel/debug/powerpc/PCIxxxx/err_injct
5. Gather necssary logs(dmesg & OPAL) and check the device(PE) got
recovered or not.
'''
def runTest(self):
pci_domains = self.get_test_pci_domains()
# Set the max EEH freeze count to 1
cmd = "echo 1 > /sys/kernel/debug/powerpc/eeh_max_freezes"
self.cv_SYSTEM.console.run_command(cmd)
pe_dic = self.get_dic_of_pe_vs_addr()
log.debug(
"======================Testing frozen PE error injection===========================")
# Frequently used function
# 0 : MMIO read
# 4 : CFG read
# 6 : MMIO write
# 10: CFG write
func = [0, 4, 6, 10]
ERROR = [0, 1]
# "pe_no:0:function:address:mask" - 32-bit PCI errors
# "pe_no:1:function:address:mask" - 64-bit PCI errors
# Ex: echo "PE_number:<0,1>:<function>:0:0" > /sys/kernel/debug/powerpc/PCIxxxx/err_injct
# echo 2:0:4:0:0 > /sys/kernel/debug/powerpc/PCI0001/err_injct && lspci -ns 0001:0c:00.0; echo $?
# Inject error on every PE
for pe, addr in list(pe_dic.items()):
count = 0
recover = True
for e in ERROR:
if count > 1 or not recover:
break
phb = (pe.split(":"))[0]
# Skip the PE's under root PHB
if any(phb in s for s in pci_domains):
for f in func:
log.debug(
"=====================Running error injection on pe %s func %s====================" % (pe, f))
rc = 1
rc = self.run_pe_4(
addr, e, f, phb, pe, self.cv_SYSTEM.console)
if rc == EEH_MISS:
continue
count += 1
if count == 1: # First time check for recovery
try:
ret = self.check_eeh_pe_recovery(pe)
self.gather_logs()
except EEHRecoveryFailed:
log.error(
"EEH_FAIL: PE %s recovery failed after first EEH error" % pe)
recover = False
break
if not ret:
log.error(
"EEH_FAIL: PE %s recovery failed after first EEH error" % pe)
recover = False
break
else:
log.debug(
"PE % recovered after first EEH error" % pe)
elif count == 2: # Second time check for removal
if not self.check_eeh_removed():
log.error("EEH_FAIL: PE %s remove failed" % pe)
else:
log.debug("PE %s removed successfully" % pe)
self.check_eeh_slot_resets()
self.verify_location_code_logging(pe)
def suite():
s = unittest.TestSuite()
s.addTest(OpTestEEHbasic_fenced_phb())
s.addTest(OpTestEEHmax_fenced_phb())
s.addTest(OpTestEEHbasic_frozen_pe())
s.addTest(OpTestEEHmax_frozen_pe())
return s