Commit 21031482 authored by Olivier Berthier's avatar Olivier Berthier Committed by Dylan Baker

framework: Add aborting option when a monitored error is detected

This adds a policy which advises when the user should reboot the system
to avoid noisy test results due to system becoming unstable, for
instance, and therefore continues testing successfully.

To do this, a new module is proposed. A class Monitoring is used for
managing the monitoring rules. Two types of rules, MonitoringFile and
MonitoringLinuxDmesg, derived from the abstract class MonitoringBase,
have been implemented. The first allow to track a pattern on standard
files or locked files. The second, derived from dmesg.LinuxDmesg, will
track a pattern on the dmesg.

The monitoring rules must be defined in piglit.conf at the section
monitored-errors. If one of the regex is found, Piglit will raise a
PiglitAbort exception, stop the test execution -terminating test thread
pool- and exit with code 3.

Then test execution resume, after rebooting the system or not, is done
like usually with command line parameter "resume".

To call it, use command line parameter: --abort-on-monitored-error

This option implies --no-concurrent

This include also a set of unit tests for this module.
Reviewed-by: default avatarDylan Baker <dylanx.c.baker@intel.com>
parent e43a23e7
......@@ -30,6 +30,7 @@ __all__ = [
'PiglitInternalError',
'PiglitFatalError',
'PiglitException',
'PiglitAbort',
'handler',
]
......@@ -51,6 +52,10 @@ def handler(func):
except PiglitFatalError as e:
print('Fatal Error: {}'.format(str(e)), file=sys.stderr)
sys.exit(1)
except PiglitAbort as e:
print('Aborting Piglit execution: {}'.format(str(e)),
file=sys.stderr)
sys.exit(3)
return _inner
......@@ -87,3 +92,12 @@ class PiglitFatalError(Exception):
to the top of the program where it exits.
"""
class PiglitAbort(Exception):
"""Class for non-errors that require piglit aborting.
When this class (or a subclass) is raised it should be raised all the way
to the top of the program where it exits.
"""
# Copyright (c) 2016 Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
"""Module implementing classes for monitoring
This module provides a set of classes for monitoring the system on multiple
sources. The sources can be standard files, locked files or the dmesg.
The patterns of the errors are defined in piglit.conf, at the section
monitored-errors.
When one of the regex is found in the corresponding source Piglit will abort
with code 3.
"""
from __future__ import (
absolute_import, division, print_function, unicode_literals
)
import abc
import errno
import fcntl
import os
import re
import six
from framework.core import PIGLIT_CONFIG
from framework.dmesg import LinuxDmesg
from framework import exceptions
__all__ = [
'BaseMonitoring',
'Monitoring',
'MonitoringFile',
'MonitoringLinuxDmesg',
]
class Monitoring(object):
"""Class that initialize and update monitoring classes
This reads the rules from the section monitored-errors in piglit.conf
and initializes the monitoring objects. Their type must be derived from
BaseMonitoring and specialized according the field 'type'.
"""
# starting time: user must know current machine/GPU state before
# starting piglit: we can resume test execution based on last stopping
# if we consider that our system is still in good shape (ie not blocking
# run forever)
_abort_error = None
_monitoring_rules = None
def __init__(self, monitoring_enabled):
"""Create a LinuxMonitored instance"""
# Get the monitoring rules from piglit.conf and store them into a dict.
self._monitoring_rules = {}
if monitoring_enabled and PIGLIT_CONFIG.has_section('monitored-errors'):
for key, _ in PIGLIT_CONFIG.items('monitored-errors'):
if PIGLIT_CONFIG.has_section(key):
type = PIGLIT_CONFIG.required_get(key, 'type')
regex = PIGLIT_CONFIG.required_get(key, 'regex')
parameters = PIGLIT_CONFIG.required_get(key, 'parameters')
self.add_rule(key, type, parameters, regex)
@property
def abort_needed(self):
"""Simply return if _abort_error variable is not empty"""
return self._abort_error is not None
@property
def error_message(self):
"""Simply return _abort_error message"""
return self._abort_error
def add_rule(self, key, type, parameters, regex):
"""Add a new monitoring rule
This method adds a new monitoring rule. The type must be file,
locked_file or dmesg.
Arguments:
key -- The key for storing the rule in a dict
type -- The type of monitoring
parameters -- Depending on the type, can be a file path or a command
options
regex -- The rule regex
"""
rule = None
if type == 'file':
rule = MonitoringFile(parameters,
regex)
elif type == 'locked_file':
rule = MonitoringFile(parameters,
regex, True)
elif type == 'dmesg':
rule = MonitoringLinuxDmesg(parameters,
regex)
else:
raise exceptions.PiglitFatalError(
"No available monitoring class for the type {}.".format(type))
self._monitoring_rules[key] = rule
def delete_rule(self, key):
"""Remove a monitoring rule
Arguments:
key -- The rule key
"""
self._monitoring_rules.pop(key, None)
def update_monitoring(self):
"""Update the new messages for each monitoring object"""
if self._monitoring_rules:
for monitoring_rule in six.itervalues(self._monitoring_rules):
monitoring_rule.update_monitoring()
def check_monitoring(self):
"""Check monitoring objects statue
This method checks the state for each monitoring object.
If one of them found the pattern in the new messages,
set itself on abort_needed state.
"""
# Get a new snapshot of the source
self.update_monitoring()
if self._monitoring_rules:
for rule_key, monitoring_rule in six.iteritems(
self._monitoring_rules):
# Get error message
self._abort_error = monitoring_rule.check_monitoring()
# if error message is not empty, abort is requested
if self.abort_needed:
self._abort_error = "From the rule {}:\n{}".format(
rule_key,
self._abort_error)
break
@six.add_metaclass(abc.ABCMeta)
class BaseMonitoring(object):
"""Abstract base class for Monitoring derived objects
This provides the bases of the constructor and most subclasses should call
super() in __init__(). It provides a concrete implementation of the
check_monitoring() method, which should be suitible for all subclasses.
The update_monitoring() method need to be override for all subclasses.
"""
@abc.abstractmethod
def __init__(self, monitoring_source, regex):
"""Abstract constructor for BaseMonitoring subclasses
Arguments;
monitoring_source -- The source to monitor
regex -- The errors pattern
"""
self._monitoring_source = monitoring_source
self._monitoring_regex = re.compile(regex)
self._new_messages = ''
@property
def new_messages(self):
"""Simply return if _abort_error variable is not empty"""
return self._new_messages
@abc.abstractmethod
def update_monitoring(self):
"""Update _new_messages list
This method should read a monitoring source like a file or a program
output, determine new entries since the last read of the source,
and update self._new_messages with those entries
"""
pass
def check_monitoring(self):
"""Check _new_messages
This method checks if the regex is found in the new messages,
then return a string that contain the monitored location and
the matched line
"""
if self._new_messages and self._monitoring_regex:
for line in self._new_messages:
if self._monitoring_regex.search(line):
return line
class MonitoringFile(BaseMonitoring):
"""Monitoring from a file
This class is for monitoring the system from a file that
can be a standard file or a locked file. The locked file
algorithm uses fnctl, so it doesn't work on non Unix systems
Arguments:
is_locked -- True if the target is a locked file
"""
_is_locked = False
def __init__(self, monitoring_source, regex, is_locked=False):
"""Create a MonitoringFile instance"""
self._last_message = None
self._is_locked = is_locked
super(MonitoringFile, self).__init__(monitoring_source, regex)
def update_monitoring(self):
"""Open the file and get the differences
Get the contents of the file, then calculate new messages.
This implements also a specific method for reading locked files.
"""
try:
with open(self._monitoring_source, 'r') as f:
lines = []
if self._is_locked:
# Create a duplicated file descriptor, this avoid lock
fd = os.dup(f.fileno())
# use I/O control for reading the lines
fcntl.fcntl(fd, fcntl.F_SETFL, os.O_NONBLOCK)
while True:
try:
line = os.read(fd, 1024)
if not line:
break;
except OSError as e:
if e.errno == errno.EAGAIN:
break
else:
raise e
lines.append(line.decode("utf-8", "strict"))
os.close(fd)
else:
lines = f.read().splitlines()
f.close()
# Find all new entries, do this by slicing the list of the lines to only
# returns elements after the last element stored. If there are not
# matches a value error is raised, that means all of the lines are new
l = 0
for index, item in enumerate(reversed(lines)):
if item == self._last_message:
l = len(lines) - index # don't include the matched element
break
self._new_messages = lines[l:]
# Attempt to store the last element of lines,
# unless there was no line
self._last_message = lines[-1] if lines else None
except Exception:
# if an error occured, we consider there are no new messages
self._new_messages = []
pass
class MonitoringLinuxDmesg(BaseMonitoring, LinuxDmesg):
"""Monitoring on dmesg
This class is for monitoring on the system dmesg. It's inherited
from LinuxDmesg for the dmesg processing methods.
Work only on Linux operating system.
"""
def __init__(self, monitoring_source, regex):
"""Create a MonitoringLinuxDmesg instance"""
self.DMESG_COMMAND = ['dmesg']+monitoring_source.split()
BaseMonitoring.__init__(self, monitoring_source, regex)
LinuxDmesg.__init__(self)
def update_monitoring(self):
"""Call update_dmesg"""
self.update_dmesg()
......@@ -178,6 +178,7 @@ class _Options(object): # pylint: disable=too-many-instance-attributes
exclude_filter -- list of compiled regex which exclude tests that match
valgrind -- True if valgrind is to be used
dmesg -- True if dmesg checking is desired. This forces concurrency off
monitored -- True if monitoring is desired. This forces concurrency off
env -- environment variables set for each test before run
"""
......@@ -192,6 +193,7 @@ class _Options(object): # pylint: disable=too-many-instance-attributes
self.exclude_tests = set()
self.valgrind = False
self.dmesg = False
self.monitored = False
self.sync = False
# env is used to set some base environment variables that are not going
......
......@@ -42,6 +42,7 @@ import six
from framework import grouptools, exceptions, options
from framework.dmesg import get_dmesg
from framework.log import LogManager
from framework.monitoring import Monitoring
from framework.test.base import Test
__all__ = [
......@@ -208,6 +209,8 @@ class TestProfile(object):
self._dmesg = None
self.dmesg = False
self.results_dir = None
self._monitoring = None
self.monitoring = False
@property
def dmesg(self):
......@@ -225,6 +228,22 @@ class TestProfile(object):
"""
self._dmesg = get_dmesg(not_dummy)
@property
def monitoring(self):
""" Return monitoring """
return self._monitoring
@monitoring.setter
def monitoring(self, monitored):
""" Set monitoring
Arguments:
monitored -- if Truthy Monitoring will enable monitoring according the
defined rules
"""
self._monitoring = Monitoring(monitored)
def _prepare_test_list(self):
""" Prepare tests for running
......@@ -309,16 +328,18 @@ class TestProfile(object):
self._prepare_test_list()
log = LogManager(logger, len(self.test_list))
def test(pair):
def test(pair, this_pool=None):
"""Function to call test.execute from map"""
name, test = pair
with backend.write_test(name) as w:
test.execute(name, log.get(), self.dmesg)
test.execute(name, log.get(), self.dmesg, self.monitoring)
w(test.result)
if self._monitoring.abort_needed:
this_pool.terminate()
def run_threads(pool, testlist):
""" Open a pool, close it, and join it """
pool.imap(test, testlist, chunksize)
pool.imap(lambda pair: test(pair, pool), testlist, chunksize)
pool.close()
pool.join()
......@@ -345,6 +366,9 @@ class TestProfile(object):
self._post_run_hook()
if self._monitoring.abort_needed:
raise exceptions.PiglitAbort(self._monitoring.error_message)
def filter_tests(self, function):
"""Filter out tests that return false from the supplied function
......
......@@ -137,6 +137,13 @@ def _run_parser(input_):
action="store_true",
help="Capture a difference in dmesg before and "
"after each test. Implies -1/--no-concurrency")
parser.add_argument("--abort-on-monitored-error",
action="store_true",
dest="monitored",
help="Enable monitoring according the rules defined "
"in piglit.conf, and stop the execution when a "
"monitored error is detected. Exit code 3. "
"Implies -1/--no-concurrency")
parser.add_argument("-s", "--sync",
action="store_true",
help="Sync results to disk after every test")
......@@ -225,7 +232,7 @@ def run(input_):
# If dmesg is requested we must have serial run, this is because dmesg
# isn't reliable with threaded run
if args.dmesg:
if args.dmesg or args.monitored:
args.concurrency = "none"
# Pass arguments into Options
......@@ -235,6 +242,7 @@ def run(input_):
options.OPTIONS.execute = args.execute
options.OPTIONS.valgrind = args.valgrind
options.OPTIONS.dmesg = args.dmesg
options.OPTIONS.monitored = args.monitored
options.OPTIONS.sync = args.sync
# Set the platform to pass to waffle
......@@ -284,6 +292,10 @@ def run(input_):
# Set the dmesg type
if args.dmesg:
profile.dmesg = args.dmesg
if args.monitored:
profile.monitoring = args.monitored
profile.run(args.log_level, backend)
results.time_elapsed.end = time.time()
......@@ -319,6 +331,7 @@ def resume(input_):
options.OPTIONS.execute = results.options['execute']
options.OPTIONS.valgrind = results.options['valgrind']
options.OPTIONS.dmesg = results.options['dmesg']
options.OPTIONS.monitored = results.options['monitored']
options.OPTIONS.sync = results.options['sync']
core.get_config(args.config_file)
......@@ -345,6 +358,9 @@ def resume(input_):
if options.OPTIONS.dmesg:
profile.dmesg = options.OPTIONS.dmesg
if options.OPTIONS.monitored:
profile.monitoring = options.OPTIONS.monitored
# This is resumed, don't bother with time since it won't be accurate anyway
profile.run(results.options['log_level'], backend)
......
......@@ -185,7 +185,7 @@ class Test(object):
assert isinstance(timeout, int)
self.timeout = timeout
def execute(self, path, log, dmesg):
def execute(self, path, log, dmesg, monitoring):
""" Run a test
Run a test, but with features. This times the test, uses dmesg checking
......@@ -195,6 +195,7 @@ class Test(object):
path -- the name of the test
log -- a log.Log instance
dmesg -- a dmesg.BaseDmesg derived class
monitoring -- a monitoring.Monitoring instance
"""
log.start(path)
......@@ -203,9 +204,11 @@ class Test(object):
try:
self.result.time.start = time.time()
dmesg.update_dmesg()
monitoring.update_monitoring()
self.run()
self.result.time.end = time.time()
self.result = dmesg.update_result(self.result)
monitoring.check_monitoring()
# This is a rare case where a bare exception is okay, since we're
# using it to log exceptions
except:
......
......@@ -162,3 +162,32 @@ run_test=./%(test_name)s
[expected-crashes]
; Like expected-failures, but specifies that a test is expected to
; crash.
[monitored-errors]
; Set the monitoring rules for checking when the system need to be rebooted.
; Piglit must be launched with --abort-on-monitored-error
;
; For each activated monitoring rule a section must be created in this file that
; contains the type of monitoring (dmesg, file or locked_file).
; Depending on the type, the parameter 'parameters' is a filename or a list of
; options. The regex is the pattern that causes Piglit aborting when it's found.
; Examples :
;
;i915_error_state
;kmsg_monitoring
;dmesg_error
;
;[i915_error_state]
;type=file
;parameters=/sys/kernel/debug/dri/0/i915_error_state
;regex=^((?!no error state collected).)*$
;
;[kmsg_monitoring]
;type=locked_file
;parameters=/dev/kmsg
;regex=\*ERROR\* ring create req|\*ERROR\* Failed to reset chip
;
;[dmesg_error]
;type=dmesg
;parameters=--level emerg,alert,crit,err,warn,notice
;regex=\*ERROR\* ring create req|\*ERROR\* Failed to reset chip|BUG:|Oops:|turning off the locking correctness validator
......@@ -51,7 +51,7 @@ from framework.test.base import (
WindowResizeMixin,
)
from framework.options import _Options as Options
from framework import log, dmesg
from framework import log, dmesg, monitoring
# pylint: disable=invalid-name
......@@ -410,7 +410,8 @@ class TestExecuteTraceback(object):
test.execute(mock.Mock(spec=six.text_type),
mock.Mock(spec=log.BaseLog),
mock.Mock(spec=dmesg.BaseDmesg))
mock.Mock(spec=dmesg.BaseDmesg),
mock.Mock(spec=monitoring.Monitoring))
cls.test = test.result
......
# Copyright (c) 2016 Intel Corporation
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""Tests for the monitoring module.
This provides tests for the framework.monitoring modules.
"""
from __future__ import (
absolute_import, division, print_function, unicode_literals
)
try:
from unittest import mock
except ImportError:
import mock
import nose.tools as nt
from . import utils
from framework import monitoring, exceptions
class TestMonitoring(object):
"""Tests for Monitoring methods."""
def __init__(self):
"""Setup for TestMonitoring
This create a monitoring.Monitoring instance with monitoring disabled
to avoid reading the rules in piglit.conf.
"""
self.regex = r'\*ERROR\*|BUG:'
self.init_contents = r'foo bar\n'
self.no_error_contents = r'foo bar\n'
self.error_contents = r'BUG:bar\n'
self.monitoring = monitoring.Monitoring(False)
def test_Monitoring_delete_rule(self):
"""monitorin.Monitoring: add and delete rule."""
with utils.tempfile(self.init_contents)