You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
qemu/scripts/arm_processor_error.py

477 lines
18 KiB
Python

#!/usr/bin/env python3
#
# pylint: disable=C0301,C0114,R0903,R0912,R0913,R0914,R0915,W0511
# SPDX-License-Identifier: GPL-2.0-or-later
#
# Copyright (C) 2024-2025 Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
# TODO: current implementation has dummy defaults.
#
# For a better implementation, a QMP addition/call is needed to
# retrieve some data for ARM Processor Error injection:
#
# - ARM registers: power_state, mpidr.
"""
Generate an ARM processor error CPER, compatible with
UEFI 2.9A Errata.
Injecting such errors can be done using:
$ ./scripts/ghes_inject.py arm
Error injected.
Produces a simple CPER register, as detected on a Linux guest:
[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1
[Hardware Error]: event severity: recoverable
[Hardware Error]: Error 0, type: recoverable
[Hardware Error]: section_type: ARM processor error
[Hardware Error]: MIDR: 0x0000000000000000
[Hardware Error]: running state: 0x0
[Hardware Error]: Power State Coordination Interface state: 0
[Hardware Error]: Error info structure 0:
[Hardware Error]: num errors: 2
[Hardware Error]: error_type: 0x02: cache error
[Hardware Error]: error_info: 0x000000000091000f
[Hardware Error]: transaction type: Data Access
[Hardware Error]: cache error, operation type: Data write
[Hardware Error]: cache level: 2
[Hardware Error]: processor context not corrupted
[Firmware Warn]: GHES: Unhandled processor error type 0x02: cache error
The ARM Processor Error message can be customized via command line
parameters. For instance:
$ ./scripts/ghes_inject.py arm --mpidr 0x444 --running --affinity 1 \
--error-info 12345678 --vendor 0x13,123,4,5,1 --ctx-array 0,1,2,3,4,5 \
-t cache tlb bus micro-arch tlb,micro-arch
Error injected.
Injects this error, as detected on a Linux guest:
[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1
[Hardware Error]: event severity: recoverable
[Hardware Error]: Error 0, type: recoverable
[Hardware Error]: section_type: ARM processor error
[Hardware Error]: MIDR: 0x0000000000000000
[Hardware Error]: Multiprocessor Affinity Register (MPIDR): 0x0000000000000000
[Hardware Error]: error affinity level: 0
[Hardware Error]: running state: 0x1
[Hardware Error]: Power State Coordination Interface state: 0
[Hardware Error]: Error info structure 0:
[Hardware Error]: num errors: 2
[Hardware Error]: error_type: 0x02: cache error
[Hardware Error]: error_info: 0x0000000000bc614e
[Hardware Error]: cache level: 2
[Hardware Error]: processor context not corrupted
[Hardware Error]: Error info structure 1:
[Hardware Error]: num errors: 2
[Hardware Error]: error_type: 0x04: TLB error
[Hardware Error]: error_info: 0x000000000054007f
[Hardware Error]: transaction type: Instruction
[Hardware Error]: TLB error, operation type: Instruction fetch
[Hardware Error]: TLB level: 1
[Hardware Error]: processor context not corrupted
[Hardware Error]: the error has not been corrected
[Hardware Error]: PC is imprecise
[Hardware Error]: Error info structure 2:
[Hardware Error]: num errors: 2
[Hardware Error]: error_type: 0x08: bus error
[Hardware Error]: error_info: 0x00000080d6460fff
[Hardware Error]: transaction type: Generic
[Hardware Error]: bus error, operation type: Generic read (type of instruction or data request cannot be determined)
[Hardware Error]: affinity level at which the bus error occurred: 1
[Hardware Error]: processor context corrupted
[Hardware Error]: the error has been corrected
[Hardware Error]: PC is imprecise
[Hardware Error]: Program execution can be restarted reliably at the PC associated with the error.
[Hardware Error]: participation type: Local processor observed
[Hardware Error]: request timed out
[Hardware Error]: address space: External Memory Access
[Hardware Error]: memory access attributes:0x20
[Hardware Error]: access mode: secure
[Hardware Error]: Error info structure 3:
[Hardware Error]: num errors: 2
[Hardware Error]: error_type: 0x10: micro-architectural error
[Hardware Error]: error_info: 0x0000000078da03ff
[Hardware Error]: Error info structure 4:
[Hardware Error]: num errors: 2
[Hardware Error]: error_type: 0x14: TLB error|micro-architectural error
[Hardware Error]: Context info structure 0:
[Hardware Error]: register context type: AArch64 EL1 context registers
[Hardware Error]: 00000000: 00000000 00000000
[Hardware Error]: Vendor specific error info has 5 bytes:
[Hardware Error]: 00000000: 13 7b 04 05 01 .{...
[Firmware Warn]: GHES: Unhandled processor error type 0x02: cache error
[Firmware Warn]: GHES: Unhandled processor error type 0x04: TLB error
[Firmware Warn]: GHES: Unhandled processor error type 0x08: bus error
[Firmware Warn]: GHES: Unhandled processor error type 0x10: micro-architectural error
[Firmware Warn]: GHES: Unhandled processor error type 0x14: TLB error|micro-architectural error
"""
import argparse
import re
from qmp_helper import qmp, util, cper_guid
class ArmProcessorEinj:
"""
Implements ARM Processor Error injection via GHES
"""
DESC = """
Generates an ARM processor error CPER, compatible with
UEFI 2.9A Errata.
"""
ACPI_GHES_ARM_CPER_LENGTH = 40
ACPI_GHES_ARM_CPER_PEI_LENGTH = 32
# Context types
CONTEXT_AARCH32_EL1 = 1
CONTEXT_AARCH64_EL1 = 5
CONTEXT_MISC_REG = 8
def __init__(self, subparsers):
"""Initialize the error injection class and add subparser"""
# Valid choice values
self.arm_valid_bits = {
"mpidr": util.bit(0),
"affinity": util.bit(1),
"running": util.bit(2),
"vendor": util.bit(3),
}
self.pei_flags = {
"first": util.bit(0),
"last": util.bit(1),
"propagated": util.bit(2),
"overflow": util.bit(3),
}
self.pei_error_types = {
"cache": util.bit(1),
"tlb": util.bit(2),
"bus": util.bit(3),
"micro-arch": util.bit(4),
}
self.pei_valid_bits = {
"multiple-error": util.bit(0),
"flags": util.bit(1),
"error-info": util.bit(2),
"virt-addr": util.bit(3),
"phy-addr": util.bit(4),
}
self.data = bytearray()
parser = subparsers.add_parser("arm", description=self.DESC)
arm_valid_bits = ",".join(self.arm_valid_bits.keys())
flags = ",".join(self.pei_flags.keys())
error_types = ",".join(self.pei_error_types.keys())
pei_valid_bits = ",".join(self.pei_valid_bits.keys())
# UEFI N.16 ARM Validation bits
g_arm = parser.add_argument_group("ARM processor")
g_arm.add_argument("--arm", "--arm-valid",
help=f"ARM valid bits: {arm_valid_bits}")
g_arm.add_argument("-a", "--affinity", "--level", "--affinity-level",
type=lambda x: int(x, 0),
help="Affinity level (when multiple levels apply)")
g_arm.add_argument("-l", "--mpidr", type=lambda x: int(x, 0),
help="Multiprocessor Affinity Register")
g_arm.add_argument("-i", "--midr", type=lambda x: int(x, 0),
help="Main ID Register")
g_arm.add_argument("-r", "--running",
action=argparse.BooleanOptionalAction,
default=None,
help="Indicates if the processor is running or not")
g_arm.add_argument("--psci", "--psci-state",
type=lambda x: int(x, 0),
help="Power State Coordination Interface - PSCI state")
# TODO: Add vendor-specific support
# UEFI N.17 bitmaps (type and flags)
g_pei = parser.add_argument_group("ARM Processor Error Info (PEI)")
g_pei.add_argument("-t", "--type", nargs="+",
help=f"one or more error types: {error_types}")
g_pei.add_argument("-f", "--flags", nargs="*",
help=f"zero or more error flags: {flags}")
g_pei.add_argument("-V", "--pei-valid", "--error-valid", nargs="*",
help=f"zero or more PEI valid bits: {pei_valid_bits}")
# UEFI N.17 Integer values
g_pei.add_argument("-m", "--multiple-error", nargs="+",
help="Number of errors: 0: Single error, 1: Multiple errors, 2-65535: Error count if known")
g_pei.add_argument("-e", "--error-info", nargs="+",
help="Error information (UEFI 2.10 tables N.18 to N.20)")
g_pei.add_argument("-p", "--physical-address", nargs="+",
help="Physical address")
g_pei.add_argument("-v", "--virtual-address", nargs="+",
help="Virtual address")
# UEFI N.21 Context
g_ctx = parser.add_argument_group("Processor Context")
g_ctx.add_argument("--ctx-type", "--context-type", nargs="*",
help="Type of the context (0=ARM32 GPR, 5=ARM64 EL1, other values supported)")
g_ctx.add_argument("--ctx-size", "--context-size", nargs="*",
help="Minimal size of the context")
g_ctx.add_argument("--ctx-array", "--context-array", nargs="*",
help="Comma-separated arrays for each context")
# Vendor-specific data
g_vendor = parser.add_argument_group("Vendor-specific data")
g_vendor.add_argument("--vendor", "--vendor-specific", nargs="+",
help="Vendor-specific byte arrays of data")
# Add arguments for Generic Error Data
qmp.argparse(parser)
parser.set_defaults(func=self.send_cper)
def send_cper(self, args):
"""Parse subcommand arguments and send a CPER via QMP"""
qmp_cmd = qmp(args.host, args.port, args.debug)
# Handle Generic Error Data arguments if any
qmp_cmd.set_args(args)
is_cpu_type = re.compile(r"^([\w+]+\-)?arm\-cpu$")
cpus = qmp_cmd.search_qom("/machine/unattached/device",
"type", is_cpu_type)
cper = {}
pei = {}
ctx = {}
vendor = {}
arg = vars(args)
# Handle global parameters
if args.arm:
arm_valid_init = False
cper["valid"] = util.get_choice(name="valid",
value=args.arm,
choices=self.arm_valid_bits,
suffixes=["-error", "-err"])
else:
cper["valid"] = 0
arm_valid_init = True
if "running" in arg:
if args.running:
cper["running-state"] = util.bit(0)
else:
cper["running-state"] = 0
else:
cper["running-state"] = 0
if arm_valid_init:
if args.affinity:
cper["valid"] |= self.arm_valid_bits["affinity"]
if args.mpidr:
cper["valid"] |= self.arm_valid_bits["mpidr"]
if "running-state" in cper:
cper["valid"] |= self.arm_valid_bits["running"]
if args.psci:
cper["valid"] |= self.arm_valid_bits["running"]
# Handle PEI
if not args.type:
args.type = ["cache-error"]
util.get_mult_choices(
pei,
name="valid",
values=args.pei_valid,
choices=self.pei_valid_bits,
suffixes=["-valid", "--addr"],
)
util.get_mult_choices(
pei,
name="type",
values=args.type,
choices=self.pei_error_types,
suffixes=["-error", "-err"],
)
util.get_mult_choices(
pei,
name="flags",
values=args.flags,
choices=self.pei_flags,
suffixes=["-error", "-cap"],
)
util.get_mult_int(pei, "error-info", args.error_info)
util.get_mult_int(pei, "multiple-error", args.multiple_error)
util.get_mult_int(pei, "phy-addr", args.physical_address)
util.get_mult_int(pei, "virt-addr", args.virtual_address)
# Handle context
util.get_mult_int(ctx, "type", args.ctx_type, allow_zero=True)
util.get_mult_int(ctx, "minimal-size", args.ctx_size, allow_zero=True)
util.get_mult_array(ctx, "register", args.ctx_array, allow_zero=True)
util.get_mult_array(vendor, "bytes", args.vendor, max_val=255)
# Store PEI
pei_data = bytearray()
default_flags = self.pei_flags["first"]
default_flags |= self.pei_flags["last"]
error_info_num = 0
for i, p in pei.items(): # pylint: disable=W0612
error_info_num += 1
# UEFI 2.10 doesn't define how to encode error information
# when multiple types are raised. So, provide a default only
# if a single type is there
if "error-info" not in p:
if p["type"] == util.bit(1):
p["error-info"] = 0x0091000F
if p["type"] == util.bit(2):
p["error-info"] = 0x0054007F
if p["type"] == util.bit(3):
p["error-info"] = 0x80D6460FFF
if p["type"] == util.bit(4):
p["error-info"] = 0x78DA03FF
if "valid" not in p:
p["valid"] = 0
if "multiple-error" in p:
p["valid"] |= self.pei_valid_bits["multiple-error"]
if "flags" in p:
p["valid"] |= self.pei_valid_bits["flags"]
if "error-info" in p:
p["valid"] |= self.pei_valid_bits["error-info"]
if "phy-addr" in p:
p["valid"] |= self.pei_valid_bits["phy-addr"]
if "virt-addr" in p:
p["valid"] |= self.pei_valid_bits["virt-addr"]
# Version
util.data_add(pei_data, 0, 1)
util.data_add(pei_data,
self.ACPI_GHES_ARM_CPER_PEI_LENGTH, 1)
util.data_add(pei_data, p["valid"], 2)
util.data_add(pei_data, p["type"], 1)
util.data_add(pei_data, p.get("multiple-error", 1), 2)
util.data_add(pei_data, p.get("flags", default_flags), 1)
util.data_add(pei_data, p.get("error-info", 0), 8)
util.data_add(pei_data, p.get("virt-addr", 0xDEADBEEF), 8)
util.data_add(pei_data, p.get("phy-addr", 0xABBA0BAD), 8)
# Store Context
ctx_data = bytearray()
context_info_num = 0
if ctx:
ret = qmp_cmd.send_cmd("query-target", may_open=True)
default_ctx = self.CONTEXT_MISC_REG
if "arch" in ret:
if ret["arch"] == "aarch64":
default_ctx = self.CONTEXT_AARCH64_EL1
elif ret["arch"] == "arm":
default_ctx = self.CONTEXT_AARCH32_EL1
for k in sorted(ctx.keys()):
context_info_num += 1
if "type" not in ctx[k]:
ctx[k]["type"] = default_ctx
if "register" not in ctx[k]:
ctx[k]["register"] = []
reg_size = len(ctx[k]["register"])
size = 0
if "minimal-size" in ctx:
size = ctx[k]["minimal-size"]
size = max(size, reg_size)
size = (size + 1) % 0xFFFE
# Version
util.data_add(ctx_data, 0, 2)
util.data_add(ctx_data, ctx[k]["type"], 2)
util.data_add(ctx_data, 8 * size, 4)
for r in ctx[k]["register"]:
util.data_add(ctx_data, r, 8)
for i in range(reg_size, size): # pylint: disable=W0612
util.data_add(ctx_data, 0, 8)
# Vendor-specific bytes are not grouped
vendor_data = bytearray()
if vendor:
for k in sorted(vendor.keys()):
for b in vendor[k]["bytes"]:
util.data_add(vendor_data, b, 1)
# Encode ARM Processor Error
data = bytearray()
util.data_add(data, cper["valid"], 4)
util.data_add(data, error_info_num, 2)
util.data_add(data, context_info_num, 2)
# Calculate the length of the CPER data
cper_length = self.ACPI_GHES_ARM_CPER_LENGTH
cper_length += len(pei_data)
cper_length += len(vendor_data)
cper_length += len(ctx_data)
util.data_add(data, cper_length, 4)
util.data_add(data, arg.get("affinity-level", 0), 1)
# Reserved
util.data_add(data, 0, 3)
if "midr-el1" not in arg:
if cpus:
cmd_arg = {
'path': cpus[0],
'property': "midr"
}
ret = qmp_cmd.send_cmd("qom-get", cmd_arg, may_open=True)
if isinstance(ret, int):
arg["midr-el1"] = ret
util.data_add(data, arg.get("mpidr-el1", 0), 8)
util.data_add(data, arg.get("midr-el1", 0), 8)
util.data_add(data, cper["running-state"], 4)
util.data_add(data, arg.get("psci-state", 0), 4)
# Add PEI
data.extend(pei_data)
data.extend(ctx_data)
data.extend(vendor_data)
self.data = data
qmp_cmd.send_cper(cper_guid.CPER_PROC_ARM, self.data)