cxl/mem: Read, trace, and clear events on driver load
CXL devices have multiple event logs which can be queried for CXL event
records. Devices are required to support the storage of at least one
event record in each event log type.
Devices track event log overflow by incrementing a counter and tracking
the time of the first and last overflow event seen.
Software queries events via the Get Event Record mailbox command; CXL
rev 3.0 section 8.2.9.2.2 and clears events via CXL rev 3.0 section
8.2.9.2.3 Clear Event Records mailbox command.
If the result of negotiating CXL Error Reporting Control is OS control,
read and clear all event logs on driver load.
Ensure a clean slate of events by reading and clearing the events on
driver load.
The status register is not used because a device may continue to trigger
events and the only requirement is to empty the log at least once. This
allows for the required transition from empty to non-empty for interrupt
generation. Handling of interrupts is in a follow on patch.
The device can return up to 1MB worth of event records per query.
Allocate a shared large buffer to handle the max number of records based
on the mailbox payload size.
This patch traces a raw event record and leaves specific event record
type tracing to subsequent patches. Macros are created to aid in
tracing the common CXL Event header fields.
Each record is cleared explicitly. A clear all bit is specified but is
only valid when the log overflows.
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20221216-cxl-ev-log-v7-1-2316a5c8f7d8@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index 20ca2fe..1805936 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -6,8 +6,11 @@
#if !defined(_CXL_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _CXL_EVENTS_H
-#include <cxl.h>
#include <linux/tracepoint.h>
+#include <asm-generic/unaligned.h>
+
+#include <cxl.h>
+#include <cxlmem.h>
#define CXL_RAS_UC_CACHE_DATA_PARITY BIT(0)
#define CXL_RAS_UC_CACHE_ADDR_PARITY BIT(1)
@@ -103,6 +106,122 @@ TRACE_EVENT(cxl_aer_correctable_error,
)
);
+#define cxl_event_log_type_str(type) \
+ __print_symbolic(type, \
+ { CXL_EVENT_TYPE_INFO, "Informational" }, \
+ { CXL_EVENT_TYPE_WARN, "Warning" }, \
+ { CXL_EVENT_TYPE_FAIL, "Failure" }, \
+ { CXL_EVENT_TYPE_FATAL, "Fatal" })
+
+TRACE_EVENT(cxl_overflow,
+
+ TP_PROTO(const struct device *dev, enum cxl_event_log_type log,
+ struct cxl_get_event_payload *payload),
+
+ TP_ARGS(dev, log, payload),
+
+ TP_STRUCT__entry(
+ __string(dev_name, dev_name(dev))
+ __field(int, log)
+ __field(u64, first_ts)
+ __field(u64, last_ts)
+ __field(u16, count)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev_name, dev_name(dev));
+ __entry->log = log;
+ __entry->count = le16_to_cpu(payload->overflow_err_count);
+ __entry->first_ts = le64_to_cpu(payload->first_overflow_timestamp);
+ __entry->last_ts = le64_to_cpu(payload->last_overflow_timestamp);
+ ),
+
+ TP_printk("%s: log=%s : %u records from %llu to %llu",
+ __get_str(dev_name), cxl_event_log_type_str(__entry->log),
+ __entry->count, __entry->first_ts, __entry->last_ts)
+
+);
+
+/*
+ * Common Event Record Format
+ * CXL 3.0 section 8.2.9.2.1; Table 8-42
+ */
+#define CXL_EVENT_RECORD_FLAG_PERMANENT BIT(2)
+#define CXL_EVENT_RECORD_FLAG_MAINT_NEEDED BIT(3)
+#define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED BIT(4)
+#define CXL_EVENT_RECORD_FLAG_HW_REPLACE BIT(5)
+#define show_hdr_flags(flags) __print_flags(flags, " | ", \
+ { CXL_EVENT_RECORD_FLAG_PERMANENT, "PERMANENT_CONDITION" }, \
+ { CXL_EVENT_RECORD_FLAG_MAINT_NEEDED, "MAINTENANCE_NEEDED" }, \
+ { CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, "PERFORMANCE_DEGRADED" }, \
+ { CXL_EVENT_RECORD_FLAG_HW_REPLACE, "HARDWARE_REPLACEMENT_NEEDED" } \
+)
+
+/*
+ * Define macros for the common header of each CXL event.
+ *
+ * Tracepoints using these macros must do 3 things:
+ *
+ * 1) Add CXL_EVT_TP_entry to TP_STRUCT__entry
+ * 2) Use CXL_EVT_TP_fast_assign within TP_fast_assign;
+ * pass the dev, log, and CXL event header
+ * 3) Use CXL_EVT_TP_printk() instead of TP_printk()
+ *
+ * See the generic_event tracepoint as an example.
+ */
+#define CXL_EVT_TP_entry \
+ __string(dev_name, dev_name(dev)) \
+ __field(int, log) \
+ __field_struct(uuid_t, hdr_uuid) \
+ __field(u32, hdr_flags) \
+ __field(u16, hdr_handle) \
+ __field(u16, hdr_related_handle) \
+ __field(u64, hdr_timestamp) \
+ __field(u8, hdr_length) \
+ __field(u8, hdr_maint_op_class)
+
+#define CXL_EVT_TP_fast_assign(dev, l, hdr) \
+ __assign_str(dev_name, dev_name(dev)); \
+ __entry->log = (l); \
+ memcpy(&__entry->hdr_uuid, &(hdr).id, sizeof(uuid_t)); \
+ __entry->hdr_length = (hdr).length; \
+ __entry->hdr_flags = get_unaligned_le24((hdr).flags); \
+ __entry->hdr_handle = le16_to_cpu((hdr).handle); \
+ __entry->hdr_related_handle = le16_to_cpu((hdr).related_handle); \
+ __entry->hdr_timestamp = le64_to_cpu((hdr).timestamp); \
+ __entry->hdr_maint_op_class = (hdr).maint_op_class
+
+#define CXL_EVT_TP_printk(fmt, ...) \
+ TP_printk("%s log=%s : time=%llu uuid=%pUb len=%d flags='%s' " \
+ "handle=%x related_handle=%x maint_op_class=%u" \
+ " : " fmt, \
+ __get_str(dev_name), cxl_event_log_type_str(__entry->log), \
+ __entry->hdr_timestamp, &__entry->hdr_uuid, __entry->hdr_length,\
+ show_hdr_flags(__entry->hdr_flags), __entry->hdr_handle, \
+ __entry->hdr_related_handle, __entry->hdr_maint_op_class, \
+ ##__VA_ARGS__)
+
+TRACE_EVENT(cxl_generic_event,
+
+ TP_PROTO(const struct device *dev, enum cxl_event_log_type log,
+ struct cxl_event_record_raw *rec),
+
+ TP_ARGS(dev, log, rec),
+
+ TP_STRUCT__entry(
+ CXL_EVT_TP_entry
+ __array(u8, data, CXL_EVENT_RECORD_DATA_LENGTH)
+ ),
+
+ TP_fast_assign(
+ CXL_EVT_TP_fast_assign(dev, log, rec->hdr);
+ memcpy(__entry->data, &rec->data, CXL_EVENT_RECORD_DATA_LENGTH);
+ ),
+
+ CXL_EVT_TP_printk("%s",
+ __print_hex(__entry->data, CXL_EVENT_RECORD_DATA_LENGTH))
+);
+
#endif /* _CXL_EVENTS_H */
#define TRACE_INCLUDE_FILE trace