Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions shared/source/command_stream/command_stream_receiver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,12 @@ WaitStatus CommandStreamReceiver::baseWaitFunction(volatile TagAddressType *poll
partitionAddress = ptrOffset(partitionAddress, this->immWritePostSyncWriteOffset);
}

// Final hang check before returning ready - catches GPU faults that occurred
// during fast operations where the periodic hang check was never triggered
if (isGpuHangDetected()) {
return WaitStatus::gpuHang;
}

return WaitStatus::ready;
}

Expand Down
75 changes: 67 additions & 8 deletions shared/source/os_interface/linux/drm_neo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,18 +271,28 @@ bool Drm::isGpuHangDetected(OsContext &osContext) {
}

bool Drm::checkResetStatus(OsContext &osContext) {
const auto osContextLinux = static_cast<OsContextLinux *>(&osContext);
const auto &drmContextIds = osContextLinux->getDrmContextIds();
const auto &drmContextIds = osContext.getDrmContextIds();

for (const auto drmContextId : drmContextIds) {
ResetStats resetStats{};
resetStats.contextId = drmContextId;
ResetStatsFault fault{};
uint32_t status = 0;
bool getResetStatsSucceeded = false;
const auto retVal{ioctlHelper->getResetStats(resetStats, &status, &fault)};
UNRECOVERABLE_IF(retVal != 0);
if (retVal != 0) {
// getResetStats may fail if exec queue is destroyed or not supported
// Still check VM faults below
PRINT_STRING(debugManager.flags.PrintDebugMessages.get(), stderr,
"getResetStats failed with error %d for contextId %u, checking VM faults\n",
retVal, drmContextId);
} else {
getResetStatsSucceeded = true;
}
auto debuggingEnabled = rootDeviceEnvironment.executionEnvironment.isDebuggingEnabled();
if (checkToDisableScratchPage() && ioctlHelper->validPageFault(fault.flags)) {

// Check for page fault from prelim extension API (only if getResetStats succeeded)
if (getResetStatsSucceeded && checkToDisableScratchPage() && ioctlHelper->validPageFault(fault.flags)) {
bool banned = ((status & ioctlHelper->getStatusForResetStats(true)) != 0);
if (!banned && debuggingEnabled) {
return false;
Expand All @@ -305,9 +315,57 @@ bool Drm::checkResetStatus(OsContext &osContext) {
banned);
UNRECOVERABLE_IF(true);
}
if (resetStats.batchActive > 0 || resetStats.batchPending > 0) {

// Check for page fault from upstream VM get property API
// Query VM faults independently - they may exist even before exec queue is banned
bool banned = getResetStatsSucceeded ? ((status & ioctlHelper->getStatusForResetStats(true)) != 0) : false;
if (checkToDisableScratchPage()) {
// Use per-context VM IDs if available, otherwise fall back to shared VM IDs
const auto &contextVmIds = osContext.getDrmVmIds();
std::vector<uint32_t> vmIdsToCheck;
if (!contextVmIds.empty()) {
vmIdsToCheck = contextVmIds;
} else {
// For Xe driver without per-context VMs, use the shared virtualMemoryIds
for (size_t i = 0; i < virtualMemoryIds.size(); i++) {
if (virtualMemoryIds[i] != 0) {
vmIdsToCheck.push_back(virtualMemoryIds[i]);
}
}
}

for (const auto vmId : vmIdsToCheck) {
std::vector<ResetStatsFault> vmFaults;
if (ioctlHelper->getVmFaults(vmId, vmFaults) == 0 && !vmFaults.empty()) {
if (!banned && debuggingEnabled) {
return false;
}
for (const auto &vmFault : vmFaults) {
IoFunctions::fprintf(stderr, "Segmentation fault from GPU at 0x%llx, vm_id: %u (%s) type: %d (%s), level: %d (%s), access: %d (%s), banned: %d, aborting.\n",
vmFault.addr,
vmId,
EngineHelpers::engineTypeToString(osContext.getEngineType()).c_str(),
vmFault.type, GpuPageFaultHelpers::faultTypeToString(static_cast<FaultType>(vmFault.type)).c_str(),
vmFault.level, GpuPageFaultHelpers::faultLevelToString(static_cast<FaultLevel>(vmFault.level)).c_str(),
vmFault.access, GpuPageFaultHelpers::faultAccessToString(static_cast<FaultAccess>(vmFault.access)).c_str(),
banned);
IoFunctions::fprintf(stdout, "Segmentation fault from GPU at 0x%llx, vm_id: %u (%s) type: %d (%s), level: %d (%s), access: %d (%s), banned: %d, aborting.\n",
vmFault.addr,
vmId,
EngineHelpers::engineTypeToString(osContext.getEngineType()).c_str(),
vmFault.type, GpuPageFaultHelpers::faultTypeToString(static_cast<FaultType>(vmFault.type)).c_str(),
vmFault.level, GpuPageFaultHelpers::faultLevelToString(static_cast<FaultLevel>(vmFault.level)).c_str(),
vmFault.access, GpuPageFaultHelpers::faultAccessToString(static_cast<FaultAccess>(vmFault.access)).c_str(),
banned);
}
UNRECOVERABLE_IF(true);
}
}
}

if (getResetStatsSucceeded && (resetStats.batchActive > 0 || resetStats.batchPending > 0)) {
PRINT_STRING(debugManager.flags.PrintDebugMessages.get(), stderr, "%s", "ERROR: GPU HANG detected!\n");
osContextLinux->setHangDetected();
osContext.setHangDetected();
return true;
}
}
Expand Down Expand Up @@ -1756,9 +1814,10 @@ int Drm::createDrmVirtualMemory(uint32_t &drmVmId) {
}

bool useVmBind = isVmBindAvailable();
bool enablePageFault = hasPageFaultSupport() && useVmBind;
bool disableScratch = checkToDisableScratchPage();
bool enablePageFault = (hasPageFaultSupport() && useVmBind) || disableScratch;

ctl.flags = ioctlHelper->getFlagsForVmCreate(checkToDisableScratchPage(), enablePageFault, useVmBind);
ctl.flags = ioctlHelper->getFlagsForVmCreate(disableScratch, enablePageFault, useVmBind);

auto ret = ioctlHelper->ioctl(DrmIoctl::gemVmCreate, &ctl);

Expand Down
3 changes: 2 additions & 1 deletion shared/source/os_interface/linux/drm_wrappers.h
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,8 @@ enum class DrmIoctl {
perfDisable,
perfQuery,
primaryContextExport,
primaryContextImport
primaryContextImport,
vmGetProperty
};

enum class DrmParam {
Expand Down
1 change: 1 addition & 0 deletions shared/source/os_interface/linux/ioctl_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ class IoctlHelper {
virtual int vmBind(const VmBindParams &vmBindParams) = 0;
virtual int vmUnbind(const VmBindParams &vmBindParams) = 0;
virtual int getResetStats(ResetStats &resetStats, uint32_t *status, ResetStatsFault *resetStatsFault) = 0;
virtual int getVmFaults(uint32_t vmId, std::vector<ResetStatsFault> &faults) { return -1; }
virtual bool isEuStallSupported() = 0;
virtual uint32_t getEuStallFdParameter() = 0;
virtual bool perfOpenEuStallStream(uint32_t euStallFdParameter, uint32_t &samplingPeriodNs, uint64_t engineInstance, uint64_t notifyNReports, uint64_t gpuTimeStampfrequency, int32_t *stream) = 0;
Expand Down
8 changes: 4 additions & 4 deletions shared/source/os_interface/linux/os_context_linux.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,18 @@ class OsContextLinux : public OsContext {

unsigned int getEngineFlag() const { return engineFlag; }
void setEngineFlag(unsigned int engineFlag) { this->engineFlag = engineFlag; }
const std::vector<uint32_t> &getDrmContextIds() const { return drmContextIds; }
const std::vector<uint32_t> &getDrmVmIds() const { return drmVmIds; }
const std::vector<uint32_t> &getDrmContextIds() const override { return drmContextIds; }
const std::vector<uint32_t> &getDrmVmIds() const override { return drmVmIds; }
bool isDirectSubmissionSupported() const override;
Drm &getDrm() const;
virtual std::pair<uint64_t, uint64_t> getFenceAddressAndValToWait(uint32_t vmHandleId, bool isLocked);
virtual void waitForPagingFence();
static OsContext *create(OSInterface *osInterface, uint32_t rootDeviceIndex, uint32_t contextId, const EngineDescriptor &engineDescriptor);
void reInitializeContext() override;
void setHangDetected() {
void setHangDetected() override {
contextHangDetected = true;
}
bool isHangDetected() const {
bool isHangDetected() const override {
return contextHangDetected;
}

Expand Down
98 changes: 97 additions & 1 deletion shared/source/os_interface/linux/xe/ioctl_helper_xe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1217,7 +1217,95 @@ int IoctlHelperXe::vmUnbind(const VmBindParams &vmBindParams) {
}

int IoctlHelperXe::getResetStats(ResetStats &resetStats, uint32_t *status, ResetStatsFault *resetStatsFault) {
return ioctl(DrmIoctl::getResetStats, &resetStats);
prelim_drm_xe_exec_queue_ban_fault_ext faultExt{};
faultExt.base.name = 0;
faultExt.base.next_extension = 0;

drm_xe_exec_queue_get_property getProperty{};
getProperty.exec_queue_id = resetStats.contextId;
getProperty.property = DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN;
getProperty.value = 0;
getProperty.extensions = reinterpret_cast<__u64>(&faultExt);

const auto retVal = ioctl(DrmIoctl::getResetStats, &getProperty);
if (retVal != 0) {
return retVal;
}

const auto banned = (getProperty.value & PRELIM_DRM_XE_EXEC_QUEUE_BAN_STATUS_BANNED) != 0;
resetStats.batchActive = banned ? 1 : 0;
resetStats.batchPending = 0;
resetStats.resetCount = 0;

if (status) {
*status = static_cast<uint32_t>(getProperty.value);
}
if (resetStatsFault) {
resetStatsFault->addr = faultExt.addr;
resetStatsFault->type = faultExt.type;
resetStatsFault->level = faultExt.level;
resetStatsFault->access = faultExt.access;
resetStatsFault->flags = faultExt.flags;
}

return retVal;
}

int IoctlHelperXe::getVmFaults(uint32_t vmId, std::vector<ResetStatsFault> &faults) {
drm_xe_vm_get_property getProperty{};
getProperty.vm_id = vmId;
getProperty.property = DRM_XE_VM_GET_PROPERTY_FAULTS;
getProperty.size = 0;
getProperty.data = 0;

// First call to get the size
auto retVal = ioctl(DrmIoctl::vmGetProperty, &getProperty);
XELOG(" -> IoctlHelperXe::getVmFaults vmId=%u retVal=%d size=%u\n", vmId, retVal, getProperty.size);
if (retVal != 0) {
return retVal;
}

if (getProperty.size == 0) {
faults.clear();
return 0;
}

// Allocate buffer and get the faults
auto numFaults = getProperty.size / sizeof(xe_vm_fault);
std::vector<xe_vm_fault> faultBuffer(numFaults);
getProperty.data = reinterpret_cast<uint64_t>(faultBuffer.data());

retVal = ioctl(DrmIoctl::vmGetProperty, &getProperty);
if (retVal != 0) {
return retVal;
}

// Convert to ResetStatsFault format
faults.clear();
faults.reserve(numFaults);
for (const auto &fault : faultBuffer) {
ResetStatsFault resetFault{};
resetFault.addr = fault.address;
resetFault.type = fault.fault_type;
resetFault.level = fault.fault_level;
resetFault.access = fault.access_type;
resetFault.flags = 1; // Mark as valid
faults.push_back(resetFault);
}

return 0;
}

bool IoctlHelperXe::validPageFault(uint16_t flags) {
return (flags & PRELIM_DRM_XE_EXEC_QUEUE_BAN_FAULT_VALID) != 0;
}

uint32_t IoctlHelperXe::getStatusForResetStats(bool banned) {
uint32_t retVal = 0u;
if (banned) {
retVal |= PRELIM_DRM_XE_EXEC_QUEUE_BAN_STATUS_BANNED;
}
return retVal;
}

UuidRegisterResult IoctlHelperXe::registerUuid(const std::string &uuid, uint32_t uuidClass, uint64_t ptr, uint64_t size) {
Expand Down Expand Up @@ -1503,6 +1591,10 @@ int IoctlHelperXe::ioctl(DrmIoctl request, void *arg) {
case DrmIoctl::perfOpen: {
ret = perfOpenIoctl(request, arg);
} break;
case DrmIoctl::vmGetProperty: {
ret = IoctlHelper::ioctl(request, arg);
XELOG(" -> IoctlHelperXe::ioctl VmGetProperty r=%d\n", ret);
} break;

default:
XELOG("Not handled 0x%x\n", request);
Expand Down Expand Up @@ -2022,6 +2114,8 @@ unsigned int IoctlHelperXe::getIoctlRequestValue(DrmIoctl ioctlRequest) const {
RETURN_ME(DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL);
case DrmIoctl::getResetStats:
RETURN_ME(DRM_IOCTL_XE_EXEC_QUEUE_GET_PROPERTY);
case DrmIoctl::vmGetProperty:
RETURN_ME(DRM_IOCTL_XE_VM_GET_PROPERTY);
case DrmIoctl::debuggerOpen:
case DrmIoctl::metadataCreate:
case DrmIoctl::metadataDestroy:
Expand Down Expand Up @@ -2089,6 +2183,8 @@ std::string IoctlHelperXe::getIoctlString(DrmIoctl ioctlRequest) const {
STRINGIFY_ME(DRM_IOCTL_XE_DEBUG_METADATA_DESTROY);
case DrmIoctl::getResetStats:
STRINGIFY_ME(DRM_IOCTL_XE_EXEC_QUEUE_GET_PROPERTY);
case DrmIoctl::vmGetProperty:
STRINGIFY_ME(DRM_IOCTL_XE_VM_GET_PROPERTY);
default:
return "???";
}
Expand Down
3 changes: 3 additions & 0 deletions shared/source/os_interface/linux/xe/ioctl_helper_xe.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ class IoctlHelperXe : public IoctlHelper {
int vmBind(const VmBindParams &vmBindParams) override;
int vmUnbind(const VmBindParams &vmBindParams) override;
int getResetStats(ResetStats &resetStats, uint32_t *status, ResetStatsFault *resetStatsFault) override;
int getVmFaults(uint32_t vmId, std::vector<ResetStatsFault> &faults) override;
bool validPageFault(uint16_t flags) override;
uint32_t getStatusForResetStats(bool banned) override;
bool isEuStallSupported() override;
uint32_t getEuStallFdParameter() override;
bool perfOpenEuStallStream(uint32_t euStallFdParameter, uint32_t &samplingPeriodNs, uint64_t engineInstance, uint64_t notifyNReports, uint64_t gpuTimeStampfrequency, int32_t *stream) override;
Expand Down
57 changes: 55 additions & 2 deletions shared/source/os_interface/linux/xe/xedrm.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2024 Intel Corporation
* Copyright (C) 2024-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
Expand All @@ -13,7 +13,60 @@
namespace NEO {
namespace XeDrm {
#include "xe_drm.h"
}

struct prelim_drm_xe_exec_queue_ban_fault_ext { // NOLINT(readability-identifier-naming)
struct drm_xe_user_extension base;
__u64 addr;
__u16 type;
__u16 level;
__u16 access;
__u16 flags;
};

#define PRELIM_DRM_XE_EXEC_QUEUE_BAN_FAULT_VALID (1 << 0)
#define PRELIM_DRM_XE_EXEC_QUEUE_BAN_STATUS_BANNED (1 << 0)

// VM Get Property ioctl - upstream API for querying VM faults
#define DRM_XE_VM_GET_PROPERTY 0x0f

struct xe_vm_fault { // NOLINT(readability-identifier-naming)
__u64 address;
__u32 address_precision; // NOLINT(readability-identifier-naming)
#define FAULT_ACCESS_TYPE_READ 0
#define FAULT_ACCESS_TYPE_WRITE 1
#define FAULT_ACCESS_TYPE_ATOMIC 2
__u8 access_type; // NOLINT(readability-identifier-naming)
#define FAULT_TYPE_NOT_PRESENT 0
#define FAULT_TYPE_WRITE_ACCESS 1
#define FAULT_TYPE_ATOMIC_ACCESS 2
__u8 fault_type; // NOLINT(readability-identifier-naming)
#define FAULT_LEVEL_PTE 0
#define FAULT_LEVEL_PDE 1
#define FAULT_LEVEL_PDP 2
#define FAULT_LEVEL_PML4 3
#define FAULT_LEVEL_PML5 4
__u8 fault_level; // NOLINT(readability-identifier-naming)
__u8 pad;
__u64 reserved[4];
};

struct drm_xe_vm_get_property { // NOLINT(readability-identifier-naming)
__u64 extensions;
__u32 vm_id; // NOLINT(readability-identifier-naming)
#define DRM_XE_VM_GET_PROPERTY_FAULTS 0
__u32 property;
__u32 size;
__u32 pad;
union {
__u64 data;
__u64 value;
};
__u64 reserved[3];
};

#define DRM_IOCTL_XE_VM_GET_PROPERTY DRM_IOWR(DRM_COMMAND_BASE + DRM_XE_VM_GET_PROPERTY, struct drm_xe_vm_get_property)

} // namespace XeDrm
} // namespace NEO
using namespace NEO::XeDrm;

Expand Down
3 changes: 3 additions & 0 deletions shared/source/os_interface/os_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#include "shared/source/os_interface/product_helper.h"

namespace NEO {

const std::vector<uint32_t> OsContext::emptyIdVector{};

OsContext::OsContext(uint32_t rootDeviceIndex, uint32_t contextId, const EngineDescriptor &engineDescriptor)
: rootDeviceIndex(rootDeviceIndex),
contextId(contextId),
Expand Down
Loading