diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp index fc1e199ada0a5..6f8666f51af25 100644 --- a/shared/source/command_stream/command_stream_receiver.cpp +++ b/shared/source/command_stream/command_stream_receiver.cpp @@ -572,6 +572,12 @@ WaitStatus CommandStreamReceiver::baseWaitFunction(volatile TagAddressType *poll partitionAddress = ptrOffset(partitionAddress, this->immWritePostSyncWriteOffset); } + // Final hang check before returning ready - catches GPU faults that occurred + // during fast operations where the periodic hang check was never triggered + if (isGpuHangDetected()) { + return WaitStatus::gpuHang; + } + return WaitStatus::ready; } diff --git a/shared/source/os_interface/linux/drm_neo.cpp b/shared/source/os_interface/linux/drm_neo.cpp index a796c955b5a3d..e592e8f72ab28 100644 --- a/shared/source/os_interface/linux/drm_neo.cpp +++ b/shared/source/os_interface/linux/drm_neo.cpp @@ -271,18 +271,28 @@ bool Drm::isGpuHangDetected(OsContext &osContext) { } bool Drm::checkResetStatus(OsContext &osContext) { - const auto osContextLinux = static_cast(&osContext); - const auto &drmContextIds = osContextLinux->getDrmContextIds(); + const auto &drmContextIds = osContext.getDrmContextIds(); for (const auto drmContextId : drmContextIds) { ResetStats resetStats{}; resetStats.contextId = drmContextId; ResetStatsFault fault{}; uint32_t status = 0; + bool getResetStatsSucceeded = false; const auto retVal{ioctlHelper->getResetStats(resetStats, &status, &fault)}; - UNRECOVERABLE_IF(retVal != 0); + if (retVal != 0) { + // getResetStats may fail if exec queue is destroyed or not supported + // Still check VM faults below + PRINT_STRING(debugManager.flags.PrintDebugMessages.get(), stderr, + "getResetStats failed with error %d for contextId %u, checking VM faults\n", + retVal, drmContextId); + } else { + getResetStatsSucceeded = true; + } auto debuggingEnabled = rootDeviceEnvironment.executionEnvironment.isDebuggingEnabled(); - if (checkToDisableScratchPage() && ioctlHelper->validPageFault(fault.flags)) { + + // Check for page fault from prelim extension API (only if getResetStats succeeded) + if (getResetStatsSucceeded && checkToDisableScratchPage() && ioctlHelper->validPageFault(fault.flags)) { bool banned = ((status & ioctlHelper->getStatusForResetStats(true)) != 0); if (!banned && debuggingEnabled) { return false; @@ -305,9 +315,57 @@ bool Drm::checkResetStatus(OsContext &osContext) { banned); UNRECOVERABLE_IF(true); } - if (resetStats.batchActive > 0 || resetStats.batchPending > 0) { + + // Check for page fault from upstream VM get property API + // Query VM faults independently - they may exist even before exec queue is banned + bool banned = getResetStatsSucceeded ? ((status & ioctlHelper->getStatusForResetStats(true)) != 0) : false; + if (checkToDisableScratchPage()) { + // Use per-context VM IDs if available, otherwise fall back to shared VM IDs + const auto &contextVmIds = osContext.getDrmVmIds(); + std::vector vmIdsToCheck; + if (!contextVmIds.empty()) { + vmIdsToCheck = contextVmIds; + } else { + // For Xe driver without per-context VMs, use the shared virtualMemoryIds + for (size_t i = 0; i < virtualMemoryIds.size(); i++) { + if (virtualMemoryIds[i] != 0) { + vmIdsToCheck.push_back(virtualMemoryIds[i]); + } + } + } + + for (const auto vmId : vmIdsToCheck) { + std::vector vmFaults; + if (ioctlHelper->getVmFaults(vmId, vmFaults) == 0 && !vmFaults.empty()) { + if (!banned && debuggingEnabled) { + return false; + } + for (const auto &vmFault : vmFaults) { + IoFunctions::fprintf(stderr, "Segmentation fault from GPU at 0x%llx, vm_id: %u (%s) type: %d (%s), level: %d (%s), access: %d (%s), banned: %d, aborting.\n", + vmFault.addr, + vmId, + EngineHelpers::engineTypeToString(osContext.getEngineType()).c_str(), + vmFault.type, GpuPageFaultHelpers::faultTypeToString(static_cast(vmFault.type)).c_str(), + vmFault.level, GpuPageFaultHelpers::faultLevelToString(static_cast(vmFault.level)).c_str(), + vmFault.access, GpuPageFaultHelpers::faultAccessToString(static_cast(vmFault.access)).c_str(), + banned); + IoFunctions::fprintf(stdout, "Segmentation fault from GPU at 0x%llx, vm_id: %u (%s) type: %d (%s), level: %d (%s), access: %d (%s), banned: %d, aborting.\n", + vmFault.addr, + vmId, + EngineHelpers::engineTypeToString(osContext.getEngineType()).c_str(), + vmFault.type, GpuPageFaultHelpers::faultTypeToString(static_cast(vmFault.type)).c_str(), + vmFault.level, GpuPageFaultHelpers::faultLevelToString(static_cast(vmFault.level)).c_str(), + vmFault.access, GpuPageFaultHelpers::faultAccessToString(static_cast(vmFault.access)).c_str(), + banned); + } + UNRECOVERABLE_IF(true); + } + } + } + + if (getResetStatsSucceeded && (resetStats.batchActive > 0 || resetStats.batchPending > 0)) { PRINT_STRING(debugManager.flags.PrintDebugMessages.get(), stderr, "%s", "ERROR: GPU HANG detected!\n"); - osContextLinux->setHangDetected(); + osContext.setHangDetected(); return true; } } @@ -1756,9 +1814,10 @@ int Drm::createDrmVirtualMemory(uint32_t &drmVmId) { } bool useVmBind = isVmBindAvailable(); - bool enablePageFault = hasPageFaultSupport() && useVmBind; + bool disableScratch = checkToDisableScratchPage(); + bool enablePageFault = (hasPageFaultSupport() && useVmBind) || disableScratch; - ctl.flags = ioctlHelper->getFlagsForVmCreate(checkToDisableScratchPage(), enablePageFault, useVmBind); + ctl.flags = ioctlHelper->getFlagsForVmCreate(disableScratch, enablePageFault, useVmBind); auto ret = ioctlHelper->ioctl(DrmIoctl::gemVmCreate, &ctl); diff --git a/shared/source/os_interface/linux/drm_wrappers.h b/shared/source/os_interface/linux/drm_wrappers.h index 8d058f9e3137c..cc6fdfdfac190 100644 --- a/shared/source/os_interface/linux/drm_wrappers.h +++ b/shared/source/os_interface/linux/drm_wrappers.h @@ -341,7 +341,8 @@ enum class DrmIoctl { perfDisable, perfQuery, primaryContextExport, - primaryContextImport + primaryContextImport, + vmGetProperty }; enum class DrmParam { diff --git a/shared/source/os_interface/linux/ioctl_helper.h b/shared/source/os_interface/linux/ioctl_helper.h index a040e57b30d3c..6ff5b0d707dee 100644 --- a/shared/source/os_interface/linux/ioctl_helper.h +++ b/shared/source/os_interface/linux/ioctl_helper.h @@ -160,6 +160,7 @@ class IoctlHelper { virtual int vmBind(const VmBindParams &vmBindParams) = 0; virtual int vmUnbind(const VmBindParams &vmBindParams) = 0; virtual int getResetStats(ResetStats &resetStats, uint32_t *status, ResetStatsFault *resetStatsFault) = 0; + virtual int getVmFaults(uint32_t vmId, std::vector &faults) { return -1; } virtual bool isEuStallSupported() = 0; virtual uint32_t getEuStallFdParameter() = 0; virtual bool perfOpenEuStallStream(uint32_t euStallFdParameter, uint32_t &samplingPeriodNs, uint64_t engineInstance, uint64_t notifyNReports, uint64_t gpuTimeStampfrequency, int32_t *stream) = 0; diff --git a/shared/source/os_interface/linux/os_context_linux.h b/shared/source/os_interface/linux/os_context_linux.h index 715268d14a2ae..b06b97b2fbd49 100644 --- a/shared/source/os_interface/linux/os_context_linux.h +++ b/shared/source/os_interface/linux/os_context_linux.h @@ -25,18 +25,18 @@ class OsContextLinux : public OsContext { unsigned int getEngineFlag() const { return engineFlag; } void setEngineFlag(unsigned int engineFlag) { this->engineFlag = engineFlag; } - const std::vector &getDrmContextIds() const { return drmContextIds; } - const std::vector &getDrmVmIds() const { return drmVmIds; } + const std::vector &getDrmContextIds() const override { return drmContextIds; } + const std::vector &getDrmVmIds() const override { return drmVmIds; } bool isDirectSubmissionSupported() const override; Drm &getDrm() const; virtual std::pair getFenceAddressAndValToWait(uint32_t vmHandleId, bool isLocked); virtual void waitForPagingFence(); static OsContext *create(OSInterface *osInterface, uint32_t rootDeviceIndex, uint32_t contextId, const EngineDescriptor &engineDescriptor); void reInitializeContext() override; - void setHangDetected() { + void setHangDetected() override { contextHangDetected = true; } - bool isHangDetected() const { + bool isHangDetected() const override { return contextHangDetected; } diff --git a/shared/source/os_interface/linux/xe/ioctl_helper_xe.cpp b/shared/source/os_interface/linux/xe/ioctl_helper_xe.cpp index e98e243b5d53b..da1fc07fcfa2d 100644 --- a/shared/source/os_interface/linux/xe/ioctl_helper_xe.cpp +++ b/shared/source/os_interface/linux/xe/ioctl_helper_xe.cpp @@ -1217,7 +1217,95 @@ int IoctlHelperXe::vmUnbind(const VmBindParams &vmBindParams) { } int IoctlHelperXe::getResetStats(ResetStats &resetStats, uint32_t *status, ResetStatsFault *resetStatsFault) { - return ioctl(DrmIoctl::getResetStats, &resetStats); + prelim_drm_xe_exec_queue_ban_fault_ext faultExt{}; + faultExt.base.name = 0; + faultExt.base.next_extension = 0; + + drm_xe_exec_queue_get_property getProperty{}; + getProperty.exec_queue_id = resetStats.contextId; + getProperty.property = DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN; + getProperty.value = 0; + getProperty.extensions = reinterpret_cast<__u64>(&faultExt); + + const auto retVal = ioctl(DrmIoctl::getResetStats, &getProperty); + if (retVal != 0) { + return retVal; + } + + const auto banned = (getProperty.value & PRELIM_DRM_XE_EXEC_QUEUE_BAN_STATUS_BANNED) != 0; + resetStats.batchActive = banned ? 1 : 0; + resetStats.batchPending = 0; + resetStats.resetCount = 0; + + if (status) { + *status = static_cast(getProperty.value); + } + if (resetStatsFault) { + resetStatsFault->addr = faultExt.addr; + resetStatsFault->type = faultExt.type; + resetStatsFault->level = faultExt.level; + resetStatsFault->access = faultExt.access; + resetStatsFault->flags = faultExt.flags; + } + + return retVal; +} + +int IoctlHelperXe::getVmFaults(uint32_t vmId, std::vector &faults) { + drm_xe_vm_get_property getProperty{}; + getProperty.vm_id = vmId; + getProperty.property = DRM_XE_VM_GET_PROPERTY_FAULTS; + getProperty.size = 0; + getProperty.data = 0; + + // First call to get the size + auto retVal = ioctl(DrmIoctl::vmGetProperty, &getProperty); + XELOG(" -> IoctlHelperXe::getVmFaults vmId=%u retVal=%d size=%u\n", vmId, retVal, getProperty.size); + if (retVal != 0) { + return retVal; + } + + if (getProperty.size == 0) { + faults.clear(); + return 0; + } + + // Allocate buffer and get the faults + auto numFaults = getProperty.size / sizeof(xe_vm_fault); + std::vector faultBuffer(numFaults); + getProperty.data = reinterpret_cast(faultBuffer.data()); + + retVal = ioctl(DrmIoctl::vmGetProperty, &getProperty); + if (retVal != 0) { + return retVal; + } + + // Convert to ResetStatsFault format + faults.clear(); + faults.reserve(numFaults); + for (const auto &fault : faultBuffer) { + ResetStatsFault resetFault{}; + resetFault.addr = fault.address; + resetFault.type = fault.fault_type; + resetFault.level = fault.fault_level; + resetFault.access = fault.access_type; + resetFault.flags = 1; // Mark as valid + faults.push_back(resetFault); + } + + return 0; +} + +bool IoctlHelperXe::validPageFault(uint16_t flags) { + return (flags & PRELIM_DRM_XE_EXEC_QUEUE_BAN_FAULT_VALID) != 0; +} + +uint32_t IoctlHelperXe::getStatusForResetStats(bool banned) { + uint32_t retVal = 0u; + if (banned) { + retVal |= PRELIM_DRM_XE_EXEC_QUEUE_BAN_STATUS_BANNED; + } + return retVal; } UuidRegisterResult IoctlHelperXe::registerUuid(const std::string &uuid, uint32_t uuidClass, uint64_t ptr, uint64_t size) { @@ -1503,6 +1591,10 @@ int IoctlHelperXe::ioctl(DrmIoctl request, void *arg) { case DrmIoctl::perfOpen: { ret = perfOpenIoctl(request, arg); } break; + case DrmIoctl::vmGetProperty: { + ret = IoctlHelper::ioctl(request, arg); + XELOG(" -> IoctlHelperXe::ioctl VmGetProperty r=%d\n", ret); + } break; default: XELOG("Not handled 0x%x\n", request); @@ -2022,6 +2114,8 @@ unsigned int IoctlHelperXe::getIoctlRequestValue(DrmIoctl ioctlRequest) const { RETURN_ME(DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL); case DrmIoctl::getResetStats: RETURN_ME(DRM_IOCTL_XE_EXEC_QUEUE_GET_PROPERTY); + case DrmIoctl::vmGetProperty: + RETURN_ME(DRM_IOCTL_XE_VM_GET_PROPERTY); case DrmIoctl::debuggerOpen: case DrmIoctl::metadataCreate: case DrmIoctl::metadataDestroy: @@ -2089,6 +2183,8 @@ std::string IoctlHelperXe::getIoctlString(DrmIoctl ioctlRequest) const { STRINGIFY_ME(DRM_IOCTL_XE_DEBUG_METADATA_DESTROY); case DrmIoctl::getResetStats: STRINGIFY_ME(DRM_IOCTL_XE_EXEC_QUEUE_GET_PROPERTY); + case DrmIoctl::vmGetProperty: + STRINGIFY_ME(DRM_IOCTL_XE_VM_GET_PROPERTY); default: return "???"; } diff --git a/shared/source/os_interface/linux/xe/ioctl_helper_xe.h b/shared/source/os_interface/linux/xe/ioctl_helper_xe.h index 38d3ab66dcc42..d918ed34156b7 100644 --- a/shared/source/os_interface/linux/xe/ioctl_helper_xe.h +++ b/shared/source/os_interface/linux/xe/ioctl_helper_xe.h @@ -87,6 +87,9 @@ class IoctlHelperXe : public IoctlHelper { int vmBind(const VmBindParams &vmBindParams) override; int vmUnbind(const VmBindParams &vmBindParams) override; int getResetStats(ResetStats &resetStats, uint32_t *status, ResetStatsFault *resetStatsFault) override; + int getVmFaults(uint32_t vmId, std::vector &faults) override; + bool validPageFault(uint16_t flags) override; + uint32_t getStatusForResetStats(bool banned) override; bool isEuStallSupported() override; uint32_t getEuStallFdParameter() override; bool perfOpenEuStallStream(uint32_t euStallFdParameter, uint32_t &samplingPeriodNs, uint64_t engineInstance, uint64_t notifyNReports, uint64_t gpuTimeStampfrequency, int32_t *stream) override; diff --git a/shared/source/os_interface/linux/xe/xedrm.h b/shared/source/os_interface/linux/xe/xedrm.h index 6475695014303..c91ba4f1f4f10 100644 --- a/shared/source/os_interface/linux/xe/xedrm.h +++ b/shared/source/os_interface/linux/xe/xedrm.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2024 Intel Corporation + * Copyright (C) 2024-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -13,7 +13,60 @@ namespace NEO { namespace XeDrm { #include "xe_drm.h" -} + +struct prelim_drm_xe_exec_queue_ban_fault_ext { // NOLINT(readability-identifier-naming) + struct drm_xe_user_extension base; + __u64 addr; + __u16 type; + __u16 level; + __u16 access; + __u16 flags; +}; + +#define PRELIM_DRM_XE_EXEC_QUEUE_BAN_FAULT_VALID (1 << 0) +#define PRELIM_DRM_XE_EXEC_QUEUE_BAN_STATUS_BANNED (1 << 0) + +// VM Get Property ioctl - upstream API for querying VM faults +#define DRM_XE_VM_GET_PROPERTY 0x0f + +struct xe_vm_fault { // NOLINT(readability-identifier-naming) + __u64 address; + __u32 address_precision; // NOLINT(readability-identifier-naming) +#define FAULT_ACCESS_TYPE_READ 0 +#define FAULT_ACCESS_TYPE_WRITE 1 +#define FAULT_ACCESS_TYPE_ATOMIC 2 + __u8 access_type; // NOLINT(readability-identifier-naming) +#define FAULT_TYPE_NOT_PRESENT 0 +#define FAULT_TYPE_WRITE_ACCESS 1 +#define FAULT_TYPE_ATOMIC_ACCESS 2 + __u8 fault_type; // NOLINT(readability-identifier-naming) +#define FAULT_LEVEL_PTE 0 +#define FAULT_LEVEL_PDE 1 +#define FAULT_LEVEL_PDP 2 +#define FAULT_LEVEL_PML4 3 +#define FAULT_LEVEL_PML5 4 + __u8 fault_level; // NOLINT(readability-identifier-naming) + __u8 pad; + __u64 reserved[4]; +}; + +struct drm_xe_vm_get_property { // NOLINT(readability-identifier-naming) + __u64 extensions; + __u32 vm_id; // NOLINT(readability-identifier-naming) +#define DRM_XE_VM_GET_PROPERTY_FAULTS 0 + __u32 property; + __u32 size; + __u32 pad; + union { + __u64 data; + __u64 value; + }; + __u64 reserved[3]; +}; + +#define DRM_IOCTL_XE_VM_GET_PROPERTY DRM_IOWR(DRM_COMMAND_BASE + DRM_XE_VM_GET_PROPERTY, struct drm_xe_vm_get_property) + +} // namespace XeDrm } // namespace NEO using namespace NEO::XeDrm; diff --git a/shared/source/os_interface/os_context.cpp b/shared/source/os_interface/os_context.cpp index cf1706c47c889..d479d05bb835b 100644 --- a/shared/source/os_interface/os_context.cpp +++ b/shared/source/os_interface/os_context.cpp @@ -15,6 +15,9 @@ #include "shared/source/os_interface/product_helper.h" namespace NEO { + +const std::vector OsContext::emptyIdVector{}; + OsContext::OsContext(uint32_t rootDeviceIndex, uint32_t contextId, const EngineDescriptor &engineDescriptor) : rootDeviceIndex(rootDeviceIndex), contextId(contextId), diff --git a/shared/source/os_interface/os_context.h b/shared/source/os_interface/os_context.h index b2d0a0d570118..cbe4e2315ace2 100644 --- a/shared/source/os_interface/os_context.h +++ b/shared/source/os_interface/os_context.h @@ -12,6 +12,7 @@ #include #include +#include namespace NEO { class OSInterface; @@ -68,6 +69,15 @@ class OsContext : public ReferenceTrackedObject { bool &startInContext); virtual void reInitializeContext() {} + virtual const std::vector &getDrmContextIds() const { + return emptyIdVector; + } + virtual const std::vector &getDrmVmIds() const { + return emptyIdVector; + } + virtual void setHangDetected() {} + virtual bool isHangDetected() const { return false; } + static constexpr uint8_t getUmdPowerHintMax() { return NEO::OsContext::powerHintMax; } uint8_t getUmdPowerHintValue() { return powerHintValue; } void setUmdPowerHintValue(uint8_t powerHintValue) { this->powerHintValue = powerHintValue; } @@ -141,6 +151,7 @@ class OsContext : public ReferenceTrackedObject { bool initializeInternalEngineImmediately = true; uint8_t powerHintValue = 0; static constexpr inline uint8_t powerHintMax = 100u; // by definition: 100% power-saving + static const std::vector emptyIdVector; uint32_t contextGroupCount = 0; const OsContext *primaryContext = nullptr; diff --git a/shared/source/os_interface/windows/os_context_win.cpp b/shared/source/os_interface/windows/os_context_win.cpp index 4152818b763bf..5e8dedcb1fdd8 100644 --- a/shared/source/os_interface/windows/os_context_win.cpp +++ b/shared/source/os_interface/windows/os_context_win.cpp @@ -108,6 +108,10 @@ bool OsContextWin::isDirectSubmissionSupported() const { return !isWSL && productHelper.isDirectSubmissionSupported(); } +bool OsContextWin::isHangDetected() const { + return monitoredFence.cpuAddress && *monitoredFence.cpuAddress == Wddm::gpuHangIndication; +} + OsContextWin::~OsContextWin() { if (contextInitialized && (false == this->wddm.skipResourceCleanup())) { wddm.getWddmInterface()->destroyHwQueue(hardwareQueue.handle); diff --git a/shared/source/os_interface/windows/os_context_win.h b/shared/source/os_interface/windows/os_context_win.h index ad8e83fc6bccc..f6343570d2449 100644 --- a/shared/source/os_interface/windows/os_context_win.h +++ b/shared/source/os_interface/windows/os_context_win.h @@ -44,6 +44,7 @@ class OsContextWin : public OsContext { void getDeviceLuidArray(std::vector &luidData, size_t arraySize); uint32_t getDeviceNodeMask(); uint64_t getOfflineDumpContextId(uint32_t deviceIndex) const override; + bool isHangDetected() const override; protected: bool initializeContext(bool allocateInterrupt) override; diff --git a/shared/source/os_interface/windows/wddm/wddm.cpp b/shared/source/os_interface/windows/wddm/wddm.cpp index 9ba28d91a6d1a..9061a5f793a00 100644 --- a/shared/source/os_interface/windows/wddm/wddm.cpp +++ b/shared/source/os_interface/windows/wddm/wddm.cpp @@ -1252,9 +1252,7 @@ bool Wddm::waitFromCpu(uint64_t lastFenceValue, const MonitoredFence &monitoredF } bool Wddm::isGpuHangDetected(OsContext &osContext) { - const auto osContextWin = static_cast(&osContext); - const auto &monitoredFence = osContextWin->getMonitoredFence(); - bool hangDetected = monitoredFence.cpuAddress && *monitoredFence.cpuAddress == gpuHangIndication; + bool hangDetected = osContext.isHangDetected(); PRINT_STRING(hangDetected && debugManager.flags.PrintDebugMessages.get(), stderr, "%s", "ERROR: GPU HANG detected!\n"); diff --git a/shared/test/common/os_interface/linux/drm_memory_manager_fixture.h b/shared/test/common/os_interface/linux/drm_memory_manager_fixture.h index 6e587e897d762..1bfdc2e7fb274 100644 --- a/shared/test/common/os_interface/linux/drm_memory_manager_fixture.h +++ b/shared/test/common/os_interface/linux/drm_memory_manager_fixture.h @@ -59,6 +59,7 @@ class DrmMemoryManagerFixture : public MemoryManagementFixture { executionEnvironment->incRefInternal(); debugManager.flags.DeferOsContextInitialization.set(0); debugManager.flags.SetAmountOfReusableAllocations.set(0); + debugManager.flags.DisableGpuHangDetection.set(1); environmentWrapper.setCsrType>(); allocationData.rootDeviceIndex = rootDeviceIndex; diff --git a/shared/test/common/os_interface/linux/xe/mock_drm_xe.inl b/shared/test/common/os_interface/linux/xe/mock_drm_xe.inl index 13f97ec1cf77c..57801cac6ccd7 100644 --- a/shared/test/common/os_interface/linux/xe/mock_drm_xe.inl +++ b/shared/test/common/os_interface/linux/xe/mock_drm_xe.inl @@ -56,12 +56,25 @@ struct DrmMockXe : public DrmMockCustom { int waitUserFenceReturn = 0; int execQueueBanPropertyReturn = 0; + int getResetStatsReturn = 0; uint32_t createParamsFlags = 0u; uint16_t createParamsCpuCaching = 0u; uint32_t createParamsPlacement = 0u; bool ioctlCalled = false; bool forceMmapOffsetFail = false; + // VM faults mock data + struct VmFaultMock { + uint64_t address; + uint32_t addressPrecision; + uint8_t accessType; + uint8_t faultType; + uint8_t faultLevel; + }; + std::vector mockVmFaults; + int vmGetPropertyCallCount = 0; + int vmGetPropertyFailOnCall = 0; // 0 = never fail, N = fail on Nth call + protected: // Don't call directly, use the create() function DrmMockXe(RootDeviceEnvironment &rootDeviceEnvironment) diff --git a/shared/test/common/os_interface/linux/xe/mock_drm_xe_definitions.inl b/shared/test/common/os_interface/linux/xe/mock_drm_xe_definitions.inl index a9d5b8d3ce15c..b88fc29cff3d5 100644 --- a/shared/test/common/os_interface/linux/xe/mock_drm_xe_definitions.inl +++ b/shared/test/common/os_interface/linux/xe/mock_drm_xe_definitions.inl @@ -135,11 +135,63 @@ int DrmMockXe::ioctl(DrmIoctl request, void *arg) { ret = -2; break; case DrmIoctl::getResetStats: { + if (getResetStatsReturn != 0) { + ret = getResetStatsReturn; + break; + } auto execQueueProperty = static_cast(arg); EXPECT_EQ(execQueueProperty->property, static_cast(DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN)); execQueueProperty->value = execQueueBanPropertyReturn; ret = 0; } break; + case DrmIoctl::vmGetProperty: { + vmGetPropertyCallCount++; + if (vmGetPropertyFailOnCall > 0 && vmGetPropertyCallCount >= vmGetPropertyFailOnCall) { + ret = -1; + break; + } + // Use local struct definitions to work across all Xe mock variants (XeDrm, XeDrmPrelim, XeJgs) + struct VmFault { + __u64 address; + __u32 address_precision; + __u8 access_type; + __u8 fault_type; + __u8 fault_level; + __u8 pad; + __u64 reserved[4]; + }; + struct VmGetProperty { + __u64 extensions; + __u32 vm_id; + __u32 property; + __u32 size; + __u32 pad; + __u64 data; + __u64 reserved[3]; + }; + constexpr __u32 vmGetPropertyFaults = 0; + auto vmProperty = static_cast(arg); + if (vmProperty->property == vmGetPropertyFaults) { + if (mockVmFaults.empty()) { + vmProperty->size = 0; + } else { + vmProperty->size = static_cast<__u32>(mockVmFaults.size() * sizeof(VmFault)); + if (vmProperty->data != 0) { + auto *faultData = reinterpret_cast(vmProperty->data); + for (size_t i = 0; i < mockVmFaults.size(); i++) { + faultData[i].address = mockVmFaults[i].address; + faultData[i].address_precision = mockVmFaults[i].addressPrecision; + faultData[i].access_type = mockVmFaults[i].accessType; + faultData[i].fault_type = mockVmFaults[i].faultType; + faultData[i].fault_level = mockVmFaults[i].faultLevel; + faultData[i].pad = 0; + memset(faultData[i].reserved, 0, sizeof(faultData[i].reserved)); + } + } + } + ret = 0; + } + } break; case DrmIoctl::query: { struct drm_xe_device_query *deviceQuery = static_cast(arg); switch (deviceQuery->query) { diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index e9024e288ee23..9878e5d9f3e31 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -444,6 +444,20 @@ HWTEST_F(CommandStreamReceiverTest, givenCheckingGpuHangWhenGpuHangDetectedThenG EXPECT_EQ(1u, driverModel->getDeviceStateCalledCount); } +HWTEST_F(CommandStreamReceiverTest, givenCheckingGpuHangWhenNoGpuHangDetectedThenFalseIsReturned) { + auto driverModelMock = std::make_unique(); + driverModelMock->isGpuHangDetectedToReturn = false; + auto driverModel = driverModelMock.get(); + auto osInterface = std::make_unique(); + osInterface->setDriverModel(std::move(driverModelMock)); + + auto &csr = pDevice->getUltCommandStreamReceiver(); + csr.executionEnvironment.rootDeviceEnvironments[csr.rootDeviceIndex]->osInterface = std::move(osInterface); + + EXPECT_FALSE(csr.isGpuHangDetected()); + EXPECT_EQ(0u, driverModel->getDeviceStateCalledCount); +} + HWTEST_F(CommandStreamReceiverTest, givenGpuHangWhenWaititingForCompletionWithTimeoutThenGpuHangIsReturned) { auto driverModelMock = std::make_unique(); driverModelMock->isGpuHangDetectedToReturn = true; @@ -495,6 +509,32 @@ HWTEST_F(CommandStreamReceiverTest, givenNoGpuHangWhenWaititingForCompletionWith EXPECT_EQ(WaitStatus::ready, waitStatus); } +HWTEST_F(CommandStreamReceiverTest, givenTaskAlreadyCompletedAndGpuHangDetectedOnFinalCheckWhenWaitingForCompletionThenGpuHangIsReturned) { + auto driverModelMock = std::make_unique(); + driverModelMock->isGpuHangDetectedToReturn = true; + + auto osInterface = std::make_unique(); + osInterface->setDriverModel(std::move(driverModelMock)); + + auto &csr = pDevice->getUltCommandStreamReceiver(); + csr.executionEnvironment.rootDeviceEnvironments[csr.rootDeviceIndex]->osInterface = std::move(osInterface); + csr.callBaseWaitForCompletionWithTimeout = true; + csr.activePartitions = 1; + csr.gpuHangCheckPeriod = 1000000us; // Long period so periodic check won't trigger + + // Task is already complete from the start + TagAddressType tasksCount[16] = {5}; + csr.tagAddress = tasksCount; + + constexpr auto enableTimeout = false; + constexpr auto timeoutMicroseconds = std::numeric_limits::max(); + constexpr auto taskCountToWait = 1; // Less than tasksCount[0], so task is already done + + // Even though task is complete, the final hang check should detect the GPU hang + const auto waitStatus = csr.waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, taskCountToWait); + EXPECT_EQ(WaitStatus::gpuHang, waitStatus); +} + HWTEST_F(CommandStreamReceiverTest, givenFailingFlushSubmissionsAndGpuHangWhenWaititingForCompletionWithTimeoutThenGpuHangIsReturned) { auto driverModelMock = std::make_unique(); driverModelMock->isGpuHangDetectedToReturn = true; diff --git a/shared/test/unit_test/os_interface/linux/drm_tests.cpp b/shared/test/unit_test/os_interface/linux/drm_tests.cpp index e126454495b64..f7212a36c909b 100644 --- a/shared/test/unit_test/os_interface/linux/drm_tests.cpp +++ b/shared/test/unit_test/os_interface/linux/drm_tests.cpp @@ -1405,7 +1405,7 @@ TEST(DrmTest, GivenMinusEbusyIoctlErrorWhenCallingExecbufferThenCallIoctlAgain) EXPECT_EQ(0, drm.Drm::ioctl(DrmIoctl::gemExecbuffer2, nullptr)); } -TEST(DrmTest, GivenIoctlErrorWhenIsGpuHangIsCalledThenErrorIsThrown) { +TEST(DrmTest, GivenIoctlErrorWhenIsGpuHangIsCalledThenNoHangDetected) { MockExecutionEnvironment executionEnvironment{}; DrmMock drm{*executionEnvironment.rootDeviceEnvironments[0]}; @@ -1416,7 +1416,9 @@ TEST(DrmTest, GivenIoctlErrorWhenIsGpuHangIsCalledThenErrorIsThrown) { mockOsContextLinux.drmContextIds.push_back(0); mockOsContextLinux.drmContextIds.push_back(3); - EXPECT_THROW(drm.isGpuHangDetected(mockOsContextLinux), std::runtime_error); + // getResetStats fails (no resetStatsToReturn entries), but gracefully handled + EXPECT_FALSE(drm.isGpuHangDetected(mockOsContextLinux)); + EXPECT_FALSE(mockOsContextLinux.isHangDetected()); } TEST(DrmTest, GivenZeroBatchActiveAndZeroBatchPendingResetStatsWhenIsGpuHangIsCalledThenNoHangIsReported) { diff --git a/shared/test/unit_test/os_interface/linux/xe/ioctl_helper_xe_tests.cpp b/shared/test/unit_test/os_interface/linux/xe/ioctl_helper_xe_tests.cpp index 50c8716bb1fe9..859cbdd3144d7 100644 --- a/shared/test/unit_test/os_interface/linux/xe/ioctl_helper_xe_tests.cpp +++ b/shared/test/unit_test/os_interface/linux/xe/ioctl_helper_xe_tests.cpp @@ -597,6 +597,7 @@ TEST_F(IoctlHelperXeTest, givenIoctlHelperXeWhenCallingAnyMethodThenDummyValueIs verifyIoctlString(DrmIoctl::syncObjTimelineWait, "DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT"); verifyIoctlString(DrmIoctl::syncObjTimelineSignal, "DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL"); verifyIoctlString(DrmIoctl::getResetStats, "DRM_IOCTL_XE_EXEC_QUEUE_GET_PROPERTY"); + verifyIoctlString(DrmIoctl::vmGetProperty, "DRM_IOCTL_XE_VM_GET_PROPERTY"); EXPECT_TRUE(xeIoctlHelper->completionFenceExtensionSupported(true)); @@ -663,6 +664,7 @@ TEST_F(IoctlHelperXeTest, whenGettingIoctlRequestValueThenPropertValueIsReturned verifyIoctlRequestValue(DRM_IOCTL_SYNCOBJ_SIGNAL, DrmIoctl::syncObjSignal); verifyIoctlRequestValue(DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, DrmIoctl::syncObjTimelineWait); verifyIoctlRequestValue(DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL, DrmIoctl::syncObjTimelineSignal); + verifyIoctlRequestValue(DRM_IOCTL_XE_VM_GET_PROPERTY, DrmIoctl::vmGetProperty); } TEST_F(IoctlHelperXeTest, verifyPublicFunctions) { @@ -2511,15 +2513,185 @@ TEST_F(IoctlHelperXeTest, whenCallingGetResetStatsThenSuccessIsReturned) { EXPECT_EQ(0, xeIoctlHelper->getResetStats(resetStats, nullptr, nullptr)); } -TEST_F(IoctlHelperXeTest, whenCallingGetStatusAndFlagsForResetStatsThenZeroIsReturned) { +TEST_F(IoctlHelperXeTest, givenVmIdWhenCallingGetVmFaultsThenFaultsAreReturned) { + auto executionEnvironment = std::make_unique(); + auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]); + auto xeIoctlHelper = static_cast(drm->getIoctlHelper()); + + xeIoctlHelper->initialize(); + + uint32_t vmId = 123; + std::vector faults; + + auto ret = xeIoctlHelper->getVmFaults(vmId, faults); + EXPECT_EQ(0, ret); + EXPECT_TRUE(faults.empty()); +} + +TEST_F(IoctlHelperXeTest, givenVmFaultsWhenCallingGetVmFaultsThenFaultDataIsReturned) { + auto executionEnvironment = std::make_unique(); + auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]); + auto xeIoctlHelper = static_cast(drm->getIoctlHelper()); + + xeIoctlHelper->initialize(); + + // Add mock faults + drm->mockVmFaults.push_back({0xDEADBEEF000, 0, 1, 2, 3}); + drm->mockVmFaults.push_back({0xCAFEBABE000, 1, 2, 3, 4}); + + uint32_t vmId = 123; + std::vector faults; + + auto ret = xeIoctlHelper->getVmFaults(vmId, faults); + EXPECT_EQ(0, ret); + ASSERT_EQ(2u, faults.size()); + + EXPECT_EQ(0xDEADBEEF000u, faults[0].addr); + EXPECT_EQ(1u, faults[0].access); + EXPECT_EQ(2u, faults[0].type); + EXPECT_EQ(3u, faults[0].level); + EXPECT_EQ(1u, faults[0].flags); // Valid flag + + EXPECT_EQ(0xCAFEBABE000u, faults[1].addr); + EXPECT_EQ(2u, faults[1].access); + EXPECT_EQ(3u, faults[1].type); + EXPECT_EQ(4u, faults[1].level); + EXPECT_EQ(1u, faults[1].flags); // Valid flag +} + +TEST_F(IoctlHelperXeTest, givenSecondIoctlFailureWhenCallingGetVmFaultsThenErrorIsReturned) { + auto executionEnvironment = std::make_unique(); + auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]); + auto xeIoctlHelper = static_cast(drm->getIoctlHelper()); + + xeIoctlHelper->initialize(); + + // Add mock faults to ensure first ioctl succeeds with non-zero size + drm->mockVmFaults.push_back({0xDEADBEEF000, 0, 1, 2, 3}); + + // Make the second vmGetProperty call fail (first call gets size, second gets data) + drm->vmGetPropertyFailOnCall = 2; + + uint32_t vmId = 123; + std::vector faults; + + auto ret = xeIoctlHelper->getVmFaults(vmId, faults); + EXPECT_EQ(-1, ret); +} + +TEST_F(IoctlHelperXeTest, whenCallingGetStatusAndFlagsForResetStatsThenCorrectValuesReturned) { auto executionEnvironment = std::make_unique(); auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]); auto ioctlHelper = static_cast(drm->getIoctlHelper()); - EXPECT_EQ(0u, ioctlHelper->getStatusForResetStats(true)); + EXPECT_EQ(1u, ioctlHelper->getStatusForResetStats(true)); EXPECT_EQ(0u, ioctlHelper->getStatusForResetStats(false)); EXPECT_FALSE(ioctlHelper->validPageFault(0u)); + EXPECT_TRUE(ioctlHelper->validPageFault(0x0001u)); + EXPECT_TRUE(ioctlHelper->validPageFault(0x0003u)); +} + +TEST_F(IoctlHelperXeTest, givenGetResetStatsFailsWhenCheckingResetStatusThenNoHangDetected) { + auto executionEnvironment = std::make_unique(); + auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]); + auto xeIoctlHelper = static_cast(drm->getIoctlHelper()); + xeIoctlHelper->initialize(); + + MockOsContextLinux osContext(*drm, 0, 5u, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular})); + osContext.drmContextIds.push_back(0); + + // Simulate getResetStats ioctl failure + drm->getResetStatsReturn = -1; + + // No VM faults, getResetStats fails, should not detect hang + EXPECT_FALSE(drm->checkResetStatus(osContext)); + EXPECT_FALSE(osContext.isHangDetected()); +} + +TEST_F(IoctlHelperXeTest, givenVmFaultsWhenCheckingResetStatusWithDisabledScratchThenProcessTerminated) { + DebugManagerStateRestore restorer; + debugManager.flags.DisableScratchPages.set(true); + + auto executionEnvironment = std::make_unique(); + auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]); + drm->configureScratchPagePolicy(); + auto xeIoctlHelper = static_cast(drm->getIoctlHelper()); + xeIoctlHelper->initialize(); + + // Add mock VM faults + drm->mockVmFaults.push_back({0xDEADBEEF000, 0, 1, 2, 3}); + + MockOsContextLinux osContext(*drm, 0, 5u, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular})); + osContext.drmContextIds.push_back(0); + osContext.drmVmIds.push_back(123); // Add VM ID + + // Make getResetStats succeed but return not banned + drm->execQueueBanPropertyReturn = 0; + + // Capture output to avoid SIGPIPE when test runner pipes are closed + StreamCapture capture; + capture.captureStderr(); + capture.captureStdout(); + + // Should terminate due to VM fault (UNRECOVERABLE_IF) + EXPECT_THROW(drm->checkResetStatus(osContext), std::runtime_error); + + // Verify output contains expected fault message + auto stderrOutput = capture.getCapturedStderr(); + auto stdoutOutput = capture.getCapturedStdout(); + EXPECT_TRUE(stderrOutput.find("Segmentation fault from GPU") != std::string::npos); + EXPECT_TRUE(stdoutOutput.find("Segmentation fault from GPU") != std::string::npos); +} + +TEST_F(IoctlHelperXeTest, givenVmFaultsAndDebuggingEnabledWhenCheckingResetStatusThenNoHangDetected) { + DebugManagerStateRestore restorer; + debugManager.flags.DisableScratchPages.set(true); + + auto executionEnvironment = std::make_unique(); + executionEnvironment->setDebuggingMode(DebuggingMode::online); + auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]); + drm->configureScratchPagePolicy(); + auto xeIoctlHelper = static_cast(drm->getIoctlHelper()); + xeIoctlHelper->initialize(); + + // Add mock VM faults + drm->mockVmFaults.push_back({0xDEADBEEF000, 0, 1, 2, 3}); + + MockOsContextLinux osContext(*drm, 0, 5u, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular})); + osContext.drmContextIds.push_back(0); + osContext.drmVmIds.push_back(123); // Add VM ID + + // Make getResetStats succeed but return not banned (debugging case) + drm->execQueueBanPropertyReturn = 0; + + // Should return false (early return) when debugging is enabled and not banned + EXPECT_FALSE(drm->checkResetStatus(osContext)); + EXPECT_FALSE(osContext.isHangDetected()); +} + +TEST_F(IoctlHelperXeTest, givenNoFaultsWhenCheckingResetStatusWithDisabledScratchThenNoHangDetected) { + DebugManagerStateRestore restorer; + debugManager.flags.DisableScratchPages.set(true); + + auto executionEnvironment = std::make_unique(); + auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]); + drm->configureScratchPagePolicy(); + auto xeIoctlHelper = static_cast(drm->getIoctlHelper()); + xeIoctlHelper->initialize(); + + // No mock VM faults added + + MockOsContextLinux osContext(*drm, 0, 5u, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular})); + osContext.drmContextIds.push_back(0); + osContext.drmVmIds.push_back(123); // Add VM ID + + // Make getResetStats succeed but return not banned + drm->execQueueBanPropertyReturn = 0; + + // Should not detect hang since there are no VM faults and exec queue is not banned + EXPECT_FALSE(drm->checkResetStatus(osContext)); + EXPECT_FALSE(osContext.isHangDetected()); } TEST_F(IoctlHelperXeTest, whenInitializeThenProperHwInfoIsSet) { diff --git a/shared/test/unit_test/os_interface/os_context_tests.cpp b/shared/test/unit_test/os_interface/os_context_tests.cpp index 3fd4d294f4e50..c0292731eac7b 100644 --- a/shared/test/unit_test/os_interface/os_context_tests.cpp +++ b/shared/test/unit_test/os_interface/os_context_tests.cpp @@ -78,6 +78,26 @@ TEST(OSContext, givenReinitializeContextWhenContextIsInitThenContextIsStillIinit delete pOsContext; } +TEST(OSContext, givenBaseOsContextWhenCallingDrmMethodsThenDefaultImplementationsAreUsed) { + auto engineDescriptor = EngineDescriptorHelper::getDefaultDescriptor(); + auto pOsContext = OsContext::create(nullptr, 0, 0, engineDescriptor); + + // Test getDrmContextIds - should return empty vector + const auto &contextIds = pOsContext->getDrmContextIds(); + EXPECT_TRUE(contextIds.empty()); + + // Test getDrmVmIds - should return empty vector + const auto &vmIds = pOsContext->getDrmVmIds(); + EXPECT_TRUE(vmIds.empty()); + + // Test setHangDetected and isHangDetected - base implementation does nothing + EXPECT_FALSE(pOsContext->isHangDetected()); + pOsContext->setHangDetected(); + EXPECT_FALSE(pOsContext->isHangDetected()); // Should still be false (base impl is no-op) + + delete pOsContext; +} + TEST(OSContext, givenSetPowerHintThenGetPowerHintShowsTheSameValue) { auto engineDescriptor = EngineDescriptorHelper::getDefaultDescriptor(); auto pOsContext = OsContext::create(nullptr, 0, 0, engineDescriptor);