/* Copyright (C) 2025 Wildfire Games. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "precompiled.h" #include "Profiler2GPU.h" #include "lib/debug.h" #include "lib/types.h" #include "ps/ConfigDB.h" #include "ps/Profiler2.h" #include "ps/VideoMode.h" #include "renderer/backend/IDevice.h" #include "renderer/backend/IDeviceCommandContext.h" #include #include #include #include #include #include #include /** * At each enter/leave-region event, we do an async GPU timestamp query. * When all the queries for a frame have their results available, * we convert their GPU timestamps into durations and record the data. */ class CProfiler2GPUImpl { NONCOPYABLE(CProfiler2GPUImpl); struct SEvent { const char* id; uint32_t queryHandle; bool isEnter; // true if entering region; false if leaving }; struct SFrame { u32 num; double syncTimeStart; // CPU time at start of this frame. std::vector events; }; std::deque m_Frames; public: CProfiler2GPUImpl(CProfiler2& profiler) : m_Profiler(profiler), m_Storage(*new CProfiler2::ThreadStorage(profiler, "gpu")) { m_Storage.RecordSyncMarker(m_Profiler.GetTime()); m_Storage.Record(CProfiler2::ITEM_EVENT, m_Profiler.GetTime(), "thread start"); m_Profiler.AddThreadStorage(&m_Storage); } ~CProfiler2GPUImpl() { while (!m_Frames.empty()) PopFrontFrame(); for (const uint32_t queryHandle : m_FreeQueries) g_VideoMode.GetBackendDevice()->FreeQuery(queryHandle); m_FreeQueries.clear(); m_Profiler.RemoveThreadStorage(&m_Storage); } void FrameStart(Renderer::Backend::IDeviceCommandContext* deviceCommandContext) { ProcessFrames(); SFrame frame; frame.num = m_Profiler.GetFrameNumber(); // GL backend: // On (at least) some NVIDIA Windows drivers, when GPU-bound, or when // vsync enabled and not CPU-bound, the first glGet* call at the start // of a frame appears to trigger a wait (to stop the GPU getting too // far behind, or to wait for the vsync period). // That will be this GL_TIMESTAMP get, which potentially distorts the // reported results. So we'll only do it fairly rarely, and for most // frames we'll just assume the clocks don't drift much // Timestamps might shift and overflow for all backends. So for // simplicity we don't synchronize the frame start on CPU and GPU. As // we only need durations. We just roughly assume that the first // timestamp on GPU matches the CPU frame start. For real timestamps // it's better to use GPU Trace instruments. frame.syncTimeStart = m_Profiler.GetTime(); m_Frames.push_back(frame); RegionEnter(deviceCommandContext, "frame"); } void FrameEnd(Renderer::Backend::IDeviceCommandContext* deviceCommandContext) { RegionLeave(deviceCommandContext, "frame"); } void RecordRegion(Renderer::Backend::IDeviceCommandContext* deviceCommandContext, const char* id, bool isEnter) { ENSURE(!m_Frames.empty()); SFrame& frame = m_Frames.back(); SEvent event; event.id = id; event.queryHandle = NewQuery(); event.isEnter = isEnter; deviceCommandContext->InsertTimestampQuery(event.queryHandle, isEnter); frame.events.push_back(event); } void RegionEnter(Renderer::Backend::IDeviceCommandContext* deviceCommandContext, const char* id) { RecordRegion(deviceCommandContext, id, true); } void RegionLeave(Renderer::Backend::IDeviceCommandContext* deviceCommandContext, const char* id) { RecordRegion(deviceCommandContext, id, false); } private: void ProcessFrames() { Renderer::Backend::IDevice* device{g_VideoMode.GetBackendDevice()}; while (!m_Frames.empty()) { SFrame& frame = m_Frames.front(); // We assume queries become available in order so we only need to // check the last one. if (!device->IsQueryResultAvailable(frame.events.back().queryHandle)) break; // We use the first event GPU timestamp as a frame start. const uint64_t firstFrameTimestamp{!frame.events.empty() ? device->GetQueryResult(frame.events[0].queryHandle) : 0u}; const double timestampMultiplier{ device->GetCapabilities().timestampMultiplier}; std::vector> stack; // The frame's queries are now available, so retrieve and record all their results: for (size_t i = 0; i < frame.events.size(); ++i) { const uint64_t queryTimestamp{ i == 0 ? firstFrameTimestamp : device->GetQueryResult(frame.events[i].queryHandle)}; ENSURE(queryTimestamp >= firstFrameTimestamp); // Convert to absolute CPU-clock time const double t{ frame.syncTimeStart + static_cast(queryTimestamp - firstFrameTimestamp) * timestampMultiplier}; // Record a frame-start for syncing if (i == 0) m_Storage.RecordFrameStart(t); if (frame.events[i].isEnter) m_Storage.Record(CProfiler2::ITEM_ENTER, t, frame.events[i].id); else m_Storage.RecordLeave(t); // Associate the frame number with the "frame" region if (i == 0) m_Storage.RecordAttributePrintf("%u", frame.num); } PopFrontFrame(); } } void PopFrontFrame() { ENSURE(!m_Frames.empty()); SFrame& frame = m_Frames.front(); for (size_t i = 0; i < frame.events.size(); ++i) m_FreeQueries.push_back(frame.events[i].queryHandle); m_Frames.pop_front(); } // Returns a new backend query handle (or a recycled old one). uint32_t NewQuery() { if (m_FreeQueries.empty()) return g_VideoMode.GetBackendDevice()->AllocateQuery(); const uint32_t queryHandle{m_FreeQueries.back()}; m_FreeQueries.pop_back(); return queryHandle; } CProfiler2& m_Profiler; CProfiler2::ThreadStorage& m_Storage; std::vector m_FreeQueries; // query objects that are allocated but not currently in used }; CProfiler2GPU::CProfiler2GPU(CProfiler2& profiler) : m_Profiler(profiler) { if (g_ConfigDB.Get("profiler2.gpu.enable", false) && g_VideoMode.GetBackendDevice()->GetCapabilities().timestamps) { m_Impl = std::make_unique(m_Profiler); } } CProfiler2GPU::~CProfiler2GPU() = default; void CProfiler2GPU::FrameStart(Renderer::Backend::IDeviceCommandContext* deviceCommandContext) { if (m_Impl) m_Impl->FrameStart(deviceCommandContext); } void CProfiler2GPU::FrameEnd(Renderer::Backend::IDeviceCommandContext* deviceCommandContext) { if (m_Impl) m_Impl->FrameEnd(deviceCommandContext); } void CProfiler2GPU::RegionEnter( Renderer::Backend::IDeviceCommandContext* deviceCommandContext, const char* id) { if (m_Impl) m_Impl->RegionEnter(deviceCommandContext, id); } void CProfiler2GPU::RegionLeave( Renderer::Backend::IDeviceCommandContext* deviceCommandContext, const char* id) { if (m_Impl) m_Impl->RegionLeave(deviceCommandContext, id); }