/* Copyright (c) 2010 Wildfire Games * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "precompiled.h" #include "lib/sysdep/numa.h" #include "lib/bits.h" // round_up, PopulationCount #include "lib/timer.h" #include "lib/sysdep/os_cpu.h" #include "lib/sysdep/acpi.h" #include "lib/sysdep/os/win/win.h" #include "lib/sysdep/os/win/wutil.h" #include "lib/sysdep/os/win/wcpu.h" #include "lib/sysdep/os/win/winit.h" #include WINIT_REGISTER_EARLY_INIT(wnuma_Init); //----------------------------------------------------------------------------- // node topology //----------------------------------------------------------------------------- static size_t NumNodes() { typedef BOOL (WINAPI *PGetNumaHighestNodeNumber)(PULONG highestNode); const HMODULE hKernel32 = GetModuleHandle("kernel32.dll"); const PGetNumaHighestNodeNumber pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)GetProcAddress(hKernel32, "GetNumaHighestNodeNumber"); if(pGetNumaHighestNodeNumber) { ULONG highestNode; const BOOL ok = pGetNumaHighestNodeNumber(&highestNode); debug_assert(ok); debug_assert(highestNode < os_cpu_NumProcessors()); // #nodes <= #processors return highestNode+1; } // NUMA not supported else return 1; } static void FillNodesProcessorMask(uintptr_t* nodesProcessorMask) { typedef BOOL (WINAPI *PGetNumaNodeProcessorMask)(UCHAR node, PULONGLONG affinity); const HMODULE hKernel32 = GetModuleHandle("kernel32.dll"); const PGetNumaNodeProcessorMask pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)GetProcAddress(hKernel32, "GetNumaNodeProcessorMask"); if(pGetNumaNodeProcessorMask) { DWORD_PTR processAffinity, systemAffinity; { const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity); debug_assert(ok); } for(size_t node = 0; node < numa_NumNodes(); node++) { ULONGLONG affinity; { const BOOL ok = pGetNumaNodeProcessorMask((UCHAR)node, &affinity); debug_assert(ok); } const uintptr_t processorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, (DWORD_PTR)affinity); nodesProcessorMask[node] = processorMask; } } // NUMA not supported - consider node 0 to consist of all system processors else nodesProcessorMask[0] = os_cpu_ProcessorMask(); } // note: it is easier to implement this in terms of nodesProcessorMask // rather than the other way around because wcpu provides the // wcpu_ProcessorMaskFromAffinity helper. there is no similar function to // convert processor to processorNumber. static void FillProcessorsNode(size_t numNodes, const uintptr_t* nodesProcessorMask, size_t* processorsNode) { for(size_t node = 0; node < numNodes; node++) { const uintptr_t processorMask = nodesProcessorMask[node]; for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++) { if(IsBitSet(processorMask, processor)) processorsNode[processor] = node; } } } //----------------------------------------------------------------------------- // node topology interface struct NodeTopology // POD { size_t numNodes; size_t processorsNode[os_cpu_MaxProcessors]; uintptr_t nodesProcessorMask[os_cpu_MaxProcessors]; }; static NodeTopology s_nodeTopology; static void DetectNodeTopology() { s_nodeTopology.numNodes = NumNodes(); FillNodesProcessorMask(s_nodeTopology.nodesProcessorMask); FillProcessorsNode(s_nodeTopology.numNodes, s_nodeTopology.nodesProcessorMask, s_nodeTopology.processorsNode); } size_t numa_NumNodes() { return s_nodeTopology.numNodes; } size_t numa_NodeFromProcessor(size_t processor) { debug_assert(processor < os_cpu_NumProcessors()); return s_nodeTopology.processorsNode[processor]; } uintptr_t numa_ProcessorMaskFromNode(size_t node) { debug_assert(node < s_nodeTopology.numNodes); return s_nodeTopology.nodesProcessorMask[node]; } //----------------------------------------------------------------------------- // memory info //----------------------------------------------------------------------------- size_t numa_AvailableMemory(size_t node) { debug_assert(node < numa_NumNodes()); // note: it is said that GetNumaAvailableMemoryNode sometimes incorrectly // reports zero bytes. the actual cause may however be unexpected // RAM configuration, e.g. not all slots filled. typedef BOOL (WINAPI *PGetNumaAvailableMemoryNode)(UCHAR node, PULONGLONG availableBytes); static PGetNumaAvailableMemoryNode pGetNumaAvailableMemoryNode; if(!pGetNumaAvailableMemoryNode) { const HMODULE hKernel32 = GetModuleHandle("kernel32.dll"); pGetNumaAvailableMemoryNode = (PGetNumaAvailableMemoryNode)GetProcAddress(hKernel32, "GetNumaAvailableMemoryNode"); } if(pGetNumaAvailableMemoryNode) { ULONGLONG availableBytes; const BOOL ok = pGetNumaAvailableMemoryNode((UCHAR)node, &availableBytes); debug_assert(ok); const size_t availableMiB = size_t(availableBytes / MiB); return availableMiB; } // NUMA not supported - return available system memory else return os_cpu_MemoryAvailable(); } double numa_Factor() { WinScopedLock lock(WNUMA_CS); static double factor; if(factor == 0.0) { // if non-NUMA, skip the (expensive) measurements below. if(numa_NumNodes() == 1) factor = 1.0; else { // allocate memory on one node const size_t size = 16*MiB; shared_ptr buffer((u8*)numa_AllocateOnNode(size, 0), numa_Deleter()); const uintptr_t previousProcessorMask = os_cpu_SetThreadAffinityMask(os_cpu_ProcessorMask()); // measure min/max fill times required by a processor from each node double minTime = 1e10, maxTime = 0.0; for(size_t node = 0; node < numa_NumNodes(); node++) { const uintptr_t processorMask = numa_ProcessorMaskFromNode(node); os_cpu_SetThreadAffinityMask(processorMask); const double startTime = timer_Time(); memset(buffer.get(), 0, size); const double elapsedTime = timer_Time() - startTime; minTime = std::min(minTime, elapsedTime); maxTime = std::max(maxTime, elapsedTime); } (void)os_cpu_SetThreadAffinityMask(previousProcessorMask); factor = maxTime / minTime; } debug_assert(factor >= 1.0); debug_assert(factor <= 3.0); // (Microsoft guideline for NUMA systems) } return factor; } bool numa_IsMemoryInterleaved() { WinScopedLock lock(WNUMA_CS); static int isInterleaved = -1; if(isInterleaved == -1) { if(acpi_Init()) { // the BIOS only generates an SRAT (System Resource Affinity Table) // if node interleaving is disabled. isInterleaved = acpi_GetTable("SRAT") == 0; acpi_Shutdown(); } else isInterleaved = 0; // can't tell } return isInterleaved != 0; } //----------------------------------------------------------------------------- // allocator //----------------------------------------------------------------------------- static bool largePageAllocationTookTooLong = false; static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocationSize) { // can't, OS does not support large pages if(os_cpu_LargePageSize() == 0) return false; // overrides if(disposition == LPD_NEVER) return false; if(disposition == LPD_ALWAYS) return true; // default disposition: use a heuristic { // allocation is rather small and would "only" use half of the // TLBs for its pages. if(allocationSize < 64/2 * os_cpu_PageSize()) return false; // pre-Vista Windows OSes attempt to cope with page fragmentation by // trimming the working set of all processes, thus swapping them out, // and waiting for contiguous regions to appear. this is terribly // slow (multiple seconds), hence the following heuristics: if(wutil_WindowsVersion() < WUTIL_VERSION_VISTA) { // a previous attempt already took too long. if(largePageAllocationTookTooLong) return false; // if there's not plenty of free memory, then memory is surely // already fragmented. if(os_cpu_MemoryAvailable() < 2000) // 2 GB return false; } } return true; } void* numa_Allocate(size_t size, LargePageDisposition largePageDisposition, size_t* ppageSize) { void* mem = 0; // try allocating with large pages (reduces TLB misses) if(ShouldUseLargePages(largePageDisposition, size)) { const size_t largePageSize = os_cpu_LargePageSize(); const size_t paddedSize = round_up(size, largePageSize); // required by MEM_LARGE_PAGES // note: this call can take SECONDS, which is why several checks are // undertaken before we even try. these aren't authoritative, so we // at least prevent future attempts if it takes too long. const double startTime = timer_Time(); mem = VirtualAlloc(0, paddedSize, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE); if(ppageSize) *ppageSize = largePageSize; const double elapsedTime = timer_Time() - startTime; debug_printf(L"TIMER| NUMA large page allocation: %g\n", elapsedTime); if(elapsedTime > 1.0) largePageAllocationTookTooLong = true; } // try (again) with regular pages if(!mem) { mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); if(ppageSize) *ppageSize = os_cpu_PageSize(); } // all attempts failed - we're apparently out of memory. if(!mem) throw std::bad_alloc(); return mem; } static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node) { typedef BOOL (WINAPI *PQueryWorkingSetEx)(HANDLE hProcess, PVOID buffer, DWORD bufferSize); static PQueryWorkingSetEx pQueryWorkingSetEx; if(!pQueryWorkingSetEx) { const HMODULE hKernel32 = GetModuleHandle("kernel32.dll"); pQueryWorkingSetEx = (PQueryWorkingSetEx)GetProcAddress(hKernel32, "QueryWorkingSetEx"); if(!pQueryWorkingSetEx) return true; // can't do anything } #if WINVER >= 0x600 size_t largePageSize = os_cpu_LargePageSize(); debug_assert(largePageSize != 0); // this value is needed for later // retrieve attributes of all pages constituting mem const size_t numPages = (size + pageSize-1) / pageSize; PSAPI_WORKING_SET_EX_INFORMATION* wsi = new PSAPI_WORKING_SET_EX_INFORMATION[numPages]; for(size_t i = 0; i < numPages; i++) wsi[i].VirtualAddress = (u8*)mem + i*pageSize; pQueryWorkingSetEx(GetCurrentProcess(), wsi, DWORD(sizeof(PSAPI_WORKING_SET_EX_INFORMATION)*numPages)); // ensure each is valid and allocated on the correct node for(size_t i = 0; i < numPages; i++) { const PSAPI_WORKING_SET_EX_BLOCK& attributes = wsi[i].VirtualAttributes; if(!attributes.Valid) return false; if((attributes.LargePage != 0) != (pageSize == largePageSize)) { debug_printf(L"NUMA: is not a large page\n"); return false; } if(attributes.Node != node) { debug_printf(L"NUMA: allocated from remote node\n"); return false; } } delete[] wsi; #else UNUSED2(mem); UNUSED2(size); UNUSED2(pageSize); UNUSED2(node); #endif return true; } void* numa_AllocateOnNode(size_t node, size_t size, LargePageDisposition largePageDisposition, size_t* ppageSize) { debug_assert(node < numa_NumNodes()); // see if there will be enough memory (non-authoritative, for debug purposes only) { const size_t sizeMiB = size/MiB; const size_t availableMiB = numa_AvailableMemory(node); if(availableMiB < sizeMiB) debug_printf(L"NUMA: warning: node reports insufficient memory (%d vs %d MB)\n", availableMiB, sizeMiB); } size_t pageSize; // (used below even if ppageSize is zero) void* const mem = numa_Allocate(size, largePageDisposition, &pageSize); if(ppageSize) *ppageSize = pageSize; // we can't use VirtualAllocExNuma - it's only available in Vista and Server 2008. // workaround: fault in all pages now to ensure they are allocated from the // current node, then verify page attributes. // (note: VirtualAlloc's MEM_COMMIT only maps virtual pages and does not // actually allocate page frames. Windows XP uses a first-touch heuristic - // the page will be taken from the node whose processor caused the fault. // Windows Vista allocates on the "preferred" node, so affinity should be // set such that this thread is running on .) memset(mem, 0, size); VerifyPages(mem, size, pageSize, node); return mem; } void numa_Deallocate(void* mem) { VirtualFree(mem, 0, MEM_RELEASE); } //----------------------------------------------------------------------------- static LibError wnuma_Init() { DetectNodeTopology(); return INFO::OK; }