mirror of
https://gitea.wildfiregames.com/0ad/0ad
synced 2026-06-18 22:33:56 -07:00
also rename wchar -> utf8 to avoid conflict with <wchar.h> (requires rebuild of workspace) (unfortunately copying history fails to "502 bad gateway"; had to delete old + add new independently) This was SVN commit r7340.
428 lines
13 KiB
C++
428 lines
13 KiB
C++
/* Copyright (c) 2010 Wildfire Games
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining
|
|
* a copy of this software and associated documentation files (the
|
|
* "Software"), to deal in the Software without restriction, including
|
|
* without limitation the rights to use, copy, modify, merge, publish,
|
|
* distribute, sublicense, and/or sell copies of the Software, and to
|
|
* permit persons to whom the Software is furnished to do so, subject to
|
|
* the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included
|
|
* in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include "precompiled.h"
|
|
#include "lib/sysdep/numa.h"
|
|
|
|
#include "lib/bits.h" // round_up, PopulationCount
|
|
#include "lib/timer.h"
|
|
#include "lib/sysdep/os_cpu.h"
|
|
#include "lib/sysdep/acpi.h"
|
|
#include "lib/sysdep/os/win/win.h"
|
|
#include "lib/sysdep/os/win/wutil.h"
|
|
#include "lib/sysdep/os/win/wcpu.h"
|
|
#include "lib/sysdep/os/win/winit.h"
|
|
#include <Psapi.h>
|
|
|
|
|
|
WINIT_REGISTER_EARLY_INIT(wnuma_Init);
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// node topology
|
|
//-----------------------------------------------------------------------------
|
|
|
|
static size_t NumNodes()
|
|
{
|
|
typedef BOOL (WINAPI *PGetNumaHighestNodeNumber)(PULONG highestNode);
|
|
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
|
|
const PGetNumaHighestNodeNumber pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)GetProcAddress(hKernel32, "GetNumaHighestNodeNumber");
|
|
if(pGetNumaHighestNodeNumber)
|
|
{
|
|
ULONG highestNode;
|
|
const BOOL ok = pGetNumaHighestNodeNumber(&highestNode);
|
|
debug_assert(ok);
|
|
debug_assert(highestNode < os_cpu_NumProcessors()); // #nodes <= #processors
|
|
return highestNode+1;
|
|
}
|
|
// NUMA not supported
|
|
else
|
|
return 1;
|
|
}
|
|
|
|
|
|
static void FillNodesProcessorMask(uintptr_t* nodesProcessorMask)
|
|
{
|
|
typedef BOOL (WINAPI *PGetNumaNodeProcessorMask)(UCHAR node, PULONGLONG affinity);
|
|
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
|
|
const PGetNumaNodeProcessorMask pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)GetProcAddress(hKernel32, "GetNumaNodeProcessorMask");
|
|
if(pGetNumaNodeProcessorMask)
|
|
{
|
|
DWORD_PTR processAffinity, systemAffinity;
|
|
{
|
|
const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
|
|
debug_assert(ok);
|
|
}
|
|
|
|
for(size_t node = 0; node < numa_NumNodes(); node++)
|
|
{
|
|
ULONGLONG affinity;
|
|
{
|
|
const BOOL ok = pGetNumaNodeProcessorMask((UCHAR)node, &affinity);
|
|
debug_assert(ok);
|
|
}
|
|
const uintptr_t processorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, (DWORD_PTR)affinity);
|
|
nodesProcessorMask[node] = processorMask;
|
|
}
|
|
}
|
|
// NUMA not supported - consider node 0 to consist of all system processors
|
|
else
|
|
nodesProcessorMask[0] = os_cpu_ProcessorMask();
|
|
}
|
|
|
|
|
|
// note: it is easier to implement this in terms of nodesProcessorMask
|
|
// rather than the other way around because wcpu provides the
|
|
// wcpu_ProcessorMaskFromAffinity helper. there is no similar function to
|
|
// convert processor to processorNumber.
|
|
static void FillProcessorsNode(size_t numNodes, const uintptr_t* nodesProcessorMask, size_t* processorsNode)
|
|
{
|
|
for(size_t node = 0; node < numNodes; node++)
|
|
{
|
|
const uintptr_t processorMask = nodesProcessorMask[node];
|
|
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
|
{
|
|
if(IsBitSet(processorMask, processor))
|
|
processorsNode[processor] = node;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// node topology interface
|
|
|
|
struct NodeTopology // POD
|
|
{
|
|
size_t numNodes;
|
|
size_t processorsNode[os_cpu_MaxProcessors];
|
|
uintptr_t nodesProcessorMask[os_cpu_MaxProcessors];
|
|
};
|
|
static NodeTopology s_nodeTopology;
|
|
|
|
static void DetectNodeTopology()
|
|
{
|
|
s_nodeTopology.numNodes = NumNodes();
|
|
FillNodesProcessorMask(s_nodeTopology.nodesProcessorMask);
|
|
FillProcessorsNode(s_nodeTopology.numNodes, s_nodeTopology.nodesProcessorMask, s_nodeTopology.processorsNode);
|
|
}
|
|
|
|
size_t numa_NumNodes()
|
|
{
|
|
return s_nodeTopology.numNodes;
|
|
}
|
|
|
|
size_t numa_NodeFromProcessor(size_t processor)
|
|
{
|
|
debug_assert(processor < os_cpu_NumProcessors());
|
|
return s_nodeTopology.processorsNode[processor];
|
|
}
|
|
|
|
uintptr_t numa_ProcessorMaskFromNode(size_t node)
|
|
{
|
|
debug_assert(node < s_nodeTopology.numNodes);
|
|
return s_nodeTopology.nodesProcessorMask[node];
|
|
}
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// memory info
|
|
//-----------------------------------------------------------------------------
|
|
|
|
size_t numa_AvailableMemory(size_t node)
|
|
{
|
|
debug_assert(node < numa_NumNodes());
|
|
|
|
// note: it is said that GetNumaAvailableMemoryNode sometimes incorrectly
|
|
// reports zero bytes. the actual cause may however be unexpected
|
|
// RAM configuration, e.g. not all slots filled.
|
|
typedef BOOL (WINAPI *PGetNumaAvailableMemoryNode)(UCHAR node, PULONGLONG availableBytes);
|
|
static PGetNumaAvailableMemoryNode pGetNumaAvailableMemoryNode;
|
|
if(!pGetNumaAvailableMemoryNode)
|
|
{
|
|
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
|
|
pGetNumaAvailableMemoryNode = (PGetNumaAvailableMemoryNode)GetProcAddress(hKernel32, "GetNumaAvailableMemoryNode");
|
|
}
|
|
|
|
if(pGetNumaAvailableMemoryNode)
|
|
{
|
|
ULONGLONG availableBytes;
|
|
const BOOL ok = pGetNumaAvailableMemoryNode((UCHAR)node, &availableBytes);
|
|
debug_assert(ok);
|
|
const size_t availableMiB = size_t(availableBytes / MiB);
|
|
return availableMiB;
|
|
}
|
|
// NUMA not supported - return available system memory
|
|
else
|
|
return os_cpu_MemoryAvailable();
|
|
}
|
|
|
|
|
|
double numa_Factor()
|
|
{
|
|
WinScopedLock lock(WNUMA_CS);
|
|
static double factor;
|
|
if(factor == 0.0)
|
|
{
|
|
// if non-NUMA, skip the (expensive) measurements below.
|
|
if(numa_NumNodes() == 1)
|
|
factor = 1.0;
|
|
else
|
|
{
|
|
// allocate memory on one node
|
|
const size_t size = 16*MiB;
|
|
shared_ptr<u8> buffer((u8*)numa_AllocateOnNode(size, 0), numa_Deleter<u8>());
|
|
|
|
const uintptr_t previousProcessorMask = os_cpu_SetThreadAffinityMask(os_cpu_ProcessorMask());
|
|
|
|
// measure min/max fill times required by a processor from each node
|
|
double minTime = 1e10, maxTime = 0.0;
|
|
for(size_t node = 0; node < numa_NumNodes(); node++)
|
|
{
|
|
const uintptr_t processorMask = numa_ProcessorMaskFromNode(node);
|
|
os_cpu_SetThreadAffinityMask(processorMask);
|
|
|
|
const double startTime = timer_Time();
|
|
memset(buffer.get(), 0, size);
|
|
const double elapsedTime = timer_Time() - startTime;
|
|
|
|
minTime = std::min(minTime, elapsedTime);
|
|
maxTime = std::max(maxTime, elapsedTime);
|
|
}
|
|
|
|
(void)os_cpu_SetThreadAffinityMask(previousProcessorMask);
|
|
|
|
factor = maxTime / minTime;
|
|
}
|
|
|
|
debug_assert(factor >= 1.0);
|
|
debug_assert(factor <= 3.0); // (Microsoft guideline for NUMA systems)
|
|
}
|
|
|
|
return factor;
|
|
}
|
|
|
|
|
|
bool numa_IsMemoryInterleaved()
|
|
{
|
|
WinScopedLock lock(WNUMA_CS);
|
|
static int isInterleaved = -1;
|
|
if(isInterleaved == -1)
|
|
{
|
|
if(acpi_Init())
|
|
{
|
|
// the BIOS only generates an SRAT (System Resource Affinity Table)
|
|
// if node interleaving is disabled.
|
|
isInterleaved = acpi_GetTable("SRAT") == 0;
|
|
acpi_Shutdown();
|
|
}
|
|
else
|
|
isInterleaved = 0; // can't tell
|
|
}
|
|
|
|
return isInterleaved != 0;
|
|
}
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// allocator
|
|
//-----------------------------------------------------------------------------
|
|
|
|
static bool largePageAllocationTookTooLong = false;
|
|
|
|
static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocationSize)
|
|
{
|
|
// can't, OS does not support large pages
|
|
if(os_cpu_LargePageSize() == 0)
|
|
return false;
|
|
|
|
// overrides
|
|
if(disposition == LPD_NEVER)
|
|
return false;
|
|
if(disposition == LPD_ALWAYS)
|
|
return true;
|
|
|
|
// default disposition: use a heuristic
|
|
{
|
|
// allocation is rather small and would "only" use half of the
|
|
// TLBs for its pages.
|
|
if(allocationSize < 64/2 * os_cpu_PageSize())
|
|
return false;
|
|
|
|
// pre-Vista Windows OSes attempt to cope with page fragmentation by
|
|
// trimming the working set of all processes, thus swapping them out,
|
|
// and waiting for contiguous regions to appear. this is terribly
|
|
// slow (multiple seconds), hence the following heuristics:
|
|
if(wutil_WindowsVersion() < WUTIL_VERSION_VISTA)
|
|
{
|
|
// a previous attempt already took too long.
|
|
if(largePageAllocationTookTooLong)
|
|
return false;
|
|
|
|
// if there's not plenty of free memory, then memory is surely
|
|
// already fragmented.
|
|
if(os_cpu_MemoryAvailable() < 2000) // 2 GB
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
void* numa_Allocate(size_t size, LargePageDisposition largePageDisposition, size_t* ppageSize)
|
|
{
|
|
void* mem = 0;
|
|
|
|
// try allocating with large pages (reduces TLB misses)
|
|
if(ShouldUseLargePages(largePageDisposition, size))
|
|
{
|
|
const size_t largePageSize = os_cpu_LargePageSize();
|
|
const size_t paddedSize = round_up(size, largePageSize); // required by MEM_LARGE_PAGES
|
|
// note: this call can take SECONDS, which is why several checks are
|
|
// undertaken before we even try. these aren't authoritative, so we
|
|
// at least prevent future attempts if it takes too long.
|
|
const double startTime = timer_Time();
|
|
mem = VirtualAlloc(0, paddedSize, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
|
|
if(ppageSize)
|
|
*ppageSize = largePageSize;
|
|
const double elapsedTime = timer_Time() - startTime;
|
|
debug_printf(L"TIMER| NUMA large page allocation: %g\n", elapsedTime);
|
|
if(elapsedTime > 1.0)
|
|
largePageAllocationTookTooLong = true;
|
|
}
|
|
|
|
// try (again) with regular pages
|
|
if(!mem)
|
|
{
|
|
mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
|
|
if(ppageSize)
|
|
*ppageSize = os_cpu_PageSize();
|
|
}
|
|
|
|
// all attempts failed - we're apparently out of memory.
|
|
if(!mem)
|
|
throw std::bad_alloc();
|
|
|
|
return mem;
|
|
}
|
|
|
|
|
|
static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node)
|
|
{
|
|
typedef BOOL (WINAPI *PQueryWorkingSetEx)(HANDLE hProcess, PVOID buffer, DWORD bufferSize);
|
|
static PQueryWorkingSetEx pQueryWorkingSetEx;
|
|
if(!pQueryWorkingSetEx)
|
|
{
|
|
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
|
|
pQueryWorkingSetEx = (PQueryWorkingSetEx)GetProcAddress(hKernel32, "QueryWorkingSetEx");
|
|
if(!pQueryWorkingSetEx)
|
|
return true; // can't do anything
|
|
}
|
|
|
|
#if WINVER >= 0x600
|
|
size_t largePageSize = os_cpu_LargePageSize();
|
|
debug_assert(largePageSize != 0); // this value is needed for later
|
|
|
|
// retrieve attributes of all pages constituting mem
|
|
const size_t numPages = (size + pageSize-1) / pageSize;
|
|
PSAPI_WORKING_SET_EX_INFORMATION* wsi = new PSAPI_WORKING_SET_EX_INFORMATION[numPages];
|
|
for(size_t i = 0; i < numPages; i++)
|
|
wsi[i].VirtualAddress = (u8*)mem + i*pageSize;
|
|
pQueryWorkingSetEx(GetCurrentProcess(), wsi, DWORD(sizeof(PSAPI_WORKING_SET_EX_INFORMATION)*numPages));
|
|
|
|
// ensure each is valid and allocated on the correct node
|
|
for(size_t i = 0; i < numPages; i++)
|
|
{
|
|
const PSAPI_WORKING_SET_EX_BLOCK& attributes = wsi[i].VirtualAttributes;
|
|
if(!attributes.Valid)
|
|
return false;
|
|
if((attributes.LargePage != 0) != (pageSize == largePageSize))
|
|
{
|
|
debug_printf(L"NUMA: is not a large page\n");
|
|
return false;
|
|
}
|
|
if(attributes.Node != node)
|
|
{
|
|
debug_printf(L"NUMA: allocated from remote node\n");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
delete[] wsi;
|
|
#else
|
|
UNUSED2(mem);
|
|
UNUSED2(size);
|
|
UNUSED2(pageSize);
|
|
UNUSED2(node);
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
void* numa_AllocateOnNode(size_t node, size_t size, LargePageDisposition largePageDisposition, size_t* ppageSize)
|
|
{
|
|
debug_assert(node < numa_NumNodes());
|
|
|
|
// see if there will be enough memory (non-authoritative, for debug purposes only)
|
|
{
|
|
const size_t sizeMiB = size/MiB;
|
|
const size_t availableMiB = numa_AvailableMemory(node);
|
|
if(availableMiB < sizeMiB)
|
|
debug_printf(L"NUMA: warning: node reports insufficient memory (%d vs %d MB)\n", availableMiB, sizeMiB);
|
|
}
|
|
|
|
size_t pageSize; // (used below even if ppageSize is zero)
|
|
void* const mem = numa_Allocate(size, largePageDisposition, &pageSize);
|
|
if(ppageSize)
|
|
*ppageSize = pageSize;
|
|
|
|
// we can't use VirtualAllocExNuma - it's only available in Vista and Server 2008.
|
|
// workaround: fault in all pages now to ensure they are allocated from the
|
|
// current node, then verify page attributes.
|
|
// (note: VirtualAlloc's MEM_COMMIT only maps virtual pages and does not
|
|
// actually allocate page frames. Windows XP uses a first-touch heuristic -
|
|
// the page will be taken from the node whose processor caused the fault.
|
|
// Windows Vista allocates on the "preferred" node, so affinity should be
|
|
// set such that this thread is running on <node>.)
|
|
memset(mem, 0, size);
|
|
|
|
VerifyPages(mem, size, pageSize, node);
|
|
|
|
return mem;
|
|
}
|
|
|
|
|
|
void numa_Deallocate(void* mem)
|
|
{
|
|
VirtualFree(mem, 0, MEM_RELEASE);
|
|
}
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
static LibError wnuma_Init()
|
|
{
|
|
DetectNodeTopology();
|
|
return INFO::OK;
|
|
}
|