0ad/source/lib/sysdep/os/win/wnuma.cpp
janwas a3696c0b91 premake: detect processor architecture from script
lib/sysdep: clean up by moving OS and cpu-arch folders into "os" and
"arch" folders

This was SVN commit r6162.
2008-06-30 17:34:18 +00:00

394 lines
12 KiB
C++

#include "precompiled.h"
#include "lib/sysdep/numa.h"
#include "lib/bits.h" // round_up, PopulationCount
#include "lib/timer.h"
#include "lib/sysdep/os_cpu.h"
#include "lib/sysdep/acpi.h"
#include "win.h"
#include "wutil.h"
#include "wcpu.h"
#include "winit.h"
#include <Psapi.h>
WINIT_REGISTER_EARLY_INIT(wnuma_Init);
//-----------------------------------------------------------------------------
// node topology
//-----------------------------------------------------------------------------
static size_t NumNodes()
{
typedef BOOL (WINAPI *PGetNumaHighestNodeNumber)(PULONG highestNode);
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
const PGetNumaHighestNodeNumber pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)GetProcAddress(hKernel32, "GetNumaHighestNodeNumber");
if(pGetNumaHighestNodeNumber)
{
ULONG highestNode;
const BOOL ok = pGetNumaHighestNodeNumber(&highestNode);
debug_assert(ok);
debug_assert(highestNode < os_cpu_NumProcessors()); // #nodes <= #processors
return highestNode+1;
}
// NUMA not supported
else
return 1;
}
static void FillNodesProcessorMask(uintptr_t* nodesProcessorMask)
{
typedef BOOL (WINAPI *PGetNumaNodeProcessorMask)(UCHAR node, PULONGLONG affinity);
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
const PGetNumaNodeProcessorMask pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)GetProcAddress(hKernel32, "GetNumaNodeProcessorMask");
if(pGetNumaNodeProcessorMask)
{
DWORD_PTR processAffinity, systemAffinity;
const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
debug_assert(ok);
for(size_t node = 0; node < numa_NumNodes(); node++)
{
ULONGLONG affinity;
const BOOL ok = pGetNumaNodeProcessorMask((UCHAR)node, &affinity);
debug_assert(ok);
const uintptr_t processorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, (DWORD_PTR)affinity);
nodesProcessorMask[node] = processorMask;
}
}
// NUMA not supported - consider node 0 to consist of all system processors
else
nodesProcessorMask[0] = os_cpu_ProcessorMask();
}
// note: it is easier to implement this in terms of nodesProcessorMask
// rather than the other way around because wcpu provides the
// wcpu_ProcessorMaskFromAffinity helper. there is no similar function to
// convert processor to processorNumber.
static void FillProcessorsNode(size_t numNodes, const uintptr_t* nodesProcessorMask, size_t* processorsNode)
{
for(size_t node = 0; node < numNodes; node++)
{
const uintptr_t processorMask = nodesProcessorMask[node];
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
if(IsBitSet(processorMask, processor))
processorsNode[processor] = node;
}
}
}
//-----------------------------------------------------------------------------
// node topology interface
struct NodeTopology // POD
{
size_t numNodes;
size_t processorsNode[os_cpu_MaxProcessors];
uintptr_t nodesProcessorMask[os_cpu_MaxProcessors];
};
static NodeTopology s_nodeTopology;
static void DetectNodeTopology()
{
s_nodeTopology.numNodes = NumNodes();
FillNodesProcessorMask(s_nodeTopology.nodesProcessorMask);
FillProcessorsNode(s_nodeTopology.numNodes, s_nodeTopology.nodesProcessorMask, s_nodeTopology.processorsNode);
}
size_t numa_NumNodes()
{
return s_nodeTopology.numNodes;
}
size_t numa_NodeFromProcessor(size_t processor)
{
debug_assert(processor < os_cpu_NumProcessors());
return s_nodeTopology.processorsNode[processor];
}
uintptr_t numa_ProcessorMaskFromNode(size_t node)
{
debug_assert(node < s_nodeTopology.numNodes);
return s_nodeTopology.nodesProcessorMask[node];
}
//-----------------------------------------------------------------------------
// memory info
//-----------------------------------------------------------------------------
size_t numa_AvailableMemory(size_t node)
{
debug_assert(node < numa_NumNodes());
// note: it is said that GetNumaAvailableMemoryNode sometimes incorrectly
// reports zero bytes. the actual cause may however be unexpected
// RAM configuration, e.g. not all slots filled.
typedef BOOL (WINAPI *PGetNumaAvailableMemoryNode)(UCHAR node, PULONGLONG availableBytes);
static PGetNumaAvailableMemoryNode pGetNumaAvailableMemoryNode;
if(!pGetNumaAvailableMemoryNode)
{
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
pGetNumaAvailableMemoryNode = (PGetNumaAvailableMemoryNode)GetProcAddress(hKernel32, "GetNumaAvailableMemoryNode");
}
if(pGetNumaAvailableMemoryNode)
{
ULONGLONG availableBytes;
const BOOL ok = pGetNumaAvailableMemoryNode((UCHAR)node, &availableBytes);
debug_assert(ok);
const size_t availableMiB = size_t(availableBytes / MiB);
return availableMiB;
}
// NUMA not supported - return available system memory
else
return os_cpu_MemoryAvailable();
}
double numa_Factor()
{
WinScopedLock lock(WNUMA_CS);
static double factor;
if(factor == 0.0)
{
// if non-NUMA, skip the (expensive) measurements below.
if(numa_NumNodes() == 1)
factor = 1.0;
else
{
// allocate memory on one node
const size_t size = 16*MiB;
shared_ptr<u8> buffer((u8*)numa_AllocateOnNode(size, 0), numa_Deleter<u8>());
const uintptr_t previousProcessorMask = os_cpu_SetThreadAffinityMask(os_cpu_ProcessorMask());
// measure min/max fill times required by a processor from each node
double minTime = 1e10, maxTime = 0.0;
for(size_t node = 0; node < numa_NumNodes(); node++)
{
const uintptr_t processorMask = numa_ProcessorMaskFromNode(node);
os_cpu_SetThreadAffinityMask(processorMask);
const double startTime = timer_Time();
memset(buffer.get(), 0, size);
const double elapsedTime = timer_Time() - startTime;
minTime = std::min(minTime, elapsedTime);
maxTime = std::max(maxTime, elapsedTime);
}
(void)os_cpu_SetThreadAffinityMask(previousProcessorMask);
factor = maxTime / minTime;
}
debug_assert(factor >= 1.0);
debug_assert(factor <= 3.0); // (Microsoft guideline for NUMA systems)
}
return factor;
}
bool numa_IsMemoryInterleaved()
{
WinScopedLock lock(WNUMA_CS);
static int isInterleaved = -1;
if(isInterleaved == -1)
{
if(acpi_Init())
{
// the BIOS only generates an SRAT (System Resource Affinity Table)
// if node interleaving is disabled.
isInterleaved = acpi_GetTable("SRAT") == 0;
acpi_Shutdown();
}
else
isInterleaved = 0; // can't tell
}
return isInterleaved != 0;
}
//-----------------------------------------------------------------------------
// allocator
//-----------------------------------------------------------------------------
static bool largePageAllocationTookTooLong = false;
static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocationSize)
{
// can't, OS does not support large pages
if(os_cpu_LargePageSize() == 0)
return false;
// overrides
if(disposition == LPD_NEVER)
return false;
if(disposition == LPD_ALWAYS)
return true;
// default disposition: use a heuristic
{
// a previous attempt already took too long (Windows is apparently
// shoveling aside lots of memory).
if(largePageAllocationTookTooLong)
return false;
// allocation is rather small and would "only" use half of the
// TLBs for its pages.
if(allocationSize < 64/2 * os_cpu_PageSize())
return false;
// we want there to be plenty of memory available, otherwise the
// page frames are going to be terribly fragmented and even a
// single allocation would take SECONDS.
if(os_cpu_MemoryAvailable() < 2000) // 2 GB
return false;
}
return true;
}
void* numa_Allocate(size_t size, LargePageDisposition largePageDisposition, size_t* ppageSize)
{
void* mem = 0;
// try allocating with large pages (reduces TLB misses)
if(ShouldUseLargePages(largePageDisposition, size))
{
const size_t largePageSize = os_cpu_LargePageSize();
const size_t paddedSize = round_up(size, largePageSize); // required by MEM_LARGE_PAGES
// note: this call can take SECONDS, which is why several checks are
// undertaken before we even try. these aren't authoritative, so we
// at least prevent future attempts if it takes too long.
const double startTime = timer_Time();
mem = VirtualAlloc(0, paddedSize, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
if(ppageSize)
*ppageSize = largePageSize;
const double elapsedTime = timer_Time() - startTime;
debug_printf("TIMER| NUMA large page allocation: %g\n", elapsedTime);
if(elapsedTime > 1.0)
largePageAllocationTookTooLong = true;
}
// try (again) with regular pages
if(!mem)
{
mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
if(ppageSize)
*ppageSize = os_cpu_PageSize();
}
// all attempts failed - we're apparently out of memory.
if(!mem)
throw std::bad_alloc();
return mem;
}
static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node)
{
typedef BOOL (WINAPI *PQueryWorkingSetEx)(HANDLE hProcess, PVOID buffer, DWORD bufferSize);
static PQueryWorkingSetEx pQueryWorkingSetEx;
if(!pQueryWorkingSetEx)
{
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
pQueryWorkingSetEx = (PQueryWorkingSetEx)GetProcAddress(hKernel32, "QueryWorkingSetEx");
if(!pQueryWorkingSetEx)
return true; // can't do anything
}
#if WINVER >= 0x600
// retrieve attributes of all pages constituting mem
const size_t numPages = (size + pageSize-1) / pageSize;
PSAPI_WORKING_SET_EX_INFORMATION* wsi = new PSAPI_WORKING_SET_EX_INFORMATION[numPages];
for(size_t i = 0; i < numPages; i++)
wsi[i].VirtualAddress = (u8*)mem + i*pageSize;
pQueryWorkingSetEx(GetCurrentProcess(), wsi, sizeof(PSAPI_WORKING_SET_EX_INFORMATION)*numPages);
// ensure each is valid and allocated on the correct node
for(size_t i = 0; i < numPages; i++)
{
const PSAPI_WORKING_SET_EX_BLOCK& attributes = wsi[i].VirtualAttributes;
if(!attributes.valid)
return false;
if(attributes.LargePage != (pageSize == LargePageSize()))
{
debug_printf("NUMA: is not a large page\n");
return false;
}
if(attributes.node != node)
{
debug_printf("NUMA: allocated from remote node\n");
return false;
}
}
delete[] wsi;
#else
UNUSED2(mem);
UNUSED2(size);
UNUSED2(pageSize);
UNUSED2(node);
#endif
return true;
}
void* numa_AllocateOnNode(size_t node, size_t size, LargePageDisposition largePageDisposition, size_t* ppageSize)
{
debug_assert(node < numa_NumNodes());
// see if there will be enough memory (non-authoritative, for debug purposes only)
{
const size_t sizeMiB = size/MiB;
const size_t availableMiB = numa_AvailableMemory(node);
if(availableMiB < sizeMiB)
debug_printf("NUMA: warning: node reports insufficient memory (%d vs %d MB)\n", availableMiB, sizeMiB);
}
size_t pageSize; // (used below even if ppageSize is zero)
void* const mem = numa_Allocate(size, largePageDisposition, &pageSize);
if(ppageSize)
*ppageSize = pageSize;
// we can't use VirtualAllocExNuma - it's only available in Vista and Server 2008.
// workaround: fault in all pages now to ensure they are allocated from the
// current node, then verify page attributes.
// (note: VirtualAlloc's MEM_COMMIT only maps virtual pages and does not
// actually allocate page frames. Windows XP uses a first-touch heuristic -
// the page will be taken from the node whose processor caused the fault.
// Windows Vista allocates on the "preferred" node, so affinity should be
// set such that this thread is running on <node>.)
memset(mem, 0, size);
VerifyPages(mem, size, pageSize, node);
return mem;
}
void numa_Deallocate(void* mem)
{
VirtualFree(mem, 0, MEM_RELEASE);
}
//-----------------------------------------------------------------------------
static LibError wnuma_Init()
{
DetectNodeTopology();
return INFO::OK;
}