diff --git a/source/lib/bits.h b/source/lib/bits.h
index a9b2e6d4af..1ec4248b2f 100644
--- a/source/lib/bits.h
+++ b/source/lib/bits.h
@@ -40,7 +40,7 @@ template<typename T>
 T Bit(size_t n)
 {
 	const T one = T(1);
-	return (one << n);
+	return (T)(one << n);
 }
 
 /**
@@ -71,16 +71,14 @@ bool IsBitSet(T value, size_t index)
 template<typename T>
 T bit_mask(size_t numBits)
 {
-	if(numBits == 0)	// prevent shift count == bitsInT, which would be undefined.
-		return 0;
-	// notes:
-	// - the perhaps more intuitive (1 << numBits)-1 cannot
-	//   handle numBits == bitsInT, but this implementation does.
-	// - though bulky, the below statements avoid sign-conversion warnings.
 	const T bitsInT = sizeof(T)*CHAR_BIT;
-	T mask(0);
-	mask = T(~mask);
-	mask >>= T(bitsInT-numBits);
+	const T allBits = (T)~T(0);
+	// (shifts of at least bitsInT are undefined)
+	if(numBits >= bitsInT)
+		return allBits;
+	// (note: the previous allBits >> (bitsInT-numBits) is not safe
+	// because right-shifts of negative numbers are undefined.)
+	const T mask = T(T(1) << numBits)-1;
 	return mask;
 }
 
@@ -98,12 +96,31 @@ T bit_mask(size_t numBits)
 template<typename T>
 inline T bits(T num, size_t lo_idx, size_t hi_idx)
 {
-	const size_t count = (hi_idx - lo_idx)+1;	// # bits to return
+	const size_t numBits = (hi_idx - lo_idx)+1;	// # bits to return
 	T result = T(num >> lo_idx);
-	result = T(result & bit_mask<T>(count));
+	result = T(result & bit_mask<T>(numBits));
 	return result;
 }
 
+/**
+ * set the value of bits hi_idx:lo_idx
+ *
+ * @param lo_idx bit index of lowest  bit to include
+ * @param hi_idx bit index of highest bit to include
+ * @param value new value to be assigned to these bits
+ **/
+template<typename T>
+inline T SetBitsTo(T num, size_t lo_idx, size_t hi_idx, size_t value)
+{
+	const size_t numBits = (hi_idx - lo_idx)+1;
+	debug_assert(value < (T(1) << numBits));
+	const T mask = bit_mask<T>(numBits) << lo_idx;
+	T result = num & ~mask;
+	result = T(result | (value << lo_idx));
+	return result;
+}
+
+
 /**
  * @return number of 1-bits in mask
  **/
@@ -127,7 +144,7 @@ size_t PopulationCount(T mask)
  * @return whether the given number is a power of two.
  **/
 template<typename T>
-bool is_pow2(T n)
+inline bool is_pow2(T n)
 {
 	// 0 would pass the test below but isn't a POT.
 	if(n == 0)
@@ -135,6 +152,19 @@ bool is_pow2(T n)
 	return (n & (n-1)) == 0;
 }
 
+template<typename T>
+inline T LeastSignificantBit(T x)
+{
+	const T negX = T(~x + 1);	// 2's complement (avoids 'negating unsigned type' warning)
+	return x & negX;
+}
+
+template<typename T>
+inline T ClearLeastSignificantBit(T x)
+{
+	return x & (x-1);
+}
+
 /**
  * ceil(log2(x))
  *
diff --git a/source/lib/code_annotation.h b/source/lib/code_annotation.h
index a63acb6f0f..3aceeef810 100644
--- a/source/lib/code_annotation.h
+++ b/source/lib/code_annotation.h
@@ -43,60 +43,48 @@
 
 
 /**
-"unreachable code" helpers
-
-unreachable lines of code are often the source or symptom of subtle bugs.
-they are flagged by compiler warnings; however, the opposite problem -
-erroneously reaching certain spots (e.g. due to missing return statement)
-is worse and not detected automatically.
-
-to defend against this, the programmer can annotate their code to
-indicate to humans that a particular spot should never be reached.
-however, that isn't much help; better is a sentinel that raises an
-error if if it is actually reached. hence, the UNREACHABLE macro.
-
-ironically, if the code guarded by UNREACHABLE works as it should,
-compilers may flag the macro's code as unreachable. this would
-distract from genuine warnings, which is unacceptable.
-
-even worse, compilers differ in their code checking: GCC only complains if
-non-void functions end without returning a value (i.e. missing return
-statement), while VC checks if lines are unreachable (e.g. if they are
-preceded by a return on all paths).
-
-our implementation of UNREACHABLE solves this dilemna as follows:
-- on GCC: call abort(); since it has the noreturn attributes, the
-  "non-void" warning disappears.
-- on VC: avoid generating any code. we allow the compiler to assume the
-  spot is actually unreachable, which incidentally helps optimization.
-  if reached after all, a crash usually results. in that case, compile with
-  CONFIG_PARANOIA, which will cause an error message to be displayed.
-
-this approach still allows for the possiblity of automated
-checking, but does not cause any compiler warnings.
-**/
+ * "unreachable code" helpers
+ *
+ * unreachable lines of code are often the source or symptom of subtle bugs.
+ * they are flagged by compiler warnings; however, the opposite problem -
+ * erroneously reaching certain spots (e.g. due to missing return statement)
+ * is worse and not detected automatically.
+ *
+ * to defend against this, the programmer can annotate their code to
+ * indicate to humans that a particular spot should never be reached.
+ * however, that isn't much help; better is a sentinel that raises an
+ * error if if it is actually reached. hence, the UNREACHABLE macro.
+ *
+ * ironically, if the code guarded by UNREACHABLE works as it should,
+ * compilers may flag the macro's code as unreachable. this would
+ * distract from genuine warnings, which is unacceptable.
+ *
+ * even worse, compilers differ in their code checking: GCC only complains if
+ * non-void functions end without returning a value (i.e. missing return
+ * statement), while VC checks if lines are unreachable (e.g. if they are
+ * preceded by a return on all paths).
+ *
+ * the implementation below enables optimization and automated checking
+ * without raising warnings.
+ **/
 #define UNREACHABLE	// actually defined below.. this is for
 # undef UNREACHABLE	// CppDoc's benefit only.
 
-// 1) final build: optimize assuming this location cannot be reached.
-//    may crash if that turns out to be untrue, but removes checking overhead.
-#if CONFIG_FINAL
+// compiler supports ASSUME_UNREACHABLE => allow it to assume the code is
+// never reached (improves optimization at the cost of undefined behavior
+// if the annotation turns out to be incorrect).
+#if HAVE_ASSUME_UNREACHABLE && !CONFIG_PARANOIA
 # define UNREACHABLE ASSUME_UNREACHABLE
-// 2) normal build:
+// otherwise (or if CONFIG_PARANOIA is set), add a user-visible
+// warning if the code is reached. note that abort() fails to stop
+// ICC from warning about the lack of a return statement, so we
+// use an infinite loop instead.
 #else
-//    a) normal implementation: includes "abort", which is declared with
-//       noreturn attribute and therefore avoids GCC's "execution reaches
-//       end of non-void function" warning.
-# if !MSC_VERSION || ICC_VERSION || CONFIG_PARANOIA
-#  define UNREACHABLE\
+# define UNREACHABLE\
 	STMT(\
 		debug_assert(0);	/* hit supposedly unreachable code */\
-		abort();\
+		for(;;){};\
 	)
-//    b) VC only: don't generate any code; squelch the warning and optimize.
-# else
-#  define UNREACHABLE ASSUME_UNREACHABLE
-# endif
 #endif
 
 /**
diff --git a/source/lib/sysdep/arch/x86_x64/topology.cpp b/source/lib/sysdep/arch/x86_x64/topology.cpp
index af24f6b51b..7326dd0f8c 100644
--- a/source/lib/sysdep/arch/x86_x64/topology.cpp
+++ b/source/lib/sysdep/arch/x86_x64/topology.cpp
@@ -116,7 +116,7 @@ static size_t MaxLogicalPerCache()
 
 
 //-----------------------------------------------------------------------------
-// determination of enabled cores/HTs
+// APIC IDs
 
 // APIC IDs consist of variable-length fields identifying the logical unit,
 // core, package and shared cache. if they are available, we can determine
@@ -174,106 +174,102 @@ const u8* ApicIds()
 }
 
 
-/**
- * count the number of unique APIC IDs after application of a mask.
- *
- * this is used to implement NumUniqueValuesInField and also required
- * for counting the number of caches.
- **/
-static size_t NumUniqueMaskedValues(const u8* apicIds, u8 mask)
+// (if maxValues == 1, the field is zero-width and thus zero)
+static size_t ApicField(size_t apicId, size_t indexOfLowestBit, size_t maxValues)
 {
-	std::set<u8> ids;
-	for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
-	{
-		const u8 apicId = apicIds[processor];
-		const u8 field = u8(apicId & mask);
-		ids.insert(field);
-	}
-
-	return ids.size();
+	const size_t numBits = ceil_log2(maxValues);
+	const size_t mask = bit_mask<size_t>(numBits);
+	return (apicId >> indexOfLowestBit) & mask;
 }
 
 
-/**
- * Count the number of values assumed by a certain field within APIC IDs.
- *
- * @param apicIds
- * @param offset Index of the lowest bit that is part of the field.
- * @param numValues Number of values that can be assumed by the field.
- *		  If equal to one, the field is zero-width.
- * @return number of unique values (for convenience of the topology code,
- * this is always at least one)
- **/
-static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t numValues)
-{
-	if(numValues == 1)	// see parameter description above
-		return 1;
-	const size_t numBits = ceil_log2(numValues);
-	const u8 mask = u8((bit_mask<u8>(numBits) << offset) & 0xFF);
-	return NumUniqueMaskedValues(apicIds, mask);
-}
-
-
-static size_t MinPackages(size_t maxCoresPerPackage, size_t maxLogicalPerCore)
-{
-	const size_t numNodes = numa_NumNodes();
-	const size_t logicalPerNode = PopulationCount(numa_ProcessorMaskFromNode(0));
-	// NB: some cores or logical processors may be disabled.
-	const size_t maxLogicalPerPackage = maxCoresPerPackage*maxLogicalPerCore;
-	const size_t minPackagesPerNode = DivideRoundUp(logicalPerNode, maxLogicalPerPackage);
-	return minPackagesPerNode*numNodes;
-}
-
+//-----------------------------------------------------------------------------
+// CPU topology interface
 
 struct CpuTopology	// POD
 {
-	size_t numPackages;
-	size_t coresPerPackage;
+	size_t maxLogicalPerCore;
+	size_t maxCoresPerPackage;
+
+	size_t logicalOffset;
+	size_t coreOffset;
+	size_t packageOffset;
+
+	// how many are actually enabled
 	size_t logicalPerCore;
+	size_t coresPerPackage;
+	size_t numPackages;
 };
 static CpuTopology cpuTopology;
 static ModuleInitState cpuInitState;
 
 static LibError InitCpuTopology()
 {
-	const size_t numProcessors = os_cpu_NumProcessors();
-	const size_t maxCoresPerPackage = MaxCoresPerPackage();
-	const size_t maxLogicalPerCore = MaxLogicalPerCore();
+	cpuTopology.maxLogicalPerCore = MaxLogicalPerCore();
+	cpuTopology.maxCoresPerPackage = MaxCoresPerPackage();
+
+	cpuTopology.logicalOffset = 0;
+	cpuTopology.coreOffset    = ceil_log2(cpuTopology.maxLogicalPerCore);
+	cpuTopology.packageOffset = cpuTopology.coreOffset + ceil_log2(cpuTopology.maxCoresPerPackage);
 
 	const u8* apicIds = ApicIds();
 	if(apicIds)
 	{
-		const size_t packageOffset = ceil_log2(maxCoresPerPackage) + ceil_log2(maxLogicalPerCore);
-		const size_t coreOffset    = ceil_log2(maxLogicalPerCore);
-		const size_t logicalOffset = 0;
-		cpuTopology.numPackages     = NumUniqueValuesInField(apicIds, packageOffset, 256);
-		cpuTopology.coresPerPackage = NumUniqueValuesInField(apicIds, coreOffset,    maxCoresPerPackage);
-		cpuTopology.logicalPerCore  = NumUniqueValuesInField(apicIds, logicalOffset, maxLogicalPerCore);
+		struct NumUniqueValuesInField
+		{
+			size_t operator()(const u8* apicIds, size_t indexOfLowestBit, size_t numValues) const
+			{
+				std::set<size_t> values;
+				for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+				{
+					const size_t value = ApicField(apicIds[processor], numValues, indexOfLowestBit);
+					values.insert(value);
+				}
+				return values.size();
+			}
+		};
+
+		cpuTopology.logicalPerCore  = NumUniqueValuesInField()(apicIds, cpuTopology.logicalOffset, cpuTopology.maxLogicalPerCore);
+		cpuTopology.coresPerPackage = NumUniqueValuesInField()(apicIds, cpuTopology.coreOffset,    cpuTopology.maxCoresPerPackage);
+		cpuTopology.numPackages     = NumUniqueValuesInField()(apicIds, cpuTopology.packageOffset, 256);
 	}
 	else // the processor lacks an xAPIC, or the IDs are invalid
 	{
+		struct MinPackages
+		{
+			size_t operator()(size_t maxCoresPerPackage, size_t maxLogicalPerCore) const
+			{
+				const size_t numNodes = numa_NumNodes();
+				const size_t logicalPerNode = PopulationCount(numa_ProcessorMaskFromNode(0));
+				// NB: some cores or logical processors may be disabled.
+				const size_t maxLogicalPerPackage = maxCoresPerPackage*maxLogicalPerCore;
+				const size_t minPackagesPerNode = DivideRoundUp(logicalPerNode, maxLogicalPerPackage);
+				return minPackagesPerNode*numNodes;
+			}
+		};
+
 		// we can't differentiate between cores and logical processors.
 		// since the former are less likely to be disabled, we seek the
 		// maximum feasible number of cores and minimal number of packages:
-		const size_t minPackages = MinPackages(maxCoresPerPackage, maxLogicalPerCore);
-		const size_t maxPackages = numProcessors;
-		for(size_t numPackages = minPackages; numPackages <= maxPackages; numPackages++)
+		const size_t minPackages = MinPackages()(cpuTopology.maxCoresPerPackage, cpuTopology.maxLogicalPerCore);
+		const size_t numProcessors = os_cpu_NumProcessors();
+		for(size_t numPackages = minPackages; numPackages <= numProcessors; numPackages++)
 		{
 			if(numProcessors % numPackages != 0)
 				continue;
 			const size_t logicalPerPackage = numProcessors / numPackages;
-			const size_t minCoresPerPackage = DivideRoundUp(logicalPerPackage, maxLogicalPerCore);
-			for(size_t coresPerPackage = maxCoresPerPackage; coresPerPackage >= minCoresPerPackage; coresPerPackage--)
+			const size_t minCoresPerPackage = DivideRoundUp(logicalPerPackage, cpuTopology.maxLogicalPerCore);
+			for(size_t coresPerPackage = cpuTopology.maxCoresPerPackage; coresPerPackage >= minCoresPerPackage; coresPerPackage--)
 			{
 				if(logicalPerPackage % coresPerPackage != 0)
 					continue;
 				const size_t logicalPerCore = logicalPerPackage / coresPerPackage;
-				if(logicalPerCore <= maxLogicalPerCore)
+				if(logicalPerCore <= cpuTopology.maxLogicalPerCore)
 				{
 					debug_assert(numProcessors == numPackages*coresPerPackage*logicalPerCore);
-					cpuTopology.numPackages = numPackages;
-					cpuTopology.coresPerPackage = coresPerPackage;
 					cpuTopology.logicalPerCore = logicalPerCore;
+					cpuTopology.coresPerPackage = coresPerPackage;
+					cpuTopology.numPackages = numPackages;
 					return INFO::OK;
 				}
 			}
@@ -303,6 +299,24 @@ size_t cpu_topology_LogicalPerCore()
 	return cpuTopology.logicalPerCore;
 }
 
+size_t cpu_topology_LogicalFromId(size_t apicId)
+{
+	ModuleInit(&cpuInitState, InitCpuTopology);
+	return ApicField(apicId, cpuTopology.logicalOffset, cpuTopology.maxLogicalPerCore);
+}
+
+size_t cpu_topology_CoreFromId(size_t apicId)
+{
+	ModuleInit(&cpuInitState, InitCpuTopology);
+	return ApicField(apicId, cpuTopology.coreOffset, cpuTopology.maxCoresPerPackage);
+}
+
+size_t cpu_topology_PackageFromId(size_t apicId)
+{
+	ModuleInit(&cpuInitState, InitCpuTopology);
+	return ApicField(apicId, cpuTopology.packageOffset, 256);
+}
+
 
 //-----------------------------------------------------------------------------
 // cache topology
diff --git a/source/lib/sysdep/arch/x86_x64/topology.h b/source/lib/sysdep/arch/x86_x64/topology.h
index 5f0ae7969a..47e6b55106 100644
--- a/source/lib/sysdep/arch/x86_x64/topology.h
+++ b/source/lib/sysdep/arch/x86_x64/topology.h
@@ -65,6 +65,11 @@ LIB_API size_t cpu_topology_CoresPerPackage();
 LIB_API size_t cpu_topology_LogicalPerCore();
 
 
+LIB_API size_t cpu_topology_LogicalFromId(size_t apicId);
+LIB_API size_t cpu_topology_CoreFromId(size_t apicId);
+LIB_API size_t cpu_topology_PackageFromId(size_t apicId);
+
+
 //-----------------------------------------------------------------------------
 // L2 cache
 
diff --git a/source/lib/sysdep/arch/x86_x64/x86_x64.cpp b/source/lib/sysdep/arch/x86_x64/x86_x64.cpp
index df8a268ada..f093c0b7f1 100644
--- a/source/lib/sysdep/arch/x86_x64/x86_x64.cpp
+++ b/source/lib/sysdep/arch/x86_x64/x86_x64.cpp
@@ -157,7 +157,7 @@ bool x86_x64_cap(x86_x64_Cap cap)
 
 
 //-----------------------------------------------------------------------------
-// CPU identification
+// vendor
 
 static x86_x64_Vendors vendor;
 
@@ -197,10 +197,14 @@ x86_x64_Vendors x86_x64_Vendor()
 }
 
 
+//-----------------------------------------------------------------------------
+// signature
+
 static size_t model;
 static size_t family;
+static ModuleInitState signatureInitState;
 
-static void InitModelAndFamily()
+static LibError InitSignature()
 {
 	x86_x64_CpuidRegs regs = { 0 };
 	regs.eax = 1;
@@ -214,71 +218,19 @@ static void InitModelAndFamily()
 		family += extendedFamily;
 	if(family == 0xF || (x86_x64_Vendor() == X86_X64_VENDOR_INTEL && family == 6))
 		model += extendedModel << 4;
-}
-
-
-static size_t generation;
-
-static LibError InitGeneration()
-{
-	InitModelAndFamily();
-
-	switch(x86_x64_Vendor())
-	{
-	case X86_X64_VENDOR_AMD:
-		switch(family)
-		{
-		case 5:
-			if(model < 6)
-				generation = 5;	// K5
-			else
-				generation = 6;	// K6
-			break;
-
-		case 6:
-			generation = 7;	// K7 (Athlon)
-			break;
-
-		case 0xF:
-		case 0x10:
-			generation = 8;	// K8 (Opteron)
-			break;
-		}
-		break;
-
-	case X86_X64_VENDOR_INTEL:
-		switch(family)
-		{
-		case 5:
-			generation = 5;	// Pentium
-			break;
-
-		case 6:
-			if(model < 0xF)
-				generation = 6;	// Pentium Pro/II/III/M
-			else
-				generation = 8;	// Core2Duo
-			break;
-
-		case 0xF:
-			if(model <= 6)
-				generation = 7;	// Pentium 4/D
-			break;
-		}
-		if(family >= 0x10)
-			generation = 9;
-		break;
-	}
-
-	debug_assert(generation != 0);
 	return INFO::OK;
 }
 
-size_t x86_x64_Generation()
+size_t x86_x64_Model()
 {
-	static ModuleInitState initState;
-	ModuleInit(&initState, InitGeneration);
-	return generation;
+	ModuleInit(&signatureInitState, InitSignature);
+	return model;
+}
+
+size_t x86_x64_Family()
+{
+	ModuleInit(&signatureInitState, InitSignature);
+	return family;
 }
 
 
@@ -832,7 +784,8 @@ static LibError InitIdentifierString()
 	//   doesn't recognize.
 	if(!gotBrandString || strncmp(identifierString, "Unknow", 6) == 0)
 	{
-		InitModelAndFamily();
+		const size_t family = x86_x64_Family();
+		const size_t model = x86_x64_Model();
 		switch(x86_x64_Vendor())
 		{
 		case X86_X64_VENDOR_AMD:
diff --git a/source/lib/sysdep/arch/x86_x64/x86_x64.h b/source/lib/sysdep/arch/x86_x64/x86_x64.h
index 3b525759dc..c9f29c3024 100644
--- a/source/lib/sysdep/arch/x86_x64/x86_x64.h
+++ b/source/lib/sysdep/arch/x86_x64/x86_x64.h
@@ -73,6 +73,11 @@ enum x86_x64_Vendors
 LIB_API x86_x64_Vendors x86_x64_Vendor();
 
 
+LIB_API size_t x86_x64_Model();
+
+LIB_API size_t x86_x64_Family();
+
+
 /**
  * @return the colloquial processor generation
  * (5 = Pentium, 6 = Pentium Pro/II/III / K6, 7 = Pentium4 / Athlon, 8 = Core / Opteron)
@@ -96,6 +101,7 @@ enum x86_x64_Cap
 	// standard (edx)
 	X86_X64_CAP_FPU             = 32+0,  // Floating Point Unit
 	X86_X64_CAP_TSC             = 32+4,  // TimeStamp Counter
+	X86_X64_CAP_MSR             = 32+5,	 // Model Specific Registers
 	X86_X64_CAP_CMOV            = 32+15, // Conditional MOVe
 	X86_X64_CAP_TM_SCC          = 32+22, // Thermal Monitoring and Software Controlled Clock
 	X86_X64_CAP_MMX             = 32+23, // MultiMedia eXtensions
diff --git a/source/lib/sysdep/compiler.h b/source/lib/sysdep/compiler.h
index 7be48e1e7d..fdad9daa31 100644
--- a/source/lib/sysdep/compiler.h
+++ b/source/lib/sysdep/compiler.h
@@ -175,10 +175,15 @@
 // this macro should not generate any fallback code; it is merely the
 // compiler-specific backend for lib.h's UNREACHABLE.
 // #define it to nothing if the compiler doesn't support such a hint.
-#if MSC_VERSION
+#define HAVE_ASSUME_UNREACHABLE 1
+#if MSC_VERSION && !ICC_VERSION // (ICC ignores this)
 # define ASSUME_UNREACHABLE __assume(0)
+#elif GCC_VERSION >= 450
+# define ASSUME_UNREACHABLE __builtin_unreachable()
 #else
 # define ASSUME_UNREACHABLE
+# undef HAVE_ASSUME_UNREACHABLE
+# define HAVE_ASSUME_UNREACHABLE 0
 #endif
 
 
diff --git a/source/lib/sysdep/os/win/aken/aken.h b/source/lib/sysdep/os/win/aken/aken.h
index 77722cc7d7..582249ca5c 100644
--- a/source/lib/sysdep/os/win/aken/aken.h
+++ b/source/lib/sysdep/os/win/aken/aken.h
@@ -41,47 +41,75 @@
 #define IOCTL_AKEN_WRITE_PORT          CTL_CODE(FILE_DEVICE_AKEN, AKEN_IOCTL+1, METHOD_BUFFERED, FILE_ANY_ACCESS)
 #define IOCTL_AKEN_MAP                 CTL_CODE(FILE_DEVICE_AKEN, AKEN_IOCTL+2, METHOD_BUFFERED, FILE_ANY_ACCESS)
 #define IOCTL_AKEN_UNMAP               CTL_CODE(FILE_DEVICE_AKEN, AKEN_IOCTL+3, METHOD_BUFFERED, FILE_ANY_ACCESS)
+#define IOCTL_AKEN_READ_MSR            CTL_CODE(FILE_DEVICE_AKEN, AKEN_IOCTL+4, METHOD_BUFFERED, FILE_ANY_ACCESS)
+#define IOCTL_AKEN_WRITE_MSR           CTL_CODE(FILE_DEVICE_AKEN, AKEN_IOCTL+5, METHOD_BUFFERED, FILE_ANY_ACCESS)
+#define IOCTL_AKEN_READ_PMC            CTL_CODE(FILE_DEVICE_AKEN, AKEN_IOCTL+6, METHOD_BUFFERED, FILE_ANY_ACCESS)
 
 
 // input and output data structures for the IOCTLs
 
 #pragma pack(push, 1)
 
-struct AkenReadPortIn
+typedef struct AkenReadPortIn_
 {
 	USHORT port;
 	UCHAR numBytes;
-};
+}
+AkenReadPortIn;
 
-struct AkenReadPortOut
+typedef struct AkenReadPortOut_
 {
 	DWORD32 value;
-};
+}
+AkenReadPortOut;
 
-struct AkenWritePortIn
+typedef struct AkenWritePortIn_
 {
 	DWORD32 value;
 	USHORT port;
 	UCHAR numBytes;
-};
+}
+AkenWritePortIn;
 
-struct AkenMapIn
+typedef struct AkenMapIn_
 {
 	// note: fixed-width types allow the 32 or 64-bit Mahaf wrapper to
 	// interoperate with the 32 or 64-bit Aken driver.
 	DWORD64 physicalAddress;
 	DWORD64 numBytes;
-};
+}
+AkenMapIn;
 
-struct AkenMapOut
+typedef struct AkenMapOut_
 {
 	DWORD64 virtualAddress;
-};
+}
+AkenMapOut;
 
-struct AkenUnmapIn
+typedef struct AkenUnmapIn_
 {
 	DWORD64 virtualAddress;
-};
+}
+AkenUnmapIn;
+
+typedef struct AkenReadRegisterIn_
+{
+	DWORD64 reg;
+}
+AkenReadRegisterIn;
+
+typedef struct AkenReadRegisterOut_
+{
+	DWORD64 value;
+}
+AkenReadRegisterOut;
+
+typedef struct AkenWriteRegisterIn_
+{
+	DWORD64 reg;
+	DWORD64 value;
+}
+AkenWriteRegisterIn;
 
 #pragma pack(pop)
 
diff --git a/source/lib/sysdep/os/win/mahaf.cpp b/source/lib/sysdep/os/win/mahaf.cpp
index 9fd059e9e3..6f4425973e 100644
--- a/source/lib/sysdep/os/win/mahaf.cpp
+++ b/source/lib/sysdep/os/win/mahaf.cpp
@@ -25,6 +25,7 @@
  */
 
 #include "precompiled.h"
+#include "lib/sysdep/os/win/mahaf.h"
 
 #include "lib/sysdep/os/win/win.h"
 #include <winioctl.h>
@@ -56,8 +57,7 @@ static u32 ReadPort(u16 port, u8 numBytes)
 	}
 
 	debug_assert(bytesReturned == sizeof(out));
-	const u32 value = out.value;
-	return value;
+	return out.value;
 }
 
 u8 mahaf_ReadPort8(u16 port)
@@ -159,6 +159,48 @@ void mahaf_UnmapPhysicalMemory(volatile void* virtualAddress)
 }
 
 
+static u64 ReadRegister(DWORD ioctl, u64 reg)
+{
+	AkenReadRegisterIn in;
+	in.reg = reg;
+	AkenReadRegisterOut out;
+
+	DWORD bytesReturned;
+	LPOVERLAPPED ovl = 0;	// synchronous
+	BOOL ok = DeviceIoControl(hAken, ioctl, &in, sizeof(in), &out, sizeof(out), &bytesReturned, ovl);
+	if(!ok)
+	{
+		WARN_WIN32_ERR;
+		return 0;
+	}
+
+	debug_assert(bytesReturned == sizeof(out));
+	return out.value;
+}
+
+u64 mahaf_ReadModelSpecificRegister(u64 reg)
+{
+	return ReadRegister((DWORD)IOCTL_AKEN_READ_MSR, reg);
+}
+
+u64 mahaf_ReadPerformanceMonitoringCounter(u64 reg)
+{
+	return ReadRegister((DWORD)IOCTL_AKEN_READ_PMC, reg);
+}
+
+void mahaf_WriteModelSpecificRegister(u64 reg, u64 value)
+{
+	AkenWriteRegisterIn in;
+	in.reg = reg;
+	in.value = value;
+
+	DWORD bytesReturned;	// unused but must be passed to DeviceIoControl
+	LPOVERLAPPED ovl = 0;	// synchronous
+	BOOL ok = DeviceIoControl(hAken, (DWORD)IOCTL_AKEN_WRITE_MSR, &in, sizeof(in), 0, 0u, &bytesReturned, ovl);
+	WARN_IF_FALSE(ok);
+}
+
+
 //-----------------------------------------------------------------------------
 // driver installation
 //-----------------------------------------------------------------------------
diff --git a/source/lib/sysdep/os/win/mahaf.h b/source/lib/sysdep/os/win/mahaf.h
index 74b7dc804d..bea5a4fff3 100644
--- a/source/lib/sysdep/os/win/mahaf.h
+++ b/source/lib/sysdep/os/win/mahaf.h
@@ -39,20 +39,26 @@
  * note: mahaf_MapPhysicalMemory will complain if it
  * is called despite this function having returned true.
  **/
-extern bool mahaf_IsPhysicalMappingDangerous();
+LIB_API bool mahaf_IsPhysicalMappingDangerous();
 
 
-extern LibError mahaf_Init();
-extern void mahaf_Shutdown();
+LIB_API LibError mahaf_Init();
+LIB_API void mahaf_Shutdown();
 
-extern u8  mahaf_ReadPort8 (u16 port);
-extern u16 mahaf_ReadPort16(u16 port);
-extern u32 mahaf_ReadPort32(u16 port);
-extern void mahaf_WritePort8 (u16 port, u8  value);
-extern void mahaf_WritePort16(u16 port, u16 value);
-extern void mahaf_WritePort32(u16 port, u32 value);
+LIB_API u8  mahaf_ReadPort8 (u16 port);
+LIB_API u16 mahaf_ReadPort16(u16 port);
+LIB_API u32 mahaf_ReadPort32(u16 port);
+LIB_API void mahaf_WritePort8 (u16 port, u8  value);
+LIB_API void mahaf_WritePort16(u16 port, u16 value);
+LIB_API void mahaf_WritePort32(u16 port, u32 value);
 
-extern volatile void* mahaf_MapPhysicalMemory(uintptr_t physicalAddress, size_t numBytes);
-extern void mahaf_UnmapPhysicalMemory(volatile void* virtualAddress);
+LIB_API volatile void* mahaf_MapPhysicalMemory(uintptr_t physicalAddress, size_t numBytes);
+LIB_API void mahaf_UnmapPhysicalMemory(volatile void* virtualAddress);
+
+LIB_API u64 mahaf_ReadModelSpecificRegister(u64 reg);
+LIB_API void mahaf_WriteModelSpecificRegister(u64 reg, u64 value);
+
+// must be done in the driver because Windows clears CR4.PCE[8]
+LIB_API u64 mahaf_ReadPerformanceMonitoringCounter(u64 reg);
 
 #endif	// INCLUDED_MAHAF
diff --git a/source/lib/sysdep/os/win/whrt/tsc.cpp b/source/lib/sysdep/os/win/whrt/tsc.cpp
index d834df9e87..7abe12ceaa 100644
--- a/source/lib/sysdep/os/win/whrt/tsc.cpp
+++ b/source/lib/sysdep/os/win/whrt/tsc.cpp
@@ -38,6 +38,7 @@
 #if ARCH_X86_X64
 # include "lib/sysdep/arch/x86_x64/x86_x64.h"	// x86_x64_rdtsc
 # include "lib/sysdep/arch/x86_x64/topology.h"
+# include "lib/sysdep/arch/x86_x64/msr.h"
 #endif
 
 
@@ -173,7 +174,7 @@ public:
 
 #if ARCH_X86_X64
 		// recent CPU:
-		if(x86_x64_Generation() >= 7)
+		//if(x86_x64_Generation() >= 7)
 		{
 			// note: 8th generation CPUs support C1-clock ramping, which causes
 			// drift on multi-core systems, but those were excluded above.
@@ -183,7 +184,7 @@ public:
 			// the chipset thinks the system is dangerously overheated; the
 			// OS isn't even notified. this may be rare, but could cause
 			// incorrect results => unsafe.
-			return false;
+			//return false;
 		}
 #endif
 
@@ -217,6 +218,15 @@ public:
 		// note: even here, initial accuracy isn't critical because the
 		// clock is subject to thermal drift and would require continual
 		// recalibration anyway.
+#if ARCH_X86_X64
+		if(MSR::HasNehalem())
+		{
+			const u64 platformInfo = MSR::Read(MSR::PLATFORM_INFO);
+			const u8 maxNonTurboRatio = bits(platformInfo, 8, 15);
+			return maxNonTurboRatio * 133.33e6f;
+		}
+		else
+#endif
 		return os_cpu_ClockFrequency();
 	}