File kvm-backend-7.2.0-dev-20250903.patch of Package virtualbox-kvm
diff --git a/Config.kmk b/Config.kmk
index 26651f7..6028374 100644
--- a/Config.kmk
+++ b/Config.kmk
@@ -574,7 +574,7 @@ endif
# Enables the new breakpoint handling code, see @bugref{8650}
VBOX_WITH_DBGF_FLOW_TRACING = 1
# Enables ARMv8 API support and if possible virtualization, see @bugref{10383}
-VBOX_WITH_VIRT_ARMV8 = 1
+VBOX_WITH_VIRT_ARMV8 =
# Makes x86 emulation on ARM hosts available in the GUI.
if1of ($(KBUILD_TARGET).$(KBUILD_TARGET_ARCH), empty-set)
VBOX_WITH_X86_ON_ARM_ENABLED = 1
@@ -1702,6 +1702,19 @@ ifdef VBOX_HEADLESS
VBOX_WITH_VRDP_RDESKTOP =
endif
+#
+# Configure VirtualBox to use the KVM NEM backend.
+#
+ifdef VBOX_WITH_KVM
+ VBOX_WITH_DRIVERLESS_FORCED = 1
+ VBOX_WITH_NATIVE_NEM=1
+ # KVM doesn't need the VirtualBox Ring 0 drivers
+ VBOX_WITH_VBOXDRV=
+ VBOX_WITH_NETFLT=
+ VBOX_WITH_NETFLT_CROSSBOW=
+ VBOX_WITH_NETADP=
+endif
+
#
# Undefined VBOX_WITH_MAIN implies exclusion of a few more items.
#
@@ -2119,6 +2132,14 @@ endif
ifdef VBOX_WITH_STATIC_ARM64_PAGE_SHIFT
DEFS.linux.arm64 += IPRT_STATIC_ARM64_PAGE_SHIFT=$(VBOX_WITH_STATIC_ARM64_PAGE_SHIFT)
endif
+CYBERUS_CXX_FLAGS = -Werror -Wall
+ifdef VBOX_WITH_KVM
+ DEFS += VBOX_WITH_KVM
+ DEFS += VBOX_WITH_KVM_NESTING
+endif
+ifndef VBOX_HEADLESS
+ DEFS += VBOX_WITH_GVT_RENDERING
+endif
# Don't flood CDEFS, old MASMs doesn't like too many defines.
ifdef VBOX_WITH_DEBUGGER
@@ -3641,6 +3662,8 @@ ifndef VBOX_GCC_std
VBOX_GCC_std := -std=c++17
# else if "$(VBOX_CLANG_VERSION_CXX)" vge 60000 # Most language features complete by v6. Lib stuff was less complete in v6, but hopefully acceptable for out purposes.
#VBOX_GCC_std := -std=c++17
+ else if "$(VBOX_WITH_KVM)" veq 1
+ VBOX_GCC_std := -std=c++17
else if "$(VBOX_CLANG_VERSION_CXX)" vge 50000 # darwin Xcode 5 allegedly knows what C++11 is
VBOX_GCC_std := -std=c++11
# else if "$(VBOX_GCC_VERSION_CXX)" vge 70000 # Language feature P0512R0 was v8, rest v7 or earlier. Most lib stuff present in 7, complete in v12.
diff --git a/configure b/configure
index 9e67b32..1d3de3c 100755
--- a/configure
+++ b/configure
@@ -86,6 +86,7 @@ SETUP_WINE=
ONLY_ADDITIONS=0
TARGET_MACHINE=""
TARGET_CPU=""
+WITH_KVM=0
WITH_XPCOM=1
WITH_PYTHON=1
WITH_JAVA=1
@@ -2489,6 +2490,7 @@ cat << EOF
--build-libssl build openssl from sources
--build-libtpms build libtpms from sources
--build-liblzma build liblzma from sources
+ --with-kvm build with kvm backend
EOF
[ $OSE -eq 0 ] && cat << EOF
--build-libcurl build libcurl from sources
@@ -2643,6 +2645,9 @@ for option in "$@"; do
--with-linux=*)
LINUX=`echo $option | cut -d'=' -f2`
;;
+ --with-kvm)
+ WITH_KVM=1
+ ;;
--with-makeself=*)
MAKESELF=`echo $option | cut -d'=' -f2`
;;
@@ -2922,6 +2927,7 @@ fi
[ $WITH_JAVA -eq 0 ] && cnf_append "VBOX_WITH_JWS" ""
[ $WITH_HARDENING -eq 0 ] && cnf_append "VBOX_WITHOUT_HARDENING" "1"
[ $WITH_HARDENING -eq 2 ] && cnf_append "VBOX_WITH_HARDENING" "2"
+[ $WITH_KVM -eq 1 ] && cnf_append "VBOX_WITH_KVM" "1"
[ $WITH_LIBTPMS -eq 0 ] && cnf_append "VBOX_WITH_LIBTPMS" ""
[ $WITH_LIBLZMA -eq 0 ] && cnf_append "VBOX_WITH_LIBLZMA" ""
if [ $WITH_LIBVPX -eq 0 ]; then
diff --git a/include/VBox/vmm/nem.h b/include/VBox/vmm/nem.h
index 76414fb..13111c4 100644
--- a/include/VBox/vmm/nem.h
+++ b/include/VBox/vmm/nem.h
@@ -43,6 +43,14 @@
#include <VBox/vmm/vmapi.h>
#include <VBox/vmm/pgm.h>
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+// For KVMPICSTATE and KVMIRQCHIP
+#include <VBox/vmm/pdmdev.h>
+#endif
+
+#if defined(VBOX_WITH_KVM) && defined(IN_RING3)
+#include <VBox/vmm/cpum.h> /* for PCPUMCPUIDLEAF */
+#endif
RT_C_DECLS_BEGIN
@@ -163,6 +171,150 @@ VMMR3_INT_DECL(int) NEMR3NotifyPhysRomRegisterEarly(PVM pVM, RTGCPHYS GCPhys, R
VMMR3_INT_DECL(int) NEMR3NotifyPhysRomRegisterLate(PVM pVM, RTGCPHYS GCPhys, RTGCPHYS cb, void *pvPages,
uint32_t fFlags, uint8_t *pu2State, uint32_t *puNemRange);
+#if defined(VBOX_WITH_KVM) && defined(IN_RING3)
+
+/**
+ * Retrieves the value of a single model specific register (MSR).
+ * @param pVCpu The vCPU in which context the MSR should be read (can be any vCPU for global MSRs).
+ * @param msr The index of the MSR that should be read.
+ * @param val A buffer that will contain the value of the specified MSR, if reading was successful.
+ * @return VBox status code, VINF_SUCCESS, if the read access was successful.
+ */
+VMMR3_INT_DECL(int) NEMR3KvmGetMsr(PVMCPU pVCpu, uint64_t msr, uint64_t* val);
+
+/**
+ * Writes a value to single model specific register (MSR).
+ * @param pVCpu The vCPU in which context the MSR should be written (can be any vCPU for global MSRs).
+ * @param msr The index of the MSR that should be written.
+ * @param val The value that should be written to the MSR.
+ * @return VBox status code, VINF_SUCCESS, if the write access was successful.
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSetMsr(PVMCPU pVCpu, uint64_t msr, uint64_t val);
+
+/**
+ * Asserts a specific interrupt line on both PIC and I/O APIC.
+ * @param pVM The cross context VM structure.
+ * @param u16Gsi the GSI of the interrupt lines that should be asserted.
+ * @param iLevel Line level, either PDM_IRQ_LEVEL_HIGH, PDM_IRQ_LEVEL_LOW or PDM_IRQ_LEVEL_FLIP_FLOP.
+ * @return Vbox status code.
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSetIrqLine(PVM pVM, uint16_t u16Gsi, int iLevel);
+
+/**
+ * Execute state load operation. This sets the correct KVM MP state depending on
+ * the VBox vCPUs state.
+ * @param pVM The cross context VM structure
+ */
+VMMR3_INT_DECL(int) NEMR3LoadExec(PVM pVM);
+
+/**
+ * Retrieves the local APIC state from the in-kernel irqchip.
+ * @param pVCpu The vCpu to retrieve the APIC state from
+ * @param pXApicPage Pointer to the memory the APIC state is saved to. Must be
+ * at least of size KVM_APIC_REG_SIZE.
+ * @returns VBox status code
+ */
+VMMR3_INT_DECL(int) NEMR3KvmGetLapicState(PVMCPU pVCpu, void* pXApicPage);
+
+/**
+ * Configures the local APIC state of the in-kernel irqchip.
+ * @param pVCpu The vCpu for which to set the APIC state
+ * @param pXApicPage Pointer to the memory containing APIC state. Must be at
+ * least of size KVM_APIC_REG_SIZE.
+ * @returns VBox status code
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSetLapicState(PVMCPU pVCpu, void* pXApicPage);
+
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+
+/**
+ * Retrieves the PIC state from the in-kernel irqchip.
+ * @param pVM The VM to retrieve the PIC state from
+ * @param irqchip Whether to retrieve the state from the master or slave pic
+ * @param state Buffer to store the PIC state in.
+ * @returns VBox status code
+ */
+VMMR3_INT_DECL(int) NEMR3KvmGetPicState(PVM pVM, KVMIRQCHIP irqchip, KVMPICSTATE* state);
+
+/**
+ * Configures the PIC state of the in-kernel irqchip.
+ * @param pVM The VM to for which to set the PIC state
+ * @param irqchip Whether to set the state of the master or slave pic
+ * @param state Pointer to the memory containing PIC state.
+ * @returns VBox status code
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSetPicState(PVM pVM, KVMIRQCHIP irqchip, KVMPICSTATE* state);
+
+/**
+ * Retrieves the I/O APIC state from the in-kernel irqchip.
+ * @param pVM The VM to retrieve the I/O APIC state from
+ * @param state Buffer where to store I/O APIC state.
+ * @returns VBox status code
+ */
+VMMR3_INT_DECL(int) NEMR3KvmGetIoApicState(PVM pVM, KVMIOAPICSTATE* state);
+
+/**
+ * Configures the I/O APIC state of the in-kernel irqchip.
+ * @param pVM The VM to for which to set the I/O APIC state
+ * @param state Pointer to the memory containing I/O APIC state.
+ * @returns VBox status code
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSetIoApicState(PVM pVM, KVMIOAPICSTATE* state);
+#endif
+/**
+ * Deliver a MSI via the in-kernel irqchip.
+ *
+ * @returns VBox status code
+ * @param pVM The cross context VM structure
+ * @param pMsi The MSI to inject into the guest
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSplitIrqchipDeliverMsi(PVM pVM, PCMSIMSG pMsi);
+
+/**
+ * Add or update the Entry in the Redirection Table indexed by the GSI number.
+ *
+ * Interrupts configured via this interface will cause an EOI exit when the
+ * guest acknowledges them. Typically, this is only necessary for level
+ * triggered interrupts.
+ *
+ * @returns VBox status code
+ * @param pVM The cross context VM structure
+ * @param gsi The GSI number
+ * @param pMSI The MSI that should be delivered when the interrupt fires
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSplitIrqchipAddUpdateRTE(PVM pVM, uint16_t u16Gsi, PCMSIMSG pMsi);
+
+/**
+ * Remove an Redirection Table entry indexed by the GSI number
+ *
+ * @returns VBox status code
+ * @param pVM The cross context VM structure
+ * @param gsi The GSI number for what the Redirection Table Entry should be
+ * removed
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSplitIrqchipRemoveRTE(PVM pVM, uint16_t u16Gsi);
+
+/**
+ * Returns an array of Hyper-V CPUID leaves supported by KVM.
+ *
+ * @returns VBox status code
+ * @param pVM The cross context VM structure
+ * @param outpCpuId The pointer where the CPUID leaves will be returned. Must be freed by the caller!
+ * @param outcLeaves The pointer where the number of CPUID leaves will be returned.
+ */
+VMMR3_INT_DECL(int) NEMR3KvmGetHvCpuIdLeaves(PVM pVM, PCPUMCPUIDLEAF *outpCpuId, size_t *outcLeaves);
+
+/**
+ * Returns an array of CPUID leaves supported by KVM.
+ *
+ * @returns VBox status code
+ * @param pVM The cross context VM structure
+ * @param outpCpuId The pointer where the CPUID leaves will be returned. Must be freed by the caller!
+ * @param outcLeaves The pointer where the number of CPUID leaves will be returned.
+ */
+VMMR3_INT_DECL(int) NEMR3KvmGetCpuIdLeaves(PVM pVM, PCPUMCPUIDLEAF *outpCpuId, size_t *outcLeaves);
+#endif
+
/** @name Flags for NEMR3NotifyPhysRomRegisterEarly and NEMR3NotifyPhysRomRegisterLate.
* @{ */
/** Set if the range is replacing RAM rather that unused space. */
diff --git a/include/VBox/vmm/pdmdev.h b/include/VBox/vmm/pdmdev.h
index 0befe41..7ce0de0 100644
--- a/include/VBox/vmm/pdmdev.h
+++ b/include/VBox/vmm/pdmdev.h
@@ -64,6 +64,49 @@
#include <iprt/stdarg.h>
#include <iprt/list.h>
+#ifdef VBOX_WITH_KVM
+#define KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS 24
+#define KVM_IRQCHIP_NUM_PIC_INTR_PINS 16
+#endif
+
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+struct KVMPICSTATE
+{
+ uint8_t last_irr;
+ uint8_t irr;
+ uint8_t imr;
+ uint8_t isr;
+ uint8_t priority_add;
+ uint8_t irq_base;
+ uint8_t read_reg_select;
+ uint8_t poll;
+ uint8_t special_mask;
+ uint8_t init_state;
+ uint8_t auto_eoi;
+ uint8_t rotate_on_auto_eoi;
+ uint8_t special_fully_nested_mode;
+ uint8_t init4;
+ uint8_t elcr;
+ uint8_t elcr_mask;
+};
+
+enum class KVMIRQCHIP
+{
+ PIC_MASTER = 0,
+ PIC_SLAVE = 1,
+};
+
+struct KVMIOAPICSTATE
+{
+ uint64_t base_address;
+ uint32_t ioregsel;
+ uint32_t id;
+ uint32_t irr;
+
+ uint64_t redirtbl[KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS];
+};
+#endif
+
RT_C_DECLS_BEGIN
@@ -1762,6 +1805,35 @@ typedef struct PDMPICHLP
*/
DECLCALLBACKMEMBER(void, pfnUnlock,(PPDMDEVINS pDevIns));
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+ /**
+ * Asserts a PIC INTR Line.
+ * @param pDevIns The PIC device instance.
+ * @param u16Gsu The GSI of the line to assert.
+ * @param iLevel Either PDM_IRQ_LEVEL_HIGH, PDM_IRQ_LEVEL_LOW or PDM_IRQ_LEVEL_FLIP_FLOP.
+ * @return Vbox status code.
+ */
+ DECLCALLBACKMEMBER(int, pfnKvmSetIrqLine,(PPDMDEVINS pDevIns, uint16_t u16Gsi, int iLevel));
+
+ /**
+ * Retrieves the PIC state from the in-kernel irqchip.
+ * @param pDevIns The PIC device instance.
+ * @param irqchip Whether to retrieve the state from the master or slave pic
+ * @param state Buffer to store the PIC state in.
+ * @returns VBox status code
+ */
+ DECLCALLBACKMEMBER(int, pfnKvmGetPicState,(PPDMDEVINS pDevIns, KVMIRQCHIP irqchip, KVMPICSTATE* state));
+
+ /**
+ * Configures the PIC state of the in-kernel irqchip.
+ * @param pDevIns The PIC device instance.
+ * @param irqchip Whether to set the state of the master or slave pic.
+ * @param state Pointer to the memory containing PIC state.
+ * @returns VBox status code
+ */
+ DECLCALLBACKMEMBER(int, pfnKvmSetPicState,(PPDMDEVINS pDevIns, KVMIRQCHIP irqchip, KVMPICSTATE* state));
+#endif
+
/** Just a safety precaution. */
uint32_t u32TheEnd;
} PDMPICHLP;
@@ -1948,6 +2020,55 @@ typedef struct PDMIOAPICHLP
*/
DECLCALLBACKMEMBER(int, pfnIommuMsiRemap,(PPDMDEVINS pDevIns, uint16_t idDevice, PCMSIMSG pMsiIn, PMSIMSG pMsiOut));
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+ DECLCALLBACKMEMBER(int, pfnKvmSetIrqLine,(PPDMDEVINS pDevIns, uint16_t u16Gsi, int iLevel));
+ /**
+ * Private interface between IOAPIC and KVM Split Irq Chip
+ *
+ * @returns status code.
+ * @param pDevIns Device instance of the IOAPIC.
+ * @param pMsi The MSI to deliver to the KVM Split Irq Chip
+ */
+ DECLCALLBACKMEMBER(int, pfnKvmSplitIrqchipDeliverMsi,(PPDMDEVINS pDevIns, PCMSIMSG pMsi));
+
+ /**
+ * Add or Update Redirection Table Entry for the desired GSI
+ *
+ * @returns status code.
+ * @param pDevIns Device instance of the IOAPIC
+ * @param u16Gsi The GSI number to change the redirection table entry for.
+ * @param pMsi The MSI that should be sent when GSI is triggered
+ */
+ DECLCALLBACKMEMBER(int, pfnKvmSplitIrqchipAddUpdateRTE, (PPDMDEVINS pDevIns, uint16_t u16Gsi, PCMSIMSG pMsi));
+
+ /**
+ * Remove the entry from the Redirection Table indicated by the GSI number.
+ *
+ * @retruns status code.
+ * @param pDevIns Device instance of the IOAPIC
+ * @param u16Gsi The GSI number to remove from the Redirection Table
+ */
+ DECLCALLBACKMEMBER(int, pfnKvmSplitIrqchipRemoveRTE, (PPDMDEVINS pDevIns, uint16_t u16Gsi));
+#endif
+
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+ /**
+ * Retrieves the I/O APIC state from the in-kernel irqchip.
+ * @param pDevIns The I/O APIC device instance.
+ * @param state Buffer to store the I/O APIC state in.
+ * @returns VBox status code
+ */
+ DECLCALLBACKMEMBER(int, pfnKvmGetIoApicState,(PPDMDEVINS pDevIns, KVMIOAPICSTATE* state));
+
+ /**
+ * Configures the I/O APIC state of the in-kernel irqchip.
+ * @param pDevIns The I/O APIC device instance.
+ * @param state Pointer to the memory containing I/O APIC state.
+ * @returns VBox status code
+ */
+ DECLCALLBACKMEMBER(int, pfnKvmSetIoApicState,(PPDMDEVINS pDevIns, KVMIOAPICSTATE* state));
+#endif
+
/** Just a safety precaution. */
uint32_t u32TheEnd;
} PDMIOAPICHLP;
diff --git a/include/iprt/mangling.h b/include/iprt/mangling.h
index 2f2bc7e..a21e4ed 100644
--- a/include/iprt/mangling.h
+++ b/include/iprt/mangling.h
@@ -2705,6 +2705,7 @@
# define RTThreadIsSelfKnown RT_MANGLER(RTThreadIsSelfKnown)
# define RTThreadNativeSelf RT_MANGLER(RTThreadNativeSelf)
# define RTThreadControlPokeSignal RT_MANGLER(RTThreadControlPokeSignal) /* not-win not-os2 */
+# define RTThreadPokeSignal RT_MANGLER(RTThreadPokeSignal) /* not-win not-os2 */
# define RTThreadPoke RT_MANGLER(RTThreadPoke) /* not-win not-os2 */
# define RTThreadPreemptDisable RT_MANGLER(RTThreadPreemptDisable) /* r0drv */
# define RTThreadPreemptIsEnabled RT_MANGLER(RTThreadPreemptIsEnabled) /* r0drv */
diff --git a/include/iprt/thread.h b/include/iprt/thread.h
index d4d504c..49013eb 100644
--- a/include/iprt/thread.h
+++ b/include/iprt/thread.h
@@ -555,6 +555,12 @@ RTDECL(int) RTThreadPoke(RTTHREAD hThread);
*/
RTDECL(int) RTThreadControlPokeSignal(RTTHREAD hThread, bool fEnable);
+/**
+ * Returns the signal that is used to poke threads.
+ *
+ * @returns a signal number or -1.
+ */
+RTDECL(int) RTThreadPokeSignal(void);
# ifdef IN_RING0
diff --git a/include/iprt/x86.h b/include/iprt/x86.h
index 8b7ecd2..3ee9197 100644
--- a/include/iprt/x86.h
+++ b/include/iprt/x86.h
@@ -682,6 +682,8 @@ typedef const X86CPUIDFEATEDX *PCX86CPUIDFEATEDX;
#define X86_CPUID_STEXT_FEATURE_EBX_SMAP RT_BIT_32(20)
/** EBX Bit 23 - CLFLUSHOPT - Supports CLFLUSHOPT (Cache Line Flush). */
#define X86_CPUID_STEXT_FEATURE_EBX_CLFLUSHOPT RT_BIT_32(23)
+/** EBX Bit 24 - CLWB - Supports CLWB (Cache Line write-back). */
+#define X86_CPUID_STEXT_FEATURE_EBX_CLWB RT_BIT_32(24)
/** EBX Bit 25 - INTEL_PT - Supports Intel Processor Trace. */
#define X86_CPUID_STEXT_FEATURE_EBX_INTEL_PT RT_BIT_32(25)
/** EBX Bit 26 - AVX512PF - Supports AVX512PF. */
@@ -703,6 +705,8 @@ typedef const X86CPUIDFEATEDX *PCX86CPUIDFEATEDX;
#define X86_CPUID_STEXT_FEATURE_ECX_OSPKE RT_BIT_32(4)
/** ECX Bit 7 - CET_SS - Supports CET shadow stack features. */
#define X86_CPUID_STEXT_FEATURE_ECX_CET_SS RT_BIT_32(7)
+/** ECX Bit 8 - GFNI - Supports Galois Field instructions . */
+#define X86_CPUID_STEXT_FEATURE_ECX_GFNI RT_BIT_32(8)
/** ECX Bits 17-21 - MAWAU - Value used by BNDLDX and BNDSTX. */
#define X86_CPUID_STEXT_FEATURE_ECX_MAWAU UINT32_C(0x003e0000)
/** ECX Bit 22 - RDPID - Support pread process ID. */
@@ -710,6 +714,8 @@ typedef const X86CPUIDFEATEDX *PCX86CPUIDFEATEDX;
/** ECX Bit 30 - SGX_LC - Supports SGX launch configuration. */
#define X86_CPUID_STEXT_FEATURE_ECX_SGX_LC RT_BIT_32(30)
+/** EDX Bit 4 - FSRM - Supports Fast Short REP MOVSB */
+#define X86_CPUID_STEXT_FEATURE_EDX_FSRM RT_BIT(4)
/** EDX bit 9 - SRBDS_CTRL - (Special Register Buffer Data Sample Control)
* Supports IA32_MCU_OPT_CTRL and IA32_MCU_OPT_CTRL.RNGDS_MITG_DIS. */
#define X86_CPUID_STEXT_FEATURE_EDX_SRBDS_CTRL RT_BIT_32(9)
@@ -717,6 +723,8 @@ typedef const X86CPUIDFEATEDX *PCX86CPUIDFEATEDX;
#define X86_CPUID_STEXT_FEATURE_EDX_MD_CLEAR RT_BIT_32(10)
/** EDX Bit 11 - TSX_FORCE_ABORT - Supports for IA32_TSX_FORCE_ABORT MSR. */
#define X86_CPUID_STEXT_FEATURE_EDX_TSX_FORCE_ABORT RT_BIT_32(11)
+/** EDX Bit 14 - SERIALIZE - Supports the SERIALIZE CPU instruction. */
+#define X86_CPUID_STEXT_FEATURE_EDX_SERIALIZE RT_BIT_32(14)
/** EDX Bit 20 - CET_IBT - Supports CET indirect branch tracking features. */
#define X86_CPUID_STEXT_FEATURE_EDX_CET_IBT RT_BIT_32(20)
/** EDX Bit 26 - IBRS & IBPB - Supports the IBRS flag in IA32_SPEC_CTRL and
diff --git a/src/VBox/Devices/PC/DevACPI.cpp b/src/VBox/Devices/PC/DevACPI.cpp
index 50e1ca6..4efb074 100644
--- a/src/VBox/Devices/PC/DevACPI.cpp
+++ b/src/VBox/Devices/PC/DevACPI.cpp
@@ -814,7 +814,11 @@ struct ACPITBLISO
uint16_t u16Flags; /**< MPS INTI flags Global */
};
AssertCompileSize(ACPITBLISO, 10);
-#define NUMBER_OF_IRQ_SOURCE_OVERRIDES 2
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+#define NUMBER_OF_IRQ_SOURCE_OVERRIDES (10)
+#else
+#define NUMBER_OF_IRQ_SOURCE_OVERRIDES (2)
+#endif
/** HPET Descriptor Structure */
struct ACPITBLHPET
@@ -3319,8 +3323,73 @@ static void acpiR3SetupMadt(PPDMDEVINS pDevIns, PACPISTATE pThis, RTGCPHYS32 add
isos[1].u8Bus = 0; /* Must be 0 */
isos[1].u8Source = 9; /* IRQ9 */
isos[1].u32GSI = 9; /* connected to pin 9 */
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+ isos[1].u16Flags = 0xd; /* active high, level triggered */
+#else
isos[1].u16Flags = 0xf; /* active low, level triggered */
+#endif
+
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+ isos[2].u8Type = 2;
+ isos[2].u8Length = sizeof(ACPITBLISO);
+ isos[2].u8Bus = 0; /* Must be 0 */
+ isos[2].u8Source = 16; /* IRQ16 */
+ isos[2].u32GSI = 16; /* connected to pin 16 */
+ isos[2].u16Flags = 0xd; /* active high, level triggered */
+
+ isos[3].u8Type = 2;
+ isos[3].u8Length = sizeof(ACPITBLISO);
+ isos[3].u8Bus = 0; /* Must be 0 */
+ isos[3].u8Source = 17; /* IRQ17 */
+ isos[3].u32GSI = 17; /* connected to pin 17 */
+ isos[3].u16Flags = 0xd; /* active high, level triggered */
+
+ isos[4].u8Type = 2;
+ isos[4].u8Length = sizeof(ACPITBLISO);
+ isos[4].u8Bus = 0; /* Must be 0 */
+ isos[4].u8Source = 18; /* IRQ18 */
+ isos[4].u32GSI = 18; /* connected to pin 18 */
+ isos[4].u16Flags = 0xd; /* active high, level triggered */
+
+ isos[5].u8Type = 2;
+ isos[5].u8Length = sizeof(ACPITBLISO);
+ isos[5].u8Bus = 0; /* Must be 0 */
+ isos[5].u8Source = 19; /* IRQ19 */
+ isos[5].u32GSI = 19; /* connected to pin 19 */
+ isos[5].u16Flags = 0xd; /* active high, level triggered */
+
+ isos[6].u8Type = 2;
+ isos[6].u8Length = sizeof(ACPITBLISO);
+ isos[6].u8Bus = 0; /* Must be 0 */
+ isos[6].u8Source = 20; /* IRQ20 */
+ isos[6].u32GSI = 20; /* connected to pin 20 */
+ isos[6].u16Flags = 0xd; /* active high, level triggered */
+
+ isos[7].u8Type = 2;
+ isos[7].u8Length = sizeof(ACPITBLISO);
+ isos[7].u8Bus = 0; /* Must be 0 */
+ isos[7].u8Source = 21; /* IRQ21 */
+ isos[7].u32GSI = 21; /* connected to pin 21 */
+ isos[7].u16Flags = 0xd; /* active high, level triggered */
+
+ isos[8].u8Type = 2;
+ isos[8].u8Length = sizeof(ACPITBLISO);
+ isos[8].u8Bus = 0; /* Must be 0 */
+ isos[8].u8Source = 22; /* IRQ22 */
+ isos[8].u32GSI = 22; /* connected to pin 22 */
+ isos[8].u16Flags = 0xd; /* active high, level triggered */
+
+ isos[9].u8Type = 2;
+ isos[9].u8Length = sizeof(ACPITBLISO);
+ isos[9].u8Bus = 0; /* Must be 0 */
+ isos[9].u8Source = 23; /* IRQ23 */
+ isos[9].u32GSI = 23; /* connected to pin 23 */
+ isos[9].u16Flags = 0xd; /* active high, level triggered */
+
+ Assert(NUMBER_OF_IRQ_SOURCE_OVERRIDES == 10);
+#else
Assert(NUMBER_OF_IRQ_SOURCE_OVERRIDES == 2);
+#endif
madt.header_addr()->u8Checksum = acpiR3Checksum(madt.data(), madt.size());
acpiR3PhysCopy(pDevIns, addr, madt.data(), madt.size());
diff --git a/src/VBox/Devices/PC/DevIoApic.cpp b/src/VBox/Devices/PC/DevIoApic.cpp
index 2dd37c2..796b539 100644
--- a/src/VBox/Devices/PC/DevIoApic.cpp
+++ b/src/VBox/Devices/PC/DevIoApic.cpp
@@ -32,6 +32,14 @@
#define LOG_GROUP LOG_GROUP_DEV_IOAPIC
#include <VBox/log.h>
#include <VBox/vmm/hm.h>
+
+#ifdef VBOX_WITH_KVM
+#include <VBox/vmm/nem.h>
+#ifdef IN_RING3
+#include <vector>
+#endif
+#endif
+
#include <VBox/msi.h>
#include <VBox/pci.h>
#include <VBox/vmm/pdmdev.h>
@@ -40,7 +48,6 @@
#include <iprt/x86.h>
#include <iprt/string.h>
-
/*********************************************************************************************************************************
* Defined Constants And Macros *
*********************************************************************************************************************************/
@@ -68,6 +75,10 @@ Controller" */
/** The number of interrupt input pins. */
#define IOAPIC_NUM_INTR_PINS 24
+
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+AssertCompile(IOAPIC_NUM_INTR_PINS == KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS);
+#endif
/** Maximum redirection entires. */
#define IOAPIC_MAX_RTE_INDEX (IOAPIC_NUM_INTR_PINS - 1)
/** Reduced RTEs used by SIO.A (82379AB). */
@@ -340,6 +351,19 @@ typedef struct IOAPIC
#endif
/** Per-vector stats. */
STAMCOUNTER aStatVectors[256];
+
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL) && defined(IN_RING3)
+ /** Handle to the timer that is used for delayed IRQ injection */
+ TMTIMERHANDLE hIoapicDelayedInjectionHandler;
+
+ /** List of PINs that need delayed injection handling, protected by IOAPIC_LOCK */
+ std::vector<uint8_t> delayed_interrupt_list;
+
+ /** A per-GSI counter that is increased whenever a level triggered interrupt is
+ instantly pending following an EOI. The counter is reset to zero when no
+ interrupt is pending following an EOI. */
+ uint64_t gsi_counter[IOAPIC_NUM_INTR_PINS] {};
+#endif
} IOAPIC;
AssertCompileMemberAlignment(IOAPIC, au64RedirTable, 8);
/** Pointer to shared IOAPIC data. */
@@ -572,6 +596,35 @@ DECLINLINE(void) ioapicGetMsiFromRte(uint64_t u64Rte, IOAPICTYPE enmType, PMSIMS
#endif
+static bool handlePossibleInterruptStorm(PPDMDEVINS pDevIns, PIOAPIC pThis, unsigned idxRte)
+{
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL) && defined(IN_RING3)
+
+ /** There are buggy drivers that do not clear all interrupt conditions before sending an EOI to the IOAPIC.
+ On real HW, such drivers make slow foward progress because the IOAPIC needs a few cycles the next interrupt
+ is injected after an EOI. If we detect this situation, delay the interrupt and give the guest driver the
+ opportunity to fix this mess. */
+
+ static constexpr uint64_t NUM_EXCESSIVE_INTERRUPTS {10000};
+ if (++pThis->gsi_counter[idxRte] == NUM_EXCESSIVE_INTERRUPTS) {
+ LogRel(("Interrupt storm on GSI %d, delaying injection\n", idxRte));
+
+ // Reset our counter so the next injection of this GSI succeeds.
+ pThis->gsi_counter[idxRte] = 0;
+
+ // Remember which GSI we have to raise after our delay.
+ pThis->delayed_interrupt_list.push_back(idxRte);
+
+ // Arm the delayed injection handler.
+ PDMDevHlpTimerSetMillies(pDevIns, pThis->hIoapicDelayedInjectionHandler, 100 /* ms */);
+ return true;
+ }
+#else
+ NOREF(pDevIns); NOREF(pThis); NOREF(idxRte);
+#endif
+
+ return false;
+}
/**
* Signals the next pending interrupt for the specified Redirection Table Entry
* (RTE).
@@ -608,6 +661,10 @@ static void ioapicSignalIntrForRte(PPDMDEVINS pDevIns, PIOAPIC pThis, PIOAPICCC
STAM_COUNTER_INC(&pThis->StatSuppressedLevelIntr);
return;
}
+
+ if (handlePossibleInterruptStorm(pDevIns, pThis, idxRte)) {
+ return;
+ }
}
XAPICINTR ApicIntr;
@@ -655,6 +712,11 @@ static void ioapicSignalIntrForRte(PPDMDEVINS pDevIns, PIOAPIC pThis, PIOAPICCC
}
#endif
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+ AssertReleaseMsg(rcRemap == VERR_IOMMU_NOT_PRESENT || rcRemap == VERR_IOMMU_CANNOT_CALL_SELF,
+ ("Interrupt remapping not supported yet."));
+ int rc = pThisCC->pIoApicHlp->pfnKvmSplitIrqchipDeliverMsi(pDevIns, &MsiIn);
+#else
uint32_t const u32TagSrc = pThis->au32TagSrc[idxRte];
Log2(("IOAPIC: Signaling %s-triggered interrupt. Dest=%#x DestMode=%s Vector=%#x (%u)\n",
ApicIntr.u8TriggerMode == IOAPIC_RTE_TRIGGER_MODE_EDGE ? "edge" : "level", ApicIntr.u8Dest,
@@ -672,6 +734,7 @@ static void ioapicSignalIntrForRte(PPDMDEVINS pDevIns, PIOAPIC pThis, PIOAPICCC
ApicIntr.u8Polarity,
ApicIntr.u8TriggerMode,
u32TagSrc);
+#endif
/* Can't reschedule to R3. */
Assert(rc == VINF_SUCCESS || rc == VERR_APIC_INTR_DISCARDED);
#ifdef DEBUG_ramshankar
@@ -781,6 +844,16 @@ static VBOXSTRICTRC ioapicSetRedirTableEntry(PPDMDEVINS pDevIns, PIOAPIC pThis,
LogFlow(("IOAPIC: ioapicSetRedirTableEntry: uIndex=%#RX32 idxRte=%u uValue=%#RX32\n", uIndex, idxRte, uValue));
+#if defined(VBOX_WITH_KVM) && defined(IN_RING3) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+ const uint64_t u64RteNew { pThis->au64RedirTable[idxRte] };
+ if (not IOAPIC_RTE_IS_MASKED(u64RteNew)) {
+ MSIMSG msi;
+ RT_ZERO(msi);
+ ioapicGetMsiFromRte(u64RteNew, pThis->enmType, &msi);
+ rc = pThisCC->pIoApicHlp->pfnKvmSplitIrqchipAddUpdateRTE(pDevIns, idxRte, &msi);
+ }
+#endif
+
/*
* Signal the next pending interrupt for this RTE.
*/
@@ -790,7 +863,6 @@ static VBOXSTRICTRC ioapicSetRedirTableEntry(PPDMDEVINS pDevIns, PIOAPIC pThis,
LogFlow(("IOAPIC: ioapicSetRedirTableEntry: Signalling pending interrupt. idxRte=%u\n", idxRte));
ioapicSignalIntrForRte(pDevIns, pThis, pThisCC, idxRte);
}
-
IOAPIC_UNLOCK(pDevIns, pThis, pThisCC);
}
else
@@ -947,6 +1019,15 @@ static DECLCALLBACK(void) ioapicSetIrq(PPDMDEVINS pDevIns, PCIBDF uBusDevFn, int
PIOAPIC pThis = PDMDEVINS_2_DATA(pDevIns, PIOAPIC);
PIOAPICCC pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PIOAPICCC);
LogFlow(("IOAPIC: ioapicSetIrq: iIrq=%d iLevel=%d uTagSrc=%#x\n", iIrq, iLevel, uTagSrc));
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+ pThisCC->pIoApicHlp->pfnKvmSetIrqLine(pDevIns, iIrq, iLevel & PDM_IRQ_LEVEL_HIGH);
+
+ if ((iLevel & PDM_IRQ_LEVEL_FLIP_FLOP) == PDM_IRQ_LEVEL_FLIP_FLOP) {
+ pThisCC->pIoApicHlp->pfnKvmSetIrqLine(pDevIns, iIrq, PDM_IRQ_LEVEL_LOW);
+ }
+
+ return;
+#endif
STAM_COUNTER_INC(&pThis->CTX_SUFF_Z(StatSetIrq));
@@ -969,6 +1050,9 @@ static DECLCALLBACK(void) ioapicSetIrq(PPDMDEVINS pDevIns, PCIBDF uBusDevFn, int
#endif
if (!fActive)
{
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL) && defined(IN_RING3)
+ pThis->gsi_counter[idxRte] = 0;
+#endif
pThis->uIrr &= ~uPinMask;
pThis->au32TagSrc[idxRte] = 0;
IOAPIC_UNLOCK(pDevIns, pThis, pThisCC);
@@ -1087,7 +1171,11 @@ static DECLCALLBACK(void) ioapicSendMsi(PPDMDEVINS pDevIns, PCIBDF uBusDevFn, PC
#else
NOREF(uBusDevFn);
#endif
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+ int rc = pThisCC->pIoApicHlp->pfnKvmSplitIrqchipDeliverMsi(pDevIns, pMsi);
+ AssertReleaseMsg(rc == VINF_SUCCESS || rc == VERR_APIC_INTR_DISCARDED, ("ioapicSendMsi: Could not deliver MSI! error %d\n", rc));
+#else
ioapicGetApicIntrFromMsi(pMsi, &ApicIntr);
/*
@@ -1105,6 +1193,7 @@ static DECLCALLBACK(void) ioapicSendMsi(PPDMDEVINS pDevIns, PCIBDF uBusDevFn, PC
uTagSrc);
/* Can't reschedule to R3. */
Assert(rc == VINF_SUCCESS || rc == VERR_APIC_INTR_DISCARDED); NOREF(rc);
+#endif
}
@@ -1451,10 +1540,33 @@ static DECLCALLBACK(void) ioapicR3DbgInfo(PPDMDEVINS pDevIns, PCDBGFINFOHLP pHlp
*/
static DECLCALLBACK(int) ioapicR3SaveExec(PPDMDEVINS pDevIns, PSSMHANDLE pSSM)
{
- PCIOAPIC pThis = PDMDEVINS_2_DATA(pDevIns, PCIOAPIC);
+ PIOAPIC pThis = PDMDEVINS_2_DATA(pDevIns, PIOAPIC);
PCPDMDEVHLPR3 pHlp = pDevIns->pHlpR3;
LogFlow(("IOAPIC: ioapicR3SaveExec\n"));
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+ PIOAPICCC pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PIOAPICCC);
+ KVMIOAPICSTATE kvm_ioapic_state;
+
+ for (unsigned pic = 0; pic < 2; ++pic) {
+ int rc = pThisCC->pIoApicHlp->pfnKvmGetIoApicState(pDevIns, &kvm_ioapic_state);
+ AssertLogRelMsg(RT_SUCCESS(rc), ("Unable to retrieve IOPIC state from KVM"));
+
+ /**
+ * There's no need to look at kvm_ioapic_state.base_address because
+ * VBox does not support IOAPIC relocation, thus, it will always be
+ * at IOAPIC_MMIO_BASE_PHYSADDR.
+ */
+ pThis->uIrr = kvm_ioapic_state.irr;
+ pThis->u8Id = kvm_ioapic_state.id;
+ pThis->u8Index = kvm_ioapic_state.ioregsel;
+
+ for (uint8_t idxRte = 0; idxRte < RT_ELEMENTS(pThis->au64RedirTable); idxRte++) {
+ pThis->au64RedirTable[idxRte] = kvm_ioapic_state.redirtbl[idxRte];
+ }
+ }
+#endif
+
pHlp->pfnSSMPutU32(pSSM, pThis->uIrr);
pHlp->pfnSSMPutU8(pSSM, pThis->u8Id);
pHlp->pfnSSMPutU8(pSSM, pThis->u8Index);
@@ -1497,6 +1609,39 @@ static DECLCALLBACK(int) ioapicR3LoadExec(PPDMDEVINS pDevIns, PSSMHANDLE pSSM, u
for (uint8_t idxRte = 0; idxRte < RT_ELEMENTS(pThis->au64RedirTable); idxRte++)
pHlp->pfnSSMGetU64(pSSM, &pThis->au64RedirTable[idxRte]);
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+ PIOAPICCC pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PIOAPICCC);
+ for (uint8_t idxRte = 0; idxRte < RT_ELEMENTS(pThis->au64RedirTable); idxRte++) {
+ const uint64_t u64RteNew { pThis->au64RedirTable[idxRte] };
+ if (not IOAPIC_RTE_IS_MASKED(u64RteNew) and (IOAPIC_RTE_GET_TRIGGER_MODE(u64RteNew) != IOAPIC_RTE_TRIGGER_MODE_EDGE)) {
+ MSIMSG msi;
+ RT_ZERO(msi);
+ ioapicGetMsiFromRte(u64RteNew, pThis->enmType, &msi);
+ int rc = pThisCC->pIoApicHlp->pfnKvmSplitIrqchipAddUpdateRTE(pDevIns, idxRte, &msi);
+ AssertLogRelMsg(RT_SUCCESS(rc), ("Adding redirection table entry failed."));
+ }
+ }
+#endif
+
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+ PIOAPICCC pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PIOAPICCC);
+ KVMIOAPICSTATE kvm_ioapic_state;
+
+ for (unsigned pic = 0; pic < 2; ++pic) {
+ kvm_ioapic_state.base_address = IOAPIC_MMIO_BASE_PHYSADDR;
+ kvm_ioapic_state.irr = pThis->uIrr;
+ kvm_ioapic_state.id = pThis->u8Id;
+ kvm_ioapic_state.ioregsel = pThis->u8Index;
+
+ for (uint8_t idxRte = 0; idxRte < RT_ELEMENTS(pThis->au64RedirTable); idxRte++) {
+ kvm_ioapic_state.redirtbl[idxRte] = pThis->au64RedirTable[idxRte];
+ }
+
+ int rc = pThisCC->pIoApicHlp->pfnKvmSetIoApicState(pDevIns, &kvm_ioapic_state);
+ AssertLogRelMsg(RT_SUCCESS(rc), ("Unable to retrieve IOPIC state from KVM"));
+ }
+#endif
+
if (uVersion > IOAPIC_SAVED_STATE_VERSION_NO_FLIPFLOP_MAP)
for (uint8_t idx = 0; idx < RT_ELEMENTS(pThis->bmFlipFlop); idx++)
pHlp->pfnSSMGetU64(pSSM, &pThis->bmFlipFlop[idx]);
@@ -1525,6 +1670,10 @@ static DECLCALLBACK(void) ioapicR3Reset(PPDMDEVINS pDevIns)
{
pThis->au64RedirTable[idxRte] = IOAPIC_RTE_MASK;
pThis->au32TagSrc[idxRte] = 0;
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+ int rc = pThisCC->pIoApicHlp->pfnKvmSplitIrqchipRemoveRTE(pDevIns, idxRte);
+ AssertLogRelMsg(RT_SUCCESS(rc), ("Removing redirection table entry failed."));
+#endif
}
IOAPIC_UNLOCK(pDevIns, pThis, pThisCC);
@@ -1552,6 +1701,10 @@ static DECLCALLBACK(int) ioapicR3Destruct(PPDMDEVINS pDevIns)
PIOAPIC pThis = PDMDEVINS_2_DATA(pDevIns, PIOAPIC);
LogFlow(("IOAPIC: ioapicR3Destruct: pThis=%p\n", pThis));
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL) && defined(IN_RING3)
+ PDMDevHlpTimerDestroy(pDevIns, pThis->hIoapicDelayedInjectionHandler);
+#endif
+
# ifndef IOAPIC_WITH_PDM_CRITSECT
/*
* Destroy the RTE critical section.
@@ -1565,6 +1718,26 @@ static DECLCALLBACK(int) ioapicR3Destruct(PPDMDEVINS pDevIns)
return VINF_SUCCESS;
}
+static DECLCALLBACK(void) ioapicDelayedInjectionHandler(PPDMDEVINS pDevIns, TMTIMERHANDLE hTimer, void *pvUser)
+{
+ NOREF(hTimer); NOREF(pvUser);
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL) && defined(IN_RING3)
+ PIOAPIC pThis = PDMDEVINS_2_DATA(pDevIns, PIOAPIC);
+ PIOAPICCC pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PIOAPICCC);
+
+ IOAPIC_LOCK(pDevIns, pThis, pThisCC, VERR_IGNORED);
+
+ for(auto iPin : pThis->delayed_interrupt_list) {
+ ioapicSignalIntrForRte(pDevIns, pThis, pThisCC, iPin);
+ }
+
+ pThis->delayed_interrupt_list.clear();
+
+ IOAPIC_UNLOCK(pDevIns, pThis, pThisCC);
+#else
+ NOREF(pDevIns);
+#endif
+}
/**
* @interface_method_impl{PDMDEVREG,pfnConstruct}
@@ -1578,6 +1751,12 @@ static DECLCALLBACK(int) ioapicR3Construct(PPDMDEVINS pDevIns, int iInstance, PC
LogFlow(("IOAPIC: ioapicR3Construct: pThis=%p iInstance=%d\n", pThis, iInstance));
Assert(iInstance == 0); RT_NOREF(iInstance);
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL) && defined(IN_RING3)
+ int rc_timer = PDMDevHlpTimerCreate(pDevIns, TMCLOCK_VIRTUAL, ioapicDelayedInjectionHandler, pThis,
+ TMTIMER_FLAGS_NO_CRIT_SECT | TMTIMER_FLAGS_NO_RING0, "IOAPIC Delayed IRQ", &pThis->hIoapicDelayedInjectionHandler);
+ AssertRCReturn(rc_timer, rc_timer);
+#endif
+
/*
* Validate and read the configuration.
*/
diff --git a/src/VBox/Devices/PC/DevPIC.cpp b/src/VBox/Devices/PC/DevPIC.cpp
index 4ad8d83..651b706 100644
--- a/src/VBox/Devices/PC/DevPIC.cpp
+++ b/src/VBox/Devices/PC/DevPIC.cpp
@@ -366,6 +366,16 @@ static DECLCALLBACK(void) picSetIrq(PPDMDEVINS pDevIns, int iIrq, int iLevel, ui
{
PDEVPIC pThis = PDMDEVINS_2_DATA(pDevIns, PDEVPIC);
PDEVPICCC pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PDEVPICCC);
+
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+ pThisCC->pPicHlp->pfnKvmSetIrqLine(pDevIns, iIrq, iLevel & PDM_IRQ_LEVEL_HIGH);
+
+ if ((iLevel & PDM_IRQ_LEVEL_FLIP_FLOP) == PDM_IRQ_LEVEL_FLIP_FLOP) {
+ pThisCC->pPicHlp->pfnKvmSetIrqLine(pDevIns, iIrq, PDM_IRQ_LEVEL_LOW);
+ }
+
+ return;
+#else
AssertMsgReturnVoid(iIrq < 16, ("iIrq=%d\n", iIrq));
Log(("picSetIrq %d %d\n", iIrq, iLevel));
@@ -383,6 +393,7 @@ static DECLCALLBACK(void) picSetIrq(PPDMDEVINS pDevIns, int iIrq, int iLevel, ui
}
pic_set_irq1(&RT_SAFE_SUBSCRIPT(pThis->aPics, iIrq >> 3), iIrq & 7, iLevel & PDM_IRQ_LEVEL_HIGH, uTagSrc);
pic_update_irq(pDevIns, pThis, pThisCC);
+#endif
}
@@ -830,6 +841,33 @@ static DECLCALLBACK(int) picR3SaveExec(PPDMDEVINS pDevIns, PSSMHANDLE pSSM)
PDEVPIC pThis = PDMDEVINS_2_DATA(pDevIns, PDEVPIC);
PCPDMDEVHLPR3 pHlp = pDevIns->pHlpR3;
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+ PDEVPICCC pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PDEVPICCC);
+ KVMPICSTATE kvm_pic_state;
+
+ for (unsigned pic = 0; pic < 2; ++pic) {
+ int rc = pThisCC->pPicHlp->pfnKvmGetPicState(pDevIns, pic == 0 ? KVMIRQCHIP::PIC_MASTER : KVMIRQCHIP::PIC_SLAVE, &kvm_pic_state);
+ AssertLogRelMsg(RT_SUCCESS(rc), ("Unable to retrieve PIC state from KVM"));
+
+ pThis->aPics[pic].last_irr = kvm_pic_state.last_irr;
+ pThis->aPics[pic].irr = kvm_pic_state.irr;
+ pThis->aPics[pic].imr = kvm_pic_state.imr;
+ pThis->aPics[pic].isr = kvm_pic_state.isr;
+ pThis->aPics[pic].priority_add = kvm_pic_state.priority_add;
+ pThis->aPics[pic].irq_base = kvm_pic_state.irq_base;
+ pThis->aPics[pic].read_reg_select = kvm_pic_state.read_reg_select;
+ pThis->aPics[pic].poll = kvm_pic_state.poll;
+ pThis->aPics[pic].special_mask = kvm_pic_state.special_mask;
+ pThis->aPics[pic].init_state = kvm_pic_state.init_state;
+ pThis->aPics[pic].auto_eoi = kvm_pic_state.auto_eoi;
+ pThis->aPics[pic].rotate_on_auto_eoi = kvm_pic_state.rotate_on_auto_eoi;
+ pThis->aPics[pic].special_fully_nested_mode = kvm_pic_state.special_fully_nested_mode;
+ pThis->aPics[pic].init4 = kvm_pic_state.init4;
+ pThis->aPics[pic].elcr = kvm_pic_state.elcr;
+ pThis->aPics[pic].elcr_mask = kvm_pic_state.elcr_mask;
+ }
+#endif
+
for (unsigned i = 0; i < RT_ELEMENTS(pThis->aPics); i++)
{
pHlp->pfnSSMPutU8(pSSM, pThis->aPics[i].last_irr);
@@ -883,6 +921,33 @@ static DECLCALLBACK(int) picR3LoadExec(PPDMDEVINS pDevIns, PSSMHANDLE pSSM, uint
pHlp->pfnSSMGetU8(pSSM, &pThis->aPics[i].elcr);
}
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+ PDEVPICCC pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PDEVPICCC);
+ KVMPICSTATE kvm_pic_state;
+
+ for (unsigned pic = 0; pic < 2; ++pic) {
+ kvm_pic_state.last_irr = pThis->aPics[pic].last_irr;
+ kvm_pic_state.irr = pThis->aPics[pic].irr;
+ kvm_pic_state.imr = pThis->aPics[pic].imr;
+ kvm_pic_state.isr = pThis->aPics[pic].isr;
+ kvm_pic_state.priority_add = pThis->aPics[pic].priority_add;
+ kvm_pic_state.irq_base = pThis->aPics[pic].irq_base;
+ kvm_pic_state.read_reg_select = pThis->aPics[pic].read_reg_select;
+ kvm_pic_state.poll = pThis->aPics[pic].poll;
+ kvm_pic_state.special_mask = pThis->aPics[pic].special_mask;
+ kvm_pic_state.init_state = pThis->aPics[pic].init_state;
+ kvm_pic_state.auto_eoi = pThis->aPics[pic].auto_eoi;
+ kvm_pic_state.rotate_on_auto_eoi = pThis->aPics[pic].rotate_on_auto_eoi;
+ kvm_pic_state.special_fully_nested_mode = pThis->aPics[pic].special_fully_nested_mode;
+ kvm_pic_state.init4 = pThis->aPics[pic].init4;
+ kvm_pic_state.elcr = pThis->aPics[pic].elcr;
+ kvm_pic_state.elcr_mask = pThis->aPics[pic].elcr_mask;
+
+ int rc = pThisCC->pPicHlp->pfnKvmSetPicState(pDevIns, pic == 0 ? KVMIRQCHIP::PIC_MASTER : KVMIRQCHIP::PIC_SLAVE, &kvm_pic_state);
+ AssertLogRelMsg(RT_SUCCESS(rc), ("Unable to push PIC state to KVM"));
+ }
+#endif
+
/* Note! PDM will restore the VMCPU_FF_INTERRUPT_PIC state. */
return VINF_SUCCESS;
}
diff --git a/src/VBox/HostDrivers/Support/Makefile.kmk b/src/VBox/HostDrivers/Support/Makefile.kmk
index 48a28d3..d4032db 100644
--- a/src/VBox/HostDrivers/Support/Makefile.kmk
+++ b/src/VBox/HostDrivers/Support/Makefile.kmk
@@ -196,6 +196,7 @@ SUPR3_DEFS = \
$(if $(VBOX_WITH_RAW_MODE),VBOX_WITH_RAW_MODE,) \
$(if $(VBOX_WITH_DRIVERLESS_NEM_FALLBACK),VBOX_WITH_DRIVERLESS_NEM_FALLBACK,) \
$(if $(VBOX_WITH_R0_MODULES),VBOX_WITH_R0_MODULES,) \
+ $(if $(VBOX_WITH_PREALLOC_RAM_BY_DEFAULT),VBOX_WITH_PREALLOC_RAM_BY_DEFAULT,) \
VBOX_PERMIT_MORE \
VBOX_PERMIT_EVEN_MORE
SUPR3_INCS := $(PATH_SUB_CURRENT)
diff --git a/src/VBox/HostDrivers/Support/linux/SUPLib-linux.cpp b/src/VBox/HostDrivers/Support/linux/SUPLib-linux.cpp
index 2591661..3b8a6e2 100644
--- a/src/VBox/HostDrivers/Support/linux/SUPLib-linux.cpp
+++ b/src/VBox/HostDrivers/Support/linux/SUPLib-linux.cpp
@@ -96,6 +96,11 @@ DECLHIDDEN(int) suplibOsInit(PSUPLIBDATA pThis, bool fPreInited, uint32_t fFlags
return VINF_SUCCESS;
Assert(pThis->hDevice == (intptr_t)NIL_RTFILE);
+#ifdef VBOX_WITH_KVM
+ pThis->fDriverless = true;
+ return VINF_SUCCESS;
+#endif
+
/*
* Check if madvise works.
*/
@@ -256,10 +261,15 @@ DECLHIDDEN(int) suplibOsPageAlloc(PSUPLIBDATA pThis, size_t cPages, uint32_t fFl
fMmap |= MAP_HUGETLB;
#endif
+#ifdef VBOX_WITH_PREALLOC_RAM_BY_DEFAULT
+ fMmap |= MAP_POPULATE;
+#endif
+
uint32_t const cbPage = SUP_PAGE_SIZE;
uint32_t const cPageShift = SUP_PAGE_SHIFT;
size_t cbMmap = cPages << cPageShift;
+
if ( !pThis->fSysMadviseWorks
&& (fFlags & (SUP_PAGE_ALLOC_F_FOR_LOCKING | SUP_PAGE_ALLOC_F_LARGE_PAGES)) == SUP_PAGE_ALLOC_F_FOR_LOCKING)
cbMmap += cbPage * 2;
diff --git a/src/VBox/Main/Makefile.kmk b/src/VBox/Main/Makefile.kmk
index 2aeba32..a44cb53 100644
--- a/src/VBox/Main/Makefile.kmk
+++ b/src/VBox/Main/Makefile.kmk
@@ -1114,7 +1114,8 @@ if !defined(VBOX_ONLY_SDK) && !defined(VBOX_ONLY_EXTPACKS) # Note this goes on f
VBoxC_LIBS += \
- $(PATH_STAGE_LIB)/VBoxAPIWrap$(VBOX_SUFF_LIB)
+ $(PATH_STAGE_LIB)/VBoxAPIWrap$(VBOX_SUFF_LIB)
+
VBoxC_LIBS.win += \
$(PATH_SDK_$(VBOX_WINPSDK)_LIB)/psapi.lib \
$(PATH_TOOL_$(VBOX_VCC_TOOL)_LIB)/delayimp.lib
diff --git a/src/VBox/Main/src-server/HostImpl.cpp b/src/VBox/Main/src-server/HostImpl.cpp
index 7cbd92b..2ab5c17 100644
--- a/src/VBox/Main/src-server/HostImpl.cpp
+++ b/src/VBox/Main/src-server/HostImpl.cpp
@@ -82,6 +82,8 @@
# include <errno.h>
# include <net/if.h>
# include <net/if_arp.h>
+# include <fcntl.h>
+# include <unistd.h>
#endif /* RT_OS_LINUX */
#ifdef RT_OS_SOLARIS
diff --git a/src/VBox/Runtime/Makefile.kmk b/src/VBox/Runtime/Makefile.kmk
index f2c2498..22cbed4 100644
--- a/src/VBox/Runtime/Makefile.kmk
+++ b/src/VBox/Runtime/Makefile.kmk
@@ -3307,8 +3307,8 @@ if1of ($(KBUILD_TARGET).$(KBUILD_TARGET_ARCH), win.x86 win.amd64 linux.amd64 dar
$(if-expr "$(KBUILD_TARGET_ARCH)" == "amd64",-e "/not-amd64/d",-e "/only-amd64/d") \
$(if-expr "$(KBUILD_TARGET_ARCH)" == "arm64",-e "/not-arm64/d",-e "/only-arm64/d") \
$(if-expr "$(KBUILD_TARGET).$(KBUILD_TARGET_ARCH)" == "darwin.arm64",, -e "/only-darwin.arm64/d") \
- $(if-expr "$(substr $(if-expr $(KBUILD_TARGET) != 'win',$(VBOX_GCC_std), $(VBOX_VCC_std)),-2)" >= "17" \
- ,-e "/before-noexcept/d", -e "/after-noexcept/d") \
+ $(if-expr "$(VBOX_WITH_KVM)" != "1", $(if-expr "$(substr $(if-expr $(KBUILD_TARGET) != 'win',$(VBOX_GCC_std), $(VBOX_VCC_std)),-2)" >= "17" \
+ ,-e "/before-noexcept/d", -e "/after-noexcept/d"), -e "/after-noexcept/d") \
$(if-expr $(intersects $(KBUILD_TARGET), linux) && $(intersects $(KBUILD_TARGET_ARCH), amd64 arm64) \
,-e "/int64=llong/d", -e "/int64=long/d") \
-f "$<" $(filter %.def, $^)
diff --git a/src/VBox/Runtime/r3/posix/thread-posix.cpp b/src/VBox/Runtime/r3/posix/thread-posix.cpp
index 8b05377..70202f7 100644
--- a/src/VBox/Runtime/r3/posix/thread-posix.cpp
+++ b/src/VBox/Runtime/r3/posix/thread-posix.cpp
@@ -729,6 +729,10 @@ RTDECL(int) RTThreadControlPokeSignal(RTTHREAD hThread, bool fEnable)
return rc;
}
+RTDECL(int) RTThreadPokeSignal(void)
+{
+ return g_iSigPokeThread;
+}
#endif
diff --git a/src/VBox/Runtime/testcase/Makefile.kmk b/src/VBox/Runtime/testcase/Makefile.kmk
index 5fa0a11..6b5821a 100644
--- a/src/VBox/Runtime/testcase/Makefile.kmk
+++ b/src/VBox/Runtime/testcase/Makefile.kmk
@@ -610,6 +610,7 @@ ifdef VBOX_WITH_TESTCASES # The whole file
tstLog_CLEAN = $(tstLog_0_OUTDIR)/tstLogGroups.h
$$(tstLog_0_OUTDIR)/tstLogGroups.h: $(PATH_ROOT)/include/VBox/log.h
$(call MSG_GENERATE,,$@,$<)
+ $(QUIET)$(MKDIR) -p $(tstLog_0_OUTDIR)
$(QUIET)$(RM) -f -- "$@"
$(QUIET)$(SED) -n -e 's/^ *LOG_GROUP_\([A-Z0-9_]*\),.*$(DOLLAR)/{ LOG_GROUP_\1, "\1" },/p' --output "$@" "$<"
endif # !VBOX_ONLY_VALIDATIONKIT
diff --git a/src/VBox/VMM/Makefile.kmk b/src/VBox/VMM/Makefile.kmk
index 6cd7d4e..087819c 100644
--- a/src/VBox/VMM/Makefile.kmk
+++ b/src/VBox/VMM/Makefile.kmk
@@ -147,7 +147,8 @@ VBoxVMM_SOURCES = \
VMMR3/EMR3Nem.cpp \
VMMR3/GCM.cpp \
VMMR3/GIM.cpp \
- VMMR3/GIMHv.cpp \
+ $(if-expr !defined(VBOX_WITH_KVM), VMMR3/GIMHv.cpp,) \
+ $(if-expr defined(VBOX_WITH_KVM), VMMR3/GIMHvOnKvm.cpp,) \
VMMR3/GIMKvm.cpp \
VMMR3/GIMMinimal.cpp \
VMMR3/IEMR3.cpp \
@@ -237,7 +238,8 @@ VBoxVMM_SOURCES = \
VMMAll/EMAll.cpp \
VMMAll/GCMAll.cpp \
VMMAll/GIMAll.cpp \
- VMMAll/GIMAllHv.cpp \
+ $(if-expr !defined(VBOX_WITH_KVM), VMMAll/GIMAllHv.cpp,) \
+ $(if-expr defined(VBOX_WITH_KVM), VMMAll/GIMAllHvOnKvm.cpp,) \
VMMAll/GIMAllKvm.cpp \
VMMAll/TMAll.cpp \
VMMAll/TMAllCpu.cpp \
diff --git a/src/VBox/VMM/VMMAll/APICAll.cpp b/src/VBox/VMM/VMMAll/APICAll.cpp
index 192e824..6a2e63d 100644
--- a/src/VBox/VMM/VMMAll/APICAll.cpp
+++ b/src/VBox/VMM/VMMAll/APICAll.cpp
@@ -2654,6 +2654,16 @@ static DECLCALLBACK(VBOXSTRICTRC) apicSetLocalInterrupt(PVMCPUCC pVCpu, uint8_t
AssertReturn(u8Level <= 1, VERR_INVALID_PARAMETER);
VBOXSTRICTRC rcStrict = VINF_SUCCESS;
+#ifdef VBOX_WITH_KVM
+ /* TODO: Fix the local interrupt handling. See vbox-engineering#430. */
+ if (u8Level) {
+ apicSetInterruptFF(pVCpu, PDMAPICIRQ_EXTINT);
+ } else {
+ apicClearInterruptFF(pVCpu, PDMAPICIRQ_EXTINT);
+ }
+
+ return VINF_SUCCESS;
+#endif
/* If the APIC is enabled, the interrupt is subject to LVT programming. */
if (apicIsEnabled(pVCpu))
diff --git a/src/VBox/VMM/VMMAll/GIMAllHvOnKvm.cpp b/src/VBox/VMM/VMMAll/GIMAllHvOnKvm.cpp
new file mode 100644
index 0000000..f45a2d7
--- /dev/null
+++ b/src/VBox/VMM/VMMAll/GIMAllHvOnKvm.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) Cyberus Technology GmbH.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ *
+ * SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#define LOG_GROUP LOG_GROUP_GIM
+#include <VBox/vmm/dbgf.h>
+#include <VBox/vmm/gim.h>
+#include "GIMInternal.h"
+#include <VBox/vmm/vm.h>
+
+#include <VBox/err.h>
+
+#include <iprt/assert.h>
+
+/**
+ * With GIMHvOnKvm, userspace does not need to do any HyperV emulation because
+ * it all happens inside the kernel module. These stubs are merely here to make
+ * GIM.cpp happy.
+ */
+
+VMM_INT_DECL(void) gimHvStartStimer(PVMCPUCC pVCpu, PCGIMHVSTIMER pHvStimer)
+{
+ NOREF(pVCpu); NOREF(pHvStimer);
+ AssertLogRelMsg(false, ("%s", __PRETTY_FUNCTION__));
+}
+
+VMM_INT_DECL(VBOXSTRICTRC) gimHvHypercall(PVMCPUCC pVCpu, PCPUMCTX pCtx)
+{
+ NOREF(pVCpu); NOREF(pCtx);
+ AssertLogRelMsgReturn(false, ("%s", __PRETTY_FUNCTION__), VERR_NOT_SUPPORTED);
+}
+
+VMM_INT_DECL(VBOXSTRICTRC) gimHvHypercallEx(PVMCPUCC pVCpu, PCPUMCTX pCtx, unsigned uDisOpcode, uint8_t cbInstr)
+{
+ NOREF(pVCpu); NOREF(pCtx); NOREF(uDisOpcode); NOREF(cbInstr);
+ AssertLogRelMsgReturn(false, ("%s", __PRETTY_FUNCTION__), VERR_NOT_SUPPORTED);
+}
+
+VMM_INT_DECL(PGIMMMIO2REGION) gimHvGetMmio2Regions(PVM pVM, uint32_t *pcRegions)
+{
+ NOREF(pVM); NOREF(pcRegions);
+ return nullptr;
+}
+
+VMM_INT_DECL(bool) gimHvAreHypercallsEnabled(PCVM pVM)
+{
+ NOREF(pVM);
+ return false;
+}
+
+VMM_INT_DECL(bool) gimHvIsParavirtTscEnabled(PVM pVM)
+{
+ NOREF(pVM);
+ return false;
+}
+
+VMM_INT_DECL(bool) gimHvShouldTrapXcptUD(PVMCPU pVCpu)
+{
+ NOREF(pVCpu);
+ return false;
+}
+
+VMM_INT_DECL(VBOXSTRICTRC) gimHvXcptUD(PVMCPUCC pVCpu, PCPUMCTX pCtx, PDISSTATE pDis, uint8_t *pcbInstr)
+{
+ NOREF(pVCpu); NOREF(pCtx); NOREF(pDis); NOREF(pcbInstr);
+ AssertLogRelMsgReturn(false, ("%s", __PRETTY_FUNCTION__), VERR_NOT_SUPPORTED);
+}
+
+VMM_INT_DECL(VBOXSTRICTRC) gimHvReadMsr(PVMCPUCC pVCpu, uint32_t idMsr, PCCPUMMSRRANGE pRange, uint64_t *puValue)
+{
+ NOREF(pRange);
+
+ PVMCC pVM = pVCpu->CTX_SUFF(pVM);
+ PCGIMHV pHv = &pVM->gim.s.u.Hv;
+
+ switch (idMsr)
+ {
+ case MSR_GIM_HV_CRASH_CTL:
+ *puValue = pHv->uCrashCtlMsr;
+ return VINF_SUCCESS;
+
+ case MSR_GIM_HV_CRASH_P0: *puValue = pHv->uCrashP0Msr; return VINF_SUCCESS;
+ case MSR_GIM_HV_CRASH_P1: *puValue = pHv->uCrashP1Msr; return VINF_SUCCESS;
+ case MSR_GIM_HV_CRASH_P2: *puValue = pHv->uCrashP2Msr; return VINF_SUCCESS;
+ case MSR_GIM_HV_CRASH_P3: *puValue = pHv->uCrashP3Msr; return VINF_SUCCESS;
+ case MSR_GIM_HV_CRASH_P4: *puValue = pHv->uCrashP4Msr; return VINF_SUCCESS;
+ default: break;
+ }
+
+ AssertLogRelMsgReturn(false, ("%s", __PRETTY_FUNCTION__), VERR_NOT_SUPPORTED);
+}
+
+VMM_INT_DECL(VBOXSTRICTRC) gimHvWriteMsr(PVMCPUCC pVCpu, uint32_t idMsr, PCCPUMMSRRANGE pRange, uint64_t uRawValue)
+{
+ NOREF(pRange);
+
+ PVMCC pVM = pVCpu->CTX_SUFF(pVM);
+ PGIMHV pHv = &pVM->gim.s.u.Hv;
+
+ switch (idMsr) {
+ case MSR_GIM_HV_CRASH_CTL:
+ {
+ if (uRawValue & MSR_GIM_HV_CRASH_CTL_NOTIFY)
+ {
+ LogRel(("GIM: HyperV: Guest indicates a fatal condition! P0=%#RX64 P1=%#RX64 P2=%#RX64 P3=%#RX64 P4=%#RX64\n",
+ pHv->uCrashP0Msr, pHv->uCrashP1Msr, pHv->uCrashP2Msr, pHv->uCrashP3Msr, pHv->uCrashP4Msr));
+ DBGFR3ReportBugCheck(pVM, pVCpu, DBGFEVENT_BSOD_MSR, pHv->uCrashP0Msr, pHv->uCrashP1Msr,
+ pHv->uCrashP2Msr, pHv->uCrashP3Msr, pHv->uCrashP4Msr);
+ }
+ return VINF_SUCCESS;
+ }
+ case MSR_GIM_HV_CRASH_P0: pHv->uCrashP0Msr = uRawValue; return VINF_SUCCESS;
+ case MSR_GIM_HV_CRASH_P1: pHv->uCrashP1Msr = uRawValue; return VINF_SUCCESS;
+ case MSR_GIM_HV_CRASH_P2: pHv->uCrashP2Msr = uRawValue; return VINF_SUCCESS;
+ case MSR_GIM_HV_CRASH_P3: pHv->uCrashP3Msr = uRawValue; return VINF_SUCCESS;
+ case MSR_GIM_HV_CRASH_P4: pHv->uCrashP4Msr = uRawValue; return VINF_SUCCESS;
+ default: break;
+ }
+
+ AssertLogRelMsgReturn(false, ("%s", __PRETTY_FUNCTION__), VERR_NOT_SUPPORTED);
+}
diff --git a/src/VBox/VMM/VMMAll/PGMAllBth-x86.cpp.h b/src/VBox/VMM/VMMAll/PGMAllBth-x86.cpp.h
index 4a5cb34..45d6eb0 100644
--- a/src/VBox/VMM/VMMAll/PGMAllBth-x86.cpp.h
+++ b/src/VBox/VMM/VMMAll/PGMAllBth-x86.cpp.h
@@ -4981,7 +4981,10 @@ PGM_BTH_DECL(int, MapCR3)(PVMCPUCC pVCpu, RTGCPHYS GCPhysCR3)
|| PGM_GST_TYPE == PGM_TYPE_AMD64
LogFlow(("MapCR3: %RGp\n", GCPhysCR3));
+
+#ifndef VBOX_WITH_KVM_IRQCHIP_FULL
PGM_A20_ASSERT_MASKED(pVCpu, GCPhysCR3);
+#endif
# if PGM_GST_TYPE == PGM_TYPE_PAE
if ( !pVCpu->pgm.s.CTX_SUFF(fPaePdpesAndCr3Mapped)
diff --git a/src/VBox/VMM/VMMAll/TMAll.cpp b/src/VBox/VMM/VMMAll/TMAll.cpp
index 21adc11..055d821 100644
--- a/src/VBox/VMM/VMMAll/TMAll.cpp
+++ b/src/VBox/VMM/VMMAll/TMAll.cpp
@@ -211,6 +211,10 @@ VMMDECL(void) TMNotifyEndOfExecution(PVMCC pVM, PVMCPUCC pVCpu, uint64_t uTsc)
# ifndef VBOX_VMM_TARGET_ARMV8 /* This is perfectly valid on ARM if the guest is halting in the hypervisor. */
AssertStmt(cTicks <= uCpuHz << 2, cTicks = uCpuHz << 2); /* max 4 sec */
# endif
+ /* Execute for at most 4s. */
+ AssertMsgStmt(cTicks <= uCpuHz << 2,
+ ("TM/%u: execution took longer than 4s: cTicks=%llu uCpuHz=%llu\n", pVCpu->idCpu, cTicks, uCpuHz),
+ cTicks = uCpuHz << 2);
uint64_t cNsExecutingDelta;
if (uCpuHz < _4G)
diff --git a/src/VBox/VMM/VMMAll/TMAllVirtual.cpp b/src/VBox/VMM/VMMAll/TMAllVirtual.cpp
index 283ace3..26ed51d 100644
--- a/src/VBox/VMM/VMMAll/TMAllVirtual.cpp
+++ b/src/VBox/VMM/VMMAll/TMAllVirtual.cpp
@@ -985,7 +985,11 @@ VMM_INT_DECL(uint64_t) TMVirtualSyncGetWithDeadlineNoCheck(PVMCC pVM, uint64_t *
VMMDECL(uint64_t) TMVirtualSyncGetNsToDeadline(PVMCC pVM, uint64_t *puDeadlineVersion, uint64_t *puTscNow)
{
uint64_t cNsToDeadline;
+#ifdef VBOX_WITH_KVM
+ tmVirtualSyncGetEx(pVM, true /*fCheckTimers*/, &cNsToDeadline, puDeadlineVersion, puTscNow);
+#else
tmVirtualSyncGetEx(pVM, false /*fCheckTimers*/, &cNsToDeadline, puDeadlineVersion, puTscNow);
+#endif
return cNsToDeadline;
}
diff --git a/src/VBox/VMM/VMMR3/APIC.cpp b/src/VBox/VMM/VMMR3/APIC.cpp
index bdbef9c..55f7e53 100644
--- a/src/VBox/VMM/VMMR3/APIC.cpp
+++ b/src/VBox/VMM/VMMR3/APIC.cpp
@@ -35,6 +35,7 @@
#include <VBox/vmm/cpum.h>
#include <VBox/vmm/hm.h>
#include <VBox/vmm/mm.h>
+#include <VBox/vmm/nem.h>
#include <VBox/vmm/pdmdev.h>
#include <VBox/vmm/ssm.h>
#ifndef VBOX_DEVICE_STRUCT_TESTCASE
@@ -325,6 +326,10 @@ static DECLCALLBACK(void) apicR3Info(PVM pVM, PCDBGFINFOHLP pHlp, const char *ps
PCXAPICPAGE pXApicPage = VMCPU_TO_CXAPICPAGE(pVCpu);
PCX2APICPAGE pX2ApicPage = VMCPU_TO_CX2APICPAGE(pVCpu);
+#ifdef VBOX_WITH_KVM
+ NEMR3KvmGetLapicState(pVCpu, VMCPU_TO_XAPICPAGE(pVCpu));
+#endif
+
uint64_t const uBaseMsr = pApicCpu->uApicBaseMsr;
APICMODE const enmMode = apicGetMode(uBaseMsr);
bool const fX2ApicMode = XAPIC_IN_X2APIC_MODE(pVCpu);
@@ -953,6 +958,10 @@ static DECLCALLBACK(int) apicR3SaveExec(PPDMDEVINS pDevIns, PSSMHANDLE pSSM)
PVMCPU pVCpu = pVM->apCpusR3[idCpu];
PCAPICCPU pApicCpu = VMCPU_TO_APICCPU(pVCpu);
+#ifdef VBOX_WITH_KVM
+ NEMR3KvmGetLapicState(pVCpu, pApicCpu->pvApicPageR3);
+#endif
+
/* Update interrupts from the pending-interrupts bitmaps to the IRR. */
PDMApicUpdatePendingInterrupts(pVCpu);
@@ -1046,6 +1055,10 @@ static DECLCALLBACK(int) apicR3LoadExec(PPDMDEVINS pDevIns, PSSMHANDLE pSSM, uin
else
pHlp->pfnSSMGetStruct(pSSM, pApicCpu->pvApicPageR3, &g_aXApicPageFields[0]);
+#ifdef VBOX_WITH_KVM
+ NEMR3KvmSetLapicState(pVCpu, pApicCpu->pvApicPageR3);
+#endif
+
/* Load the timer. */
rc = pHlp->pfnSSMGetU64(pSSM, &pApicCpu->u64TimerInitial); AssertRCReturn(rc, rc);
rc = PDMDevHlpTimerLoad(pDevIns, pApicCpu->hTimer, pSSM); AssertRCReturn(rc, rc);
@@ -1174,6 +1187,11 @@ DECLCALLBACK(void) apicR3Reset(PPDMDEVINS pDevIns)
/* Clear the interrupt pending force flag. */
apicClearInterruptFF(pVCpuDest, PDMAPICIRQ_HARDWARE);
+
+#ifdef VBOX_WITH_KVM
+ PXAPICPAGE pXApicPage = VMCPU_TO_XAPICPAGE(pVCpuDest);
+ NEMR3KvmSetLapicState(pVCpuDest, pXApicPage);
+#endif
}
}
@@ -1531,6 +1549,9 @@ DECLCALLBACK(int) apicR3Construct(PPDMDEVINS pDevIns, int iInstance, PCFGMNODE p
{
PVMCPU pVCpu = pVM->apCpusR3[idCpu];
PAPICCPU pApicCpu = VMCPU_TO_APICCPU(pVCpu);
+#ifdef VBOX_WITH_KVM
+ NEMR3KvmSetLapicState(pVCpu, pApicCpu->pvApicPageR3);
+#endif
APIC_REG_COUNTER(&pApicCpu->StatPostIntrCnt, "%u", "APIC/VCPU stats / number of apicPostInterrupt calls.");
for (size_t i = 0; i < RT_ELEMENTS(pApicCpu->aStatVectors); i++)
diff --git a/src/VBox/VMM/VMMR3/EM.cpp b/src/VBox/VMM/VMMR3/EM.cpp
index 41c52bc..7b755ed 100644
--- a/src/VBox/VMM/VMMR3/EM.cpp
+++ b/src/VBox/VMM/VMMR3/EM.cpp
@@ -223,7 +223,11 @@ VMMR3_INT_DECL(int) EMR3Init(PVM pVM)
{
PVMCPU pVCpu = pVM->apCpusR3[idCpu];
+#ifdef VBOX_WITH_KVM
+ pVCpu->em.s.enmState = EMSTATE_NONE;
+#else
pVCpu->em.s.enmState = idCpu == 0 ? EMSTATE_NONE : EMSTATE_WAIT_SIPI;
+#endif
pVCpu->em.s.enmPrevState = EMSTATE_NONE;
pVCpu->em.s.msTimeSliceStart = 0; /* paranoia */
pVCpu->em.s.idxContinueExitRec = UINT16_MAX;
@@ -2341,7 +2345,14 @@ VMMR3_INT_DECL(int) EMR3ExecuteVM(PVM pVM, PVMCPU pVCpu)
else
{
/* All other VCPUs go into the wait for SIPI state. */
+#ifdef VBOX_WITH_KVM
+ /* In case the KVM split irq chip is used, KVM manages
+ * the wait for SIPI state for us and we need to stay in
+ * the NEM state. */
+ pVCpu->em.s.enmState = EMSTATE_NEM;
+#else
pVCpu->em.s.enmState = EMSTATE_WAIT_SIPI;
+#endif
}
break;
}
diff --git a/src/VBox/VMM/VMMR3/GIMHv.cpp b/src/VBox/VMM/VMMR3/GIMHv.cpp
index a4a282a..0ab7fd7 100644
--- a/src/VBox/VMM/VMMR3/GIMHv.cpp
+++ b/src/VBox/VMM/VMMR3/GIMHv.cpp
@@ -34,6 +34,9 @@
#include <VBox/vmm/gim.h>
#include <VBox/vmm/cpum.h>
#include <VBox/vmm/mm.h>
+#if defined(VBOX_WITH_KVM)
+#include <VBox/vmm/nem.h>
+#endif
#include <VBox/vmm/ssm.h>
#include <VBox/vmm/hm.h>
#include <VBox/vmm/pdmapi.h>
@@ -270,6 +273,51 @@ VMMR3_INT_DECL(int) gimR3HvInit(PVM pVM, PCFGMNODE pGimCfg)
rc = CFGMR3QueryBoolDef(pCfgHv, "HypercallDebugInterface", &pHv->fDbgHypercallInterface, false);
AssertLogRelRCReturn(rc, rc);
+#ifdef VBOX_WITH_KVM
+ uint32_t uKvmBaseFeat = 0;
+ uint32_t uKvmPartFlags = 0;
+ uint32_t uKvmPowMgmtFeat = 0;
+ uint32_t uKvmMiscFeat = 0;
+ uint32_t uKvmHyperHints = 0;
+
+ {
+ PCPUMCPUIDLEAF pKvmCpuidLeaves = nullptr;
+ size_t cKvmCpuidLeaves = 0;
+
+ rc = NEMR3KvmGetHvCpuIdLeaves(pVM, &pKvmCpuidLeaves, &cKvmCpuidLeaves);
+ AssertLogRelRCReturn(rc, rc);
+
+ for (size_t uLeaf = 0; uLeaf < cKvmCpuidLeaves; uLeaf++) {
+ LogRel(("GIM: KVM CPUID[%08x] eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
+ pKvmCpuidLeaves[uLeaf].uLeaf,
+ pKvmCpuidLeaves[uLeaf].uEax, pKvmCpuidLeaves[uLeaf].uEbx,
+ pKvmCpuidLeaves[uLeaf].uEcx, pKvmCpuidLeaves[uLeaf].uEdx));
+
+ /*
+ See this documentation for an overview of Hyper-V CPUID flags:
+ https://learn.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/feature-discovery
+ */
+
+ switch (pKvmCpuidLeaves[uLeaf].uLeaf) {
+ case 0x40000003: /* Features */
+ uKvmBaseFeat = pKvmCpuidLeaves[uLeaf].uEax;
+ uKvmPartFlags = pKvmCpuidLeaves[uLeaf].uEbx;
+ uKvmPowMgmtFeat = pKvmCpuidLeaves[uLeaf].uEcx;
+ uKvmMiscFeat = pKvmCpuidLeaves[uLeaf].uEdx;
+ break;
+ case 0x40000004: /* Implementation Recommendations */
+ uKvmHyperHints = pKvmCpuidLeaves[uLeaf].uEax;
+ break;
+ default:
+ // Ignore
+ break;
+ }
+ }
+
+ RTMemFree(pKvmCpuidLeaves);
+ }
+#endif
+
/*
* Determine interface capabilities based on the version.
*/
@@ -277,7 +325,11 @@ VMMR3_INT_DECL(int) gimR3HvInit(PVM pVM, PCFGMNODE pGimCfg)
{
/* Basic features. */
pHv->uBaseFeat = 0
+#ifdef VBOX_WITH_KVM
+ | GIM_HV_BASE_FEAT_VP_RUNTIME_MSR
+#else
//| GIM_HV_BASE_FEAT_VP_RUNTIME_MSR
+#endif
| GIM_HV_BASE_FEAT_PART_TIME_REF_COUNT_MSR
//| GIM_HV_BASE_FEAT_BASIC_SYNIC_MSRS // Both required for synethetic timers
//| GIM_HV_BASE_FEAT_STIMER_MSRS // Both required for synethetic timers
@@ -300,15 +352,29 @@ VMMR3_INT_DECL(int) gimR3HvInit(PVM pVM, PCFGMNODE pGimCfg)
| GIM_HV_MISC_FEAT_GUEST_CRASH_MSRS
//| GIM_HV_MISC_FEAT_DEBUG_MSRS
;
-
+#ifdef VBOX_WITH_KVM
+ /* Hypervisor recommendations to the guest. */
+ pHv->uHyperHints = GIM_HV_HINT_RELAX_TIME_CHECKS
+ /* Causes assertion failures in interrupt injection. */
+ //| GIM_HV_HINT_MSR_FOR_APIC_ACCESS
+ /* Inform the guest whether the host has hyperthreading disabled. */
+ | (GIM_HV_HINT_NO_NONARCH_CORESHARING & uKvmHyperHints)
+ ;
+#else
/* Hypervisor recommendations to the guest. */
pHv->uHyperHints = GIM_HV_HINT_MSR_FOR_SYS_RESET
| GIM_HV_HINT_RELAX_TIME_CHECKS
| GIM_HV_HINT_X2APIC_MSRS
;
+#endif
/* Partition features. */
+#ifdef VBOX_WITH_KVM
+ /* Extended hypercalls require KVM_EXIT_HYPER_HCALL exits to be forwarded gimHvHypercall.
+ So we don't expose them for now. */
+#else
pHv->uPartFlags |= GIM_HV_PART_FLAGS_EXTENDED_HYPERCALLS;
+#endif
/* Expose more if we're posing as Microsoft. We can, if needed, force MSR-based Hv
debugging by not exposing these bits while exposing the VS interface. The better
@@ -320,6 +386,15 @@ VMMR3_INT_DECL(int) gimR3HvInit(PVM pVM, PCFGMNODE pGimCfg)
pHv->uPartFlags |= GIM_HV_PART_FLAGS_DEBUGGING;
}
+
+#ifdef VBOX_WITH_KVM
+ // We should not enable features and hints that KVM doesn't know about.
+ Assert((pHv->uHyperHints & ~uKvmHyperHints) == 0);
+ Assert((pHv->uBaseFeat & ~uKvmBaseFeat) == 0);
+ Assert((pHv->uMiscFeat & ~uKvmMiscFeat) == 0);
+ Assert((pHv->uPartFlags & ~uKvmPartFlags) == 0);
+ Assert((pHv->uPowMgmtFeat & ~uKvmPowMgmtFeat) == 0);
+#endif
}
/*
diff --git a/src/VBox/VMM/VMMR3/GIMHvOnKvm.cpp b/src/VBox/VMM/VMMR3/GIMHvOnKvm.cpp
new file mode 100644
index 0000000..362cc69
--- /dev/null
+++ b/src/VBox/VMM/VMMR3/GIMHvOnKvm.cpp
@@ -0,0 +1,640 @@
+/* $Id: GIMHvOnKvm.cpp $ */
+/** @file
+ * GIM - Guest Interface Manager, Hyper-V implementation for the KVM-Backend.
+ */
+
+/*
+ * Copyright (C) 2014-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP LOG_GROUP_GIM
+#include <VBox/vmm/gim.h>
+#include <VBox/vmm/nem.h>
+#include <VBox/vmm/ssm.h>
+#include <VBox/vmm/hm.h>
+#include "GIMInternal.h"
+#include <VBox/vmm/vm.h>
+
+#include <VBox/err.h>
+#include <VBox/version.h>
+
+#include <iprt/assert.h>
+#include <iprt/string.h>
+#include <iprt/mem.h>
+
+/*********************************************************************************************************************************
+* Defined Constants And Macros *
+*********************************************************************************************************************************/
+/**
+ * GIM Hyper-V saved-state version.
+ *
+ * We use a number that is far away from the original GIMHv saved state version
+ * to prevent future collisions.
+ */
+#define GIM_HV_SAVED_STATE_VERSION UINT32_C(0x1000)
+
+#ifdef VBOX_WITH_STATISTICS
+# define GIMHV_MSRRANGE(a_uFirst, a_uLast, a_szName) \
+ { (a_uFirst), (a_uLast), kCpumMsrRdFn_Gim, kCpumMsrWrFn_Gim, 0, 0, 0, 0, 0, a_szName, { 0 }, { 0 }, { 0 }, { 0 } }
+#else
+# define GIMHV_MSRRANGE(a_uFirst, a_uLast, a_szName) \
+ { (a_uFirst), (a_uLast), kCpumMsrRdFn_Gim, kCpumMsrWrFn_Gim, 0, 0, 0, 0, 0, a_szName }
+#endif
+
+
+/*********************************************************************************************************************************
+* Global Variables *
+*********************************************************************************************************************************/
+/**
+ * Array of MSR ranges supported by Hyper-V.
+ */
+static CPUMMSRRANGE const g_aMsrRanges_HyperV[] =
+{
+ GIMHV_MSRRANGE(MSR_GIM_HV_RANGE0_FIRST, MSR_GIM_HV_RANGE0_LAST, "Hyper-V range 0"),
+ GIMHV_MSRRANGE(MSR_GIM_HV_RANGE1_FIRST, MSR_GIM_HV_RANGE1_LAST, "Hyper-V range 1"),
+ GIMHV_MSRRANGE(MSR_GIM_HV_RANGE2_FIRST, MSR_GIM_HV_RANGE2_LAST, "Hyper-V range 2"),
+ GIMHV_MSRRANGE(MSR_GIM_HV_RANGE3_FIRST, MSR_GIM_HV_RANGE3_LAST, "Hyper-V range 3"),
+ GIMHV_MSRRANGE(MSR_GIM_HV_RANGE4_FIRST, MSR_GIM_HV_RANGE4_LAST, "Hyper-V range 4"),
+ GIMHV_MSRRANGE(MSR_GIM_HV_RANGE5_FIRST, MSR_GIM_HV_RANGE5_LAST, "Hyper-V range 5"),
+ GIMHV_MSRRANGE(MSR_GIM_HV_RANGE6_FIRST, MSR_GIM_HV_RANGE6_LAST, "Hyper-V range 6"),
+ GIMHV_MSRRANGE(MSR_GIM_HV_RANGE7_FIRST, MSR_GIM_HV_RANGE7_LAST, "Hyper-V range 7"),
+ GIMHV_MSRRANGE(MSR_GIM_HV_RANGE8_FIRST, MSR_GIM_HV_RANGE8_LAST, "Hyper-V range 8"),
+ GIMHV_MSRRANGE(MSR_GIM_HV_RANGE9_FIRST, MSR_GIM_HV_RANGE9_LAST, "Hyper-V range 9"),
+ GIMHV_MSRRANGE(MSR_GIM_HV_RANGE10_FIRST, MSR_GIM_HV_RANGE10_LAST, "Hyper-V range 10"),
+ GIMHV_MSRRANGE(MSR_GIM_HV_RANGE11_FIRST, MSR_GIM_HV_RANGE11_LAST, "Hyper-V range 11"),
+ GIMHV_MSRRANGE(MSR_GIM_HV_RANGE12_FIRST, MSR_GIM_HV_RANGE12_LAST, "Hyper-V range 12")
+};
+#undef GIMHV_MSRRANGE
+
+/*********************************************************************************************************************************
+* Internal Functions *
+*********************************************************************************************************************************/
+
+/**
+ * Initializes the Hyper-V GIM provider.
+ *
+ * @returns VBox status code.
+ * @param pVM The cross context VM structure.
+ * @param pGimCfg The GIM CFGM node.
+ */
+VMMR3_INT_DECL(int) gimR3HvInit(PVM pVM, PCFGMNODE pGimCfg)
+{
+ AssertReturn(pVM, VERR_INVALID_PARAMETER);
+ AssertReturn(pVM->gim.s.enmProviderId == GIMPROVIDERID_HYPERV, VERR_INTERNAL_ERROR_5);
+
+ PGIMHV pHv = &pVM->gim.s.u.Hv;
+
+ /*
+ * Read configuration.
+ */
+ PCFGMNODE pCfgHv = CFGMR3GetChild(pGimCfg, "HyperV");
+ if (pCfgHv)
+ {
+ /*
+ * Validate the Hyper-V settings.
+ */
+ int rc2 = CFGMR3ValidateConfig(pCfgHv, "/HyperV/",
+ "VendorID"
+ "|VSInterface"
+ "|HypercallDebugInterface"
+ "|VirtioGPU",
+ "" /* pszValidNodes */, "GIM/HyperV" /* pszWho */, 0 /* uInstance */);
+ if (RT_FAILURE(rc2))
+ return rc2;
+ }
+
+ /**
+ * If virtio-gpu is in use, revert back to VBoxVBoxVBox as HyperV Vendor because otherwise,
+ * the Intel GPU driver does not load.
+ */
+ bool withVirtioGPU {false};
+ int rc = CFGMR3QueryBoolDef(pCfgHv, "VirtioGPU", &withVirtioGPU, false);
+ AssertLogRelRCReturn(rc, rc);
+
+ /** @cfgm{/GIM/HyperV/VendorID, string, 'VBoxVBoxVBox'}
+ * The Hyper-V vendor signature, must be 12 characters. */
+ char szVendor[13];
+ rc = CFGMR3QueryStringDef(pCfgHv, "VendorID", szVendor, sizeof(szVendor), withVirtioGPU ? "VBoxVBoxVBox" : "Microsoft Hv");
+ AssertLogRelRCReturn(rc, rc);
+ AssertLogRelMsgReturn(strlen(szVendor) == 12,
+ ("The VendorID config value must be exactly 12 chars, '%s' isn't!\n", szVendor),
+ VERR_INVALID_PARAMETER);
+
+ AssertReleaseMsg(!RTStrNCmp(szVendor, GIM_HV_VENDOR_MICROSOFT, sizeof(GIM_HV_VENDOR_MICROSOFT) - 1) ||
+ !RTStrNCmp(szVendor, GIM_HV_VENDOR_VBOX, sizeof(GIM_HV_VENDOR_VBOX) - 1), (("GIM Vendors other than Microsoft Hv and VBox are unsupported")));
+
+ LogRel(("GIM: HyperV: Reporting vendor as '%s'\n", szVendor));
+
+ pHv->fIsInterfaceVs = false;
+ pHv->fDbgHypercallInterface = false;
+
+ uint32_t uKvmBaseFeat = 0;
+ uint32_t uKvmPartFlags = 0;
+ uint32_t uKvmPowMgmtFeat = 0;
+ uint32_t uKvmMiscFeat = 0;
+ uint32_t uKvmHyperHints = 0;
+
+ {
+ PCPUMCPUIDLEAF pKvmCpuidLeaves = nullptr;
+ size_t cKvmCpuidLeaves = 0;
+
+ rc = NEMR3KvmGetHvCpuIdLeaves(pVM, &pKvmCpuidLeaves, &cKvmCpuidLeaves);
+ AssertLogRelRCReturn(rc, rc);
+
+ for (size_t uLeaf = 0; uLeaf < cKvmCpuidLeaves; uLeaf++) {
+ LogRel(("GIM: KVM CPUID[%08x] eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
+ pKvmCpuidLeaves[uLeaf].uLeaf,
+ pKvmCpuidLeaves[uLeaf].uEax, pKvmCpuidLeaves[uLeaf].uEbx,
+ pKvmCpuidLeaves[uLeaf].uEcx, pKvmCpuidLeaves[uLeaf].uEdx));
+
+ /*
+ See this documentation for an overview of Hyper-V CPUID flags:
+ https://learn.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/feature-discovery
+ */
+
+ switch (pKvmCpuidLeaves[uLeaf].uLeaf) {
+ case 0x40000003: /* Features */
+ uKvmBaseFeat = pKvmCpuidLeaves[uLeaf].uEax;
+ uKvmPartFlags = pKvmCpuidLeaves[uLeaf].uEbx;
+ uKvmPowMgmtFeat = pKvmCpuidLeaves[uLeaf].uEcx;
+ uKvmMiscFeat = pKvmCpuidLeaves[uLeaf].uEdx;
+ break;
+ case 0x40000004: /* Implementation Recommendations */
+ uKvmHyperHints = pKvmCpuidLeaves[uLeaf].uEax;
+ break;
+ default:
+ // Ignore
+ break;
+ }
+ }
+
+ RTMemFree(pKvmCpuidLeaves);
+ }
+
+ /*
+ * Determine interface capabilities based on the version.
+ */
+ if (!pVM->gim.s.u32Version)
+ {
+ /* Basic features. */
+ pHv->uBaseFeat = 0
+ | GIM_HV_BASE_FEAT_VP_RUNTIME_MSR
+ | GIM_HV_BASE_FEAT_PART_TIME_REF_COUNT_MSR
+ | GIM_HV_BASE_FEAT_BASIC_SYNIC_MSRS
+ | GIM_HV_BASE_FEAT_STIMER_MSRS
+ | GIM_HV_BASE_FEAT_APIC_ACCESS_MSRS
+ | GIM_HV_BASE_FEAT_HYPERCALL_MSRS
+ | GIM_HV_BASE_FEAT_VP_ID_MSR
+ | GIM_HV_BASE_FEAT_VIRT_SYS_RESET_MSR
+ //| GIM_HV_BASE_FEAT_STAT_PAGES_MSR
+ | GIM_HV_BASE_FEAT_PART_REF_TSC_MSR
+ //| GIM_HV_BASE_FEAT_GUEST_IDLE_STATE_MSR
+ | GIM_HV_BASE_FEAT_TIMER_FREQ_MSRS
+ //| GIM_HV_BASE_FEAT_DEBUG_MSRS
+ ;
+
+ /* Miscellaneous features. */
+ pHv->uMiscFeat = 0
+ //| GIM_HV_MISC_FEAT_GUEST_DEBUGGING
+ //| GIM_HV_MISC_FEAT_XMM_HYPERCALL_INPUT
+ | GIM_HV_MISC_FEAT_TIMER_FREQ
+ | GIM_HV_MISC_FEAT_GUEST_CRASH_MSRS
+ //| GIM_HV_MISC_FEAT_DEBUG_MSRS
+ | GIM_HV_MISC_FEAT_USE_DIRECT_SYNTH_MSRS
+ ;
+
+ /* Hypervisor recommendations to the guest. */
+ pHv->uHyperHints = GIM_HV_HINT_RELAX_TIME_CHECKS
+ /* Causes assertion failures in interrupt injection. */
+ //| GIM_HV_HINT_MSR_FOR_APIC_ACCESS
+ //|GIM_HV_HINT_MSR_FOR_SYS_RESET
+ | GIM_HV_HINT_DEPRECATE_AUTO_EOI
+ /* Inform the guest whether the host has hyperthreading disabled. */
+ | (GIM_HV_HINT_NO_NONARCH_CORESHARING & uKvmHyperHints)
+ ;
+
+
+ // We should not enable features and hints that KVM doesn't know about.
+ AssertRelease((pHv->uHyperHints & ~uKvmHyperHints) == 0);
+ AssertRelease((pHv->uBaseFeat & ~uKvmBaseFeat) == 0);
+ AssertRelease((pHv->uMiscFeat & ~uKvmMiscFeat) == 0);
+ AssertRelease((pHv->uPartFlags & ~uKvmPartFlags) == 0);
+ AssertRelease((pHv->uPowMgmtFeat & ~uKvmPowMgmtFeat) == 0);
+ }
+
+ /*
+ * Make sure the CPUID bits are in accordance with the Hyper-V
+ * requirement and other paranoia checks.
+ * See "Requirements for implementing the Microsoft hypervisor interface" spec.
+ */
+ AssertRelease(!(pHv->uPartFlags & ( GIM_HV_PART_FLAGS_CREATE_PART
+ | GIM_HV_PART_FLAGS_ACCESS_MEMORY_POOL
+ | GIM_HV_PART_FLAGS_ACCESS_PART_ID
+ | GIM_HV_PART_FLAGS_ADJUST_MSG_BUFFERS
+ | GIM_HV_PART_FLAGS_CREATE_PORT
+ | GIM_HV_PART_FLAGS_ACCESS_STATS
+ | GIM_HV_PART_FLAGS_CPU_MGMT
+ | GIM_HV_PART_FLAGS_CPU_PROFILER)));
+
+ AssertRelease((pHv->uBaseFeat & (GIM_HV_BASE_FEAT_HYPERCALL_MSRS | GIM_HV_BASE_FEAT_VP_ID_MSR))
+ == (GIM_HV_BASE_FEAT_HYPERCALL_MSRS | GIM_HV_BASE_FEAT_VP_ID_MSR));
+
+ /*
+ * Expose HVP (Hypervisor Present) bit to the guest.
+ */
+ CPUMR3SetGuestCpuIdFeature(pVM, CPUMCPUIDFEATURE_HVP);
+
+ /*
+ * Modify the standard hypervisor leaves for Hyper-V.
+ */
+ CPUMCPUIDLEAF HyperLeaf;
+ RT_ZERO(HyperLeaf);
+ HyperLeaf.uLeaf = UINT32_C(0x40000000);
+ HyperLeaf.uEax = UINT32_C(0x40000006); /* Minimum value for Hyper-V default is 0x40000005. */
+ /*
+ * Don't report vendor as 'Microsoft Hv'[1] by default, see @bugref{7270#c152}.
+ * [1]: ebx=0x7263694d ('rciM') ecx=0x666f736f ('foso') edx=0x76482074 ('vH t')
+ */
+ {
+ uint32_t uVendorEbx;
+ uint32_t uVendorEcx;
+ uint32_t uVendorEdx;
+ uVendorEbx = ((uint32_t)szVendor[ 3]) << 24 | ((uint32_t)szVendor[ 2]) << 16 | ((uint32_t)szVendor[1]) << 8
+ | (uint32_t)szVendor[ 0];
+ uVendorEcx = ((uint32_t)szVendor[ 7]) << 24 | ((uint32_t)szVendor[ 6]) << 16 | ((uint32_t)szVendor[5]) << 8
+ | (uint32_t)szVendor[ 4];
+ uVendorEdx = ((uint32_t)szVendor[11]) << 24 | ((uint32_t)szVendor[10]) << 16 | ((uint32_t)szVendor[9]) << 8
+ | (uint32_t)szVendor[ 8];
+ HyperLeaf.uEbx = uVendorEbx;
+ HyperLeaf.uEcx = uVendorEcx;
+ HyperLeaf.uEdx = uVendorEdx;
+ }
+ rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+ AssertLogRelRCReturn(rc, rc);
+
+ HyperLeaf.uLeaf = UINT32_C(0x40000001);
+ HyperLeaf.uEax = 0x31237648; /* 'Hv#1' */
+ HyperLeaf.uEbx = 0; /* Reserved */
+ HyperLeaf.uEcx = 0; /* Reserved */
+ HyperLeaf.uEdx = 0; /* Reserved */
+ rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+ AssertLogRelRCReturn(rc, rc);
+
+ /*
+ * Add Hyper-V specific leaves.
+ */
+ HyperLeaf.uLeaf = UINT32_C(0x40000002); /* MBZ until MSR_GIM_HV_GUEST_OS_ID is set by the guest. */
+ HyperLeaf.uEax = 0;
+ HyperLeaf.uEbx = 0;
+ HyperLeaf.uEcx = 0;
+ HyperLeaf.uEdx = 0;
+ rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+ AssertLogRelRCReturn(rc, rc);
+
+ HyperLeaf.uLeaf = UINT32_C(0x40000003);
+ HyperLeaf.uEax = pHv->uBaseFeat;
+ HyperLeaf.uEbx = pHv->uPartFlags;
+ HyperLeaf.uEcx = pHv->uPowMgmtFeat;
+ HyperLeaf.uEdx = pHv->uMiscFeat;
+ rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+ AssertLogRelRCReturn(rc, rc);
+
+ HyperLeaf.uLeaf = UINT32_C(0x40000004);
+ HyperLeaf.uEax = pHv->uHyperHints;
+ /* Recommended number of spinlock retries before notifying the Hypervisor. 0xffffffff means that the Hypervisor is never notified */
+ HyperLeaf.uEbx = 0xffffffff;
+ HyperLeaf.uEcx = 0;
+ HyperLeaf.uEdx = 0;
+ rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+ AssertLogRelRCReturn(rc, rc);
+
+ RT_ZERO(HyperLeaf);
+ HyperLeaf.uLeaf = UINT32_C(0x40000005);
+ rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+ AssertLogRelRCReturn(rc, rc);
+
+ // Let the guest OS know that we're running HyperV PV on KVM.
+ static constexpr char kvmVendor[] = "KVMKVMKVM\0\0\0";
+ HyperLeaf.uLeaf = 0x40000100;
+ {
+ uint32_t uVendorEbx;
+ uint32_t uVendorEcx;
+ uint32_t uVendorEdx;
+ uVendorEbx = ((uint32_t)kvmVendor[ 3]) << 24 | ((uint32_t)kvmVendor[ 2]) << 16 | ((uint32_t)kvmVendor[1]) << 8
+ | (uint32_t)kvmVendor[ 0];
+ uVendorEcx = ((uint32_t)kvmVendor[ 7]) << 24 | ((uint32_t)kvmVendor[ 6]) << 16 | ((uint32_t)kvmVendor[5]) << 8
+ | (uint32_t)kvmVendor[ 4];
+ uVendorEdx = ((uint32_t)kvmVendor[11]) << 24 | ((uint32_t)kvmVendor[10]) << 16 | ((uint32_t)kvmVendor[9]) << 8
+ | (uint32_t)kvmVendor[ 8];
+ HyperLeaf.uEbx = uVendorEbx;
+ HyperLeaf.uEcx = uVendorEcx;
+ HyperLeaf.uEdx = uVendorEdx;
+ }
+
+ rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+ AssertLogRelRCReturn(rc, rc);
+
+
+ /*
+ * Insert all MSR ranges of Hyper-V.
+ */
+ for (unsigned i = 0; i < RT_ELEMENTS(g_aMsrRanges_HyperV); i++)
+ {
+ int rc2 = CPUMR3MsrRangesInsert(pVM, &g_aMsrRanges_HyperV[i]);
+ AssertLogRelRCReturn(rc2, rc2);
+ }
+
+ /*
+ * Setup non-zero MSRs.
+ */
+ if (pHv->uMiscFeat & GIM_HV_MISC_FEAT_GUEST_CRASH_MSRS)
+ pHv->uCrashCtlMsr = MSR_GIM_HV_CRASH_CTL_NOTIFY;
+
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Initializes remaining bits of the Hyper-V provider.
+ *
+ * This is called after initializing HM and almost all other VMM components.
+ *
+ * @returns VBox status code.
+ * @param pVM The cross context VM structure.
+ */
+VMMR3_INT_DECL(int) gimR3HvInitCompleted(PVM pVM)
+{
+ PGIMHV pHv = &pVM->gim.s.u.Hv;
+ pHv->cTscTicksPerSecond = TMCpuTicksPerSecond(pVM);
+
+ /*
+ * Determine interface capabilities based on the version.
+ */
+ if (!pVM->gim.s.u32Version)
+ {
+ /* Hypervisor capabilities; features used by the hypervisor. */
+ pHv->uHyperCaps = HMIsNestedPagingActive(pVM) ? GIM_HV_HOST_FEAT_NESTED_PAGING : 0;
+ pHv->uHyperCaps |= HMIsMsrBitmapActive(pVM) ? GIM_HV_HOST_FEAT_MSR_BITMAP : 0;
+ }
+
+ CPUMCPUIDLEAF HyperLeaf;
+ RT_ZERO(HyperLeaf);
+ HyperLeaf.uLeaf = UINT32_C(0x40000006);
+ HyperLeaf.uEax = pHv->uHyperCaps;
+ HyperLeaf.uEbx = 0;
+ HyperLeaf.uEcx = 0;
+ HyperLeaf.uEdx = 0;
+ int rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+ AssertLogRelRCReturn(rc, rc);
+
+ return rc;
+}
+
+
+/**
+ * Terminates the Hyper-V GIM provider.
+ *
+ * @returns VBox status code.
+ * @param pVM The cross context VM structure.
+ */
+VMMR3_INT_DECL(int) gimR3HvTerm(PVM pVM)
+{
+ gimR3HvReset(pVM);
+
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Applies relocations to data and code managed by this
+ * component. This function will be called at init and
+ * whenever the VMM need to relocate it self inside the GC.
+ *
+ * @param pVM The cross context VM structure.
+ * @param offDelta Relocation delta relative to old location.
+ */
+VMMR3_INT_DECL(void) gimR3HvRelocate(PVM pVM, RTGCINTPTR offDelta)
+{
+ RT_NOREF(pVM, offDelta);
+}
+
+
+static bool isSynICAllowed(PGIMHV pHv)
+{
+ return pHv->uBaseFeat & GIM_HV_BASE_FEAT_BASIC_SYNIC_MSRS;
+}
+
+/**
+ * This resets Hyper-V provider MSRs and unmaps whatever Hyper-V regions that
+ * the guest may have mapped.
+ *
+ * This is called when the VM is being reset.
+ *
+ * @param pVM The cross context VM structure.
+ *
+ * @thread EMT(0)
+ */
+VMMR3_INT_DECL(void) gimR3HvReset(PVM pVM)
+{
+ VM_ASSERT_EMT0(pVM);
+
+ /*
+ * Unmap MMIO2 pages that the guest may have setup.
+ */
+ LogRel(("GIM: HyperV: Resetting MMIO2 regions and MSRs\n"));
+ PGIMHV pHv = &pVM->gim.s.u.Hv;
+
+ /*
+ * Reset MSRs.
+ */
+ pHv->u64GuestOsIdMsr = 0;
+ pHv->u64HypercallMsr = 0;
+ pHv->u64TscPageMsr = 0;
+ pHv->uCrashP0Msr = 0;
+ pHv->uCrashP1Msr = 0;
+ pHv->uCrashP2Msr = 0;
+ pHv->uCrashP3Msr = 0;
+ pHv->uCrashP4Msr = 0;
+ pHv->uDbgStatusMsr = 0;
+ pHv->uDbgPendingBufferMsr = 0;
+ pHv->uDbgSendBufferMsr = 0;
+ pHv->uDbgRecvBufferMsr = 0;
+
+ PVMCPU pVCpuBsp = pVM->apCpusR3[0];
+ NEMR3KvmSetMsr(pVCpuBsp, MSR_GIM_HV_GUEST_OS_ID, pHv->u64GuestOsIdMsr);
+ NEMR3KvmSetMsr(pVCpuBsp, MSR_GIM_HV_HYPERCALL, pHv->u64HypercallMsr);
+ NEMR3KvmSetMsr(pVCpuBsp, MSR_GIM_HV_REF_TSC, pHv->u64TscPageMsr);
+ NEMR3KvmSetMsr(pVCpuBsp, MSR_GIM_HV_SYNTH_DEBUG_STATUS, pHv->uDbgStatusMsr);
+ NEMR3KvmSetMsr(pVCpuBsp, MSR_GIM_HV_SYNTH_DEBUG_PENDING_BUFFER, pHv->uDbgPendingBufferMsr);
+ NEMR3KvmSetMsr(pVCpuBsp, MSR_GIM_HV_SYNTH_DEBUG_SEND_BUFFER, pHv->uDbgSendBufferMsr);
+ NEMR3KvmSetMsr(pVCpuBsp, MSR_GIM_HV_SYNTH_DEBUG_RECEIVE_BUFFER, pHv->uDbgRecvBufferMsr);
+
+ for (VMCPUID idCpu = 0; idCpu < pVM->cCpus; idCpu++)
+ {
+ PGIMHVCPU pHvCpu = &pVM->apCpusR3[idCpu]->gim.s.u.HvCpu;
+ PVMCPU pVCpu = pVM->apCpusR3[idCpu];
+
+ pHvCpu->uSControlMsr = 0;
+ pHvCpu->uSimpMsr = 0;
+ pHvCpu->uSiefpMsr = 0;
+ pHvCpu->uApicAssistPageMsr = 0;
+
+ NEMR3KvmSetMsr(pVCpu, MSR_GIM_HV_SCONTROL, pHvCpu->uSControlMsr);
+ NEMR3KvmSetMsr(pVCpu, MSR_GIM_HV_SIMP, pHvCpu->uSimpMsr);
+ NEMR3KvmSetMsr(pVCpu, MSR_GIM_HV_SIEFP, pHvCpu->uSiefpMsr);
+ NEMR3KvmSetMsr(pVCpu, MSR_GIM_HV_APIC_ASSIST_PAGE, pHvCpu->uApicAssistPageMsr);
+
+ for (uint8_t idxSint = 0; idxSint < RT_ELEMENTS(pHvCpu->auSintMsrs); idxSint++) {
+ pHvCpu->auSintMsrs[idxSint] = MSR_GIM_HV_SINT_MASKED;
+ if (isSynICAllowed(pHv)) {
+ NEMR3KvmSetMsr(pVCpu, MSR_GIM_HV_SINT0 + idxSint, pHvCpu->auSintMsrs[idxSint]);
+ }
+ }
+
+ for (uint8_t idxStimer = 0; idxStimer < RT_ELEMENTS(pHvCpu->aStimers); idxStimer++)
+ {
+ PGIMHVSTIMER pHvStimer = &pHvCpu->aStimers[idxStimer];
+ pHvStimer->uStimerConfigMsr = 0;
+ pHvStimer->uStimerCountMsr = 0;
+ NEMR3KvmSetMsr(pVCpu, MSR_GIM_HV_STIMER0_CONFIG + idxStimer, pHvStimer->uStimerConfigMsr);
+ NEMR3KvmSetMsr(pVCpu, MSR_GIM_HV_STIMER0_COUNT + idxStimer, pHvStimer->uStimerCountMsr);
+ }
+ }
+}
+
+
+/**
+ * Hyper-V state-load operation, final pass.
+ *
+ * @returns VBox status code.
+ * @param pVM The cross context VM structure.
+ * @param pSSM The saved state handle.
+ */
+VMMR3_INT_DECL(int) gimR3HvLoad(PVM pVM, PSSMHANDLE pSSM)
+{
+ uint32_t uHvSavedStateVersion;
+ int rc = SSMR3GetU32(pSSM, &uHvSavedStateVersion);
+ AssertRCReturn(rc, rc);
+
+ if (uHvSavedStateVersion != GIM_HV_SAVED_STATE_VERSION) {
+ return SSMR3SetLoadError(pSSM, VERR_SSM_UNSUPPORTED_DATA_UNIT_VERSION, RT_SRC_POS,
+ N_("Unsupported Hyper-V saved-state version %u (current %u)!"),
+ uHvSavedStateVersion, GIM_HV_SAVED_STATE_VERSION);
+ }
+
+ for (unsigned i = 0; i < RT_ELEMENTS(g_aMsrRanges_HyperV); i++) {
+ for (unsigned msr {g_aMsrRanges_HyperV[i].uFirst}; msr <= g_aMsrRanges_HyperV[i].uLast; ++msr) {
+
+ // See gimR3HvSave to understand why we skip this MSR.
+ if (msr == MSR_GIM_HV_EOI) {
+ continue;
+ }
+
+ uint64_t val {0};
+ PVMCPU pVCpu = pVM->apCpusR3[0];
+
+ SSMR3GetU64(pSSM, &val);
+
+ rc = NEMR3KvmSetMsr(pVCpu, msr, val);
+ if (rc != VINF_SUCCESS) {
+ // Some MSRs can only be written when HYPERV_SYINC2 has been enabled.
+ // We don't actually care here because if we unable to write the MSR,
+ // the guest couldn't have read/written it either.
+ LogRel2(("Unable to read HV MSR: 0x%x\n", msr));
+ }
+ }
+ }
+
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Hyper-V load-done callback.
+ *
+ * @returns VBox status code.
+ * @param pVM The cross context VM structure.
+ * @param pSSM The saved state handle.
+ */
+VMMR3_INT_DECL(int) gimR3HvLoadDone(PVM pVM, PSSMHANDLE pSSM)
+{
+ NOREF(pVM); NOREF(pSSM);
+ return VINF_SUCCESS;
+}
+
+/**
+ * Hyper-V state-save operation.
+ *
+ * @returns VBox status code.
+ * @param pVM The cross context VM structure.
+ * @param pSSM The saved state handle.
+ */
+VMMR3_INT_DECL(int) gimR3HvSave(PVM pVM, PSSMHANDLE pSSM)
+{
+ /*
+ * Save the Hyper-V SSM version.
+ */
+ SSMR3PutU32(pSSM, GIM_HV_SAVED_STATE_VERSION);
+
+ for (unsigned i = 0; i < RT_ELEMENTS(g_aMsrRanges_HyperV); i++) {
+ for (unsigned msr {g_aMsrRanges_HyperV[i].uFirst}; msr <= g_aMsrRanges_HyperV[i].uLast; ++msr) {
+
+ // This register is wirte-only for the guest and the last value written isn't interesting at all.
+ // Thus, there is no need save it here.
+ if (msr == MSR_GIM_HV_EOI) {
+ continue;
+ }
+
+ uint64_t val {0};
+ PVMCPU pVCpu = pVM->apCpusR3[0];
+
+ int rc {NEMR3KvmGetMsr(pVCpu, msr, &val)};
+ if (rc != VINF_SUCCESS) {
+ // Some MSRs can only be read when HYPERV_SYINC2 has been enabled.
+ // We don't actually care here because if we unable to read the MSR,
+ // the guest couldn't have read/written it either. Simply save it as
+ // zero and call it good.
+ LogRel2(("Unable to read HV MSR: 0x%x\n", msr));
+ }
+
+ SSMR3PutU64(pSSM, val);
+ }
+ }
+
+ return VINF_SUCCESS;
+}
+
+/**
+ * Get Hyper-V debug setup parameters.
+ *
+ * @returns VBox status code.
+ * @param pVM The cross context VM structure.
+ * @param pDbgSetup Where to store the debug setup details.
+ */
+VMMR3_INT_DECL(int) gimR3HvGetDebugSetup(PVM pVM, PGIMDEBUGSETUP pDbgSetup)
+{
+ NOREF(pVM); NOREF(pDbgSetup);
+ return VERR_GIM_NO_DEBUG_CONNECTION;
+}
diff --git a/src/VBox/VMM/VMMR3/NEMR3Native-linux.cpp b/src/VBox/VMM/VMMR3/NEMR3Native-linux.cpp
index 26611df..36dd594 100644
--- a/src/VBox/VMM/VMMR3/NEMR3Native-linux.cpp
+++ b/src/VBox/VMM/VMMR3/NEMR3Native-linux.cpp
@@ -37,30 +37,124 @@
#include <VBox/vmm/pdmapic.h>
#include <VBox/vmm/pdm.h>
#include <VBox/vmm/trpm.h>
+#include "CPUMInternal.h"
#include "NEMInternal.h"
+#include "HMInternal.h"
+#include "GIMInternal.h"
+#include "GIMHvInternal.h"
#include <VBox/vmm/vmcc.h>
#include <iprt/alloca.h>
+#include <iprt/mem.h>
#include <iprt/string.h>
#include <iprt/system.h>
#include <iprt/x86.h>
#include <errno.h>
#include <unistd.h>
+#include <signal.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/mman.h>
+#include <sys/prctl.h>
#include <linux/kvm.h>
-
/* Forward declarations of things called by the template. */
static int nemR3LnxInitSetupVm(PVM pVM, PRTERRINFO pErrInfo);
+#include <algorithm>
+#include <string_view>
+#include <vector>
+/**
+ * The MMIO address of the TPR register of the LAPIC.
+ */
+static constexpr uint64_t XAPIC_TPR_ADDR {0xfee00080};
/* Instantiate the common bits we share with the ARMv8 KVM backend. */
#include "NEMR3NativeTemplate-linux.cpp.h"
+/**
+ * The class priority shift for the TPR register.
+ */
+static constexpr uint64_t LAPIC_TPR_SHIFT {4};
+
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+static int kvmSetGsiRoutingFullIrqChip(PVM pVM);
+#endif
+
+
+
+#ifdef VBOX_WITH_KVM_NESTING
+static int KvmGetGuestModeOffsetFromStatsFd(PVMCPU pVCpu, size_t *offset)
+{
+ // See https://www.kernel.org/doc/html/latest/virt/kvm/api.html to learn more
+ // about the KVM binary statistics (look for KVM_GET_STATS_FD).
+
+ struct kvm_stats_header stats_header;
+ RT_ZERO(stats_header);
+
+ int rcRead = pread(pVCpu->nem.s.statsFd, &stats_header, sizeof(struct kvm_stats_header), 0);
+ AssertReleaseMsg(rcRead == sizeof(struct kvm_stats_header), ("Unable to read stats header"));
+
+ if (offset == nullptr) {
+ printf("Invalid pointer\n");
+ return VERR_INVALID_POINTER;
+ }
+
+ int real_desc_size = sizeof(struct kvm_stats_desc) + stats_header.name_size;
+ void *desc_backing = RTMemAllocZ(real_desc_size);
+
+ int rc = VERR_NOT_IMPLEMENTED;
+
+ for (unsigned i = 0; i < stats_header.num_desc; ++i) {
+ memset(desc_backing, 0, real_desc_size);
+
+ struct kvm_stats_desc* desc = static_cast<struct kvm_stats_desc*>(desc_backing);
+ rcRead = pread(pVCpu->nem.s.statsFd, desc, real_desc_size, stats_header.desc_offset + i * real_desc_size);
+ AssertReleaseMsg(rcRead == real_desc_size, ("Unable to read descriptor"));
+
+ std::basic_string_view name(desc->name);
+ if (name == "guest_mode") {
+ unsigned value_offset = stats_header.data_offset + desc->offset;
+
+ if (desc->size != 1) {
+ LogRel(("Invalid guest_mode stat size: %d\n", desc->size * 8));
+ rc = VERR_NOT_SUPPORTED;
+ break;
+ }
+ *offset = value_offset;
+
+ rc = VINF_SUCCESS;
+ break;
+ }
+ }
+
+ RTMemFree(desc_backing);
+ return rc;
+}
+#endif
+
+bool KvmIsNestedGuestExit(PVM pVM, PVMCPU pVCpu)
+{
+#ifdef VBOX_WITH_KVM_NESTING
+ if (not pVM->cpum.s.GuestFeatures.fVmx) {
+ return false;
+ }
+
+ uint64_t value {0};
+
+ AssertReleaseMsg(pVCpu->nem.s.guestModeStatOffset != 0, ("Invalid guest_mode offset"));
+
+ int rcRead = pread(pVCpu->nem.s.statsFd, &value, 8, pVCpu->nem.s.guestModeStatOffset);
+ AssertReleaseMsg(rcRead == 8, ("pread did not read all bytes: %d\n", rcRead));
+
+ return value != 0;
+#else
+ NOREF(pVM); NOREF(pVCpu);
+ return false;
+#endif
+}
/**
* Does the early setup of a KVM VM.
@@ -86,6 +180,23 @@ static int nemR3LnxInitSetupVm(PVM pVM, PRTERRINFO pErrInfo)
if (rcLnx == -1)
return RTErrInfoSetF(pErrInfo, VERR_NEM_VM_CREATE_FAILED, "Failed to enable KVM_CAP_X86_USER_SPACE_MSR failed: %u", errno);
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+ rcLnx = ioctl(pVM->nem.s.fdVm, KVM_CREATE_IRQCHIP, 0);
+ if (rcLnx == -1)
+ return RTErrInfoSetF(pErrInfo, VERR_NEM_VM_CREATE_FAILED, "Failed to execute KVM_CREATE_VCPU: %u", errno);
+
+ kvmSetGsiRoutingFullIrqChip(pVM);
+#else
+ struct kvm_enable_cap CapSplitIrqChip =
+ {
+ KVM_CAP_SPLIT_IRQCHIP, 0,
+ { KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS, 0, 0, 0}
+ };
+ rcLnx = ioctl(pVM->nem.s.fdVm, KVM_ENABLE_CAP, &CapSplitIrqChip);
+ if (rcLnx == -1)
+ return RTErrInfoSetF(pErrInfo, VERR_NEM_VM_CREATE_FAILED, "Failed to enable KVM_CAP_SPLIT_IRQCHIP: %u", errno);
+#endif
+
/*
* Create the VCpus.
*/
@@ -106,10 +217,128 @@ static int nemR3LnxInitSetupVm(PVM pVM, PRTERRINFO pErrInfo)
/* We want all x86 registers and events on each exit. */
pVCpu->nem.s.pRun->kvm_valid_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS | KVM_SYNC_X86_EVENTS;
+
+#ifdef VBOX_WITH_KVM_NESTING
+ pVCpu->nem.s.statsFd = ioctl(pVCpu->nem.s.fdVCpu, KVM_GET_STATS_FD, 0);
+
+ if (pVCpu->nem.s.statsFd < 0) {
+ return RTErrInfoSetF(pErrInfo, VERR_NEM_VM_CREATE_FAILED, "Failed to get stats FD");
+ }
+
+ int rc = KvmGetGuestModeOffsetFromStatsFd(pVCpu, &pVCpu->nem.s.guestModeStatOffset);
+ if (not RT_SUCCESS(rc)) {
+ // Instead of failing here, we could also de-feature nested hardware virtualization.
+ return RTErrInfoSetF(pErrInfo, VERR_NEM_VM_CREATE_FAILED, "Failed to get guest_mode offset");
+ }
+
+ if (idCpu == 0) {
+ // Log the offset once, just for debugging purposes.
+ LogRel2(("KVM: guest_mode offset is at %d\n", pVCpu->nem.s.guestModeStatOffset));
+ }
+#endif
}
+
+ pVM->nem.s.pARedirectionTable = std::make_unique<std::array<std::optional<MSIMSG>, KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS>>();
+
return VINF_SUCCESS;
}
+static void nemR3LnxConsumePokeSignal()
+{
+ int iPokeSignal = RTThreadPokeSignal();
+ AssertReturnVoid(iPokeSignal >= 0);
+
+ sigset_t sigset;
+ sigemptyset(&sigset);
+ sigaddset(&sigset, iPokeSignal);
+
+ struct timespec timeout;
+
+ /* Don't wait for a signal, just poll. */
+ timeout.tv_sec = 0;
+ timeout.tv_nsec = 0;
+
+ int rc = sigtimedwait(&sigset, nullptr, &timeout);
+ AssertLogRelMsg(rc >= 0 || errno == EAGAIN || errno == EINTR, ("Failed to consume signal: %d", errno));
+}
+
+static PCPUMCPUIDLEAF findKvmLeaf(PCPUMCPUIDLEAF paKvmSupportedLeaves,
+ uint32_t cKvmSupportedLeaves,
+ uint32_t leaf,
+ uint32_t subleaf)
+{
+ for (uint32_t i = 0; i < cKvmSupportedLeaves; i++) {
+ auto& kvmLeaf = paKvmSupportedLeaves[i];
+
+ if (kvmLeaf.uLeaf == leaf && kvmLeaf.uSubLeaf == subleaf) {
+ return &kvmLeaf;
+ }
+ }
+
+ return nullptr;
+}
+
+static void maybeMaskUnsupportedKVMCpuidLeafValues(PCPUMCPUIDLEAF paKvmSupportedLeaves,
+ uint32_t cKvmSupportedLeaves,
+ uint32_t leaf,
+ uint32_t subleaf,
+ uint32_t& eax,
+ uint32_t& ebx,
+ uint32_t& ecx,
+ uint32_t& edx)
+{
+ static const uint32_t CPUID_FEATURE_INFORMATION_LEAF = 0x1;
+
+ /*
+ * A list of CPUID leaves that we want to mask with the KVM
+ * supported values. For example, we want to make sure that FSGSBASE
+ * support is supported by KVM before we offer it to the guest.
+ * VirtualBox detects the features it wants to offer via CPUID,
+ * which bypasses Linux/KVM.
+ */
+ const std::vector<uint32_t> leavesToMask = {
+ CPUID_FEATURE_INFORMATION_LEAF,
+ 0x6, // Thermal and power management
+ 0x7, // Structured Extended Feature Flags Enumeration
+ 0x12, // SGX capabilities
+ 0x14, // Processor Trace
+ 0x19, // AES Key Locker features
+ 0x24, // AVX10 Features
+ 0x80000001, // Extended Processor Info and Feature Bits
+ 0x80000007, // Processor Power Management Information and RAS Capabilities
+ 0x80000008, // Virtual and Physical address Sizes
+ 0x8000000A, // Secure Virtual Machine features
+ 0x8000001F, // Encrypted Memory Capabilities
+ 0x80000021, // Extended Feature Identification 2
+ };
+
+ if (std::find(leavesToMask.begin(), leavesToMask.end(), leaf) == leavesToMask.end()) {
+ return;
+ }
+
+ auto* paKvmSupportedLeaf = findKvmLeaf(paKvmSupportedLeaves, cKvmSupportedLeaves, leaf, subleaf);
+
+ if (paKvmSupportedLeaf == nullptr) {
+ return;
+ }
+
+ switch (leaf) {
+ case CPUID_FEATURE_INFORMATION_LEAF:
+ eax &= paKvmSupportedLeaf->uEax;
+ // ebx reports APIC IDs which we would mask if we use the
+ // KVM supported values.
+ ecx &= paKvmSupportedLeaf->uEcx;
+ ecx |= X86_CPUID_FEATURE_ECX_HVP; // The hypervisor bit is not enabled in the KVM values.
+ edx &= paKvmSupportedLeaf->uEdx;
+ break;
+ default:
+ eax &= paKvmSupportedLeaf->uEax;
+ ebx &= paKvmSupportedLeaf->uEbx;
+ ecx &= paKvmSupportedLeaf->uEcx;
+ edx &= paKvmSupportedLeaf->uEdx;
+ break;
+ }
+}
/**
* Update the CPUID leaves for a VCPU.
@@ -128,6 +357,12 @@ static int nemR3LnxUpdateCpuIdsLeaves(PVM pVM, PVMCPU pVCpu)
pReq->nent = cLeaves;
pReq->padding = 0;
+ size_t cKvmSupportedLeaves = 0;
+ PCPUMCPUIDLEAF paKvmSupportedLeaves = nullptr;
+ int rc = NEMR3KvmGetCpuIdLeaves(pVM, &paKvmSupportedLeaves, &cKvmSupportedLeaves);
+ AssertLogRelMsgReturn(RT_SUCCESS(rc), ("Could not retrieve supported CPUID leaves"), rc);
+
+
for (uint32_t i = 0; i < cLeaves; i++)
{
CPUMGetGuestCpuId(pVCpu, paLeaves[i].uLeaf, paLeaves[i].uSubLeaf, -1 /*f64BitMode*/,
@@ -135,6 +370,16 @@ static int nemR3LnxUpdateCpuIdsLeaves(PVM pVM, PVMCPU pVCpu)
&pReq->entries[i].ebx,
&pReq->entries[i].ecx,
&pReq->entries[i].edx);
+
+ maybeMaskUnsupportedKVMCpuidLeafValues(paKvmSupportedLeaves,
+ cKvmSupportedLeaves,
+ paLeaves[i].uLeaf,
+ paLeaves[i].uSubLeaf,
+ pReq->entries[i].eax,
+ pReq->entries[i].ebx,
+ pReq->entries[i].ecx,
+ pReq->entries[i].edx);
+
pReq->entries[i].function = paLeaves[i].uLeaf;
pReq->entries[i].index = paLeaves[i].uSubLeaf;
pReq->entries[i].flags = !paLeaves[i].fSubLeafMask ? 0 : KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
@@ -149,6 +394,111 @@ static int nemR3LnxUpdateCpuIdsLeaves(PVM pVM, PVMCPU pVCpu)
return VINF_SUCCESS;
}
+static int nemR3LnxInitGuestInterface(PVM pVM)
+{
+ switch (pVM->gim.s.enmProviderId) {
+ case GIMPROVIDERID_HYPERV:
+ /*
+ SynIC is currently disabled pending investigation of interrupt issues. See #19.
+
+ Enabling this capability is not sufficient to enable SynNIC. The corresponding features in the Hyper-V CPUID
+ leaves also have to be enabled. Look for SYNIC and STIMER in GIMHv.cpp.
+
+ The CPUID implementation hints must also indicate deprecating AutoEOI to make APICv work.
+ */
+#if 1
+ LogRel(("NEM: Enabling SYNIC.\n"));
+
+ for (VMCPUID idCpu = 0; idCpu < pVM->cCpus; idCpu++)
+ {
+ PVMCPU pVCpu = pVM->apCpusR3[idCpu];
+
+ struct kvm_enable_cap CapSynIC =
+ {
+ KVM_CAP_HYPERV_SYNIC2, 0, { 0, 0, 0, 0 }
+ };
+
+ int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_ENABLE_CAP, &CapSynIC);
+ AssertLogRelMsgReturn(rcLnx == 0, ("Failed to enable SYNIC: rcLnx=%d errno=%d\n", rcLnx, errno),
+ RTErrConvertFromErrno(errno));
+ }
+#endif
+
+ break;
+
+ default:
+ /* Other guest interfaces are not fully supported. */
+ break;
+ }
+
+ return VINF_SUCCESS;
+}
+
+namespace
+{
+
+enum class KvmCpuIdIoctl : uint32_t
+{
+ CPUID = KVM_GET_SUPPORTED_CPUID,
+ HV_CPUID = KVM_GET_SUPPORTED_HV_CPUID
+};
+
+int KvmGetCpuIdLeavesGeneric(PVM pVM, KvmCpuIdIoctl ioctlNum, PCPUMCPUIDLEAF *outpCpuId, size_t *outcLeaves)
+{
+ struct kvm_cpuid2 *pKvmCpuid;
+ uint32_t cLeaves = 0;
+ int rc;
+
+ /* In case we exit due to errors. */
+ *outpCpuId = nullptr;
+ *outcLeaves = 0;
+
+ /* There is no way to query how many leaves there are. We just try until we hit the right size. */
+ do
+ {
+ cLeaves += 1;
+ Log(("Querying for %u leaves\n", cLeaves));
+
+ pKvmCpuid = static_cast<struct kvm_cpuid2 *>(alloca(RT_UOFFSETOF_DYN(struct kvm_cpuid2, entries[cLeaves])));
+
+ pKvmCpuid->nent = cLeaves;
+ pKvmCpuid->padding = 0;
+
+ rc = ioctl(pVM->nem.s.fdKvm, static_cast<uint32_t>(ioctlNum), pKvmCpuid);
+ } while (rc != 0 && errno == E2BIG);
+ AssertLogRelMsgReturn(rc == 0, ("Failed to query supported CPUID leaves: errno=%d", errno), RTErrConvertFromErrno(errno));
+ AssertFatal(cLeaves == pKvmCpuid->nent);
+
+ PCPUMCPUIDLEAF pCpuId = static_cast<PCPUMCPUIDLEAF>(RTMemAllocZ(sizeof(*pCpuId) * cLeaves));
+
+ for (uint32_t uLeaf = 0; uLeaf < cLeaves; uLeaf++)
+ {
+ pCpuId[uLeaf].uLeaf = pKvmCpuid->entries[uLeaf].function;
+ pCpuId[uLeaf].uSubLeaf = pKvmCpuid->entries[uLeaf].index;
+
+ pCpuId[uLeaf].uEax = pKvmCpuid->entries[uLeaf].eax;
+ pCpuId[uLeaf].uEbx = pKvmCpuid->entries[uLeaf].ebx;
+ pCpuId[uLeaf].uEcx = pKvmCpuid->entries[uLeaf].ecx;
+ pCpuId[uLeaf].uEdx = pKvmCpuid->entries[uLeaf].edx;
+ }
+
+ *outpCpuId = pCpuId;
+ *outcLeaves = cLeaves;
+
+ return VINF_SUCCESS;
+}
+
+} // anonymous namespace
+
+int NEMR3KvmGetHvCpuIdLeaves(PVM pVM, PCPUMCPUIDLEAF *outpCpuId, size_t *outcLeaves)
+{
+ return KvmGetCpuIdLeavesGeneric(pVM, KvmCpuIdIoctl::HV_CPUID, outpCpuId, outcLeaves);
+}
+
+int NEMR3KvmGetCpuIdLeaves(PVM pVM, PCPUMCPUIDLEAF *outpCpuId, size_t *outcLeaves)
+{
+ return KvmGetCpuIdLeavesGeneric(pVM, KvmCpuIdIoctl::CPUID, outpCpuId, outcLeaves);
+}
DECLHIDDEN(int) nemR3NativeInitCompletedRing3(PVM pVM)
{
@@ -163,10 +513,28 @@ DECLHIDDEN(int) nemR3NativeInitCompletedRing3(PVM pVM)
*/
for (VMCPUID idCpu = 0; idCpu < pVM->cCpus; idCpu++)
{
+ PCPUMCTXMSRS const pCtxMsrs = CPUMQueryGuestCtxMsrsPtr(pVM->apCpusR3[idCpu]);
+
int rc = nemR3LnxUpdateCpuIdsLeaves(pVM, pVM->apCpusR3[idCpu]);
AssertRCReturn(rc, rc);
+
+#ifdef VBOX_WITH_KVM_NESTING
+ if (pVM->cpum.s.GuestFeatures.fVmx) {
+ NEMR3KvmSetMsr(pVM->apCpusR3[idCpu], MSR_IA32_FEATURE_CONTROL, MSR_IA32_FEATURE_CONTROL_VMXON | MSR_IA32_FEATURE_CONTROL_LOCK);
+ }
+#endif
+
+ uint64_t val {0};
+ NEMR3KvmGetMsr(pVM->apCpusR3[idCpu], MSR_IA32_ARCH_CAPABILITIES, &val);
+ pCtxMsrs->msr.ArchCaps = val;
+
+ NEMR3KvmGetMsr(pVM->apCpusR3[idCpu], MSR_IA32_SPEC_CTRL, &val);
+ pCtxMsrs->msr.SpecCtrl = val;
}
+ int rcLnxGI = nemR3LnxInitGuestInterface(pVM);
+ AssertRCReturn(rcLnxGI, rcLnxGI);
+
/*
* Configure MSRs after ring-3 init is done.
*
@@ -193,6 +561,8 @@ DECLHIDDEN(int) nemR3NativeInitCompletedRing3(PVM pVM)
MsrFilters.ranges[iRange].bitmap = (uint8_t *)&RT_CONCAT(bm, a_uBase)[0]
#define MSR_RANGE_ADD(a_Msr) \
do { Assert((uint32_t)(a_Msr) - uBase < cMsrs); ASMBitSet(pbm, (uint32_t)(a_Msr) - uBase); } while (0)
+#define MSR_RANGE_ADD_CLOSED_IVL(first_Msr, last_Msr) \
+ for (uint32_t uMsr = (first_Msr); uMsr <= last_Msr; uMsr++) { MSR_RANGE_ADD(uMsr); }
#define MSR_RANGE_END(a_cMinMsrs) \
/* optimize the range size before closing: */ \
uint32_t cBitmap = cMsrs / 64; \
@@ -204,11 +574,44 @@ DECLHIDDEN(int) nemR3NativeInitCompletedRing3(PVM pVM)
/* 1st Intel range: 0000_0000 to 0000_3000. */
MSR_RANGE_BEGIN(0x00000000, 0x00003000, KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE);
+ MSR_RANGE_ADD(MSR_IA32_BIOS_SIGN_ID);
MSR_RANGE_ADD(MSR_IA32_TSC);
+ MSR_RANGE_ADD(MSR_IA32_APICBASE);
MSR_RANGE_ADD(MSR_IA32_SYSENTER_CS);
MSR_RANGE_ADD(MSR_IA32_SYSENTER_ESP);
MSR_RANGE_ADD(MSR_IA32_SYSENTER_EIP);
MSR_RANGE_ADD(MSR_IA32_CR_PAT);
+ MSR_RANGE_ADD(MSR_IA32_ARCH_CAPABILITIES);
+ MSR_RANGE_ADD(MSR_IA32_SPEC_CTRL);
+ MSR_RANGE_ADD(MSR_IA32_PRED_CMD);
+ MSR_RANGE_ADD(MSR_IA32_FLUSH_CMD);
+#ifdef VBOX_WITH_KVM_NESTING
+ if (pVM->cpum.s.GuestFeatures.fVmx) {
+ /* VMX MSRS */
+ MSR_RANGE_ADD(MSR_IA32_FEATURE_CONTROL);
+ MSR_RANGE_ADD(MSR_IA32_MISC_ENABLE);
+ MSR_RANGE_ADD(MSR_IA32_VMX_BASIC);
+ MSR_RANGE_ADD(MSR_IA32_VMX_PINBASED_CTLS);
+ MSR_RANGE_ADD(MSR_IA32_VMX_PROCBASED_CTLS);
+ MSR_RANGE_ADD(MSR_IA32_VMX_EXIT_CTLS);
+ MSR_RANGE_ADD(MSR_IA32_VMX_ENTRY_CTLS);
+ MSR_RANGE_ADD(MSR_IA32_VMX_MISC);
+ MSR_RANGE_ADD(MSR_IA32_VMX_CR0_FIXED0);
+ MSR_RANGE_ADD(MSR_IA32_VMX_CR0_FIXED1);
+ MSR_RANGE_ADD(MSR_IA32_VMX_CR4_FIXED0);
+ MSR_RANGE_ADD(MSR_IA32_VMX_CR4_FIXED1);
+ MSR_RANGE_ADD(MSR_IA32_VMX_VMCS_ENUM);
+ MSR_RANGE_ADD(MSR_IA32_VMX_PROCBASED_CTLS2);
+ MSR_RANGE_ADD(MSR_IA32_VMX_EPT_VPID_CAP);
+ MSR_RANGE_ADD(MSR_IA32_VMX_TRUE_PINBASED_CTLS);
+ MSR_RANGE_ADD(MSR_IA32_VMX_TRUE_PROCBASED_CTLS);
+ MSR_RANGE_ADD(MSR_IA32_VMX_TRUE_EXIT_CTLS);
+ MSR_RANGE_ADD(MSR_IA32_VMX_TRUE_ENTRY_CTLS);
+ MSR_RANGE_ADD(MSR_IA32_VMX_VMFUNC);
+ MSR_RANGE_ADD(MSR_IA32_VMX_PROCBASED_CTLS3);
+ MSR_RANGE_ADD(MSR_IA32_VMX_EXIT_CTLS2);
+ }
+#endif
/** @todo more? */
MSR_RANGE_END(64);
@@ -216,6 +619,13 @@ DECLHIDDEN(int) nemR3NativeInitCompletedRing3(PVM pVM)
MSR_RANGE_BEGIN(0xc0000000, 0xc0003000, KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE);
MSR_RANGE_ADD(MSR_K6_EFER);
MSR_RANGE_ADD(MSR_K6_STAR);
+
+ /*
+ * If we don't allow direct access to FS_BASE, we clobber the FS base for the guest. This sounds like a bug in
+ * our state synchronization with KVM.
+ */
+ MSR_RANGE_ADD(MSR_K8_FS_BASE);
+
MSR_RANGE_ADD(MSR_K8_GS_BASE);
MSR_RANGE_ADD(MSR_K8_KERNEL_GS_BASE);
MSR_RANGE_ADD(MSR_K8_LSTAR);
@@ -225,6 +635,49 @@ DECLHIDDEN(int) nemR3NativeInitCompletedRing3(PVM pVM)
/** @todo add more? */
MSR_RANGE_END(64);
+ if (pVM->gim.s.enmProviderId == GIMPROVIDERID_HYPERV)
+ {
+ MSR_RANGE_BEGIN(0x40000000, 0x40003000, KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE);
+
+ MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE0_FIRST, MSR_GIM_HV_RANGE0_LAST);
+ MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE1_FIRST, MSR_GIM_HV_RANGE1_LAST);
+ MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE2_FIRST, MSR_GIM_HV_RANGE2_LAST);
+ MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE3_FIRST, MSR_GIM_HV_RANGE3_LAST);
+
+ /* SynIC / STimer */
+ MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE4_FIRST, MSR_GIM_HV_RANGE4_LAST);
+ MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE5_FIRST, MSR_GIM_HV_RANGE5_LAST);
+ MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE6_FIRST, MSR_GIM_HV_RANGE6_LAST);
+
+ MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE7_FIRST, MSR_GIM_HV_RANGE7_LAST);
+ MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE8_FIRST, MSR_GIM_HV_RANGE8_LAST);
+ MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE9_FIRST, MSR_GIM_HV_RANGE9_LAST);
+ MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE10_FIRST, MSR_GIM_HV_RANGE10_LAST);
+ MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE11_FIRST, MSR_GIM_HV_RANGE11_LAST);
+
+ /*
+ * Crash MSRs
+ *
+ * We deliberately don't add them here, so we can handle them instead of KVM. This allows us to log the
+ * crash reason into VM log instead of it ending up in the kernel's log.
+ */
+ // MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE12_FIRST, MSR_GIM_HV_RANGE12_LAST);
+
+ /*
+ * These should be available to the guest with feature bit 23 in the base features, which we don't
+ * expose. But Windows touches them anyway?
+ */
+ MSR_RANGE_ADD(0x40000114 /* HV_X64_MSR_STIME_UNHALTED_TIMER_CONFIG */);
+ MSR_RANGE_ADD(0x40000115 /* HV_X64_MSR_STIME_UNHALTED_TIMER_COUNT */);
+
+ /*
+ * These are available to the guest with feature bit 15 in the base features (undocumented).
+ */
+ MSR_RANGE_ADD(0x40000118 /* HV_X64_MSR_TSC_INVARIANT_CONTROL */);
+
+ MSR_RANGE_END(64);
+ }
+
/** @todo Specify other ranges too? Like hyper-V and KVM to make sure we get
* the MSR requests instead of KVM. */
@@ -237,6 +690,330 @@ DECLHIDDEN(int) nemR3NativeInitCompletedRing3(PVM pVM)
}
+
+/*********************************************************************************************************************************
+* Memory management *
+*********************************************************************************************************************************/
+
+VMMR3_INT_DECL(int) NEMR3LoadExec(PVM pVM)
+{
+ // TODO: this code leaves a small window between the guest sending an INIT IPI
+ // and a subsequent SIPI IPI. If that's the case, we need to set the MP state
+ // `KVM_MP_STATE_INIT_RECEIVED` which requires some serious interaction
+ // between the NEM and SSM. For now, we hope that noone suspends a VM during
+ // VCPU bringup. See vbox-engineering#426.
+ for (VMCPUID i = 0; i < pVM->cCpus; i++) {
+ PVMCPU pVCpu = pVM->apCpusR3[i];
+ auto state = VMCPU_GET_STATE(pVCpu);
+ if (state == VMCPUSTATE_STARTED || state == VMCPUSTATE_STARTED_EXEC_NEM || state == VMCPUSTATE_STARTED_EXEC_NEM_WAIT )
+ {
+ struct kvm_mp_state mp;
+ mp.mp_state = KVM_MP_STATE_RUNNABLE;
+ int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_SET_MP_STATE, &mp);
+ AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3Load: Failed to set MP state. Error: %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+ }
+ }
+ return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmGetMsr(PVMCPU pVCpu, uint64_t msr, uint64_t* val)
+{
+ alignas(struct kvm_msrs) char backing[sizeof(struct kvm_msrs) + sizeof(struct kvm_msr_entry)];
+ struct kvm_msrs* msr_data {reinterpret_cast<struct kvm_msrs*>(&backing[0])};
+ RT_ZERO(backing);
+
+ msr_data->nmsrs = 1;
+ msr_data->entries[0].index = msr;
+
+ int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_GET_MSRS, msr_data);
+ AssertLogRelMsgReturn(rcLnx == 1, ("NEMR3KvmGetMsr: \
+ Failed to get MSR data. Error: %d, errno %d\n", rcLnx, errno), VERR_NOT_SUPPORTED);
+
+ AssertLogRelMsgReturn(val != nullptr, ("NEMR3KvmGetMsr: \
+ Invalid buffer\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+ *val = msr_data->entries[0].data;
+
+ return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmSetMsr(PVMCPU pVCpu, uint64_t msr, uint64_t val)
+{
+ alignas(struct kvm_msrs) char backing[sizeof(struct kvm_msrs) + sizeof(struct kvm_msr_entry)];
+ struct kvm_msrs* msr_data {reinterpret_cast<struct kvm_msrs*>(&backing[0])};
+ RT_ZERO(backing);
+
+ msr_data->nmsrs = 1;
+ msr_data->entries[0].index = msr;
+ msr_data->entries[0].data = val;
+
+ int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_SET_MSRS, msr_data);
+ AssertLogRelMsgReturn(rcLnx == 1, ("NEMR3KvmSetMsr: \
+ Failed to set MSR[%lx] data. Error: %d, errno %d\n", msr, rcLnx, errno), VERR_NOT_SUPPORTED);
+
+ return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmGetLapicState(PVMCPU pVCpu, void* pXApicPage)
+{
+ struct kvm_lapic_state state;
+
+ int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_GET_LAPIC, &state);
+ AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3KvmGetLapicState: \
+ Failed to get APIC state. Error: %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+ memcpy(pXApicPage, &state.regs[0], KVM_APIC_REG_SIZE);
+ return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmSetLapicState(PVMCPU pVCpu, void* pXApicPage)
+{
+ struct kvm_lapic_state state;
+
+ memcpy(&state.regs[0], pXApicPage, KVM_APIC_REG_SIZE);
+
+ int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_SET_LAPIC, &state);
+ AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3KvmSetApicState: \
+ Failed to set APIC state. Error %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+ return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmSetIrqLine(PVM pVM, uint16_t u16Gsi, int iLevel)
+{
+ struct kvm_irq_level irq;
+ RT_ZERO(irq);
+
+ irq.irq = u16Gsi;
+ irq.level = iLevel;
+
+ int rcLnx = ioctl(pVM->nem.s.fdVm, KVM_IRQ_LINE, &irq);
+ AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3KvmSetIrqLine: Failed to set irq line %d! error: %d, errno %d\n", u16Gsi, rcLnx, errno), VERR_NEM_IPE_5);
+
+ return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmSplitIrqchipDeliverMsi(PVM pVM, PCMSIMSG pMsi)
+{
+ AssertLogRelReturn(pVM != nullptr, VERR_INVALID_POINTER);
+ AssertLogRelReturn(pMsi != nullptr, VERR_INVALID_POINTER);
+
+ struct kvm_msi msi;
+ RT_ZERO(msi);
+ msi.address_lo = pMsi->Addr.au32[0];
+ msi.address_hi = pMsi->Addr.au32[1];
+ msi.data = pMsi->Data.u32;
+
+ int rcLnx = ioctl(pVM->nem.s.fdVm, KVM_SIGNAL_MSI, &msi);
+ AssertLogRelMsgReturn(rcLnx >= 0, ("NEMR3KvmSplitIrqchipDeliverMsi: Failed to deliver MSI! error: %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+ return rcLnx == 0 ? VERR_APIC_INTR_DISCARDED : VINF_SUCCESS;
+}
+
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+static int kvmSetGsiRoutingFullIrqChip(PVM pVM)
+{
+ alignas(kvm_irq_routing) char backing[ sizeof(struct kvm_irq_routing) +
+ (KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS + KVM_IRQCHIP_NUM_PIC_INTR_PINS) * sizeof(struct kvm_irq_routing_entry) ] {};
+ kvm_irq_routing* routing = reinterpret_cast<kvm_irq_routing*>(backing);
+
+ for (unsigned i = 0; i < KVM_IRQCHIP_NUM_PIC_INTR_PINS; ++i) {
+ routing->entries[i].gsi = i;
+ routing->entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
+ routing->entries[i].u.irqchip.irqchip = (i < 8) ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE;
+ routing->entries[i].u.irqchip.pin = (i < 8) ? i : (i - 8);
+ }
+
+ for (unsigned i = 0; i < KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS; ++i) {
+ uint64_t arr_idx = i + KVM_IRQCHIP_NUM_PIC_INTR_PINS;
+ routing->entries[arr_idx].gsi = i;
+ routing->entries[arr_idx].type = KVM_IRQ_ROUTING_IRQCHIP;
+ routing->entries[arr_idx].u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC;
+ if (i == 0) {
+ routing->entries[arr_idx].u.irqchip.pin = 2;
+ } else {
+ routing->entries[arr_idx].u.irqchip.pin = i;
+ }
+ }
+ routing->nr = KVM_IRQCHIP_NUM_PIC_INTR_PINS + KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS;
+
+ int rc = ioctl(pVM->nem.s.fdVm, KVM_SET_GSI_ROUTING, routing);
+
+ AssertLogRelMsgReturn(rc >= 0, ("NEM/KVM: Unable to set GSI routing! rc: %d errno %d \n", rc, errno), VERR_INTERNAL_ERROR);
+
+ return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmGetPicState(PVM pVM, KVMIRQCHIP irqchip, KVMPICSTATE* state)
+{
+ struct kvm_irqchip irqchip_state;
+ irqchip_state.chip_id = irqchip == KVMIRQCHIP::PIC_MASTER ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE;
+
+ if (state == nullptr) {
+ return VERR_INVALID_POINTER;
+ }
+
+ int rcLnx = ioctl(pVM->nem.s.fdVm, KVM_GET_IRQCHIP, &irqchip_state);
+ AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3KvmGetPicState: \
+ Failed to get PIC state. Error: %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+ state->last_irr = irqchip_state.chip.pic.last_irr;
+ state->irr = irqchip_state.chip.pic.irr;
+ state->imr = irqchip_state.chip.pic.imr;
+ state->isr = irqchip_state.chip.pic.isr;
+ state->priority_add = irqchip_state.chip.pic.priority_add;
+ state->irq_base = irqchip_state.chip.pic.irq_base;
+ state->read_reg_select = irqchip_state.chip.pic.read_reg_select;
+ state->poll = irqchip_state.chip.pic.poll;
+ state->special_mask = irqchip_state.chip.pic.special_mask;
+ state->init_state = irqchip_state.chip.pic.init_state;
+ state->auto_eoi = irqchip_state.chip.pic.auto_eoi;
+ state->rotate_on_auto_eoi = irqchip_state.chip.pic.rotate_on_auto_eoi;
+ state->special_fully_nested_mode = irqchip_state.chip.pic.special_fully_nested_mode;
+ state->init4 = irqchip_state.chip.pic.init4;
+ state->elcr = irqchip_state.chip.pic.elcr;
+ state->elcr_mask = irqchip_state.chip.pic.elcr_mask;
+
+ return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmSetPicState(PVM pVM, KVMIRQCHIP irqchip, KVMPICSTATE* state)
+{
+ struct kvm_irqchip irqchip_state;
+ irqchip_state.chip_id = irqchip == KVMIRQCHIP::PIC_MASTER ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE;
+
+ if (state == nullptr) {
+ return VERR_INVALID_POINTER;
+ }
+
+ irqchip_state.chip.pic.last_irr = state->last_irr;
+ irqchip_state.chip.pic.irr = state->irr;
+ irqchip_state.chip.pic.imr = state->imr;
+ irqchip_state.chip.pic.isr = state->isr;
+ irqchip_state.chip.pic.priority_add = state->priority_add;
+ irqchip_state.chip.pic.irq_base = state->irq_base;
+ irqchip_state.chip.pic.read_reg_select = state->read_reg_select;
+ irqchip_state.chip.pic.poll = state->poll;
+ irqchip_state.chip.pic.special_mask = state->special_mask;
+ irqchip_state.chip.pic.init_state = state->init_state;
+ irqchip_state.chip.pic.auto_eoi = state->auto_eoi;
+ irqchip_state.chip.pic.rotate_on_auto_eoi = state->rotate_on_auto_eoi;
+ irqchip_state.chip.pic.special_fully_nested_mode = state->special_fully_nested_mode;
+ irqchip_state.chip.pic.init4 = state->init4;
+ irqchip_state.chip.pic.elcr = state->elcr;
+ irqchip_state.chip.pic.elcr_mask = state->elcr_mask;
+
+ int rcLnx = ioctl(pVM->nem.s.fdVm, KVM_GET_IRQCHIP, &irqchip_state);
+ AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3KvmSetPicState: \
+ Failed to get PIC state. Error: %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+ return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmGetIoApicState(PVM pVM, KVMIOAPICSTATE* state)
+{
+ struct kvm_irqchip irqchip_state;
+ irqchip_state.chip_id = KVM_IRQCHIP_IOAPIC;
+
+ if (state == nullptr) {
+ return VERR_INVALID_POINTER;
+ }
+
+ int rcLnx = ioctl(pVM->nem.s.fdVm, KVM_GET_IRQCHIP, &irqchip_state);
+ AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3KvmGetIoApicState: \
+ Failed to get IOAPIC state. Error: %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+ state->base_address = irqchip_state.chip.ioapic.base_address;
+ state->ioregsel = irqchip_state.chip.ioapic.ioregsel;
+ state->id = irqchip_state.chip.ioapic.id;
+ state->irr = irqchip_state.chip.ioapic.irr;
+
+ for (unsigned i = 0; i < KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS; ++i) {
+ state->redirtbl[i] = irqchip_state.chip.ioapic.redirtbl[i].bits;
+ }
+
+ return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmSetIoApicState(PVM pVM, KVMIOAPICSTATE* state)
+{
+ struct kvm_irqchip irqchip_state;
+ irqchip_state.chip_id = KVM_IRQCHIP_IOAPIC;
+
+ if (state == nullptr) {
+ return VERR_INVALID_POINTER;
+ }
+
+ irqchip_state.chip.ioapic.base_address = state->base_address;
+ irqchip_state.chip.ioapic.ioregsel = state->ioregsel;
+ irqchip_state.chip.ioapic.id = state->id;
+ irqchip_state.chip.ioapic.irr = state->irr;
+
+ for (unsigned i = 0; i < KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS; ++i) {
+ irqchip_state.chip.ioapic.redirtbl[i].bits = state->redirtbl[i];
+ }
+
+ int rcLnx = ioctl(pVM->nem.s.fdVm, KVM_SET_IRQCHIP, &irqchip_state);
+ AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3KvmSetIoApicState: \
+ Failed to set IOPIC state. Error: %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+ return VINF_SUCCESS;
+}
+#endif
+
+static int kvmSetGsiRouting(PVM pVM)
+{
+ alignas(kvm_irq_routing) char backing[ sizeof(struct kvm_irq_routing) + KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS * sizeof(struct kvm_irq_routing_entry) ] {};
+ kvm_irq_routing* routing = reinterpret_cast<kvm_irq_routing*>(backing);
+
+ unsigned routingCount {0};
+
+ for(unsigned i {0}; i < KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS; ++i)
+ {
+ if (pVM->nem.s.pARedirectionTable->at(i).has_value())
+ {
+ PMSIMSG msi = &(pVM->nem.s.pARedirectionTable->at(i).value());
+ routing->entries[routingCount].gsi = i;
+ routing->entries[routingCount].type = KVM_IRQ_ROUTING_MSI;
+ routing->entries[routingCount].u.msi.address_lo = msi->Addr.au32[0];
+ routing->entries[routingCount].u.msi.address_hi = msi->Addr.au32[1];
+ routing->entries[routingCount].u.msi.data = msi->Data.u32;
+ routingCount++;
+ }
+ }
+
+ routing->nr = routingCount;
+
+ int rc = ioctl(pVM->nem.s.fdVm, KVM_SET_GSI_ROUTING, routing);
+
+ AssertLogRelMsgReturn(rc >= 0, ("NEM/KVM: Unable to set GSI routing! rc: %d errno %d \n", rc, errno), VERR_INTERNAL_ERROR);
+
+ return VINF_SUCCESS;
+}
+
+
+VMMR3_INT_DECL(int) NEMR3KvmSplitIrqchipAddUpdateRTE(PVM pVM, uint16_t u16Gsi, PCMSIMSG pMsi)
+{
+ AssertRelease(pVM->nem.s.pARedirectionTable != nullptr);
+ AssertRelease(u16Gsi < KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS);
+
+ pVM->nem.s.pARedirectionTable->at(u16Gsi) = *pMsi;
+
+ return kvmSetGsiRouting(pVM);
+}
+
+
+VMMR3_INT_DECL(int) NEMR3KvmSplitIrqchipRemoveRTE(PVM pVM, uint16_t u16Gsi)
+{
+ AssertRelease(pVM->nem.s.pARedirectionTable != nullptr);
+ AssertRelease(u16Gsi < KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS);
+
+ pVM->nem.s.pARedirectionTable->at(u16Gsi) = std::nullopt;
+
+ return kvmSetGsiRouting(pVM);
+}
+
+
/*********************************************************************************************************************************
* CPU State *
*********************************************************************************************************************************/
@@ -379,8 +1156,7 @@ static int nemHCLnxImportState(PVMCPUCC pVCpu, uint64_t fWhat, PCPUMCTX pCtx, st
}
}
}
- if (fWhat & CPUMCTX_EXTRN_APIC_TPR)
- APICSetTpr(pVCpu, (uint8_t)pRun->s.regs.sregs.cr8 << 4);
+
if (fWhat & CPUMCTX_EXTRN_EFER)
{
if (pCtx->msrEFER != pRun->s.regs.sregs.efer)
@@ -447,6 +1223,7 @@ static int nemHCLnxImportState(PVMCPUCC pVCpu, uint64_t fWhat, PCPUMCTX pCtx, st
pCtx->aXcr[0] = Xcrs.xcrs[0].value;
pCtx->aXcr[1] = Xcrs.xcrs[1].value;
+ pCtx->fXStateMask = Xcrs.xcrs[0].value;
}
}
@@ -494,6 +1271,8 @@ static int nemHCLnxImportState(PVMCPUCC pVCpu, uint64_t fWhat, PCPUMCTX pCtx, st
if (fWhat & CPUMCTX_EXTRN_OTHER_MSRS)
{
ADD_MSR(MSR_IA32_CR_PAT, pCtx->msrPAT);
+ ADD_MSR(MSR_IA32_ARCH_CAPABILITIES, pCtxMsrs->msr.ArchCaps);
+ ADD_MSR(MSR_IA32_SPEC_CTRL, pCtxMsrs->msr.SpecCtrl);
/** @todo What do we _have_ to add here?
* We also have: Mttr*, MiscEnable, FeatureControl. */
}
@@ -531,12 +1310,6 @@ static int nemHCLnxImportState(PVMCPUCC pVCpu, uint64_t fWhat, PCPUMCTX pCtx, st
pVCpu->cpum.GstCtx.rip);
CPUMUpdateInterruptInhibitingByNmi(&pVCpu->cpum.GstCtx, KvmEvents.nmi.masked != 0);
- if (KvmEvents.interrupt.injected)
- {
- STAM_REL_COUNTER_INC(&pVCpu->nem.s.StatImportPendingInterrupt);
- TRPMAssertTrap(pVCpu, KvmEvents.interrupt.nr, !KvmEvents.interrupt.soft ? TRPM_HARDWARE_INT : TRPM_SOFTWARE_INT);
- }
-
Assert(KvmEvents.nmi.injected == 0);
Assert(KvmEvents.nmi.pending == 0);
}
@@ -647,6 +1420,13 @@ VMM_INT_DECL(int) NEMImportStateOnDemand(PVMCPUCC pVCpu, uint64_t fWhat)
*/
static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_run *pRun)
{
+#define NEM_UPDATE_IF_CHANGED(dst, src, dirty_flag) \
+ if (src != dst) { \
+ dst = src; \
+ dirty_flag = true; \
+ }
+
+
uint64_t const fExtrn = ~pCtx->fExtrn & CPUMCTX_EXTRN_ALL;
Assert((~fExtrn & CPUMCTX_EXTRN_ALL) != CPUMCTX_EXTRN_ALL);
@@ -655,39 +1435,53 @@ static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_
*/
if (fExtrn & (CPUMCTX_EXTRN_RIP | CPUMCTX_EXTRN_RFLAGS | CPUMCTX_EXTRN_GPRS_MASK))
{
- if (fExtrn & CPUMCTX_EXTRN_RIP)
- pRun->s.regs.regs.rip = pCtx->rip;
- if (fExtrn & CPUMCTX_EXTRN_RFLAGS)
- pRun->s.regs.regs.rflags = pCtx->rflags.u;
-
- if (fExtrn & CPUMCTX_EXTRN_RAX)
- pRun->s.regs.regs.rax = pCtx->rax;
- if (fExtrn & CPUMCTX_EXTRN_RCX)
- pRun->s.regs.regs.rcx = pCtx->rcx;
- if (fExtrn & CPUMCTX_EXTRN_RDX)
- pRun->s.regs.regs.rdx = pCtx->rdx;
- if (fExtrn & CPUMCTX_EXTRN_RBX)
- pRun->s.regs.regs.rbx = pCtx->rbx;
- if (fExtrn & CPUMCTX_EXTRN_RSP)
- pRun->s.regs.regs.rsp = pCtx->rsp;
- if (fExtrn & CPUMCTX_EXTRN_RBP)
- pRun->s.regs.regs.rbp = pCtx->rbp;
- if (fExtrn & CPUMCTX_EXTRN_RSI)
- pRun->s.regs.regs.rsi = pCtx->rsi;
- if (fExtrn & CPUMCTX_EXTRN_RDI)
- pRun->s.regs.regs.rdi = pCtx->rdi;
+ bool dirty_gprs {false};
+
+ if (fExtrn & CPUMCTX_EXTRN_RIP) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rip, pCtx->rip, dirty_gprs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_RFLAGS) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rflags, pCtx->rflags.u, dirty_gprs);
+ }
+
+ if (fExtrn & CPUMCTX_EXTRN_RAX) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rax, pCtx->rax, dirty_gprs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_RCX) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rcx, pCtx->rcx, dirty_gprs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_RDX) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rdx, pCtx->rdx, dirty_gprs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_RBX) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rbx, pCtx->rbx, dirty_gprs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_RSP) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rsp, pCtx->rsp, dirty_gprs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_RBP) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rbp, pCtx->rbp, dirty_gprs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_RSI) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rsi, pCtx->rsi, dirty_gprs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_RDI) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rdi, pCtx->rdi, dirty_gprs);
+ }
if (fExtrn & CPUMCTX_EXTRN_R8_R15)
{
- pRun->s.regs.regs.r8 = pCtx->r8;
- pRun->s.regs.regs.r9 = pCtx->r9;
- pRun->s.regs.regs.r10 = pCtx->r10;
- pRun->s.regs.regs.r11 = pCtx->r11;
- pRun->s.regs.regs.r12 = pCtx->r12;
- pRun->s.regs.regs.r13 = pCtx->r13;
- pRun->s.regs.regs.r14 = pCtx->r14;
- pRun->s.regs.regs.r15 = pCtx->r15;
- }
- pRun->kvm_dirty_regs |= KVM_SYNC_X86_REGS;
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r8, pCtx->r8, dirty_gprs);
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r9, pCtx->r9, dirty_gprs);
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r10, pCtx->r10, dirty_gprs);
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r11, pCtx->r11, dirty_gprs);
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r12, pCtx->r12, dirty_gprs);
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r13, pCtx->r13, dirty_gprs);
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r14, pCtx->r14, dirty_gprs);
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r15, pCtx->r15, dirty_gprs);
+ }
+ if (dirty_gprs) {
+ pRun->kvm_dirty_regs |= KVM_SYNC_X86_REGS;
+ }
}
/*
@@ -701,15 +1495,7 @@ static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_
| CPUMCTX_EXTRN_EFER | CPUMCTX_EXTRN_APIC_TPR))
|| uApicBase != pVCpu->nem.s.uKvmApicBase)
{
- if ((pVCpu->nem.s.uKvmApicBase ^ uApicBase) & MSR_IA32_APICBASE_EN)
- Log(("NEM/%u: APICBASE_EN changed %#010RX64 -> %#010RX64\n", pVCpu->idCpu, pVCpu->nem.s.uKvmApicBase, uApicBase));
- pRun->s.regs.sregs.apic_base = uApicBase;
- pVCpu->nem.s.uKvmApicBase = uApicBase;
-
- if (fExtrn & CPUMCTX_EXTRN_APIC_TPR)
- pRun->s.regs.sregs.cr8 = CPUMGetGuestCR8(pVCpu);
-
-#define NEM_LNX_EXPORT_SEG(a_KvmSeg, a_CtxSeg) do { \
+#define NEM_LNX_EXPORT_SEG(a_KvmSeg, a_CtxSeg, dirty_flag) do { \
(a_KvmSeg).base = (a_CtxSeg).u64Base; \
(a_KvmSeg).limit = (a_CtxSeg).u32Limit; \
(a_KvmSeg).selector = (a_CtxSeg).Sel; \
@@ -723,64 +1509,123 @@ static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_
(a_KvmSeg).g = (a_CtxSeg).Attr.n.u1Granularity; \
(a_KvmSeg).unusable = (a_CtxSeg).Attr.n.u1Unusable; \
(a_KvmSeg).padding = 0; \
+ dirty_flag = true; \
} while (0)
+#define NEM_LNX_SREG_IDENTICAL(a_KvmSeg, a_CtxSeg) ( \
+ (a_KvmSeg).base == (a_CtxSeg).u64Base && \
+ (a_KvmSeg).limit == (a_CtxSeg).u32Limit && \
+ (a_KvmSeg).selector == (a_CtxSeg).Sel && \
+ (a_KvmSeg).type == (a_CtxSeg).Attr.n.u4Type && \
+ (a_KvmSeg).s == (a_CtxSeg).Attr.n.u1DescType && \
+ (a_KvmSeg).dpl == (a_CtxSeg).Attr.n.u2Dpl && \
+ (a_KvmSeg).present == (a_CtxSeg).Attr.n.u1Present && \
+ (a_KvmSeg).avl == (a_CtxSeg).Attr.n.u1Available && \
+ (a_KvmSeg).l == (a_CtxSeg).Attr.n.u1Long && \
+ (a_KvmSeg).db == (a_CtxSeg).Attr.n.u1DefBig && \
+ (a_KvmSeg).g == (a_CtxSeg).Attr.n.u1Granularity && \
+ (a_KvmSeg).unusable == (a_CtxSeg).Attr.n.u1Unusable \
+ )
+ bool dirty_sregs = false;
+
+ if ((pVCpu->nem.s.uKvmApicBase ^ uApicBase) & MSR_IA32_APICBASE_EN)
+ Log(("NEM/%u: APICBASE_EN changed %#010RX64 -> %#010RX64\n", pVCpu->idCpu, pVCpu->nem.s.uKvmApicBase, uApicBase));
+
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.apic_base, uApicBase, dirty_sregs);
+ NEM_UPDATE_IF_CHANGED(pVCpu->nem.s.uKvmApicBase, uApicBase, dirty_sregs);
if (fExtrn & CPUMCTX_EXTRN_SREG_MASK)
{
- if (fExtrn & CPUMCTX_EXTRN_ES)
- NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.es, pCtx->es);
- if (fExtrn & CPUMCTX_EXTRN_CS)
- NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.cs, pCtx->cs);
- if (fExtrn & CPUMCTX_EXTRN_SS)
- NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.ss, pCtx->ss);
- if (fExtrn & CPUMCTX_EXTRN_DS)
- NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.ds, pCtx->ds);
- if (fExtrn & CPUMCTX_EXTRN_FS)
- NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.fs, pCtx->fs);
- if (fExtrn & CPUMCTX_EXTRN_GS)
- NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.gs, pCtx->gs);
+ if (fExtrn & CPUMCTX_EXTRN_ES and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.es, pCtx->es)) {
+ NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.es, pCtx->es, dirty_sregs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_CS and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.cs, pCtx->cs)) {
+ NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.cs, pCtx->cs, dirty_sregs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_SS and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.ss, pCtx->ss)) {
+ NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.ss, pCtx->ss, dirty_sregs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_DS and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.ds, pCtx->ds)) {
+ NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.ds, pCtx->ds, dirty_sregs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_FS and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.fs, pCtx->fs)) {
+ NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.fs, pCtx->fs, dirty_sregs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_GS and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.gs, pCtx->gs)) {
+ NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.gs, pCtx->gs, dirty_sregs);
+ }
+
}
if (fExtrn & CPUMCTX_EXTRN_TABLE_MASK)
{
if (fExtrn & CPUMCTX_EXTRN_GDTR)
{
- pRun->s.regs.sregs.gdt.base = pCtx->gdtr.pGdt;
- pRun->s.regs.sregs.gdt.limit = pCtx->gdtr.cbGdt;
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.gdt.base, pCtx->gdtr.pGdt, dirty_sregs);
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.gdt.limit, pCtx->gdtr.cbGdt, dirty_sregs);
pRun->s.regs.sregs.gdt.padding[0] = 0;
pRun->s.regs.sregs.gdt.padding[1] = 0;
pRun->s.regs.sregs.gdt.padding[2] = 0;
}
if (fExtrn & CPUMCTX_EXTRN_IDTR)
{
- pRun->s.regs.sregs.idt.base = pCtx->idtr.pIdt;
- pRun->s.regs.sregs.idt.limit = pCtx->idtr.cbIdt;
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.idt.base, pCtx->idtr.pIdt, dirty_sregs);
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.idt.limit, pCtx->idtr.cbIdt, dirty_sregs);
pRun->s.regs.sregs.idt.padding[0] = 0;
pRun->s.regs.sregs.idt.padding[1] = 0;
pRun->s.regs.sregs.idt.padding[2] = 0;
}
- if (fExtrn & CPUMCTX_EXTRN_LDTR)
- NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.ldt, pCtx->ldtr);
- if (fExtrn & CPUMCTX_EXTRN_TR)
- NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.tr, pCtx->tr);
+ if (fExtrn & CPUMCTX_EXTRN_LDTR and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.ldt, pCtx->ldtr)) {
+ NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.ldt, pCtx->ldtr, dirty_sregs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_TR and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.tr, pCtx->tr)) {
+ NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.tr, pCtx->tr, dirty_sregs);
+ }
+
}
if (fExtrn & CPUMCTX_EXTRN_CR_MASK)
{
- if (fExtrn & CPUMCTX_EXTRN_CR0)
- pRun->s.regs.sregs.cr0 = pCtx->cr0;
- if (fExtrn & CPUMCTX_EXTRN_CR2)
- pRun->s.regs.sregs.cr2 = pCtx->cr2;
- if (fExtrn & CPUMCTX_EXTRN_CR3)
- pRun->s.regs.sregs.cr3 = pCtx->cr3;
- if (fExtrn & CPUMCTX_EXTRN_CR4)
- pRun->s.regs.sregs.cr4 = pCtx->cr4;
+ if (fExtrn & CPUMCTX_EXTRN_CR0) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.cr0, pCtx->cr0, dirty_sregs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_CR2) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.cr2, pCtx->cr2, dirty_sregs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_CR3) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.cr3, pCtx->cr3, dirty_sregs);
+ }
+ if (fExtrn & CPUMCTX_EXTRN_CR4) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.cr4, pCtx->cr4, dirty_sregs);
+ }
}
- if (fExtrn & CPUMCTX_EXTRN_EFER)
- pRun->s.regs.sregs.efer = pCtx->msrEFER;
+ if (fExtrn & CPUMCTX_EXTRN_EFER) {
+ NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.efer, pCtx->msrEFER, dirty_sregs);
+ }
+
- RT_ZERO(pRun->s.regs.sregs.interrupt_bitmap); /* this is an alternative interrupt injection interface */
+ if (dirty_sregs) {
+ pRun->kvm_dirty_regs |= KVM_SYNC_X86_SREGS;
+ } else {
+ // This is a very weird and poorly documented part of the kvm_run structure.
+ // https://www.kernel.org/doc/html/latest/virt/kvm/api.html explains this the following way:
+ //
+ // interrupt_bitmap is a bitmap of pending external interrupts. At most one bit may be set.
+ // This interrupt has been acknowledged by the APIC but not yet injected into the cpu core.
+ //
+ // Looking at the kernel part of SET/GET_SREGS, we can see that this is kinda true, but not quite.
+ // The kernel sets only 1 bit, but never clears any of the fields. Thus, in order to have only
+ // a single bit set, userspace must clear the bitmap iff we haven't modified any SREGS. If we have
+ // modified SREGS, we have to transfer the unmodified bitmap back to KVM, because otherwise, we
+ // would tell KVM that the injection is no longer pending.
+ //
+ //
+ // This is a nasty interface and we should probably do what Qemu does, that is, using SET/GET_SREGS2
+ // where this field is no longer present.
+ RT_ZERO(pRun->s.regs.sregs.interrupt_bitmap);
+ }
- pRun->kvm_dirty_regs |= KVM_SYNC_X86_SREGS;
}
+#undef NEM_LNX_EXPORT_SEG
+#undef NEM_LNX_SREG_IDENTICAL
+#undef NEM_UPDATE_IF_CHANGED
/*
* Debug registers.
@@ -886,6 +1731,8 @@ static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_
if (fExtrn & CPUMCTX_EXTRN_OTHER_MSRS)
{
ADD_MSR(MSR_IA32_CR_PAT, pCtx->msrPAT);
+ ADD_MSR(MSR_IA32_ARCH_CAPABILITIES, pCtxMsrs->msr.ArchCaps);
+ ADD_MSR(MSR_IA32_SPEC_CTRL, pCtxMsrs->msr.SpecCtrl);
/** @todo What do we _have_ to add here?
* We also have: Mttr*, MiscEnable, FeatureControl. */
}
@@ -912,6 +1759,8 @@ static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_
== (CPUMCTX_EXTRN_INHIBIT_INT | CPUMCTX_EXTRN_INHIBIT_NMI));
struct kvm_vcpu_events KvmEvents = {0};
+ int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_GET_VCPU_EVENTS, &KvmEvents);
+ AssertLogRelMsgReturn(rcLnx == 0, ("rcLnx=%d errno=%d\n", rcLnx, errno), VERR_NEM_IPE_5);
KvmEvents.flags = KVM_VCPUEVENT_VALID_SHADOW;
if (!CPUMIsInInterruptShadowWithUpdate(&pVCpu->cpum.GstCtx))
@@ -923,26 +1772,7 @@ static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_
/* No flag - this is updated unconditionally. */
KvmEvents.nmi.masked = CPUMAreInterruptsInhibitedByNmi(&pVCpu->cpum.GstCtx);
- if (TRPMHasTrap(pVCpu))
- {
- TRPMEVENT enmType = TRPM_32BIT_HACK;
- uint8_t bTrapNo = 0;
- TRPMQueryTrap(pVCpu, &bTrapNo, &enmType);
- Log(("nemHCLnxExportState: Pending trap: bTrapNo=%#x enmType=%d\n", bTrapNo, enmType));
- if ( enmType == TRPM_HARDWARE_INT
- || enmType == TRPM_SOFTWARE_INT)
- {
- KvmEvents.interrupt.soft = enmType == TRPM_SOFTWARE_INT;
- KvmEvents.interrupt.nr = bTrapNo;
- KvmEvents.interrupt.injected = 1;
- STAM_REL_COUNTER_INC(&pVCpu->nem.s.StatExportPendingInterrupt);
- TRPMResetTrap(pVCpu);
- }
- else
- AssertFailed();
- }
-
- int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_SET_VCPU_EVENTS, &KvmEvents);
+ rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_SET_VCPU_EVENTS, &KvmEvents);
AssertLogRelMsgReturn(rcLnx == 0, ("rcLnx=%d errno=%d\n", rcLnx, errno), VERR_NEM_IPE_3);
}
@@ -967,8 +1797,31 @@ static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_
VMM_INT_DECL(int) NEMHCQueryCpuTick(PVMCPUCC pVCpu, uint64_t *pcTicks, uint32_t *puAux)
{
STAM_REL_COUNTER_INC(&pVCpu->nem.s.StatQueryCpuTick);
- // KVM_GET_CLOCK?
- RT_NOREF(pVCpu, pcTicks, puAux);
+
+ // This function is called when the VM is paused or
+ // suspended. It's called for all vCPUs.
+
+ const size_t NMSRS = 2;
+
+ size_t szReq = RT_UOFFSETOF_DYN(struct kvm_msrs, entries[NMSRS]);
+ struct kvm_msrs *pReq = static_cast<kvm_msrs *>(alloca(szReq));
+ memset(pReq, 0, szReq);
+
+ pReq->nmsrs = NMSRS;
+ pReq->entries[0].index = MSR_IA32_TSC;
+ pReq->entries[1].index = MSR_K8_TSC_AUX;
+
+ int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_GET_MSRS, pReq);
+ AssertLogRelMsgReturn(rcLnx == NMSRS, ("rcLnx=%d errno=%d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+ if (pcTicks) {
+ *pcTicks = pReq->entries[0].data;
+ }
+
+ if (puAux) {
+ *puAux = static_cast<uint32_t>(pReq->entries[1].data);
+ }
+
return VINF_SUCCESS;
}
@@ -985,8 +1838,39 @@ VMM_INT_DECL(int) NEMHCQueryCpuTick(PVMCPUCC pVCpu, uint64_t *pcTicks, uint32_t
*/
VMM_INT_DECL(int) NEMHCResumeCpuTickOnAll(PVMCC pVM, PVMCPUCC pVCpu, uint64_t uPausedTscValue)
{
- // KVM_SET_CLOCK?
- RT_NOREF(pVM, pVCpu, uPausedTscValue);
+ RT_NOREF(pVCpu);
+
+ // This function is called once during unpause or resume. Despite
+ // the pVCpu parameter it is _not_ called for all vCPUs.
+
+ const size_t NMSRS = 1;
+
+ size_t szReq = RT_UOFFSETOF_DYN(struct kvm_msrs, entries[NMSRS]);
+ struct kvm_msrs *pReq = static_cast<kvm_msrs *>(alloca(szReq));
+ memset(pReq, 0, szReq);
+
+ pReq->nmsrs = NMSRS;
+ pReq->entries[0].index = MSR_IA32_TSC;
+ pReq->entries[0].data = uPausedTscValue;
+
+ // Setting the individual TSC values of all CPUs is fundamentally
+ // flawed, because the TSCs keep ticking while we set them. That
+ // means that we never really end up with synchronized TSC values
+ // unless KVM's built-in TSC synchronization magic fixes things up
+ // for us. But the interface doesn't leave us a lot of choice here
+ // for now.
+ //
+ // A better approach would be to use KVM_GET_CLOCK/KVM_SET_CLOCK
+ // and restore TSC_ADJUST values. We should validate whether this
+ // does the right thing though first.
+ for (VMCPUID idCpu = 0; idCpu < pVM->cCpus; idCpu++)
+ {
+ PVMCPU pVCpuCur = pVM->apCpusR3[idCpu];
+
+ int rcLnx = ioctl(pVCpuCur->nem.s.fdVCpu, KVM_SET_MSRS, pReq);
+ AssertLogRelMsgReturn(rcLnx == NMSRS, ("rcLnx=%d errno=%d\n", rcLnx, errno), VERR_NEM_IPE_5);
+ }
+
return VINF_SUCCESS;
}
@@ -1008,6 +1892,7 @@ VMM_INT_DECL(uint32_t) NEMHCGetFeatures(PVMCC pVM)
VMMR3_INT_DECL(bool) NEMR3CanExecuteGuest(PVM pVM, PVMCPU pVCpu)
{
+#ifndef VBOX_WITH_KVM_IRQCHIP_FULL
/*
* Only execute when the A20 gate is enabled as I cannot immediately
* spot any A20 support in KVM.
@@ -1015,6 +1900,15 @@ VMMR3_INT_DECL(bool) NEMR3CanExecuteGuest(PVM pVM, PVMCPU pVCpu)
RT_NOREF(pVM);
Assert(VM_IS_NEM_ENABLED(pVM));
return PGMPhysIsA20Enabled(pVCpu);
+#else
+ /*
+ * In full-irqchip mode, we always need to execute via KVM because we
+ * have no other way to inject interrupt into the guest (because the PIC is
+ * in the kernel!). Otherwise, we will break non-UEFI boot. This will
+ * break DOS support.
+ */
+ return true;
+#endif
}
@@ -1027,6 +1921,14 @@ DECLHIDDEN(bool) nemR3NativeSetSingleInstruction(PVM pVM, PVMCPU pVCpu, bool fEn
DECLHIDDEN(void) nemR3NativeNotifyFF(PVM pVM, PVMCPU pVCpu, uint32_t fFlags)
{
+ if (pVCpu->hThread == RTThreadSelf()) {
+ // RTThreadPoke doesn't like poking the current thread. We can
+ // safely return here because the vCPU thread is currently handling
+ // an exit and will will check all conditions again when we re-enter
+ // the run-loop.
+ return;
+ }
+
int rc = RTThreadPoke(pVCpu->hThread);
LogFlow(("nemR3NativeNotifyFF: #%u -> %Rrc\n", pVCpu->idCpu, rc));
AssertRC(rc);
@@ -1060,12 +1962,10 @@ static VBOXSTRICTRC nemHCLnxHandleInterruptFF(PVM pVM, PVMCPU pVCpu, struct kvm_
* only inject one event per KVM_RUN call. This can only happend if we
* can directly from the loop in EM, so the inhibit bits must be internal.
*/
- if (!TRPMHasTrap(pVCpu))
- { /* semi likely */ }
- else
+ if (TRPMHasTrap(pVCpu))
{
- Assert(!(pVCpu->cpum.GstCtx.fExtrn & (CPUMCTX_EXTRN_INHIBIT_INT | CPUMCTX_EXTRN_INHIBIT_NMI)));
Log8(("nemHCLnxHandleInterruptFF: TRPM has an pending event already\n"));
+
return VINF_SUCCESS;
}
@@ -1074,12 +1974,12 @@ static VBOXSTRICTRC nemHCLnxHandleInterruptFF(PVM pVM, PVMCPU pVCpu, struct kvm_
*/
if (VMCPU_FF_TEST_AND_CLEAR(pVCpu, VMCPU_FF_UPDATE_APIC))
{
- PDMApicUpdatePendingInterrupts(pVCpu);
- if (!VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC
- | VMCPU_FF_INTERRUPT_NMI | VMCPU_FF_INTERRUPT_SMI))
- return VINF_SUCCESS;
+ AssertLogRelMsgReturn(false, ("VMCPU_FF_UPDATE_APIC is set"), VERR_NEM_IPE_5);
}
+ if (!VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_PIC | VMCPU_FF_INTERRUPT_NMI | VMCPU_FF_INTERRUPT_SMI))
+ return VINF_SUCCESS;
+
/*
* We don't currently implement SMIs.
*/
@@ -1135,35 +2035,24 @@ static VBOXSTRICTRC nemHCLnxHandleInterruptFF(PVM pVM, PVMCPU pVCpu, struct kvm_
Log8(("Queuing NMI on %u\n", pVCpu->idCpu));
}
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+ AssertLogRelMsg(!VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_PIC), ("PDM has pic interrupt but full irqchip is enabled"));
+#else
/*
- * APIC or PIC interrupt?
+ * PIC interrupt?
*/
- if (VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC))
+ if (VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_PIC))
{
if (pRun->s.regs.regs.rflags & X86_EFL_IF)
{
- if (KvmEvents.interrupt.shadow == 0)
+ if (pRun->ready_for_interrupt_injection)
{
- /*
- * If CR8 is in KVM, update the VBox copy so PDMGetInterrupt will
- * work correctly.
- */
- if (pVCpu->cpum.GstCtx.fExtrn & CPUMCTX_EXTRN_APIC_TPR)
- PDMApicSetTpr(pVCpu, (uint8_t)pRun->cr8 << 4);
-
uint8_t bInterrupt;
int rc = PDMGetInterrupt(pVCpu, &bInterrupt);
if (RT_SUCCESS(rc))
{
- Assert(KvmEvents.interrupt.injected == false);
-#if 0
- int rcLnx = ioctl(pVCpu->nem.s.fdVm, KVM_INTERRUPT, (unsigned long)bInterrupt);
- AssertLogRelMsgReturn(rcLnx == 0, ("rcLnx=%d errno=%d\n", rcLnx, errno), VERR_NEM_IPE_5);
-#else
- KvmEvents.interrupt.nr = bInterrupt;
- KvmEvents.interrupt.soft = false;
- KvmEvents.interrupt.injected = true;
-#endif
+ TRPMAssertTrap(pVCpu, bInterrupt, TRPM_HARDWARE_INT);
+
Log8(("Queuing interrupt %#x on %u: %04x:%08RX64 efl=%#x\n", bInterrupt, pVCpu->idCpu,
pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, pVCpu->cpum.GstCtx.eflags.u));
}
@@ -1184,7 +2073,7 @@ static VBOXSTRICTRC nemHCLnxHandleInterruptFF(PVM pVM, PVMCPU pVCpu, struct kvm_
Log8(("Interrupt window pending on %u (#1)\n", pVCpu->idCpu));
}
}
-
+#endif
/*
* Now, update the state.
*/
@@ -1371,6 +2260,16 @@ static VBOXSTRICTRC nemHCLnxHandleExitMmio(PVMCC pVM, PVMCPUCC pVCpu, struct kvm
VBOXSTRICTRC rcStrict;
if (pRun->mmio.is_write)
{
+ /*
+ * Sync LAPIC TPR register with cr8 from KVM. This is required as long
+ * as we don't use KVM's IRQCHIP feature.
+ *
+ * This doesn't cover the X2APIC mode. But the whole cr8-code will be
+ * gone very soon anyway as we will use KVM's split-irqchip.
+ */
+ if (pRun->mmio.phys_addr == XAPIC_TPR_ADDR) {
+ pRun->cr8 = *pRun->mmio.data >> LAPIC_TPR_SHIFT;
+ }
rcStrict = PGMPhysWrite(pVM, pRun->mmio.phys_addr, pRun->mmio.data, pRun->mmio.len, PGMACCESSORIGIN_HM);
Log4(("MmioExit/%u: %04x:%08RX64: WRITE %#x LB %u, %.*Rhxs -> rcStrict=%Rrc\n",
pVCpu->idCpu, pRun->s.regs.sregs.cs.selector, pRun->s.regs.regs.rip,
@@ -1470,8 +2369,6 @@ static VBOXSTRICTRC nemHCLnxHandleExitWrMsr(PVMCPUCC pVCpu, struct kvm_run *pRun
return rcStrict;
}
-
-
static VBOXSTRICTRC nemHCLnxHandleExit(PVMCC pVM, PVMCPUCC pVCpu, struct kvm_run *pRun, bool *pfStatefulExit)
{
STAM_REL_COUNTER_INC(&pVCpu->nem.s.StatExitTotal);
@@ -1500,12 +2397,10 @@ static VBOXSTRICTRC nemHCLnxHandleExit(PVMCC pVM, PVMCPUCC pVCpu, struct kvm_run
return VINF_SUCCESS;
case KVM_EXIT_SET_TPR:
- STAM_REL_COUNTER_INC(&pVCpu->nem.s.StatExitSetTpr);
AssertFailed();
break;
case KVM_EXIT_TPR_ACCESS:
- STAM_REL_COUNTER_INC(&pVCpu->nem.s.StatExitTprAccess);
AssertFailed();
break;
@@ -1531,6 +2426,10 @@ static VBOXSTRICTRC nemHCLnxHandleExit(PVMCC pVM, PVMCPUCC pVCpu, struct kvm_run
pRun->s.regs.regs.rip + pRun->s.regs.sregs.cs.base, ASMReadTSC());
STAM_REL_COUNTER_INC(&pVCpu->nem.s.StatExitIntr);
Log5(("Intr/%u\n", pVCpu->idCpu));
+
+ /* If we don't consume the poke signal, subsequent KVM_RUN invocations will immediately return EINTR again. */
+ nemR3LnxConsumePokeSignal();
+
return VINF_SUCCESS;
case KVM_EXIT_HYPERCALL:
@@ -1547,11 +2446,48 @@ static VBOXSTRICTRC nemHCLnxHandleExit(PVMCC pVM, PVMCPUCC pVCpu, struct kvm_run
AssertFailed();
break;
case KVM_EXIT_IOAPIC_EOI:
- AssertFailed();
- break;
+ PDMIoApicBroadcastEoi(pVM, pRun->eoi.vector);
+ return VINF_SUCCESS;
case KVM_EXIT_HYPERV:
- AssertFailed();
- break;
+ Assert(pVM->gim.s.enmProviderId == GIMPROVIDERID_HYPERV);
+
+ switch (pRun->hyperv.type)
+ {
+ case KVM_EXIT_HYPERV_SYNDBG:
+ /* The synthetic debugger is not enabled and we should not get these exits. */
+ AssertFailed();
+ break;
+ case KVM_EXIT_HYPERV_HCALL:
+ LogRel2(("Hyper-V hcall input:%lx p0:%lx p1:%lx\n", pRun->hyperv.u.hcall.input, pRun->hyperv.u.hcall.params[0], pRun->hyperv.u.hcall.params[1]));
+
+ /* TODO KVM handles the performance-critical hypercalls on its own. We get mostly extended hypercalls
+ here. We would need to forward them to gimHvHypercall. None of these features are enabled right now,
+ so we can just deny the hypercall right away. */
+
+ pRun->hyperv.u.hcall.result = GIM_HV_STATUS_ACCESS_DENIED;
+ break;
+ case KVM_EXIT_HYPERV_SYNIC:
+ LogRel2(("HyperV synic msr:%lx control:%lx evt_page:%lx msg_page:%lx\n",
+ pRun->hyperv.u.synic.msr,
+ pRun->hyperv.u.synic.control,
+ pRun->hyperv.u.synic.evt_page,
+ pRun->hyperv.u.synic.msg_page));
+
+ switch (pRun->hyperv.u.synic.msr)
+ {
+ case MSR_GIM_HV_SCONTROL:
+ case MSR_GIM_HV_SIMP:
+ case MSR_GIM_HV_SIEFP:
+ break;
+ default:
+ AssertReleaseFailed();
+ }
+ break;
+ default:
+ AssertReleaseFailed();
+ }
+
+ return VINF_SUCCESS;
case KVM_EXIT_DIRTY_RING_FULL:
AssertFailed();
@@ -1619,6 +2555,82 @@ static VBOXSTRICTRC nemHCLnxHandleExit(PVMCC pVM, PVMCPUCC pVCpu, struct kvm_run
return VERR_NOT_IMPLEMENTED;
}
+static VBOXSTRICTRC nemHCLnxHandleTimers(PVMCC pVM, PVMCPUCC pVCpu)
+{
+ uint64_t nsAbsNextTimerEvt;
+ uint64_t uTscNow;
+ uint64_t nsDelta = TMVirtualSyncGetNsToDeadline(pVM, &nsAbsNextTimerEvt, &uTscNow);
+
+ [[maybe_unused]] uint64_t const nsAbsOldTimerEvt = pVCpu->nem.s.nsAbsNextTimerEvt;
+
+ pVCpu->nem.s.nsAbsNextTimerEvt = nsAbsNextTimerEvt;
+
+ /*
+ * With this optimization we only program timers once when something changes. We can enable this when we are
+ * confident that everything works correctly.
+ */
+#ifdef VBOX_KVM_DONT_REPROGRAM_TIMERS
+ if (nsAbsOldTimerEvt == nsAbsNextTimerEvt) {
+ return VINF_SUCCESS;
+ }
+#endif
+
+ if (nsDelta == 0) {
+ /* If there is no timeout, program a catch-all timer instead. */
+ nsDelta = RT_NS_1MS_64;
+ } else if (nsDelta >= RT_NS_1SEC_64) {
+ /* We need to exit at least once every 4 seconds. */
+ nsDelta = RT_NS_1SEC_64;
+ }
+
+ struct itimerspec timeout {};
+
+ /*
+ * It would be nice to program absolute timeouts here instead for better accuracy, but VBox times do not correlate
+ * to any Linux timer.
+ */
+ timeout.it_value.tv_sec = nsDelta / RT_NS_1SEC_64;
+ timeout.it_value.tv_nsec = nsDelta % RT_NS_1SEC_64;
+
+ int rcTimer = timer_settime(pVCpu->nem.s.pTimer, 0 /* relative timeout */,
+ &timeout, nullptr);
+ AssertLogRel(rcTimer == 0);
+
+ return VINF_SUCCESS;
+}
+
+static VBOXSTRICTRC nemHCLnxCheckAndInjectInterrupts(PVMCPUCC pVCpu)
+{
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+ NOREF(pVCpu);
+ AssertLogRelMsg(!TRPMHasTrap(pVCpu), ("TRPM has trap but full irqchip is enabled"));
+ return VINF_SUCCESS;
+#else
+ if (TRPMHasTrap(pVCpu))
+ {
+ TRPMEVENT enmType = TRPM_32BIT_HACK;
+ uint8_t bTrapNo = 0;
+ TRPMQueryTrap(pVCpu, &bTrapNo, &enmType);
+ Log(("nemHCLnxCheckAndInjectInterrupts: Pending trap: bTrapNo=%#x enmType=%d\n", bTrapNo, enmType));
+ if (enmType == TRPM_HARDWARE_INT)
+ {
+ struct kvm_interrupt kvm_int;
+ RT_ZERO(kvm_int);
+ kvm_int.irq = bTrapNo;
+ int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_INTERRUPT, &kvm_int);
+ AssertLogRelMsgReturn(rcLnx == 0, ("rcLnx=%d errno=%d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+ TRPMResetTrap(pVCpu);
+ }
+ else
+ {
+ return VERR_NOT_SUPPORTED;
+ }
+
+ }
+ return VINF_SUCCESS;
+#endif
+}
VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
{
@@ -1636,6 +2648,28 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
return VINF_SUCCESS;
}
+ /*
+ * The first time we come here, we have to apply Spectre mitigations. The prctl interface only allows us to set
+ * these only for the current thread.
+ */
+ if (!pVCpu->nem.s.fMitigationsApplied) {
+ Log(("NEM/%u: applying mitigations\n", pVCpu->idCpu));
+ if (pVM->hm.s.fIbpbOnVmEntry || pVM->hm.s.fIbpbOnVmExit) {
+ int rcLnx = prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0);
+
+ if (rcLnx != 0 && errno == EPERM) {
+ LogRel(("WARNING: requested IBPB, but kernel API is not activated! Boot Linux with spectre_v2_user=prctl.\n", pVCpu->idCpu));
+ } else {
+ AssertLogRelMsgReturn(rcLnx == 0,
+ ("rcLnx=%d errno=%d\n", rcLnx, errno),
+ VERR_NEM_MISSING_KERNEL_API_1);
+ Log(("NEM/%u: enabled IBPB\n", pVCpu->idCpu));
+ }
+ }
+
+ pVCpu->nem.s.fMitigationsApplied = true;
+ }
+
/*
* The run loop.
*/
@@ -1664,6 +2698,8 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
}
}
+ // See NEMR3CanExecuteGuest for details why we ignore A20 at this point.
+#ifndef VBOX_WITH_KVM_IRQCHIP_FULL
/*
* Do not execute in KVM if the A20 isn't enabled.
*/
@@ -1675,6 +2711,7 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
LogFlow(("NEM/%u: breaking: A20 disabled\n", pVCpu->idCpu));
break;
}
+#endif
/*
* Ensure KVM has the whole state.
@@ -1685,17 +2722,9 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
AssertRCReturn(rc2, rc2);
}
- /*
- * Poll timers and run for a bit.
- *
- * With the VID approach (ring-0 or ring-3) we can specify a timeout here,
- * so we take the time of the next timer event and uses that as a deadline.
- * The rounding heuristics are "tuned" so that rhel5 (1K timer) will boot fine.
- */
- /** @todo See if we cannot optimize this TMTimerPollGIP by only redoing
- * the whole polling job when timers have changed... */
- uint64_t offDeltaIgnored;
- uint64_t const nsNextTimerEvt = TMTimerPollGIP(pVM, pVCpu, &offDeltaIgnored); NOREF(nsNextTimerEvt);
+ /* Poll timers and run for a bit. */
+ nemHCLnxHandleTimers(pVM, pVCpu);
+
if ( !VM_FF_IS_ANY_SET(pVM, VM_FF_EMT_RENDEZVOUS | VM_FF_TM_VIRTUAL_SYNC)
&& !VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_HM_TO_R3_MASK))
{
@@ -1705,13 +2734,25 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
pVCpu->idCpu, pRun->s.regs.sregs.cs.selector, pRun->s.regs.regs.rip,
!!(pRun->s.regs.regs.rflags & X86_EFL_IF), pRun->s.regs.regs.rflags,
pRun->s.regs.sregs.ss.selector, pRun->s.regs.regs.rsp, pRun->s.regs.sregs.cr0));
+
+ VBOXSTRICTRC rc2 = nemHCLnxCheckAndInjectInterrupts(pVCpu);
+ AssertLogRelMsg(RT_SUCCESS(rc2), ("Failed to inject interrupt"));
+
TMNotifyStartOfExecution(pVM, pVCpu);
+#ifdef VBOX_WITH_KVM_NESTING
+ AssertReleaseMsg(not (pVCpu->nem.s.nestedGuestActive and pRun->kvm_dirty_regs),
+ ("Bug: Nested guest actitive and dirty regs are set: %x", pRun->kvm_dirty_regs));
+#endif
+
int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_RUN, 0UL);
+ int errno_ = errno;
VMCPU_CMPXCHG_STATE(pVCpu, VMCPUSTATE_STARTED_EXEC_NEM, VMCPUSTATE_STARTED_EXEC_NEM_WAIT);
TMNotifyEndOfExecution(pVM, pVCpu, ASMReadTSC());
+ pVCpu->nem.s.pRun->immediate_exit = 0;
+
#ifdef LOG_ENABLED
if (LogIsFlowEnabled())
{
@@ -1724,8 +2765,15 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
}
#endif
fStatefulExit = false;
- if (RT_LIKELY(rcLnx == 0 || errno == EINTR))
+ if (RT_LIKELY(rcLnx == 0 || errno_ == EINTR))
{
+#ifdef VBOX_WITH_KVM_NESTING
+ if (pRun->exit_reason == KVM_EXIT_INTR) {
+ pVCpu->nem.s.nestedGuestActive = KvmIsNestedGuestExit(pVM, pVCpu);
+ } else {
+ pVCpu->nem.s.nestedGuestActive = false;
+ }
+#endif
/*
* Deal with the exit.
*/
@@ -1739,10 +2787,19 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
break;
}
}
+ else if (errno_ == EAGAIN) {
+ /*
+ * We might drop out of KVM_RUN if the vCPU is still in an
+ * uninitialized state (e.g. WAIT_FOR_INIT) and some spurious
+ * wakeup event is received. In this case, simply do nothing
+ * and let the run loop enter KVM_RUN again.
+ * See https://elixir.bootlin.com/linux/v6.6/source/arch/x86/kvm/x86.c#L11138
+ */
+ }
else
{
- int rc2 = RTErrConvertFromErrno(errno);
- AssertLogRelMsgFailedReturn(("KVM_RUN failed: rcLnx=%d errno=%u rc=%Rrc\n", rcLnx, errno, rc2), rc2);
+ rc2 = RTErrConvertFromErrno(errno_);
+ AssertLogRelMsgFailedReturn(("KVM_RUN failed: rcLnx=%d errno=%u rc=%Rrc\n", rcLnx, errno_, rc2), rc2);
}
/*
@@ -1887,4 +2944,3 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
* This is using KVM.
*
*/
-
diff --git a/src/VBox/VMM/VMMR3/NEMR3NativeTemplate-linux.cpp.h b/src/VBox/VMM/VMMR3/NEMR3NativeTemplate-linux.cpp.h
index edce310..62c788f 100644
--- a/src/VBox/VMM/VMMR3/NEMR3NativeTemplate-linux.cpp.h
+++ b/src/VBox/VMM/VMMR3/NEMR3NativeTemplate-linux.cpp.h
@@ -431,6 +431,7 @@ static int nemR3LnxInitCheckCapabilities(PVM pVM, PRTERRINFO pErrInfo)
/** @callback_method_impl{FNVMMEMTRENDEZVOUS} */
+#ifndef VBOX_WITH_KVM
static DECLCALLBACK(VBOXSTRICTRC) nemR3LnxFixThreadPoke(PVM pVM, PVMCPU pVCpu, void *pvUser)
{
RT_NOREF(pVM, pvUser);
@@ -438,11 +439,107 @@ static DECLCALLBACK(VBOXSTRICTRC) nemR3LnxFixThreadPoke(PVM pVM, PVMCPU pVCpu, v
AssertLogRelRC(rc);
return VINF_SUCCESS;
}
+#else
+static VBOXSTRICTRC nemR3LnxSetVCpuSignalMask(PVMCPU pVCpu, sigset_t *pSigset)
+{
+ /*
+ * glibc and Linux/KVM do not agree on the size of sigset_t.
+ */
+ constexpr size_t kernel_sigset_size = 8;
+
+ alignas(kvm_signal_mask) char backing[sizeof(kvm_signal_mask) + kernel_sigset_size];
+ kvm_signal_mask *pKvmSignalMask = reinterpret_cast<kvm_signal_mask *>(backing);
+
+ static_assert(sizeof(sigset_t) >= kernel_sigset_size);
+
+ pKvmSignalMask->len = kernel_sigset_size;
+ memcpy(pKvmSignalMask->sigset, pSigset, kernel_sigset_size);
+
+ int rc = ioctl(pVCpu->nem.s.fdVCpu, KVM_SET_SIGNAL_MASK, pKvmSignalMask);
+ AssertLogRelMsgReturn(rc == 0, ("Failed to set vCPU signal mask: %d", errno),
+ VERR_NEM_INIT_FAILED);
+
+ return VINF_SUCCESS;
+}
+
+static DECLCALLBACK(VBOXSTRICTRC) nemR3LnxFixThreadPoke(PVM pVM, PVMCPU pVCpu, void *pvUser)
+{
+ RT_NOREF(pVM, pvUser);
+
+ int iPokeSignal = RTThreadPokeSignal();
+ AssertReturn(iPokeSignal >= 0, VERR_NEM_INIT_FAILED);
+
+ /* We disable the poke signal for the host. We never want that signal to be delivered. */
+ int rc = RTThreadControlPokeSignal(pVCpu->hThread, false /*fEnable*/);
+ AssertLogRelRC(rc);
+
+ sigset_t sigset;
+
+ /* Fetch the current signal mask. */
+ int rcProcMask = pthread_sigmask(SIG_BLOCK /* ignored */, nullptr, &sigset);
+ AssertLogRelMsgReturn(rcProcMask == 0, ("Failed to retrieve thread signal mask"), VERR_NEM_INIT_FAILED);
+
+ sigdelset(&sigset, iPokeSignal);
+
+ /* We enable the poke signal for the vCPU. Any poke will kick the vCPU out of guest execution. */
+ VBOXSTRICTRC rcVcpuMask = nemR3LnxSetVCpuSignalMask(pVCpu, &sigset);
+ AssertRCSuccessReturn(rcVcpuMask, rcVcpuMask);
+
+ /* Create a timer that delivers the poke signal. */
+ struct sigevent sev {};
+
+ sev.sigev_notify = SIGEV_THREAD_ID;
+ sev.sigev_signo = iPokeSignal;
+ sev._sigev_un._tid = gettid();
+
+ int rcTimer = timer_create(CLOCK_MONOTONIC, &sev, &pVCpu->nem.s.pTimer);
+ AssertLogRelMsgReturn(rcTimer == 0, ("Failed to create timer: %d", errno), VERR_NEM_INIT_FAILED);
+
+ return VINF_SUCCESS;
+}
+#endif
+#ifdef VBOX_WITH_KVM
+/**
+ * Check common environment problems and inform the user about misconfigurations.
+ */
+int nemR3CheckEnvironment(void)
+{
+ static const char szSplitLockMitigationFile[] = "/proc/sys/kernel/split_lock_mitigate";
+
+ char buf[64] {};
+ int fd = open(szSplitLockMitigationFile, O_RDONLY | O_CLOEXEC);
+
+ // Older kernels might not have this. A hard error feels unjustified here.
+ AssertLogRelMsgReturn(fd >= 0, ("Failed to check %s (%d). Assuming there is no problem.\n", szSplitLockMitigationFile, fd),
+ VINF_SUCCESS);
+
+ /* Leave one character to ensure that the string is zero-terminated. */
+ ssize_t bytes = read(fd, buf, sizeof(buf) - 1);
+ AssertLogRelMsgReturn(bytes >= 0, ("Failed to read %s (%zd)\n", szSplitLockMitigationFile, bytes),
+ VERR_NEM_INIT_FAILED);
+
+ int mitigationStatus = atoi(buf);
+
+ if (mitigationStatus != 0) {
+ LogRel(("NEM: WARNING: %s is %d. This can cause VM hangs, unless you set split_lock_detect=off on the host kernel command line! Please set it to 0.\n",
+ szSplitLockMitigationFile, mitigationStatus));
+ }
+
+ return VINF_SUCCESS;
+}
+#endif
+
DECLHIDDEN(int) nemR3NativeInit(PVM pVM, bool fFallback, bool fForced)
{
RT_NOREF(pVM, fFallback, fForced);
+
+#ifdef VBOX_WITH_KVM
+ int rcCheck = nemR3CheckEnvironment();
+ AssertLogRelMsgReturn(RT_SUCCESS(rcCheck), ("Failed to check environment\n"), VERR_NEM_INIT_FAILED);
+#endif
+
/*
* Some state init.
*/
@@ -623,13 +720,32 @@ DECLHIDDEN(int) nemR3NativeTerm(PVM pVM)
close(pVM->nem.s.fdKvm);
pVM->nem.s.fdKvm = -1;
}
+
+#ifdef VBOX_WITH_KVM
+ pVM->nem.s.pARedirectionTable.reset();
+#endif
return VINF_SUCCESS;
}
DECLHIDDEN(void) nemR3NativeReset(PVM pVM)
{
+#ifndef VBOX_WITH_KVM
RT_NOREF(pVM);
+#else
+ pVM->nem.s.pARedirectionTable->fill(std::nullopt);
+
+ for (VMCPUID idCpu = 0; idCpu < pVM->cCpus; idCpu++)
+ {
+ PVMCPU pVCpu = pVM->apCpusR3[idCpu];
+
+ struct kvm_mp_state mp;
+ mp.mp_state = pVCpu->idCpu == 0 ? KVM_MP_STATE_RUNNABLE : KVM_MP_STATE_UNINITIALIZED;
+
+ int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_SET_MP_STATE, &mp);
+ AssertLogRelMsg(rcLnx == 0, ("nemR3NativeReset: Failed to set MP state. Error: %d, errno %d\n", rcLnx, errno));
+ }
+#endif
}
diff --git a/src/VBox/VMM/VMMR3/PDMDevMiscHlp.cpp b/src/VBox/VMM/VMMR3/PDMDevMiscHlp.cpp
index 6ae3e07..3eaa3be 100644
--- a/src/VBox/VMM/VMMR3/PDMDevMiscHlp.cpp
+++ b/src/VBox/VMM/VMMR3/PDMDevMiscHlp.cpp
@@ -37,6 +37,7 @@
#ifdef VBOX_VMM_TARGET_X86
# include <VBox/vmm/pdmapic.h>
#endif
+#include <VBox/vmm/nem.h>
#include <VBox/vmm/vm.h>
#include <VBox/vmm/vmm.h>
@@ -116,6 +117,34 @@ static DECLCALLBACK(void) pdmR3PicHlp_Unlock(PPDMDEVINS pDevIns)
pdmUnlock(pDevIns->Internal.s.pVMR3);
}
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+/** @interface_method_impl{PDMPICHLP,pfnKvmSetIrqLine} */
+static DECLCALLBACK(int) pdmR3PicHlp_KvmSetIrqLine(PPDMDEVINS pDevIns, uint16_t u16Gsi, int iLevel)
+{
+ PDMDEV_ASSERT_DEVINS(pDevIns);
+ PVM pVM = pDevIns->Internal.s.pVMR3;
+
+ return NEMR3KvmSetIrqLine(pVM, u16Gsi, iLevel);
+}
+
+/** @interface_method_impl{PDMPICHLP,pfnKvmGetPicState} */
+static DECLCALLBACK(int) pdmR3PicHlp_KvmGetPicState(PPDMDEVINS pDevIns, KVMIRQCHIP irqchip, KVMPICSTATE* state)
+{
+ PDMDEV_ASSERT_DEVINS(pDevIns);
+ PVM pVM = pDevIns->Internal.s.pVMR3;
+
+ return NEMR3KvmGetPicState(pVM, irqchip, state);
+}
+
+/** @interface_method_impl{PDMPICHLP,pfnKvmSetPicState} */
+static DECLCALLBACK(int) pdmR3PicHlp_KvmSetPicState(PPDMDEVINS pDevIns, KVMIRQCHIP irqchip, KVMPICSTATE* state)
+{
+ PDMDEV_ASSERT_DEVINS(pDevIns);
+ PVM pVM = pDevIns->Internal.s.pVMR3;
+
+ return NEMR3KvmSetPicState(pVM, irqchip, state);
+}
+#endif
/**
* PIC Device Helpers.
@@ -127,6 +156,11 @@ const PDMPICHLP g_pdmR3DevPicHlp =
pdmR3PicHlp_ClearInterruptFF,
pdmR3PicHlp_Lock,
pdmR3PicHlp_Unlock,
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+ pdmR3PicHlp_KvmSetIrqLine,
+ pdmR3PicHlp_KvmGetPicState,
+ pdmR3PicHlp_KvmSetPicState,
+#endif
PDM_PICHLP_VERSION /* the end */
};
@@ -199,7 +233,64 @@ static DECLCALLBACK(int) pdmR3IoApicHlp_IommuMsiRemap(PPDMDEVINS pDevIns, uint16
return VERR_IOMMU_NOT_PRESENT;
}
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+/** @interface_method_impl{PDMIOAPICHLP,pfnKvmSetIrqLine} */
+static DECLCALLBACK(int) pdmR3IoApicHlp_KvmSetIrqLine(PPDMDEVINS pDevIns, uint16_t u16Gsi, int iLevel) {
+ PDMDEV_ASSERT_DEVINS(pDevIns);
+ PVM pVM = pDevIns->Internal.s.pVMR3;
+
+ return NEMR3KvmSetIrqLine(pVM, u16Gsi, iLevel);
+}
+
+/** @interface_method_impl{PDMIOAPICHLP,pfnKvmSplitIrqchipDeliverMsi} */
+static DECLCALLBACK(int) pdmR3IoApicHlp_KvmSplitIrqchipDeliverMsi(PPDMDEVINS pDevIns, PCMSIMSG pMsi)
+{
+ PDMDEV_ASSERT_DEVINS(pDevIns);
+ PVM pVM = pDevIns->Internal.s.pVMR3;
+
+ return NEMR3KvmSplitIrqchipDeliverMsi(pVM, pMsi);
+}
+
+/** @interface_method_impl{PDMIOAPICHLP,pfnKvmSplitIrqchipAddUpdateRTE} */
+static DECLCALLBACK(int) pdmR3IoApicHlp_KvmSplitIrqchipAddUpdateRTE(PPDMDEVINS pDevIns, uint16_t gsi, PCMSIMSG pMsi)
+{
+ PDMDEV_ASSERT_DEVINS(pDevIns);
+ PVM pVM = pDevIns->Internal.s.pVMR3;
+
+ return NEMR3KvmSplitIrqchipAddUpdateRTE(pVM, gsi, pMsi);
+}
+
+
+/** @interface_method_impl{PDMIOAPICHLP,pfnKvmSplitIrqchipRemoveRTE} */
+static DECLCALLBACK(int) pdmR3IoApicHlp_KvmSplitIrqchipRemoveRTE(PPDMDEVINS pDevIns, uint16_t gsi)
+{
+ PDMDEV_ASSERT_DEVINS(pDevIns);
+ PVM pVM = pDevIns->Internal.s.pVMR3;
+
+ return NEMR3KvmSplitIrqchipRemoveRTE(pVM, gsi);
+}
+#endif
+
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+/** @interface_method_impl{PDMIOAPICHLP,pfnKvmGetIoApicState} */
+static DECLCALLBACK(int) pdmR3IoApicHlp_pfnKvmGetIoApicState(PPDMDEVINS pDevIns, KVMIOAPICSTATE* state)
+{
+ PDMDEV_ASSERT_DEVINS(pDevIns);
+ PVM pVM = pDevIns->Internal.s.pVMR3;
+
+ return NEMR3KvmGetIoApicState(pVM, state);
+}
+
+/** @interface_method_impl{PDMIOAPICHLP,pfnKvmSetIoApicState} */
+static DECLCALLBACK(int) pdmR3IoApicHlp_pfnKvmSetIoApicState(PPDMDEVINS pDevIns, KVMIOAPICSTATE* state)
+{
+ PDMDEV_ASSERT_DEVINS(pDevIns);
+ PVM pVM = pDevIns->Internal.s.pVMR3;
+
+ return NEMR3KvmSetIoApicState(pVM, state);
+}
+#endif
/**
* I/O APIC Device Helpers.
*/
@@ -211,6 +302,17 @@ const PDMIOAPICHLP g_pdmR3DevIoApicHlp =
pdmR3IoApicHlp_Unlock,
pdmR3IoApicHlp_LockIsOwner,
pdmR3IoApicHlp_IommuMsiRemap,
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+ pdmR3IoApicHlp_KvmSetIrqLine,
+ pdmR3IoApicHlp_KvmSplitIrqchipDeliverMsi,
+ pdmR3IoApicHlp_KvmSplitIrqchipAddUpdateRTE,
+ pdmR3IoApicHlp_KvmSplitIrqchipRemoveRTE,
+#endif
+
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+ pdmR3IoApicHlp_pfnKvmGetIoApicState,
+ pdmR3IoApicHlp_pfnKvmSetIoApicState,
+#endif
PDM_IOAPICHLP_VERSION /* the end */
};
diff --git a/src/VBox/VMM/VMMR3/PGMPhys.cpp b/src/VBox/VMM/VMMR3/PGMPhys.cpp
index 4b98cf6..55eb2ca 100644
--- a/src/VBox/VMM/VMMR3/PGMPhys.cpp
+++ b/src/VBox/VMM/VMMR3/PGMPhys.cpp
@@ -2123,11 +2123,16 @@ int pgmR3PhysRamPreAllocate(PVM pVM)
Assert(pVM->pgm.s.fRamPreAlloc);
Log(("pgmR3PhysRamPreAllocate: enter\n"));
# ifdef VBOX_WITH_PGM_NEM_MODE
+#ifdef VBOX_WITH_PREALLOC_RAM_BY_DEFAULT
+ Log(("pgmR3PhysRamPreAllocate: Handled by default in NEM mode, skip\n"));
+ return VINF_SUCCESS;
+#else
if (VM_IS_NEM_ENABLED(pVM))
{
LogRel(("PGM: Pre-alloc ignored in NEM mode.\n"));
return VINF_SUCCESS;
}
+#endif
# endif
/*
diff --git a/src/VBox/VMM/VMMR3/VMM.cpp b/src/VBox/VMM/VMMR3/VMM.cpp
index aa51e5f..c65adcd 100644
--- a/src/VBox/VMM/VMMR3/VMM.cpp
+++ b/src/VBox/VMM/VMMR3/VMM.cpp
@@ -1104,6 +1104,11 @@ static DECLCALLBACK(int) vmmR3Load(PVM pVM, PSSMHANDLE pSSM, uint32_t uVersion,
AssertMsgFailed(("u32=%#x\n", u32));
return VERR_SSM_DATA_UNIT_FORMAT_CHANGED;
}
+
+#ifdef VBOX_WITH_KVM
+ NEMR3LoadExec(pVM);
+#endif
+
return VINF_SUCCESS;
}
diff --git a/src/VBox/VMM/VMMR3/target-x86/CPUMR3-x86.cpp b/src/VBox/VMM/VMMR3/target-x86/CPUMR3-x86.cpp
index cb13659..06bf326 100644
--- a/src/VBox/VMM/VMMR3/target-x86/CPUMR3-x86.cpp
+++ b/src/VBox/VMM/VMMR3/target-x86/CPUMR3-x86.cpp
@@ -1569,6 +1569,7 @@ DECLHIDDEN(void) cpumR3InitVmxGuestFeaturesAndMsrs(PVM pVM, PCFGMNODE pCpumCfg,
if (fVmxEpt)
{
const char *pszWhy = NULL;
+#ifndef VBOX_WITH_KVM_NESTING
if (!VM_IS_HM_ENABLED(pVM) && !VM_IS_EXEC_ENGINE_IEM(pVM))
pszWhy = "execution engine is neither HM nor IEM";
#ifdef RT_ARCH_AMD64
@@ -1576,6 +1577,9 @@ DECLHIDDEN(void) cpumR3InitVmxGuestFeaturesAndMsrs(PVM pVM, PCFGMNODE pCpumCfg,
pszWhy = "nested paging is not enabled for the VM or it is not supported by the host";
else if (VM_IS_HM_ENABLED(pVM) && !pVM->cpum.s.HostFeatures.s.fNoExecute)
pszWhy = "NX is not available on the host";
+#endif
+#else
+ if (VM_IS_HM_ENABLED(pVM) && !HMIsNestedPagingActive(pVM))
#endif
if (pszWhy)
{
@@ -2453,10 +2457,20 @@ DECLCALLBACK(int) cpumR3LoadExecTarget(PVM pVM, PSSMHANDLE pSSM, uint32_t uVersi
rc = SSMR3GetStructEx(pSSM, &pGstCtx->XState.Hdr, sizeof(pGstCtx->XState.Hdr),
0, g_aCpumXSaveHdrFields, NULL);
AssertRCReturn(rc, rc);
+#ifndef VBOX_WITH_KVM
+ /*
+ * This assertion triggers on resume when the guest was
+ * suspended early during boot. The hypothesis is that this
+ * happens when XSAVE is not enabled yet. Seems harmless for
+ * now.
+ *
+ * See virtualbox#69.
+ */
AssertLogRelMsgReturn(!(pGstCtx->XState.Hdr.bmXState & ~pGstCtx->fXStateMask),
("bmXState=%#RX64 fXStateMask=%#RX64\n",
pGstCtx->XState.Hdr.bmXState, pGstCtx->fXStateMask),
VERR_CPUM_INVALID_XSAVE_HDR);
+#endif
}
if (pGstCtx->fXStateMask & XSAVE_C_YMM)
{
diff --git a/src/VBox/VMM/VMMR3/target-x86/CPUMR3CpuId-x86.cpp b/src/VBox/VMM/VMMR3/target-x86/CPUMR3CpuId-x86.cpp
index 913e00a..db60ee1 100644
--- a/src/VBox/VMM/VMMR3/target-x86/CPUMR3CpuId-x86.cpp
+++ b/src/VBox/VMM/VMMR3/target-x86/CPUMR3CpuId-x86.cpp
@@ -1325,6 +1325,13 @@ static int cpumR3CpuIdSanitize(PVM pVM, PCPUM pCpum, PCPUMCPUIDCONFIG pConfig)
PASSTHRU_FEATURE_EX(enmConfig, fHostFeature, !VM_IS_EXEC_ENGINE_IEM(pVM), fConst)
#define PASSTHRU_FEATURE_TODO(enmConfig, fConst) ((enmConfig) ? (fConst) : 0)
+#ifdef VBOX_WITH_KVM
+#define PASSTHRU_FEATURE_KVM_ONLY(fConst) (fConst)
+#else
+#define PASSTHRU_FEATURE_KVM_ONLY(fConst) (0)
+#endif
+
+
/* Cpuid 1:
* EAX: CPU model, family and stepping.
*
@@ -1584,7 +1591,7 @@ static int cpumR3CpuIdSanitize(PVM pVM, PCPUM pCpum, PCPUMCPUIDCONFIG pConfig)
| X86_CPUID_AMD_FEATURE_EDX_MMX
| X86_CPUID_AMD_FEATURE_EDX_FXSR
| X86_CPUID_AMD_FEATURE_EDX_FFXSR
- //| X86_CPUID_EXT_FEATURE_EDX_PAGE1GB
+ | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_EXT_FEATURE_EDX_PAGE1GB)
| X86_CPUID_EXT_FEATURE_EDX_RDTSCP
//| RT_BIT_32(28) - reserved
//| X86_CPUID_EXT_FEATURE_EDX_LONG_MODE - turned on when necessary
@@ -1846,9 +1853,9 @@ static int cpumR3CpuIdSanitize(PVM pVM, PCPUM pCpum, PCPUMCPUIDCONFIG pConfig)
//| X86_CPUID_STEXT_FEATURE_EBX_HLE RT_BIT(4)
| PASSTHRU_FEATURE(pConfig->enmAvx2, pHstFeat->fAvx2, X86_CPUID_STEXT_FEATURE_EBX_AVX2)
| X86_CPUID_STEXT_FEATURE_EBX_FDP_EXCPTN_ONLY
- //| X86_CPUID_STEXT_FEATURE_EBX_SMEP RT_BIT(7)
+ | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EBX_SMEP)
| X86_CPUID_STEXT_FEATURE_EBX_BMI2
- //| X86_CPUID_STEXT_FEATURE_EBX_ERMS RT_BIT(9)
+ | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EBX_ERMS)
| PASSTHRU_FEATURE_NOT_IEM(pConfig->enmInvpcid, pHstFeat->fInvpcid, X86_CPUID_STEXT_FEATURE_EBX_INVPCID)
//| X86_CPUID_STEXT_FEATURE_EBX_RTM RT_BIT(11)
//| X86_CPUID_STEXT_FEATURE_EBX_PQM RT_BIT(12)
@@ -1860,10 +1867,11 @@ static int cpumR3CpuIdSanitize(PVM pVM, PCPUM pCpum, PCPUMCPUIDCONFIG pConfig)
| PASSTHRU_FEATURE_TODO(pConfig->enmRdSeed, X86_CPUID_STEXT_FEATURE_EBX_RDSEED)
| PASSTHRU_FEATURE(pConfig->enmAdx, pHstFeat->fAdx, X86_CPUID_STEXT_FEATURE_EBX_ADX)
//| X86_CPUID_STEXT_FEATURE_EBX_SMAP RT_BIT(20)
+ | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EBX_SMAP)
//| RT_BIT(21) - reserved
//| RT_BIT(22) - reserved
| PASSTHRU_FEATURE(pConfig->enmCLFlushOpt, pHstFeat->fClFlushOpt, X86_CPUID_STEXT_FEATURE_EBX_CLFLUSHOPT)
- //| RT_BIT(24) - reserved
+ | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EBX_CLWB)
//| X86_CPUID_STEXT_FEATURE_EBX_INTEL_PT RT_BIT(25)
//| X86_CPUID_STEXT_FEATURE_EBX_AVX512PF RT_BIT(26)
//| X86_CPUID_STEXT_FEATURE_EBX_AVX512ER RT_BIT(27)
@@ -1874,18 +1882,21 @@ static int cpumR3CpuIdSanitize(PVM pVM, PCPUM pCpum, PCPUMCPUIDCONFIG pConfig)
;
pCurLeaf->uEcx &= 0
//| X86_CPUID_STEXT_FEATURE_ECX_PREFETCHWT1 - we do not do vector functions yet.
+ | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_ECX_GFNI)
;
pCurLeaf->uEdx &= 0
+ | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EDX_FSRM)
+ | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EDX_SERIALIZE)
//| X86_CPUID_STEXT_FEATURE_EDX_SRBDS_CTRL RT_BIT(9)
| PASSTHRU_FEATURE(pConfig->enmMdsClear, pHstFeat->fMdsClear, X86_CPUID_STEXT_FEATURE_EDX_MD_CLEAR)
//| X86_CPUID_STEXT_FEATURE_EDX_TSX_FORCE_ABORT RT_BIT_32(11)
//| X86_CPUID_STEXT_FEATURE_EDX_CET_IBT RT_BIT(20)
- //| X86_CPUID_STEXT_FEATURE_EDX_IBRS_IBPB RT_BIT(26)
- //| X86_CPUID_STEXT_FEATURE_EDX_STIBP RT_BIT(27)
+ | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EDX_IBRS_IBPB)
+ | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EDX_STIBP)
| PASSTHRU_FEATURE(pConfig->enmFlushCmdMsr, pHstFeat->fFlushCmd, X86_CPUID_STEXT_FEATURE_EDX_FLUSH_CMD)
| PASSTHRU_FEATURE(pConfig->enmArchCapMsr, pHstFeat->fArchCap, X86_CPUID_STEXT_FEATURE_EDX_ARCHCAP)
//| X86_CPUID_STEXT_FEATURE_EDX_CORECAP RT_BIT_32(30)
- //| X86_CPUID_STEXT_FEATURE_EDX_SSBD RT_BIT_32(31)
+ | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EDX_SSBD)
;
/* Mask out INVPCID unless FSGSBASE is exposed due to a bug in Windows 10 SMP guests, see @bugref{9089#c15}. */
@@ -2915,6 +2926,7 @@ static int cpumR3CpuIdReadConfig(PVM pVM, PCPUMCPUIDCONFIG pConfig, PCFGMNODE pC
AssertLogRelRCReturn(rc, rc);
if (pConfig->fNestedHWVirt)
{
+#ifndef VBOX_WITH_KVM_NESTING
/** @todo Think about enabling this later with NEM/KVM. */
if (VM_IS_NEM_ENABLED(pVM))
{
@@ -2924,6 +2936,7 @@ static int cpumR3CpuIdReadConfig(PVM pVM, PCPUMCPUIDCONFIG pConfig, PCFGMNODE pC
else if (!fNestedPagingAndFullGuestExec)
return VMSetError(pVM, VERR_CPUM_INVALID_HWVIRT_CONFIG, RT_SRC_POS,
"Cannot enable nested VT-x/AMD-V without nested-paging and unrestricted guest execution!\n");
+#endif
}
}
#endif /** @todo */
@@ -3882,6 +3895,7 @@ VMMR3_INT_DECL(void) CPUMR3SetGuestCpuIdFeature(PVM pVM, CPUMCPUIDFEATURE enmFea
* Note! ASSUMES CPUMCPUIDFEATURE_APIC is called first.
*/
case CPUMCPUIDFEATURE_X2APIC:
+#ifndef VBOX_WITH_KVM
pLeaf = cpumCpuIdGetLeaf(pVM, UINT32_C(0x00000001));
if (pLeaf)
pVM->cpum.s.aGuestCpuIdPatmStd[1].uEcx = pLeaf->uEcx |= X86_CPUID_FEATURE_ECX_X2APIC;
@@ -3896,6 +3910,7 @@ VMMR3_INT_DECL(void) CPUMR3SetGuestCpuIdFeature(PVM pVM, CPUMCPUIDFEATURE enmFea
}
LogRel(("CPUM: SetGuestCpuIdFeature: Enabled x2APIC\n"));
+#endif
break;
/*
diff --git a/src/VBox/VMM/include/GIMHvInternal.h b/src/VBox/VMM/include/GIMHvInternal.h
index 4397207..66a8510 100644
--- a/src/VBox/VMM/include/GIMHvInternal.h
+++ b/src/VBox/VMM/include/GIMHvInternal.h
@@ -202,6 +202,8 @@
#define GIM_HV_HINT_INT_FOR_MBEC_SYSCALLS RT_BIT(13)
/** Recommend using enlightened VMCS interfacea and nested enlightenments. */
#define GIM_HV_HINT_NESTED_ENLIGHTENED_VMCS_INTERFACE RT_BIT(14)
+/** Indicates that core-sharing is not possible. */
+#define GIM_HV_HINT_NO_NONARCH_CORESHARING RT_BIT(18)
/** @} */
@@ -1117,6 +1119,15 @@ AssertCompile(sizeof(GIMHVEXTGETBOOTZEROMEM) <= GIM_HV_PAGE_SIZE);
/** @} */
+/** Hyper-V page size. */
+#define GIM_HV_PAGE_SIZE 4096
+/** Hyper-V page shift. */
+#define GIM_HV_PAGE_SHIFT 12
+
+/** Microsoft Hyper-V vendor signature. */
+#define GIM_HV_VENDOR_MICROSOFT "Microsoft Hv"
+#define GIM_HV_VENDOR_VBOX "VBoxVBoxVBox"
+
/**
* MMIO2 region indices.
*/
diff --git a/src/VBox/VMM/include/NEMInternal.h b/src/VBox/VMM/include/NEMInternal.h
index be5377c..1d53a8c 100644
--- a/src/VBox/VMM/include/NEMInternal.h
+++ b/src/VBox/VMM/include/NEMInternal.h
@@ -35,8 +35,17 @@
#include <VBox/types.h>
#include <VBox/vmm/nem.h>
#include <VBox/vmm/cpum.h> /* For CPUMCPUVENDOR. */
+#ifdef VBOX_WITH_KVM
+#include <VBox/vmm/pdmdev.h> /* For KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS */
+#endif
#include <VBox/vmm/stam.h>
#include <VBox/vmm/vmapi.h>
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+#include <array>
+#include <memory>
+#include <optional>
+#include <VBox/msi.h>
+#endif
#ifdef RT_OS_WINDOWS
# include <iprt/nt/hyperv.h>
# include <iprt/critsect.h>
@@ -46,6 +55,8 @@
# else
# include "VMXInternal.h"
# endif
+#elif defined(RT_OS_LINUX)
+# include <time.h>
#endif
RT_C_DECLS_BEGIN
@@ -246,6 +257,9 @@ typedef struct NEM
uint16_t idPrevSlot;
/** Memory slot ID allocation bitmap. */
uint64_t bmSlotIds[_32K / 8 / sizeof(uint64_t)];
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+ std::unique_ptr<std::array<std::optional<MSIMSG>, KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS>> pARedirectionTable;
+#endif
#elif defined(RT_OS_WINDOWS)
/** Set if we've created the EMTs. */
@@ -453,7 +467,9 @@ typedef struct NEMCPU
#endif
#if defined(RT_OS_LINUX)
- uint8_t abPadding[3];
+ uint8_t abPadding[2];
+ /** Whether processor bug mitigations have already been applied. */
+ bool fMitigationsApplied;
/** The KVM VCpu file descriptor. */
int32_t fdVCpu;
/** Pointer to the KVM_RUN data exchange region. */
@@ -466,6 +482,21 @@ typedef struct NEMCPU
/** Status of the FIQ line when last seen. */
bool fFiqLastSeen;
# elif defined(VBOX_VMM_TARGET_X86)
+#ifdef VBOX_WITH_KVM_NESTING
+ /** KVM stats file descriptor for binary statistics */
+ int statsFd;
+ size_t guestModeStatOffset;
+ bool nestedGuestActive;
+#endif
+
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+ /** The vCPU timer. */
+ timer_t pTimer;
+
+ /** The the next timeout (absolute). */
+ uint64_t nsAbsNextTimerEvt;
+#endif
+
/** The MSR_IA32_APICBASE value known to KVM. */
uint64_t uKvmApicBase;
# endif
@@ -849,4 +880,3 @@ DECLHIDDEN(int) nemHCNativeNotifyPhysPageAllocated(PVMCC pVM, RTGCPHYS GCPhy
RT_C_DECLS_END
#endif /* !VMM_INCLUDED_SRC_include_NEMInternal_h */
-