File kvm-backend-7.2.0-dev-20250903.patch of Package virtualbox-kvm

diff --git a/Config.kmk b/Config.kmk
index 26651f7..6028374 100644
--- a/Config.kmk
+++ b/Config.kmk
@@ -574,7 +574,7 @@ endif
 # Enables the new breakpoint handling code, see @bugref{8650}
 VBOX_WITH_DBGF_FLOW_TRACING = 1
 # Enables ARMv8 API support and if possible virtualization, see @bugref{10383}
-VBOX_WITH_VIRT_ARMV8 = 1
+VBOX_WITH_VIRT_ARMV8 =
 # Makes x86 emulation on ARM hosts available in the GUI.
 if1of ($(KBUILD_TARGET).$(KBUILD_TARGET_ARCH), empty-set)
  VBOX_WITH_X86_ON_ARM_ENABLED = 1
@@ -1702,6 +1702,19 @@ ifdef VBOX_HEADLESS
  VBOX_WITH_VRDP_RDESKTOP =
 endif
 
+#
+# Configure VirtualBox to use the KVM NEM backend.
+#
+ifdef VBOX_WITH_KVM
+ VBOX_WITH_DRIVERLESS_FORCED = 1
+ VBOX_WITH_NATIVE_NEM=1
+ # KVM doesn't need the VirtualBox Ring 0 drivers
+ VBOX_WITH_VBOXDRV=
+ VBOX_WITH_NETFLT=
+ VBOX_WITH_NETFLT_CROSSBOW=
+ VBOX_WITH_NETADP=
+endif
+
 #
 # Undefined VBOX_WITH_MAIN implies exclusion of a few more items.
 #
@@ -2119,6 +2132,14 @@ endif
 ifdef VBOX_WITH_STATIC_ARM64_PAGE_SHIFT
  DEFS.linux.arm64 += IPRT_STATIC_ARM64_PAGE_SHIFT=$(VBOX_WITH_STATIC_ARM64_PAGE_SHIFT)
 endif
+CYBERUS_CXX_FLAGS = -Werror -Wall
+ifdef VBOX_WITH_KVM
+ DEFS += VBOX_WITH_KVM
+ DEFS += VBOX_WITH_KVM_NESTING
+endif
+ifndef VBOX_HEADLESS
+ DEFS += VBOX_WITH_GVT_RENDERING
+endif
 
 # Don't flood CDEFS, old MASMs doesn't like too many defines.
 ifdef VBOX_WITH_DEBUGGER
@@ -3641,6 +3662,8 @@ ifndef VBOX_GCC_std
   VBOX_GCC_std := -std=c++17
   # else if "$(VBOX_CLANG_VERSION_CXX)" vge 60000 # Most language features complete by v6. Lib stuff was less complete in v6, but hopefully acceptable for out purposes.
   #VBOX_GCC_std := -std=c++17
+ else if "$(VBOX_WITH_KVM)" veq 1
+  VBOX_GCC_std := -std=c++17
  else if "$(VBOX_CLANG_VERSION_CXX)" vge 50000 # darwin Xcode 5 allegedly knows what C++11 is
   VBOX_GCC_std := -std=c++11
   # else if "$(VBOX_GCC_VERSION_CXX)" vge 70000 # Language feature P0512R0 was v8, rest v7 or earlier. Most lib stuff present in 7, complete in v12.
diff --git a/configure b/configure
index 9e67b32..1d3de3c 100755
--- a/configure
+++ b/configure
@@ -86,6 +86,7 @@ SETUP_WINE=
 ONLY_ADDITIONS=0
 TARGET_MACHINE=""
 TARGET_CPU=""
+WITH_KVM=0
 WITH_XPCOM=1
 WITH_PYTHON=1
 WITH_JAVA=1
@@ -2489,6 +2490,7 @@ cat << EOF
   --build-libssl           build openssl from sources
   --build-libtpms          build libtpms from sources
   --build-liblzma          build liblzma from sources
+  --with-kvm               build with kvm backend
 EOF
 [ $OSE -eq 0 ] && cat << EOF
   --build-libcurl          build libcurl from sources
@@ -2643,6 +2645,9 @@ for option in "$@"; do
     --with-linux=*)
       LINUX=`echo $option | cut -d'=' -f2`
       ;;
+    --with-kvm)
+      WITH_KVM=1
+      ;;
     --with-makeself=*)
       MAKESELF=`echo $option | cut -d'=' -f2`
       ;;
@@ -2922,6 +2927,7 @@ fi
 [ $WITH_JAVA      -eq 0 ] && cnf_append "VBOX_WITH_JWS" ""
 [ $WITH_HARDENING -eq 0 ] && cnf_append "VBOX_WITHOUT_HARDENING" "1"
 [ $WITH_HARDENING -eq 2 ] && cnf_append "VBOX_WITH_HARDENING" "2"
+[ $WITH_KVM       -eq 1 ] && cnf_append "VBOX_WITH_KVM" "1"
 [ $WITH_LIBTPMS   -eq 0 ] && cnf_append "VBOX_WITH_LIBTPMS" ""
 [ $WITH_LIBLZMA   -eq 0 ] && cnf_append "VBOX_WITH_LIBLZMA" ""
 if [ $WITH_LIBVPX -eq 0 ]; then
diff --git a/include/VBox/vmm/nem.h b/include/VBox/vmm/nem.h
index 76414fb..13111c4 100644
--- a/include/VBox/vmm/nem.h
+++ b/include/VBox/vmm/nem.h
@@ -43,6 +43,14 @@
 #include <VBox/vmm/vmapi.h>
 #include <VBox/vmm/pgm.h>
 
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+// For KVMPICSTATE and KVMIRQCHIP
+#include <VBox/vmm/pdmdev.h>
+#endif
+
+#if defined(VBOX_WITH_KVM) && defined(IN_RING3)
+#include <VBox/vmm/cpum.h>      /* for PCPUMCPUIDLEAF */
+#endif
 
 RT_C_DECLS_BEGIN
 
@@ -163,6 +171,150 @@ VMMR3_INT_DECL(int)  NEMR3NotifyPhysRomRegisterEarly(PVM pVM, RTGCPHYS GCPhys, R
 VMMR3_INT_DECL(int)  NEMR3NotifyPhysRomRegisterLate(PVM pVM, RTGCPHYS GCPhys, RTGCPHYS cb, void *pvPages,
                                                     uint32_t fFlags, uint8_t *pu2State, uint32_t *puNemRange);
 
+#if defined(VBOX_WITH_KVM) && defined(IN_RING3)
+
+/**
+ * Retrieves the value of a single model specific register (MSR).
+ * @param pVCpu The vCPU in which context the MSR should be read (can be any vCPU for global MSRs).
+ * @param msr The index of the MSR that should be read.
+ * @param val A buffer that will contain the value of the specified MSR, if reading was successful.
+ * @return VBox status code, VINF_SUCCESS, if the read access was successful.
+ */
+VMMR3_INT_DECL(int) NEMR3KvmGetMsr(PVMCPU pVCpu, uint64_t msr, uint64_t* val);
+
+/**
+ * Writes a value to single model specific register (MSR).
+ * @param pVCpu The vCPU in which context the MSR should be written (can be any vCPU for global MSRs).
+ * @param msr The index of the MSR that should be written.
+ * @param val The value that should be written to the MSR.
+ * @return VBox status code, VINF_SUCCESS, if the write access was successful.
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSetMsr(PVMCPU pVCpu, uint64_t msr, uint64_t val);
+
+/**
+ * Asserts a specific interrupt line on both PIC and I/O APIC.
+ * @param  pVM The cross context VM structure.
+ * @param  u16Gsi the GSI of the interrupt lines that should be asserted.
+ * @param  iLevel Line level, either PDM_IRQ_LEVEL_HIGH, PDM_IRQ_LEVEL_LOW or PDM_IRQ_LEVEL_FLIP_FLOP.
+ * @return Vbox status code.
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSetIrqLine(PVM pVM, uint16_t u16Gsi, int iLevel);
+
+/**
+ * Execute state load operation. This sets the correct KVM MP state depending on
+ * the VBox vCPUs state.
+ * @param pVM The cross context VM structure
+ */
+VMMR3_INT_DECL(int) NEMR3LoadExec(PVM pVM);
+
+/**
+ * Retrieves the local APIC state from the in-kernel irqchip.
+ * @param pVCpu The vCpu to retrieve the APIC state from
+ * @param pXApicPage Pointer to the memory the APIC state is saved to. Must be
+ *                   at least of size KVM_APIC_REG_SIZE.
+ * @returns VBox status code
+ */
+VMMR3_INT_DECL(int) NEMR3KvmGetLapicState(PVMCPU pVCpu, void* pXApicPage);
+
+/**
+ * Configures the local APIC state of the in-kernel irqchip.
+ * @param pVCpu The vCpu for which to set the APIC state
+ * @param pXApicPage Pointer to the memory containing APIC state. Must be at
+ *                   least of size KVM_APIC_REG_SIZE.
+ * @returns VBox status code
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSetLapicState(PVMCPU pVCpu, void* pXApicPage);
+
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+
+/**
+ * Retrieves the PIC state from the in-kernel irqchip.
+ * @param pVM The VM to retrieve the PIC state from
+ * @param irqchip Whether to retrieve the state from the master or slave pic
+ * @param state Buffer to store the PIC state in.
+ * @returns VBox status code
+ */
+VMMR3_INT_DECL(int) NEMR3KvmGetPicState(PVM pVM, KVMIRQCHIP irqchip, KVMPICSTATE* state);
+
+/**
+ * Configures the PIC state of the in-kernel irqchip.
+ * @param pVM The VM to for which to set the PIC state
+ * @param irqchip Whether to set the state of the master or slave pic
+ * @param state Pointer to the memory containing PIC state.
+ * @returns VBox status code
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSetPicState(PVM pVM, KVMIRQCHIP irqchip, KVMPICSTATE* state);
+
+/**
+ * Retrieves the I/O APIC state from the in-kernel irqchip.
+ * @param pVM The VM to retrieve the I/O APIC state from
+ * @param state Buffer where to store I/O APIC state.
+ * @returns VBox status code
+ */
+VMMR3_INT_DECL(int) NEMR3KvmGetIoApicState(PVM pVM, KVMIOAPICSTATE* state);
+
+/**
+ * Configures the I/O APIC state of the in-kernel irqchip.
+ * @param pVM The VM to for which to set the I/O APIC state
+ * @param state Pointer to the memory containing I/O APIC state.
+ * @returns VBox status code
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSetIoApicState(PVM pVM, KVMIOAPICSTATE* state);
+#endif
+/**
+ * Deliver a MSI via the in-kernel irqchip.
+ *
+ * @returns VBox status code
+ * @param pVM The cross context VM structure
+ * @param pMsi The MSI to inject into the guest
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSplitIrqchipDeliverMsi(PVM pVM, PCMSIMSG pMsi);
+
+/**
+ * Add or update the Entry in the Redirection Table indexed by the GSI number.
+ *
+ * Interrupts configured via this interface will cause an EOI exit when the
+ * guest acknowledges them. Typically, this is only necessary for level
+ * triggered interrupts.
+ *
+ * @returns VBox status code
+ * @param pVM The cross context VM structure
+ * @param gsi The GSI number
+ * @param pMSI The MSI that should be delivered when the interrupt fires
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSplitIrqchipAddUpdateRTE(PVM pVM, uint16_t u16Gsi, PCMSIMSG pMsi);
+
+/**
+ *  Remove an Redirection Table entry indexed by the GSI number
+ *
+ *  @returns VBox status code
+ *  @param pVM The cross context VM structure
+ *  @param gsi The GSI number for what the Redirection Table Entry should be
+ *  removed
+ */
+VMMR3_INT_DECL(int) NEMR3KvmSplitIrqchipRemoveRTE(PVM pVM, uint16_t u16Gsi);
+
+/**
+ * Returns an array of Hyper-V CPUID leaves supported by KVM.
+ *
+ * @returns VBox status code
+ * @param pVM The cross context VM structure
+ * @param outpCpuId The pointer where the CPUID leaves will be returned. Must be freed by the caller!
+ * @param outcLeaves The pointer where the number of CPUID leaves will be returned.
+ */
+VMMR3_INT_DECL(int) NEMR3KvmGetHvCpuIdLeaves(PVM pVM, PCPUMCPUIDLEAF *outpCpuId, size_t *outcLeaves);
+
+/**
+ * Returns an array of CPUID leaves supported by KVM.
+ *
+ * @returns VBox status code
+ * @param pVM The cross context VM structure
+ * @param outpCpuId The pointer where the CPUID leaves will be returned. Must be freed by the caller!
+ * @param outcLeaves The pointer where the number of CPUID leaves will be returned.
+ */
+VMMR3_INT_DECL(int) NEMR3KvmGetCpuIdLeaves(PVM pVM, PCPUMCPUIDLEAF *outpCpuId, size_t *outcLeaves);
+#endif
+
 /** @name Flags for NEMR3NotifyPhysRomRegisterEarly and NEMR3NotifyPhysRomRegisterLate.
  * @{ */
 /** Set if the range is replacing RAM rather that unused space. */
diff --git a/include/VBox/vmm/pdmdev.h b/include/VBox/vmm/pdmdev.h
index 0befe41..7ce0de0 100644
--- a/include/VBox/vmm/pdmdev.h
+++ b/include/VBox/vmm/pdmdev.h
@@ -64,6 +64,49 @@
 #include <iprt/stdarg.h>
 #include <iprt/list.h>
 
+#ifdef VBOX_WITH_KVM
+#define KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS 24
+#define KVM_IRQCHIP_NUM_PIC_INTR_PINS 16
+#endif
+
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+struct KVMPICSTATE
+{
+    uint8_t         last_irr;
+    uint8_t         irr;
+    uint8_t         imr;
+    uint8_t         isr;
+    uint8_t         priority_add;
+    uint8_t         irq_base;
+    uint8_t         read_reg_select;
+    uint8_t         poll;
+    uint8_t         special_mask;
+    uint8_t         init_state;
+    uint8_t         auto_eoi;
+    uint8_t         rotate_on_auto_eoi;
+    uint8_t         special_fully_nested_mode;
+    uint8_t         init4;
+    uint8_t         elcr;
+    uint8_t         elcr_mask;
+};
+
+enum class KVMIRQCHIP
+{
+    PIC_MASTER = 0,
+    PIC_SLAVE = 1,
+};
+
+struct KVMIOAPICSTATE
+{
+    uint64_t base_address;
+    uint32_t ioregsel;
+    uint32_t id;
+    uint32_t irr;
+
+    uint64_t redirtbl[KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS];
+};
+#endif
+
 
 RT_C_DECLS_BEGIN
 
@@ -1762,6 +1805,35 @@ typedef struct PDMPICHLP
      */
     DECLCALLBACKMEMBER(void, pfnUnlock,(PPDMDEVINS pDevIns));
 
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+    /**
+     * Asserts a PIC INTR Line.
+     * @param   pDevIns The PIC device instance.
+     * @param   u16Gsu  The GSI of the line to assert.
+     * @param   iLevel  Either PDM_IRQ_LEVEL_HIGH, PDM_IRQ_LEVEL_LOW or PDM_IRQ_LEVEL_FLIP_FLOP.
+     * @return  Vbox status code.
+     */
+    DECLCALLBACKMEMBER(int, pfnKvmSetIrqLine,(PPDMDEVINS pDevIns, uint16_t u16Gsi, int iLevel));
+
+    /**
+     * Retrieves the PIC state from the in-kernel irqchip.
+     * @param   pDevIns The PIC device instance.
+     * @param   irqchip Whether to retrieve the state from the master or slave pic
+     * @param   state   Buffer to store the PIC state in.
+     * @returns VBox status code
+     */
+    DECLCALLBACKMEMBER(int, pfnKvmGetPicState,(PPDMDEVINS pDevIns, KVMIRQCHIP irqchip, KVMPICSTATE* state));
+
+    /**
+     * Configures the PIC state of the in-kernel irqchip.
+     * @param   pDevIns The PIC device instance.
+     * @param   irqchip Whether to set the state of the master or slave pic.
+     * @param   state   Pointer to the memory containing PIC state.
+     * @returns VBox status code
+     */
+    DECLCALLBACKMEMBER(int, pfnKvmSetPicState,(PPDMDEVINS pDevIns, KVMIRQCHIP irqchip, KVMPICSTATE* state));
+#endif
+
     /** Just a safety precaution. */
     uint32_t                u32TheEnd;
 } PDMPICHLP;
@@ -1948,6 +2020,55 @@ typedef struct PDMIOAPICHLP
      */
     DECLCALLBACKMEMBER(int, pfnIommuMsiRemap,(PPDMDEVINS pDevIns, uint16_t idDevice, PCMSIMSG pMsiIn, PMSIMSG pMsiOut));
 
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+    DECLCALLBACKMEMBER(int, pfnKvmSetIrqLine,(PPDMDEVINS pDevIns, uint16_t u16Gsi, int iLevel));
+    /**
+     * Private interface between IOAPIC and KVM Split Irq Chip
+     *
+     * @returns status code.
+     * @param pDevIns Device instance of the IOAPIC.
+     * @param pMsi The MSI to deliver to the KVM Split Irq Chip
+     */
+    DECLCALLBACKMEMBER(int, pfnKvmSplitIrqchipDeliverMsi,(PPDMDEVINS pDevIns, PCMSIMSG pMsi));
+
+    /**
+     * Add or Update Redirection Table Entry for the desired GSI
+     *
+     * @returns status code.
+     * @param pDevIns Device instance of the IOAPIC
+     * @param u16Gsi The GSI number to change the redirection table entry for.
+     * @param pMsi The MSI that should be sent when GSI is triggered
+     */
+    DECLCALLBACKMEMBER(int, pfnKvmSplitIrqchipAddUpdateRTE, (PPDMDEVINS pDevIns, uint16_t u16Gsi, PCMSIMSG pMsi));
+
+    /**
+     * Remove the entry from the Redirection Table indicated by the GSI number.
+     *
+     * @retruns status code.
+     * @param pDevIns Device instance of the IOAPIC
+     * @param u16Gsi The GSI number to remove from the Redirection Table
+     */
+    DECLCALLBACKMEMBER(int, pfnKvmSplitIrqchipRemoveRTE, (PPDMDEVINS pDevIns, uint16_t u16Gsi));
+#endif
+
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+    /**
+     * Retrieves the I/O APIC state from the in-kernel irqchip.
+     * @param   pDevIns The I/O APIC device instance.
+     * @param   state   Buffer to store the I/O APIC state in.
+     * @returns VBox status code
+     */
+    DECLCALLBACKMEMBER(int, pfnKvmGetIoApicState,(PPDMDEVINS pDevIns, KVMIOAPICSTATE* state));
+
+    /**
+     * Configures the I/O APIC state of the in-kernel irqchip.
+     * @param   pDevIns The I/O APIC device instance.
+     * @param   state Pointer to the memory containing I/O APIC state.
+     * @returns VBox status code
+     */
+    DECLCALLBACKMEMBER(int, pfnKvmSetIoApicState,(PPDMDEVINS pDevIns, KVMIOAPICSTATE* state));
+#endif
+
     /** Just a safety precaution. */
     uint32_t                u32TheEnd;
 } PDMIOAPICHLP;
diff --git a/include/iprt/mangling.h b/include/iprt/mangling.h
index 2f2bc7e..a21e4ed 100644
--- a/include/iprt/mangling.h
+++ b/include/iprt/mangling.h
@@ -2705,6 +2705,7 @@
 # define RTThreadIsSelfKnown                            RT_MANGLER(RTThreadIsSelfKnown)
 # define RTThreadNativeSelf                             RT_MANGLER(RTThreadNativeSelf)
 # define RTThreadControlPokeSignal                      RT_MANGLER(RTThreadControlPokeSignal) /* not-win not-os2 */
+# define RTThreadPokeSignal                             RT_MANGLER(RTThreadPokeSignal) /* not-win not-os2 */
 # define RTThreadPoke                                   RT_MANGLER(RTThreadPoke) /* not-win not-os2 */
 # define RTThreadPreemptDisable                         RT_MANGLER(RTThreadPreemptDisable)     /* r0drv */
 # define RTThreadPreemptIsEnabled                       RT_MANGLER(RTThreadPreemptIsEnabled)   /* r0drv */
diff --git a/include/iprt/thread.h b/include/iprt/thread.h
index d4d504c..49013eb 100644
--- a/include/iprt/thread.h
+++ b/include/iprt/thread.h
@@ -555,6 +555,12 @@ RTDECL(int) RTThreadPoke(RTTHREAD hThread);
  */
 RTDECL(int) RTThreadControlPokeSignal(RTTHREAD hThread, bool fEnable);
 
+/**
+ * Returns the signal that is used to poke threads.
+ *
+ * @returns a signal number or -1.
+ */
+RTDECL(int) RTThreadPokeSignal(void);
 
 # ifdef IN_RING0
 
diff --git a/include/iprt/x86.h b/include/iprt/x86.h
index 8b7ecd2..3ee9197 100644
--- a/include/iprt/x86.h
+++ b/include/iprt/x86.h
@@ -682,6 +682,8 @@ typedef const X86CPUIDFEATEDX *PCX86CPUIDFEATEDX;
 #define X86_CPUID_STEXT_FEATURE_EBX_SMAP              RT_BIT_32(20)
 /** EBX Bit 23 - CLFLUSHOPT - Supports CLFLUSHOPT (Cache Line Flush). */
 #define X86_CPUID_STEXT_FEATURE_EBX_CLFLUSHOPT        RT_BIT_32(23)
+/** EBX Bit 24 - CLWB - Supports CLWB (Cache Line write-back). */
+#define X86_CPUID_STEXT_FEATURE_EBX_CLWB              RT_BIT_32(24)
 /** EBX Bit 25 - INTEL_PT - Supports Intel Processor Trace. */
 #define X86_CPUID_STEXT_FEATURE_EBX_INTEL_PT          RT_BIT_32(25)
 /** EBX Bit 26 - AVX512PF - Supports AVX512PF. */
@@ -703,6 +705,8 @@ typedef const X86CPUIDFEATEDX *PCX86CPUIDFEATEDX;
 #define X86_CPUID_STEXT_FEATURE_ECX_OSPKE             RT_BIT_32(4)
 /** ECX Bit 7 - CET_SS - Supports CET shadow stack features. */
 #define X86_CPUID_STEXT_FEATURE_ECX_CET_SS            RT_BIT_32(7)
+/** ECX Bit 8 - GFNI - Supports Galois Field instructions . */
+#define X86_CPUID_STEXT_FEATURE_ECX_GFNI              RT_BIT_32(8)
 /** ECX Bits 17-21 - MAWAU - Value used by BNDLDX and BNDSTX. */
 #define X86_CPUID_STEXT_FEATURE_ECX_MAWAU             UINT32_C(0x003e0000)
 /** ECX Bit 22 - RDPID - Support pread process ID. */
@@ -710,6 +714,8 @@ typedef const X86CPUIDFEATEDX *PCX86CPUIDFEATEDX;
 /** ECX Bit 30 - SGX_LC - Supports SGX launch configuration. */
 #define X86_CPUID_STEXT_FEATURE_ECX_SGX_LC            RT_BIT_32(30)
 
+/** EDX Bit 4 - FSRM - Supports Fast Short REP MOVSB */
+#define X86_CPUID_STEXT_FEATURE_EDX_FSRM              RT_BIT(4)
 /** EDX bit 9 - SRBDS_CTRL - (Special Register Buffer Data Sample Control)
  *  Supports IA32_MCU_OPT_CTRL and IA32_MCU_OPT_CTRL.RNGDS_MITG_DIS. */
 #define X86_CPUID_STEXT_FEATURE_EDX_SRBDS_CTRL        RT_BIT_32(9)
@@ -717,6 +723,8 @@ typedef const X86CPUIDFEATEDX *PCX86CPUIDFEATEDX;
 #define X86_CPUID_STEXT_FEATURE_EDX_MD_CLEAR          RT_BIT_32(10)
 /** EDX Bit 11 - TSX_FORCE_ABORT - Supports for IA32_TSX_FORCE_ABORT MSR. */
 #define X86_CPUID_STEXT_FEATURE_EDX_TSX_FORCE_ABORT   RT_BIT_32(11)
+/** EDX Bit 14 - SERIALIZE - Supports the SERIALIZE CPU instruction. */
+#define X86_CPUID_STEXT_FEATURE_EDX_SERIALIZE         RT_BIT_32(14)
 /** EDX Bit 20 - CET_IBT - Supports CET indirect branch tracking features. */
 #define X86_CPUID_STEXT_FEATURE_EDX_CET_IBT           RT_BIT_32(20)
 /** EDX Bit 26 - IBRS & IBPB - Supports the IBRS flag in IA32_SPEC_CTRL and
diff --git a/src/VBox/Devices/PC/DevACPI.cpp b/src/VBox/Devices/PC/DevACPI.cpp
index 50e1ca6..4efb074 100644
--- a/src/VBox/Devices/PC/DevACPI.cpp
+++ b/src/VBox/Devices/PC/DevACPI.cpp
@@ -814,7 +814,11 @@ struct ACPITBLISO
     uint16_t            u16Flags;               /**< MPS INTI flags Global */
 };
 AssertCompileSize(ACPITBLISO, 10);
-#define NUMBER_OF_IRQ_SOURCE_OVERRIDES 2
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+#define NUMBER_OF_IRQ_SOURCE_OVERRIDES (10)
+#else
+#define NUMBER_OF_IRQ_SOURCE_OVERRIDES (2)
+#endif
 
 /** HPET Descriptor Structure */
 struct ACPITBLHPET
@@ -3319,8 +3323,73 @@ static void acpiR3SetupMadt(PPDMDEVINS pDevIns, PACPISTATE pThis, RTGCPHYS32 add
     isos[1].u8Bus      = 0; /* Must be 0 */
     isos[1].u8Source   = 9; /* IRQ9 */
     isos[1].u32GSI     = 9; /* connected to pin 9 */
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+    isos[1].u16Flags   = 0xd; /* active high, level triggered */
+#else
     isos[1].u16Flags   = 0xf; /* active low, level triggered */
+#endif
+
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+    isos[2].u8Type     = 2;
+    isos[2].u8Length   = sizeof(ACPITBLISO);
+    isos[2].u8Bus      = 0; /* Must be 0 */
+    isos[2].u8Source   = 16; /* IRQ16 */
+    isos[2].u32GSI     = 16; /* connected to pin 16 */
+    isos[2].u16Flags   = 0xd; /* active high, level triggered */
+
+    isos[3].u8Type     = 2;
+    isos[3].u8Length   = sizeof(ACPITBLISO);
+    isos[3].u8Bus      = 0; /* Must be 0 */
+    isos[3].u8Source   = 17; /* IRQ17 */
+    isos[3].u32GSI     = 17; /* connected to pin 17 */
+    isos[3].u16Flags   = 0xd; /* active high, level triggered */
+
+    isos[4].u8Type     = 2;
+    isos[4].u8Length   = sizeof(ACPITBLISO);
+    isos[4].u8Bus      = 0; /* Must be 0 */
+    isos[4].u8Source   = 18; /* IRQ18 */
+    isos[4].u32GSI     = 18; /* connected to pin 18 */
+    isos[4].u16Flags   = 0xd; /* active high, level triggered */
+
+    isos[5].u8Type     = 2;
+    isos[5].u8Length   = sizeof(ACPITBLISO);
+    isos[5].u8Bus      = 0; /* Must be 0 */
+    isos[5].u8Source   = 19; /* IRQ19 */
+    isos[5].u32GSI     = 19; /* connected to pin 19 */
+    isos[5].u16Flags   = 0xd; /* active high, level triggered */
+
+    isos[6].u8Type     = 2;
+    isos[6].u8Length   = sizeof(ACPITBLISO);
+    isos[6].u8Bus      = 0; /* Must be 0 */
+    isos[6].u8Source   = 20; /* IRQ20 */
+    isos[6].u32GSI     = 20; /* connected to pin 20 */
+    isos[6].u16Flags   = 0xd; /* active high, level triggered */
+
+    isos[7].u8Type     = 2;
+    isos[7].u8Length   = sizeof(ACPITBLISO);
+    isos[7].u8Bus      = 0; /* Must be 0 */
+    isos[7].u8Source   = 21; /* IRQ21 */
+    isos[7].u32GSI     = 21; /* connected to pin 21 */
+    isos[7].u16Flags   = 0xd; /* active high, level triggered */
+
+    isos[8].u8Type     = 2;
+    isos[8].u8Length   = sizeof(ACPITBLISO);
+    isos[8].u8Bus      = 0; /* Must be 0 */
+    isos[8].u8Source   = 22; /* IRQ22 */
+    isos[8].u32GSI     = 22; /* connected to pin 22 */
+    isos[8].u16Flags   = 0xd; /* active high, level triggered */
+
+    isos[9].u8Type     = 2;
+    isos[9].u8Length   = sizeof(ACPITBLISO);
+    isos[9].u8Bus      = 0; /* Must be 0 */
+    isos[9].u8Source   = 23; /* IRQ23 */
+    isos[9].u32GSI     = 23; /* connected to pin 23 */
+    isos[9].u16Flags   = 0xd; /* active high, level triggered */
+
+    Assert(NUMBER_OF_IRQ_SOURCE_OVERRIDES == 10);
+#else
     Assert(NUMBER_OF_IRQ_SOURCE_OVERRIDES == 2);
+#endif
 
     madt.header_addr()->u8Checksum = acpiR3Checksum(madt.data(), madt.size());
     acpiR3PhysCopy(pDevIns, addr, madt.data(), madt.size());
diff --git a/src/VBox/Devices/PC/DevIoApic.cpp b/src/VBox/Devices/PC/DevIoApic.cpp
index 2dd37c2..796b539 100644
--- a/src/VBox/Devices/PC/DevIoApic.cpp
+++ b/src/VBox/Devices/PC/DevIoApic.cpp
@@ -32,6 +32,14 @@
 #define LOG_GROUP LOG_GROUP_DEV_IOAPIC
 #include <VBox/log.h>
 #include <VBox/vmm/hm.h>
+
+#ifdef VBOX_WITH_KVM
+#include <VBox/vmm/nem.h>
+#ifdef IN_RING3
+#include <vector>
+#endif
+#endif
+
 #include <VBox/msi.h>
 #include <VBox/pci.h>
 #include <VBox/vmm/pdmdev.h>
@@ -40,7 +48,6 @@
 #include <iprt/x86.h>
 #include <iprt/string.h>
 
-
 /*********************************************************************************************************************************
 *   Defined Constants And Macros                                                                                                 *
 *********************************************************************************************************************************/
@@ -68,6 +75,10 @@ Controller" */
 
 /** The number of interrupt input pins. */
 #define IOAPIC_NUM_INTR_PINS                    24
+
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+AssertCompile(IOAPIC_NUM_INTR_PINS == KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS);
+#endif
 /** Maximum redirection entires. */
 #define IOAPIC_MAX_RTE_INDEX                    (IOAPIC_NUM_INTR_PINS - 1)
 /** Reduced RTEs used by SIO.A (82379AB). */
@@ -340,6 +351,19 @@ typedef struct IOAPIC
 #endif
     /** Per-vector stats. */
     STAMCOUNTER             aStatVectors[256];
+
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL) && defined(IN_RING3)
+    /** Handle to the timer that is used for delayed IRQ injection */
+    TMTIMERHANDLE           hIoapicDelayedInjectionHandler;
+
+    /** List of PINs that need delayed injection handling, protected by IOAPIC_LOCK */
+    std::vector<uint8_t> delayed_interrupt_list;
+
+    /** A per-GSI counter that is increased whenever a level triggered interrupt is
+        instantly pending following an EOI. The counter is reset to zero when no
+        interrupt is pending following an EOI. */
+    uint64_t gsi_counter[IOAPIC_NUM_INTR_PINS] {};
+#endif
 } IOAPIC;
 AssertCompileMemberAlignment(IOAPIC, au64RedirTable, 8);
 /** Pointer to shared IOAPIC data. */
@@ -572,6 +596,35 @@ DECLINLINE(void) ioapicGetMsiFromRte(uint64_t u64Rte, IOAPICTYPE enmType, PMSIMS
 #endif
 
 
+static bool handlePossibleInterruptStorm(PPDMDEVINS pDevIns, PIOAPIC pThis, unsigned idxRte)
+{
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL) && defined(IN_RING3)
+
+    /** There are buggy drivers that do not clear all interrupt conditions before sending an EOI to the IOAPIC.
+        On real HW, such drivers make slow foward progress because the IOAPIC needs a few cycles the next interrupt
+        is injected after an EOI. If we detect this situation, delay the interrupt and give the guest driver the
+        opportunity to fix this mess. */
+
+    static constexpr uint64_t NUM_EXCESSIVE_INTERRUPTS {10000};
+    if (++pThis->gsi_counter[idxRte] == NUM_EXCESSIVE_INTERRUPTS) {
+        LogRel(("Interrupt storm on GSI %d, delaying injection\n", idxRte));
+
+        // Reset our counter so the next injection of this GSI succeeds.
+        pThis->gsi_counter[idxRte] = 0;
+
+        // Remember which GSI we have to raise after our delay.
+        pThis->delayed_interrupt_list.push_back(idxRte);
+
+        // Arm the delayed injection handler.
+        PDMDevHlpTimerSetMillies(pDevIns, pThis->hIoapicDelayedInjectionHandler, 100 /* ms */);
+        return true;
+    }
+#else
+    NOREF(pDevIns); NOREF(pThis); NOREF(idxRte);
+#endif
+
+    return false;
+}
 /**
  * Signals the next pending interrupt for the specified Redirection Table Entry
  * (RTE).
@@ -608,6 +661,10 @@ static void ioapicSignalIntrForRte(PPDMDEVINS pDevIns, PIOAPIC pThis, PIOAPICCC
             STAM_COUNTER_INC(&pThis->StatSuppressedLevelIntr);
             return;
         }
+
+        if (handlePossibleInterruptStorm(pDevIns, pThis, idxRte)) {
+            return;
+        }
     }
 
     XAPICINTR ApicIntr;
@@ -655,6 +712,11 @@ static void ioapicSignalIntrForRte(PPDMDEVINS pDevIns, PIOAPIC pThis, PIOAPICCC
     }
 #endif
 
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+    AssertReleaseMsg(rcRemap == VERR_IOMMU_NOT_PRESENT || rcRemap == VERR_IOMMU_CANNOT_CALL_SELF,
+                     ("Interrupt remapping not supported yet."));
+    int rc = pThisCC->pIoApicHlp->pfnKvmSplitIrqchipDeliverMsi(pDevIns, &MsiIn);
+#else
     uint32_t const u32TagSrc = pThis->au32TagSrc[idxRte];
     Log2(("IOAPIC: Signaling %s-triggered interrupt. Dest=%#x DestMode=%s Vector=%#x (%u)\n",
           ApicIntr.u8TriggerMode == IOAPIC_RTE_TRIGGER_MODE_EDGE ? "edge" : "level", ApicIntr.u8Dest,
@@ -672,6 +734,7 @@ static void ioapicSignalIntrForRte(PPDMDEVINS pDevIns, PIOAPIC pThis, PIOAPICCC
                                                     ApicIntr.u8Polarity,
                                                     ApicIntr.u8TriggerMode,
                                                     u32TagSrc);
+#endif
     /* Can't reschedule to R3. */
     Assert(rc == VINF_SUCCESS || rc == VERR_APIC_INTR_DISCARDED);
 #ifdef DEBUG_ramshankar
@@ -781,6 +844,16 @@ static VBOXSTRICTRC ioapicSetRedirTableEntry(PPDMDEVINS pDevIns, PIOAPIC pThis,
 
         LogFlow(("IOAPIC: ioapicSetRedirTableEntry: uIndex=%#RX32 idxRte=%u uValue=%#RX32\n", uIndex, idxRte, uValue));
 
+#if defined(VBOX_WITH_KVM) && defined(IN_RING3) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+        const uint64_t u64RteNew { pThis->au64RedirTable[idxRte] };
+        if (not IOAPIC_RTE_IS_MASKED(u64RteNew)) {
+            MSIMSG msi;
+            RT_ZERO(msi);
+            ioapicGetMsiFromRte(u64RteNew, pThis->enmType, &msi);
+            rc = pThisCC->pIoApicHlp->pfnKvmSplitIrqchipAddUpdateRTE(pDevIns, idxRte, &msi);
+        }
+#endif
+
         /*
          * Signal the next pending interrupt for this RTE.
          */
@@ -790,7 +863,6 @@ static VBOXSTRICTRC ioapicSetRedirTableEntry(PPDMDEVINS pDevIns, PIOAPIC pThis,
             LogFlow(("IOAPIC: ioapicSetRedirTableEntry: Signalling pending interrupt. idxRte=%u\n", idxRte));
             ioapicSignalIntrForRte(pDevIns, pThis, pThisCC, idxRte);
         }
-
         IOAPIC_UNLOCK(pDevIns, pThis, pThisCC);
     }
     else
@@ -947,6 +1019,15 @@ static DECLCALLBACK(void) ioapicSetIrq(PPDMDEVINS pDevIns, PCIBDF uBusDevFn, int
     PIOAPIC   pThis   = PDMDEVINS_2_DATA(pDevIns, PIOAPIC);
     PIOAPICCC pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PIOAPICCC);
     LogFlow(("IOAPIC: ioapicSetIrq: iIrq=%d iLevel=%d uTagSrc=%#x\n", iIrq, iLevel, uTagSrc));
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+    pThisCC->pIoApicHlp->pfnKvmSetIrqLine(pDevIns, iIrq, iLevel & PDM_IRQ_LEVEL_HIGH);
+
+    if ((iLevel & PDM_IRQ_LEVEL_FLIP_FLOP) == PDM_IRQ_LEVEL_FLIP_FLOP) {
+        pThisCC->pIoApicHlp->pfnKvmSetIrqLine(pDevIns, iIrq, PDM_IRQ_LEVEL_LOW);
+    }
+
+    return;
+#endif
 
     STAM_COUNTER_INC(&pThis->CTX_SUFF_Z(StatSetIrq));
 
@@ -969,6 +1050,9 @@ static DECLCALLBACK(void) ioapicSetIrq(PPDMDEVINS pDevIns, PCIBDF uBusDevFn, int
 #endif
         if (!fActive)
         {
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL) && defined(IN_RING3)
+            pThis->gsi_counter[idxRte] = 0;
+#endif
             pThis->uIrr &= ~uPinMask;
             pThis->au32TagSrc[idxRte] = 0;
             IOAPIC_UNLOCK(pDevIns, pThis, pThisCC);
@@ -1087,7 +1171,11 @@ static DECLCALLBACK(void) ioapicSendMsi(PPDMDEVINS pDevIns, PCIBDF uBusDevFn, PC
 #else
     NOREF(uBusDevFn);
 #endif
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+    int rc = pThisCC->pIoApicHlp->pfnKvmSplitIrqchipDeliverMsi(pDevIns, pMsi);
 
+    AssertReleaseMsg(rc == VINF_SUCCESS || rc == VERR_APIC_INTR_DISCARDED, ("ioapicSendMsi: Could not deliver MSI! error %d\n", rc));
+#else
     ioapicGetApicIntrFromMsi(pMsi, &ApicIntr);
 
     /*
@@ -1105,6 +1193,7 @@ static DECLCALLBACK(void) ioapicSendMsi(PPDMDEVINS pDevIns, PCIBDF uBusDevFn, PC
                                                     uTagSrc);
     /* Can't reschedule to R3. */
     Assert(rc == VINF_SUCCESS || rc == VERR_APIC_INTR_DISCARDED); NOREF(rc);
+#endif
 }
 
 
@@ -1451,10 +1540,33 @@ static DECLCALLBACK(void) ioapicR3DbgInfo(PPDMDEVINS pDevIns, PCDBGFINFOHLP pHlp
  */
 static DECLCALLBACK(int) ioapicR3SaveExec(PPDMDEVINS pDevIns, PSSMHANDLE pSSM)
 {
-    PCIOAPIC        pThis = PDMDEVINS_2_DATA(pDevIns, PCIOAPIC);
+    PIOAPIC         pThis = PDMDEVINS_2_DATA(pDevIns, PIOAPIC);
     PCPDMDEVHLPR3   pHlp  = pDevIns->pHlpR3;
     LogFlow(("IOAPIC: ioapicR3SaveExec\n"));
 
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+    PIOAPICCC pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PIOAPICCC);
+    KVMIOAPICSTATE kvm_ioapic_state;
+
+    for (unsigned pic = 0; pic < 2; ++pic) {
+        int rc = pThisCC->pIoApicHlp->pfnKvmGetIoApicState(pDevIns, &kvm_ioapic_state);
+        AssertLogRelMsg(RT_SUCCESS(rc), ("Unable to retrieve IOPIC state from KVM"));
+
+        /**
+         * There's no need to look at kvm_ioapic_state.base_address because
+         * VBox does not support IOAPIC relocation, thus, it will always be
+         * at IOAPIC_MMIO_BASE_PHYSADDR.
+         */
+        pThis->uIrr = kvm_ioapic_state.irr;
+        pThis->u8Id = kvm_ioapic_state.id;
+        pThis->u8Index = kvm_ioapic_state.ioregsel;
+
+        for (uint8_t idxRte = 0; idxRte < RT_ELEMENTS(pThis->au64RedirTable); idxRte++) {
+            pThis->au64RedirTable[idxRte] = kvm_ioapic_state.redirtbl[idxRte];
+        }
+    }
+#endif
+
     pHlp->pfnSSMPutU32(pSSM, pThis->uIrr);
     pHlp->pfnSSMPutU8(pSSM,  pThis->u8Id);
     pHlp->pfnSSMPutU8(pSSM,  pThis->u8Index);
@@ -1497,6 +1609,39 @@ static DECLCALLBACK(int) ioapicR3LoadExec(PPDMDEVINS pDevIns, PSSMHANDLE pSSM, u
     for (uint8_t idxRte = 0; idxRte < RT_ELEMENTS(pThis->au64RedirTable); idxRte++)
         pHlp->pfnSSMGetU64(pSSM, &pThis->au64RedirTable[idxRte]);
 
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+    PIOAPICCC pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PIOAPICCC);
+    for (uint8_t idxRte = 0; idxRte < RT_ELEMENTS(pThis->au64RedirTable); idxRte++) {
+        const uint64_t u64RteNew { pThis->au64RedirTable[idxRte] };
+        if (not IOAPIC_RTE_IS_MASKED(u64RteNew) and (IOAPIC_RTE_GET_TRIGGER_MODE(u64RteNew) != IOAPIC_RTE_TRIGGER_MODE_EDGE)) {
+            MSIMSG msi;
+            RT_ZERO(msi);
+            ioapicGetMsiFromRte(u64RteNew, pThis->enmType, &msi);
+            int rc = pThisCC->pIoApicHlp->pfnKvmSplitIrqchipAddUpdateRTE(pDevIns, idxRte, &msi);
+            AssertLogRelMsg(RT_SUCCESS(rc), ("Adding redirection table entry failed."));
+        }
+    }
+#endif
+
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+    PIOAPICCC pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PIOAPICCC);
+    KVMIOAPICSTATE kvm_ioapic_state;
+
+    for (unsigned pic = 0; pic < 2; ++pic) {
+        kvm_ioapic_state.base_address = IOAPIC_MMIO_BASE_PHYSADDR;
+        kvm_ioapic_state.irr = pThis->uIrr;
+        kvm_ioapic_state.id = pThis->u8Id;
+        kvm_ioapic_state.ioregsel = pThis->u8Index;
+
+        for (uint8_t idxRte = 0; idxRte < RT_ELEMENTS(pThis->au64RedirTable); idxRte++) {
+            kvm_ioapic_state.redirtbl[idxRte] = pThis->au64RedirTable[idxRte];
+        }
+
+        int rc = pThisCC->pIoApicHlp->pfnKvmSetIoApicState(pDevIns, &kvm_ioapic_state);
+        AssertLogRelMsg(RT_SUCCESS(rc), ("Unable to retrieve IOPIC state from KVM"));
+    }
+#endif
+
     if (uVersion > IOAPIC_SAVED_STATE_VERSION_NO_FLIPFLOP_MAP)
         for (uint8_t idx = 0; idx < RT_ELEMENTS(pThis->bmFlipFlop); idx++)
             pHlp->pfnSSMGetU64(pSSM, &pThis->bmFlipFlop[idx]);
@@ -1525,6 +1670,10 @@ static DECLCALLBACK(void) ioapicR3Reset(PPDMDEVINS pDevIns)
     {
         pThis->au64RedirTable[idxRte] = IOAPIC_RTE_MASK;
         pThis->au32TagSrc[idxRte] = 0;
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+        int rc = pThisCC->pIoApicHlp->pfnKvmSplitIrqchipRemoveRTE(pDevIns, idxRte);
+        AssertLogRelMsg(RT_SUCCESS(rc), ("Removing redirection table entry failed."));
+#endif
     }
 
     IOAPIC_UNLOCK(pDevIns, pThis, pThisCC);
@@ -1552,6 +1701,10 @@ static DECLCALLBACK(int) ioapicR3Destruct(PPDMDEVINS pDevIns)
     PIOAPIC pThis = PDMDEVINS_2_DATA(pDevIns, PIOAPIC);
     LogFlow(("IOAPIC: ioapicR3Destruct: pThis=%p\n", pThis));
 
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL) && defined(IN_RING3)
+    PDMDevHlpTimerDestroy(pDevIns, pThis->hIoapicDelayedInjectionHandler);
+#endif
+
 # ifndef IOAPIC_WITH_PDM_CRITSECT
     /*
      * Destroy the RTE critical section.
@@ -1565,6 +1718,26 @@ static DECLCALLBACK(int) ioapicR3Destruct(PPDMDEVINS pDevIns)
     return VINF_SUCCESS;
 }
 
+static DECLCALLBACK(void) ioapicDelayedInjectionHandler(PPDMDEVINS pDevIns, TMTIMERHANDLE hTimer, void *pvUser)
+{
+    NOREF(hTimer); NOREF(pvUser);
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL) && defined(IN_RING3)
+    PIOAPIC         pThis   = PDMDEVINS_2_DATA(pDevIns, PIOAPIC);
+    PIOAPICCC       pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PIOAPICCC);
+
+    IOAPIC_LOCK(pDevIns, pThis, pThisCC, VERR_IGNORED);
+
+    for(auto iPin : pThis->delayed_interrupt_list) {
+        ioapicSignalIntrForRte(pDevIns, pThis, pThisCC, iPin);
+    }
+
+    pThis->delayed_interrupt_list.clear();
+
+    IOAPIC_UNLOCK(pDevIns, pThis, pThisCC);
+#else
+    NOREF(pDevIns);
+#endif
+}
 
 /**
  * @interface_method_impl{PDMDEVREG,pfnConstruct}
@@ -1578,6 +1751,12 @@ static DECLCALLBACK(int) ioapicR3Construct(PPDMDEVINS pDevIns, int iInstance, PC
     LogFlow(("IOAPIC: ioapicR3Construct: pThis=%p iInstance=%d\n", pThis, iInstance));
     Assert(iInstance == 0); RT_NOREF(iInstance);
 
+#if defined(VBOX_WITH_KVM) && !defined(VBOX_WITH_KVM_IRQCHIP_FULL) && defined(IN_RING3)
+    int rc_timer = PDMDevHlpTimerCreate(pDevIns, TMCLOCK_VIRTUAL, ioapicDelayedInjectionHandler, pThis,
+            TMTIMER_FLAGS_NO_CRIT_SECT | TMTIMER_FLAGS_NO_RING0, "IOAPIC Delayed IRQ", &pThis->hIoapicDelayedInjectionHandler);
+    AssertRCReturn(rc_timer, rc_timer);
+#endif
+
     /*
      * Validate and read the configuration.
      */
diff --git a/src/VBox/Devices/PC/DevPIC.cpp b/src/VBox/Devices/PC/DevPIC.cpp
index 4ad8d83..651b706 100644
--- a/src/VBox/Devices/PC/DevPIC.cpp
+++ b/src/VBox/Devices/PC/DevPIC.cpp
@@ -366,6 +366,16 @@ static DECLCALLBACK(void) picSetIrq(PPDMDEVINS pDevIns, int iIrq, int iLevel, ui
 {
     PDEVPIC     pThis   = PDMDEVINS_2_DATA(pDevIns, PDEVPIC);
     PDEVPICCC   pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PDEVPICCC);
+
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+    pThisCC->pPicHlp->pfnKvmSetIrqLine(pDevIns, iIrq, iLevel & PDM_IRQ_LEVEL_HIGH);
+
+    if ((iLevel & PDM_IRQ_LEVEL_FLIP_FLOP) == PDM_IRQ_LEVEL_FLIP_FLOP) {
+        pThisCC->pPicHlp->pfnKvmSetIrqLine(pDevIns, iIrq, PDM_IRQ_LEVEL_LOW);
+    }
+
+    return;
+#else
     AssertMsgReturnVoid(iIrq < 16, ("iIrq=%d\n", iIrq));
 
     Log(("picSetIrq %d %d\n", iIrq, iLevel));
@@ -383,6 +393,7 @@ static DECLCALLBACK(void) picSetIrq(PPDMDEVINS pDevIns, int iIrq, int iLevel, ui
     }
     pic_set_irq1(&RT_SAFE_SUBSCRIPT(pThis->aPics, iIrq >> 3), iIrq & 7, iLevel & PDM_IRQ_LEVEL_HIGH, uTagSrc);
     pic_update_irq(pDevIns, pThis, pThisCC);
+#endif
 }
 
 
@@ -830,6 +841,33 @@ static DECLCALLBACK(int) picR3SaveExec(PPDMDEVINS pDevIns, PSSMHANDLE pSSM)
     PDEVPIC         pThis = PDMDEVINS_2_DATA(pDevIns, PDEVPIC);
     PCPDMDEVHLPR3   pHlp  = pDevIns->pHlpR3;
 
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+    PDEVPICCC   pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PDEVPICCC);
+    KVMPICSTATE kvm_pic_state;
+
+    for (unsigned pic = 0; pic < 2; ++pic) {
+        int rc = pThisCC->pPicHlp->pfnKvmGetPicState(pDevIns, pic == 0 ? KVMIRQCHIP::PIC_MASTER : KVMIRQCHIP::PIC_SLAVE, &kvm_pic_state);
+        AssertLogRelMsg(RT_SUCCESS(rc), ("Unable to retrieve PIC state from KVM"));
+
+        pThis->aPics[pic].last_irr = kvm_pic_state.last_irr;
+        pThis->aPics[pic].irr = kvm_pic_state.irr;
+        pThis->aPics[pic].imr = kvm_pic_state.imr;
+        pThis->aPics[pic].isr = kvm_pic_state.isr;
+        pThis->aPics[pic].priority_add = kvm_pic_state.priority_add;
+        pThis->aPics[pic].irq_base = kvm_pic_state.irq_base;
+        pThis->aPics[pic].read_reg_select = kvm_pic_state.read_reg_select;
+        pThis->aPics[pic].poll = kvm_pic_state.poll;
+        pThis->aPics[pic].special_mask = kvm_pic_state.special_mask;
+        pThis->aPics[pic].init_state = kvm_pic_state.init_state;
+        pThis->aPics[pic].auto_eoi = kvm_pic_state.auto_eoi;
+        pThis->aPics[pic].rotate_on_auto_eoi = kvm_pic_state.rotate_on_auto_eoi;
+        pThis->aPics[pic].special_fully_nested_mode = kvm_pic_state.special_fully_nested_mode;
+        pThis->aPics[pic].init4 = kvm_pic_state.init4;
+        pThis->aPics[pic].elcr = kvm_pic_state.elcr;
+        pThis->aPics[pic].elcr_mask = kvm_pic_state.elcr_mask;
+    }
+#endif
+
     for (unsigned i = 0; i < RT_ELEMENTS(pThis->aPics); i++)
     {
         pHlp->pfnSSMPutU8(pSSM, pThis->aPics[i].last_irr);
@@ -883,6 +921,33 @@ static DECLCALLBACK(int) picR3LoadExec(PPDMDEVINS pDevIns, PSSMHANDLE pSSM, uint
         pHlp->pfnSSMGetU8(pSSM, &pThis->aPics[i].elcr);
     }
 
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+    PDEVPICCC   pThisCC = PDMDEVINS_2_DATA_CC(pDevIns, PDEVPICCC);
+    KVMPICSTATE kvm_pic_state;
+
+    for (unsigned pic = 0; pic < 2; ++pic) {
+        kvm_pic_state.last_irr = pThis->aPics[pic].last_irr;
+        kvm_pic_state.irr = pThis->aPics[pic].irr;
+        kvm_pic_state.imr = pThis->aPics[pic].imr;
+        kvm_pic_state.isr = pThis->aPics[pic].isr;
+        kvm_pic_state.priority_add = pThis->aPics[pic].priority_add;
+        kvm_pic_state.irq_base = pThis->aPics[pic].irq_base;
+        kvm_pic_state.read_reg_select = pThis->aPics[pic].read_reg_select;
+        kvm_pic_state.poll = pThis->aPics[pic].poll;
+        kvm_pic_state.special_mask = pThis->aPics[pic].special_mask;
+        kvm_pic_state.init_state = pThis->aPics[pic].init_state;
+        kvm_pic_state.auto_eoi = pThis->aPics[pic].auto_eoi;
+        kvm_pic_state.rotate_on_auto_eoi = pThis->aPics[pic].rotate_on_auto_eoi;
+        kvm_pic_state.special_fully_nested_mode = pThis->aPics[pic].special_fully_nested_mode;
+        kvm_pic_state.init4 = pThis->aPics[pic].init4;
+        kvm_pic_state.elcr = pThis->aPics[pic].elcr;
+        kvm_pic_state.elcr_mask = pThis->aPics[pic].elcr_mask;
+
+        int rc = pThisCC->pPicHlp->pfnKvmSetPicState(pDevIns, pic == 0 ? KVMIRQCHIP::PIC_MASTER : KVMIRQCHIP::PIC_SLAVE, &kvm_pic_state);
+        AssertLogRelMsg(RT_SUCCESS(rc), ("Unable to push PIC state to KVM"));
+    }
+#endif
+
     /* Note! PDM will restore the VMCPU_FF_INTERRUPT_PIC state. */
     return VINF_SUCCESS;
 }
diff --git a/src/VBox/HostDrivers/Support/Makefile.kmk b/src/VBox/HostDrivers/Support/Makefile.kmk
index 48a28d3..d4032db 100644
--- a/src/VBox/HostDrivers/Support/Makefile.kmk
+++ b/src/VBox/HostDrivers/Support/Makefile.kmk
@@ -196,6 +196,7 @@ SUPR3_DEFS          = \
 	$(if $(VBOX_WITH_RAW_MODE),VBOX_WITH_RAW_MODE,) \
 	$(if $(VBOX_WITH_DRIVERLESS_NEM_FALLBACK),VBOX_WITH_DRIVERLESS_NEM_FALLBACK,) \
 	$(if $(VBOX_WITH_R0_MODULES),VBOX_WITH_R0_MODULES,) \
+        $(if $(VBOX_WITH_PREALLOC_RAM_BY_DEFAULT),VBOX_WITH_PREALLOC_RAM_BY_DEFAULT,) \
 	VBOX_PERMIT_MORE \
 	VBOX_PERMIT_EVEN_MORE
 SUPR3_INCS         := $(PATH_SUB_CURRENT)
diff --git a/src/VBox/HostDrivers/Support/linux/SUPLib-linux.cpp b/src/VBox/HostDrivers/Support/linux/SUPLib-linux.cpp
index 2591661..3b8a6e2 100644
--- a/src/VBox/HostDrivers/Support/linux/SUPLib-linux.cpp
+++ b/src/VBox/HostDrivers/Support/linux/SUPLib-linux.cpp
@@ -96,6 +96,11 @@ DECLHIDDEN(int) suplibOsInit(PSUPLIBDATA pThis, bool fPreInited, uint32_t fFlags
         return VINF_SUCCESS;
     Assert(pThis->hDevice == (intptr_t)NIL_RTFILE);
 
+#ifdef VBOX_WITH_KVM
+    pThis->fDriverless = true;
+    return VINF_SUCCESS;
+#endif
+
     /*
      * Check if madvise works.
      */
@@ -256,10 +261,15 @@ DECLHIDDEN(int) suplibOsPageAlloc(PSUPLIBDATA pThis, size_t cPages, uint32_t fFl
         fMmap |= MAP_HUGETLB;
 #endif
 
+#ifdef VBOX_WITH_PREALLOC_RAM_BY_DEFAULT
+    fMmap |= MAP_POPULATE;
+#endif
+
     uint32_t const cbPage = SUP_PAGE_SIZE;
     uint32_t const cPageShift = SUP_PAGE_SHIFT;
 
     size_t cbMmap = cPages << cPageShift;
+
     if (   !pThis->fSysMadviseWorks
         && (fFlags & (SUP_PAGE_ALLOC_F_FOR_LOCKING | SUP_PAGE_ALLOC_F_LARGE_PAGES)) == SUP_PAGE_ALLOC_F_FOR_LOCKING)
         cbMmap += cbPage * 2;
diff --git a/src/VBox/Main/Makefile.kmk b/src/VBox/Main/Makefile.kmk
index 2aeba32..a44cb53 100644
--- a/src/VBox/Main/Makefile.kmk
+++ b/src/VBox/Main/Makefile.kmk
@@ -1114,7 +1114,8 @@ if !defined(VBOX_ONLY_SDK) && !defined(VBOX_ONLY_EXTPACKS) # Note this goes on f
 
 
  VBoxC_LIBS += \
- 	$(PATH_STAGE_LIB)/VBoxAPIWrap$(VBOX_SUFF_LIB)
+	$(PATH_STAGE_LIB)/VBoxAPIWrap$(VBOX_SUFF_LIB)
+
  VBoxC_LIBS.win += \
  	$(PATH_SDK_$(VBOX_WINPSDK)_LIB)/psapi.lib \
  	$(PATH_TOOL_$(VBOX_VCC_TOOL)_LIB)/delayimp.lib
diff --git a/src/VBox/Main/src-server/HostImpl.cpp b/src/VBox/Main/src-server/HostImpl.cpp
index 7cbd92b..2ab5c17 100644
--- a/src/VBox/Main/src-server/HostImpl.cpp
+++ b/src/VBox/Main/src-server/HostImpl.cpp
@@ -82,6 +82,8 @@
 # include <errno.h>
 # include <net/if.h>
 # include <net/if_arp.h>
+# include <fcntl.h>
+# include <unistd.h>
 #endif /* RT_OS_LINUX */
 
 #ifdef RT_OS_SOLARIS
diff --git a/src/VBox/Runtime/Makefile.kmk b/src/VBox/Runtime/Makefile.kmk
index f2c2498..22cbed4 100644
--- a/src/VBox/Runtime/Makefile.kmk
+++ b/src/VBox/Runtime/Makefile.kmk
@@ -3307,8 +3307,8 @@ if1of ($(KBUILD_TARGET).$(KBUILD_TARGET_ARCH), win.x86 win.amd64 linux.amd64 dar
 		$(if-expr "$(KBUILD_TARGET_ARCH)" == "amd64",-e "/not-amd64/d",-e "/only-amd64/d") \
 		$(if-expr "$(KBUILD_TARGET_ARCH)" == "arm64",-e "/not-arm64/d",-e "/only-arm64/d") \
 		$(if-expr "$(KBUILD_TARGET).$(KBUILD_TARGET_ARCH)" == "darwin.arm64",, -e "/only-darwin.arm64/d") \
-		$(if-expr "$(substr $(if-expr $(KBUILD_TARGET) != 'win',$(VBOX_GCC_std), $(VBOX_VCC_std)),-2)" >= "17" \
-			,-e "/before-noexcept/d", -e "/after-noexcept/d") \
+		$(if-expr "$(VBOX_WITH_KVM)" != "1", $(if-expr "$(substr $(if-expr $(KBUILD_TARGET) != 'win',$(VBOX_GCC_std), $(VBOX_VCC_std)),-2)" >= "17" \
+			,-e "/before-noexcept/d", -e "/after-noexcept/d"), -e "/after-noexcept/d") \
 		$(if-expr $(intersects $(KBUILD_TARGET), linux) && $(intersects $(KBUILD_TARGET_ARCH), amd64 arm64) \
 			,-e "/int64=llong/d", -e "/int64=long/d") \
 		-f "$<" $(filter %.def, $^)
diff --git a/src/VBox/Runtime/r3/posix/thread-posix.cpp b/src/VBox/Runtime/r3/posix/thread-posix.cpp
index 8b05377..70202f7 100644
--- a/src/VBox/Runtime/r3/posix/thread-posix.cpp
+++ b/src/VBox/Runtime/r3/posix/thread-posix.cpp
@@ -729,6 +729,10 @@ RTDECL(int) RTThreadControlPokeSignal(RTTHREAD hThread, bool fEnable)
     return rc;
 }
 
+RTDECL(int) RTThreadPokeSignal(void)
+{
+    return g_iSigPokeThread;
+}
 
 #endif
 
diff --git a/src/VBox/Runtime/testcase/Makefile.kmk b/src/VBox/Runtime/testcase/Makefile.kmk
index 5fa0a11..6b5821a 100644
--- a/src/VBox/Runtime/testcase/Makefile.kmk
+++ b/src/VBox/Runtime/testcase/Makefile.kmk
@@ -610,6 +610,7 @@ ifdef VBOX_WITH_TESTCASES # The whole file
   tstLog_CLEAN        = $(tstLog_0_OUTDIR)/tstLogGroups.h
   $$(tstLog_0_OUTDIR)/tstLogGroups.h: $(PATH_ROOT)/include/VBox/log.h
 	$(call MSG_GENERATE,,$@,$<)
+	$(QUIET)$(MKDIR) -p $(tstLog_0_OUTDIR)
 	$(QUIET)$(RM) -f -- "$@"
 	$(QUIET)$(SED) -n -e 's/^ *LOG_GROUP_\([A-Z0-9_]*\),.*$(DOLLAR)/{ LOG_GROUP_\1, "\1" },/p' --output "$@" "$<"
  endif # !VBOX_ONLY_VALIDATIONKIT
diff --git a/src/VBox/VMM/Makefile.kmk b/src/VBox/VMM/Makefile.kmk
index 6cd7d4e..087819c 100644
--- a/src/VBox/VMM/Makefile.kmk
+++ b/src/VBox/VMM/Makefile.kmk
@@ -147,7 +147,8 @@ VBoxVMM_SOURCES  = \
 	VMMR3/EMR3Nem.cpp \
 	VMMR3/GCM.cpp \
 	VMMR3/GIM.cpp \
-	VMMR3/GIMHv.cpp \
+	$(if-expr !defined(VBOX_WITH_KVM), VMMR3/GIMHv.cpp,) \
+	$(if-expr  defined(VBOX_WITH_KVM), VMMR3/GIMHvOnKvm.cpp,) \
 	VMMR3/GIMKvm.cpp \
 	VMMR3/GIMMinimal.cpp \
 	VMMR3/IEMR3.cpp \
@@ -237,7 +238,8 @@ VBoxVMM_SOURCES  = \
 	VMMAll/EMAll.cpp \
 	VMMAll/GCMAll.cpp \
 	VMMAll/GIMAll.cpp \
-	VMMAll/GIMAllHv.cpp \
+	$(if-expr !defined(VBOX_WITH_KVM), VMMAll/GIMAllHv.cpp,) \
+	$(if-expr  defined(VBOX_WITH_KVM), VMMAll/GIMAllHvOnKvm.cpp,) \
 	VMMAll/GIMAllKvm.cpp \
 	VMMAll/TMAll.cpp \
 	VMMAll/TMAllCpu.cpp \
diff --git a/src/VBox/VMM/VMMAll/APICAll.cpp b/src/VBox/VMM/VMMAll/APICAll.cpp
index 192e824..6a2e63d 100644
--- a/src/VBox/VMM/VMMAll/APICAll.cpp
+++ b/src/VBox/VMM/VMMAll/APICAll.cpp
@@ -2654,6 +2654,16 @@ static DECLCALLBACK(VBOXSTRICTRC) apicSetLocalInterrupt(PVMCPUCC pVCpu, uint8_t
     AssertReturn(u8Level <= 1, VERR_INVALID_PARAMETER);
 
     VBOXSTRICTRC rcStrict = VINF_SUCCESS;
+#ifdef VBOX_WITH_KVM
+    /* TODO: Fix the local interrupt handling. See vbox-engineering#430. */
+    if (u8Level) {
+        apicSetInterruptFF(pVCpu, PDMAPICIRQ_EXTINT);
+    } else {
+        apicClearInterruptFF(pVCpu, PDMAPICIRQ_EXTINT);
+    }
+
+    return VINF_SUCCESS;
+#endif
 
     /* If the APIC is enabled, the interrupt is subject to LVT programming. */
     if (apicIsEnabled(pVCpu))
diff --git a/src/VBox/VMM/VMMAll/GIMAllHvOnKvm.cpp b/src/VBox/VMM/VMMAll/GIMAllHvOnKvm.cpp
new file mode 100644
index 0000000..f45a2d7
--- /dev/null
+++ b/src/VBox/VMM/VMMAll/GIMAllHvOnKvm.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) Cyberus Technology GmbH.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#define LOG_GROUP LOG_GROUP_GIM
+#include <VBox/vmm/dbgf.h>
+#include <VBox/vmm/gim.h>
+#include "GIMInternal.h"
+#include <VBox/vmm/vm.h>
+
+#include <VBox/err.h>
+
+#include <iprt/assert.h>
+
+/**
+ * With GIMHvOnKvm, userspace does not need to do any HyperV emulation because
+ * it all happens inside the kernel module. These stubs are merely here to make
+ * GIM.cpp happy.
+ */
+
+VMM_INT_DECL(void) gimHvStartStimer(PVMCPUCC pVCpu, PCGIMHVSTIMER pHvStimer)
+{
+    NOREF(pVCpu); NOREF(pHvStimer);
+    AssertLogRelMsg(false, ("%s", __PRETTY_FUNCTION__));
+}
+
+VMM_INT_DECL(VBOXSTRICTRC) gimHvHypercall(PVMCPUCC pVCpu, PCPUMCTX pCtx)
+{
+    NOREF(pVCpu); NOREF(pCtx);
+    AssertLogRelMsgReturn(false, ("%s", __PRETTY_FUNCTION__), VERR_NOT_SUPPORTED);
+}
+
+VMM_INT_DECL(VBOXSTRICTRC) gimHvHypercallEx(PVMCPUCC pVCpu, PCPUMCTX pCtx, unsigned uDisOpcode, uint8_t cbInstr)
+{
+    NOREF(pVCpu); NOREF(pCtx); NOREF(uDisOpcode); NOREF(cbInstr);
+    AssertLogRelMsgReturn(false, ("%s", __PRETTY_FUNCTION__), VERR_NOT_SUPPORTED);
+}
+
+VMM_INT_DECL(PGIMMMIO2REGION) gimHvGetMmio2Regions(PVM pVM, uint32_t *pcRegions)
+{
+    NOREF(pVM); NOREF(pcRegions);
+    return nullptr;
+}
+
+VMM_INT_DECL(bool) gimHvAreHypercallsEnabled(PCVM pVM)
+{
+    NOREF(pVM);
+    return false;
+}
+
+VMM_INT_DECL(bool) gimHvIsParavirtTscEnabled(PVM pVM)
+{
+    NOREF(pVM);
+    return false;
+}
+
+VMM_INT_DECL(bool) gimHvShouldTrapXcptUD(PVMCPU pVCpu)
+{
+    NOREF(pVCpu);
+    return false;
+}
+
+VMM_INT_DECL(VBOXSTRICTRC) gimHvXcptUD(PVMCPUCC pVCpu, PCPUMCTX pCtx, PDISSTATE pDis, uint8_t *pcbInstr)
+{
+    NOREF(pVCpu); NOREF(pCtx); NOREF(pDis); NOREF(pcbInstr);
+    AssertLogRelMsgReturn(false, ("%s", __PRETTY_FUNCTION__), VERR_NOT_SUPPORTED);
+}
+
+VMM_INT_DECL(VBOXSTRICTRC) gimHvReadMsr(PVMCPUCC pVCpu, uint32_t idMsr, PCCPUMMSRRANGE pRange, uint64_t *puValue)
+{
+    NOREF(pRange);
+
+    PVMCC   pVM = pVCpu->CTX_SUFF(pVM);
+    PCGIMHV pHv = &pVM->gim.s.u.Hv;
+
+    switch (idMsr)
+    {
+        case MSR_GIM_HV_CRASH_CTL:
+            *puValue = pHv->uCrashCtlMsr;
+            return VINF_SUCCESS;
+
+        case MSR_GIM_HV_CRASH_P0: *puValue = pHv->uCrashP0Msr;   return VINF_SUCCESS;
+        case MSR_GIM_HV_CRASH_P1: *puValue = pHv->uCrashP1Msr;   return VINF_SUCCESS;
+        case MSR_GIM_HV_CRASH_P2: *puValue = pHv->uCrashP2Msr;   return VINF_SUCCESS;
+        case MSR_GIM_HV_CRASH_P3: *puValue = pHv->uCrashP3Msr;   return VINF_SUCCESS;
+        case MSR_GIM_HV_CRASH_P4: *puValue = pHv->uCrashP4Msr;   return VINF_SUCCESS;
+        default: break;
+    }
+
+    AssertLogRelMsgReturn(false, ("%s", __PRETTY_FUNCTION__), VERR_NOT_SUPPORTED);
+}
+
+VMM_INT_DECL(VBOXSTRICTRC) gimHvWriteMsr(PVMCPUCC pVCpu, uint32_t idMsr, PCCPUMMSRRANGE pRange, uint64_t uRawValue)
+{
+    NOREF(pRange);
+
+    PVMCC  pVM = pVCpu->CTX_SUFF(pVM);
+    PGIMHV pHv = &pVM->gim.s.u.Hv;
+
+    switch (idMsr) {
+        case MSR_GIM_HV_CRASH_CTL:
+        {
+            if (uRawValue & MSR_GIM_HV_CRASH_CTL_NOTIFY)
+            {
+                LogRel(("GIM: HyperV: Guest indicates a fatal condition! P0=%#RX64 P1=%#RX64 P2=%#RX64 P3=%#RX64 P4=%#RX64\n",
+                        pHv->uCrashP0Msr, pHv->uCrashP1Msr, pHv->uCrashP2Msr, pHv->uCrashP3Msr, pHv->uCrashP4Msr));
+                DBGFR3ReportBugCheck(pVM, pVCpu, DBGFEVENT_BSOD_MSR, pHv->uCrashP0Msr, pHv->uCrashP1Msr,
+                                     pHv->uCrashP2Msr, pHv->uCrashP3Msr, pHv->uCrashP4Msr);
+            }
+            return VINF_SUCCESS;
+        }
+        case MSR_GIM_HV_CRASH_P0:  pHv->uCrashP0Msr = uRawValue;  return VINF_SUCCESS;
+        case MSR_GIM_HV_CRASH_P1:  pHv->uCrashP1Msr = uRawValue;  return VINF_SUCCESS;
+        case MSR_GIM_HV_CRASH_P2:  pHv->uCrashP2Msr = uRawValue;  return VINF_SUCCESS;
+        case MSR_GIM_HV_CRASH_P3:  pHv->uCrashP3Msr = uRawValue;  return VINF_SUCCESS;
+        case MSR_GIM_HV_CRASH_P4:  pHv->uCrashP4Msr = uRawValue;  return VINF_SUCCESS;
+        default: break;
+    }
+
+    AssertLogRelMsgReturn(false, ("%s", __PRETTY_FUNCTION__), VERR_NOT_SUPPORTED);
+}
diff --git a/src/VBox/VMM/VMMAll/PGMAllBth-x86.cpp.h b/src/VBox/VMM/VMMAll/PGMAllBth-x86.cpp.h
index 4a5cb34..45d6eb0 100644
--- a/src/VBox/VMM/VMMAll/PGMAllBth-x86.cpp.h
+++ b/src/VBox/VMM/VMMAll/PGMAllBth-x86.cpp.h
@@ -4981,7 +4981,10 @@ PGM_BTH_DECL(int, MapCR3)(PVMCPUCC pVCpu, RTGCPHYS GCPhysCR3)
  || PGM_GST_TYPE == PGM_TYPE_AMD64
 
     LogFlow(("MapCR3: %RGp\n", GCPhysCR3));
+
+#ifndef VBOX_WITH_KVM_IRQCHIP_FULL
     PGM_A20_ASSERT_MASKED(pVCpu, GCPhysCR3);
+#endif
 
 # if PGM_GST_TYPE == PGM_TYPE_PAE
     if (   !pVCpu->pgm.s.CTX_SUFF(fPaePdpesAndCr3Mapped)
diff --git a/src/VBox/VMM/VMMAll/TMAll.cpp b/src/VBox/VMM/VMMAll/TMAll.cpp
index 21adc11..055d821 100644
--- a/src/VBox/VMM/VMMAll/TMAll.cpp
+++ b/src/VBox/VMM/VMMAll/TMAll.cpp
@@ -211,6 +211,10 @@ VMMDECL(void) TMNotifyEndOfExecution(PVMCC pVM, PVMCPUCC pVCpu, uint64_t uTsc)
 # ifndef VBOX_VMM_TARGET_ARMV8 /* This is perfectly valid on ARM if the guest is halting in the hypervisor. */
     AssertStmt(cTicks <= uCpuHz << 2, cTicks = uCpuHz << 2); /* max 4 sec */
 # endif
+    /* Execute for at most 4s. */
+    AssertMsgStmt(cTicks <= uCpuHz << 2,
+                  ("TM/%u: execution took longer than 4s: cTicks=%llu uCpuHz=%llu\n", pVCpu->idCpu, cTicks, uCpuHz),
+                  cTicks = uCpuHz << 2);
 
     uint64_t cNsExecutingDelta;
     if (uCpuHz < _4G)
diff --git a/src/VBox/VMM/VMMAll/TMAllVirtual.cpp b/src/VBox/VMM/VMMAll/TMAllVirtual.cpp
index 283ace3..26ed51d 100644
--- a/src/VBox/VMM/VMMAll/TMAllVirtual.cpp
+++ b/src/VBox/VMM/VMMAll/TMAllVirtual.cpp
@@ -985,7 +985,11 @@ VMM_INT_DECL(uint64_t) TMVirtualSyncGetWithDeadlineNoCheck(PVMCC pVM, uint64_t *
 VMMDECL(uint64_t) TMVirtualSyncGetNsToDeadline(PVMCC pVM, uint64_t *puDeadlineVersion, uint64_t *puTscNow)
 {
     uint64_t cNsToDeadline;
+#ifdef VBOX_WITH_KVM
+    tmVirtualSyncGetEx(pVM, true /*fCheckTimers*/, &cNsToDeadline, puDeadlineVersion, puTscNow);
+#else
     tmVirtualSyncGetEx(pVM, false /*fCheckTimers*/, &cNsToDeadline, puDeadlineVersion, puTscNow);
+#endif
     return cNsToDeadline;
 }
 
diff --git a/src/VBox/VMM/VMMR3/APIC.cpp b/src/VBox/VMM/VMMR3/APIC.cpp
index bdbef9c..55f7e53 100644
--- a/src/VBox/VMM/VMMR3/APIC.cpp
+++ b/src/VBox/VMM/VMMR3/APIC.cpp
@@ -35,6 +35,7 @@
 #include <VBox/vmm/cpum.h>
 #include <VBox/vmm/hm.h>
 #include <VBox/vmm/mm.h>
+#include <VBox/vmm/nem.h>
 #include <VBox/vmm/pdmdev.h>
 #include <VBox/vmm/ssm.h>
 #ifndef VBOX_DEVICE_STRUCT_TESTCASE
@@ -325,6 +326,10 @@ static DECLCALLBACK(void) apicR3Info(PVM pVM, PCDBGFINFOHLP pHlp, const char *ps
     PCXAPICPAGE  pXApicPage  = VMCPU_TO_CXAPICPAGE(pVCpu);
     PCX2APICPAGE pX2ApicPage = VMCPU_TO_CX2APICPAGE(pVCpu);
 
+#ifdef VBOX_WITH_KVM
+    NEMR3KvmGetLapicState(pVCpu, VMCPU_TO_XAPICPAGE(pVCpu));
+#endif
+
     uint64_t const uBaseMsr  = pApicCpu->uApicBaseMsr;
     APICMODE const enmMode   = apicGetMode(uBaseMsr);
     bool const   fX2ApicMode = XAPIC_IN_X2APIC_MODE(pVCpu);
@@ -953,6 +958,10 @@ static DECLCALLBACK(int) apicR3SaveExec(PPDMDEVINS pDevIns, PSSMHANDLE pSSM)
         PVMCPU pVCpu = pVM->apCpusR3[idCpu];
         PCAPICCPU pApicCpu = VMCPU_TO_APICCPU(pVCpu);
 
+#ifdef VBOX_WITH_KVM
+        NEMR3KvmGetLapicState(pVCpu, pApicCpu->pvApicPageR3);
+#endif
+
         /* Update interrupts from the pending-interrupts bitmaps to the IRR. */
         PDMApicUpdatePendingInterrupts(pVCpu);
 
@@ -1046,6 +1055,10 @@ static DECLCALLBACK(int) apicR3LoadExec(PPDMDEVINS pDevIns, PSSMHANDLE pSSM, uin
             else
                 pHlp->pfnSSMGetStruct(pSSM, pApicCpu->pvApicPageR3, &g_aXApicPageFields[0]);
 
+#ifdef VBOX_WITH_KVM
+            NEMR3KvmSetLapicState(pVCpu, pApicCpu->pvApicPageR3);
+#endif
+
             /* Load the timer. */
             rc = pHlp->pfnSSMGetU64(pSSM, &pApicCpu->u64TimerInitial);     AssertRCReturn(rc, rc);
             rc = PDMDevHlpTimerLoad(pDevIns, pApicCpu->hTimer, pSSM);      AssertRCReturn(rc, rc);
@@ -1174,6 +1187,11 @@ DECLCALLBACK(void) apicR3Reset(PPDMDEVINS pDevIns)
 
         /* Clear the interrupt pending force flag. */
         apicClearInterruptFF(pVCpuDest, PDMAPICIRQ_HARDWARE);
+
+#ifdef VBOX_WITH_KVM
+        PXAPICPAGE  pXApicPage  = VMCPU_TO_XAPICPAGE(pVCpuDest);
+        NEMR3KvmSetLapicState(pVCpuDest, pXApicPage);
+#endif
     }
 }
 
@@ -1531,6 +1549,9 @@ DECLCALLBACK(int) apicR3Construct(PPDMDEVINS pDevIns, int iInstance, PCFGMNODE p
     {
         PVMCPU   pVCpu     = pVM->apCpusR3[idCpu];
         PAPICCPU pApicCpu  = VMCPU_TO_APICCPU(pVCpu);
+#ifdef VBOX_WITH_KVM
+        NEMR3KvmSetLapicState(pVCpu, pApicCpu->pvApicPageR3);
+#endif
 
         APIC_REG_COUNTER(&pApicCpu->StatPostIntrCnt,   "%u",  "APIC/VCPU stats / number of apicPostInterrupt calls.");
         for (size_t i = 0; i < RT_ELEMENTS(pApicCpu->aStatVectors); i++)
diff --git a/src/VBox/VMM/VMMR3/EM.cpp b/src/VBox/VMM/VMMR3/EM.cpp
index 41c52bc..7b755ed 100644
--- a/src/VBox/VMM/VMMR3/EM.cpp
+++ b/src/VBox/VMM/VMMR3/EM.cpp
@@ -223,7 +223,11 @@ VMMR3_INT_DECL(int) EMR3Init(PVM pVM)
     {
         PVMCPU pVCpu = pVM->apCpusR3[idCpu];
 
+#ifdef VBOX_WITH_KVM
+        pVCpu->em.s.enmState            = EMSTATE_NONE;
+#else
         pVCpu->em.s.enmState            = idCpu == 0 ? EMSTATE_NONE : EMSTATE_WAIT_SIPI;
+#endif
         pVCpu->em.s.enmPrevState        = EMSTATE_NONE;
         pVCpu->em.s.msTimeSliceStart    = 0; /* paranoia */
         pVCpu->em.s.idxContinueExitRec  = UINT16_MAX;
@@ -2341,7 +2345,14 @@ VMMR3_INT_DECL(int) EMR3ExecuteVM(PVM pVM, PVMCPU pVCpu)
                     else
                     {
                         /* All other VCPUs go into the wait for SIPI state. */
+#ifdef VBOX_WITH_KVM
+                        /* In case the KVM split irq chip is used, KVM manages
+                         * the wait for SIPI state for us and we need to stay in
+                         * the NEM state. */
+                        pVCpu->em.s.enmState = EMSTATE_NEM;
+#else
                         pVCpu->em.s.enmState = EMSTATE_WAIT_SIPI;
+#endif
                     }
                     break;
                 }
diff --git a/src/VBox/VMM/VMMR3/GIMHv.cpp b/src/VBox/VMM/VMMR3/GIMHv.cpp
index a4a282a..0ab7fd7 100644
--- a/src/VBox/VMM/VMMR3/GIMHv.cpp
+++ b/src/VBox/VMM/VMMR3/GIMHv.cpp
@@ -34,6 +34,9 @@
 #include <VBox/vmm/gim.h>
 #include <VBox/vmm/cpum.h>
 #include <VBox/vmm/mm.h>
+#if defined(VBOX_WITH_KVM)
+#include <VBox/vmm/nem.h>
+#endif
 #include <VBox/vmm/ssm.h>
 #include <VBox/vmm/hm.h>
 #include <VBox/vmm/pdmapi.h>
@@ -270,6 +273,51 @@ VMMR3_INT_DECL(int) gimR3HvInit(PVM pVM, PCFGMNODE pGimCfg)
     rc = CFGMR3QueryBoolDef(pCfgHv, "HypercallDebugInterface", &pHv->fDbgHypercallInterface, false);
     AssertLogRelRCReturn(rc, rc);
 
+#ifdef VBOX_WITH_KVM
+    uint32_t uKvmBaseFeat = 0;
+    uint32_t uKvmPartFlags = 0;
+    uint32_t uKvmPowMgmtFeat = 0;
+    uint32_t uKvmMiscFeat = 0;
+    uint32_t uKvmHyperHints = 0;
+
+    {
+        PCPUMCPUIDLEAF pKvmCpuidLeaves = nullptr;
+        size_t cKvmCpuidLeaves = 0;
+
+        rc = NEMR3KvmGetHvCpuIdLeaves(pVM, &pKvmCpuidLeaves, &cKvmCpuidLeaves);
+        AssertLogRelRCReturn(rc, rc);
+
+        for (size_t uLeaf = 0; uLeaf < cKvmCpuidLeaves; uLeaf++) {
+            LogRel(("GIM: KVM CPUID[%08x] eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
+                    pKvmCpuidLeaves[uLeaf].uLeaf,
+                    pKvmCpuidLeaves[uLeaf].uEax, pKvmCpuidLeaves[uLeaf].uEbx,
+                    pKvmCpuidLeaves[uLeaf].uEcx, pKvmCpuidLeaves[uLeaf].uEdx));
+
+            /*
+              See this documentation for an overview of Hyper-V CPUID flags:
+              https://learn.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/feature-discovery
+             */
+
+            switch (pKvmCpuidLeaves[uLeaf].uLeaf) {
+            case 0x40000003: /* Features */
+                uKvmBaseFeat = pKvmCpuidLeaves[uLeaf].uEax;
+                uKvmPartFlags = pKvmCpuidLeaves[uLeaf].uEbx;
+                uKvmPowMgmtFeat = pKvmCpuidLeaves[uLeaf].uEcx;
+                uKvmMiscFeat = pKvmCpuidLeaves[uLeaf].uEdx;
+                break;
+            case 0x40000004: /* Implementation Recommendations */
+                uKvmHyperHints = pKvmCpuidLeaves[uLeaf].uEax;
+                break;
+            default:
+                // Ignore
+                break;
+            }
+        }
+
+        RTMemFree(pKvmCpuidLeaves);
+    }
+#endif
+
     /*
      * Determine interface capabilities based on the version.
      */
@@ -277,7 +325,11 @@ VMMR3_INT_DECL(int) gimR3HvInit(PVM pVM, PCFGMNODE pGimCfg)
     {
         /* Basic features. */
         pHv->uBaseFeat = 0
+#ifdef VBOX_WITH_KVM
+                       | GIM_HV_BASE_FEAT_VP_RUNTIME_MSR
+#else
                      //| GIM_HV_BASE_FEAT_VP_RUNTIME_MSR
+#endif
                        | GIM_HV_BASE_FEAT_PART_TIME_REF_COUNT_MSR
                      //| GIM_HV_BASE_FEAT_BASIC_SYNIC_MSRS          // Both required for synethetic timers
                      //| GIM_HV_BASE_FEAT_STIMER_MSRS               // Both required for synethetic timers
@@ -300,15 +352,29 @@ VMMR3_INT_DECL(int) gimR3HvInit(PVM pVM, PCFGMNODE pGimCfg)
                          | GIM_HV_MISC_FEAT_GUEST_CRASH_MSRS
                        //| GIM_HV_MISC_FEAT_DEBUG_MSRS
                          ;
-
+#ifdef VBOX_WITH_KVM
+        /* Hypervisor recommendations to the guest. */
+        pHv->uHyperHints = GIM_HV_HINT_RELAX_TIME_CHECKS
+                         /* Causes assertion failures in interrupt injection. */
+                       //| GIM_HV_HINT_MSR_FOR_APIC_ACCESS
+                         /* Inform the guest whether the host has hyperthreading disabled. */
+                         | (GIM_HV_HINT_NO_NONARCH_CORESHARING & uKvmHyperHints)
+                         ;
+#else
         /* Hypervisor recommendations to the guest. */
         pHv->uHyperHints = GIM_HV_HINT_MSR_FOR_SYS_RESET
                          | GIM_HV_HINT_RELAX_TIME_CHECKS
                          | GIM_HV_HINT_X2APIC_MSRS
                          ;
+#endif
 
         /* Partition features. */
+#ifdef VBOX_WITH_KVM
+        /* Extended hypercalls require KVM_EXIT_HYPER_HCALL exits to be forwarded gimHvHypercall.
+           So we don't expose them for now. */
+#else
         pHv->uPartFlags |= GIM_HV_PART_FLAGS_EXTENDED_HYPERCALLS;
+#endif
 
         /* Expose more if we're posing as Microsoft. We can, if needed, force MSR-based Hv
            debugging by not exposing these bits while exposing the VS interface. The better
@@ -320,6 +386,15 @@ VMMR3_INT_DECL(int) gimR3HvInit(PVM pVM, PCFGMNODE pGimCfg)
 
             pHv->uPartFlags |= GIM_HV_PART_FLAGS_DEBUGGING;
         }
+
+#ifdef VBOX_WITH_KVM
+        // We should not enable features and hints that KVM doesn't know about.
+        Assert((pHv->uHyperHints & ~uKvmHyperHints) == 0);
+        Assert((pHv->uBaseFeat & ~uKvmBaseFeat) == 0);
+        Assert((pHv->uMiscFeat & ~uKvmMiscFeat) == 0);
+        Assert((pHv->uPartFlags & ~uKvmPartFlags) == 0);
+        Assert((pHv->uPowMgmtFeat & ~uKvmPowMgmtFeat) == 0);
+#endif
     }
 
     /*
diff --git a/src/VBox/VMM/VMMR3/GIMHvOnKvm.cpp b/src/VBox/VMM/VMMR3/GIMHvOnKvm.cpp
new file mode 100644
index 0000000..362cc69
--- /dev/null
+++ b/src/VBox/VMM/VMMR3/GIMHvOnKvm.cpp
@@ -0,0 +1,640 @@
+/* $Id: GIMHvOnKvm.cpp $ */
+/** @file
+ * GIM - Guest Interface Manager, Hyper-V implementation for the KVM-Backend.
+ */
+
+/*
+ * Copyright (C) 2014-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+
+/*********************************************************************************************************************************
+*   Header Files                                                                                                                 *
+*********************************************************************************************************************************/
+#define LOG_GROUP LOG_GROUP_GIM
+#include <VBox/vmm/gim.h>
+#include <VBox/vmm/nem.h>
+#include <VBox/vmm/ssm.h>
+#include <VBox/vmm/hm.h>
+#include "GIMInternal.h"
+#include <VBox/vmm/vm.h>
+
+#include <VBox/err.h>
+#include <VBox/version.h>
+
+#include <iprt/assert.h>
+#include <iprt/string.h>
+#include <iprt/mem.h>
+
+/*********************************************************************************************************************************
+*   Defined Constants And Macros                                                                                                 *
+*********************************************************************************************************************************/
+/**
+ * GIM Hyper-V saved-state version.
+ *
+ * We use a number that is far away from the original GIMHv saved state version
+ * to prevent future collisions.
+ */
+#define GIM_HV_SAVED_STATE_VERSION                      UINT32_C(0x1000)
+
+#ifdef VBOX_WITH_STATISTICS
+# define GIMHV_MSRRANGE(a_uFirst, a_uLast, a_szName) \
+    { (a_uFirst), (a_uLast), kCpumMsrRdFn_Gim, kCpumMsrWrFn_Gim, 0, 0, 0, 0, 0, a_szName, { 0 }, { 0 }, { 0 }, { 0 } }
+#else
+# define GIMHV_MSRRANGE(a_uFirst, a_uLast, a_szName) \
+    { (a_uFirst), (a_uLast), kCpumMsrRdFn_Gim, kCpumMsrWrFn_Gim, 0, 0, 0, 0, 0, a_szName }
+#endif
+
+
+/*********************************************************************************************************************************
+*   Global Variables                                                                                                             *
+*********************************************************************************************************************************/
+/**
+ * Array of MSR ranges supported by Hyper-V.
+ */
+static CPUMMSRRANGE const g_aMsrRanges_HyperV[] =
+{
+    GIMHV_MSRRANGE(MSR_GIM_HV_RANGE0_FIRST,  MSR_GIM_HV_RANGE0_LAST,  "Hyper-V range 0"),
+    GIMHV_MSRRANGE(MSR_GIM_HV_RANGE1_FIRST,  MSR_GIM_HV_RANGE1_LAST,  "Hyper-V range 1"),
+    GIMHV_MSRRANGE(MSR_GIM_HV_RANGE2_FIRST,  MSR_GIM_HV_RANGE2_LAST,  "Hyper-V range 2"),
+    GIMHV_MSRRANGE(MSR_GIM_HV_RANGE3_FIRST,  MSR_GIM_HV_RANGE3_LAST,  "Hyper-V range 3"),
+    GIMHV_MSRRANGE(MSR_GIM_HV_RANGE4_FIRST,  MSR_GIM_HV_RANGE4_LAST,  "Hyper-V range 4"),
+    GIMHV_MSRRANGE(MSR_GIM_HV_RANGE5_FIRST,  MSR_GIM_HV_RANGE5_LAST,  "Hyper-V range 5"),
+    GIMHV_MSRRANGE(MSR_GIM_HV_RANGE6_FIRST,  MSR_GIM_HV_RANGE6_LAST,  "Hyper-V range 6"),
+    GIMHV_MSRRANGE(MSR_GIM_HV_RANGE7_FIRST,  MSR_GIM_HV_RANGE7_LAST,  "Hyper-V range 7"),
+    GIMHV_MSRRANGE(MSR_GIM_HV_RANGE8_FIRST,  MSR_GIM_HV_RANGE8_LAST,  "Hyper-V range 8"),
+    GIMHV_MSRRANGE(MSR_GIM_HV_RANGE9_FIRST,  MSR_GIM_HV_RANGE9_LAST,  "Hyper-V range 9"),
+    GIMHV_MSRRANGE(MSR_GIM_HV_RANGE10_FIRST, MSR_GIM_HV_RANGE10_LAST, "Hyper-V range 10"),
+    GIMHV_MSRRANGE(MSR_GIM_HV_RANGE11_FIRST, MSR_GIM_HV_RANGE11_LAST, "Hyper-V range 11"),
+    GIMHV_MSRRANGE(MSR_GIM_HV_RANGE12_FIRST, MSR_GIM_HV_RANGE12_LAST, "Hyper-V range 12")
+};
+#undef GIMHV_MSRRANGE
+
+/*********************************************************************************************************************************
+*   Internal Functions                                                                                                           *
+*********************************************************************************************************************************/
+
+/**
+ * Initializes the Hyper-V GIM provider.
+ *
+ * @returns VBox status code.
+ * @param   pVM         The cross context VM structure.
+ * @param   pGimCfg     The GIM CFGM node.
+ */
+VMMR3_INT_DECL(int) gimR3HvInit(PVM pVM, PCFGMNODE pGimCfg)
+{
+    AssertReturn(pVM, VERR_INVALID_PARAMETER);
+    AssertReturn(pVM->gim.s.enmProviderId == GIMPROVIDERID_HYPERV, VERR_INTERNAL_ERROR_5);
+
+    PGIMHV pHv = &pVM->gim.s.u.Hv;
+
+    /*
+     * Read configuration.
+     */
+    PCFGMNODE pCfgHv = CFGMR3GetChild(pGimCfg, "HyperV");
+    if (pCfgHv)
+    {
+        /*
+         * Validate the Hyper-V settings.
+         */
+        int rc2 = CFGMR3ValidateConfig(pCfgHv, "/HyperV/",
+                                  "VendorID"
+                                  "|VSInterface"
+                                  "|HypercallDebugInterface"
+                                  "|VirtioGPU",
+                                  "" /* pszValidNodes */, "GIM/HyperV" /* pszWho */, 0 /* uInstance */);
+        if (RT_FAILURE(rc2))
+            return rc2;
+    }
+
+    /**
+     * If virtio-gpu is in use, revert back to VBoxVBoxVBox as HyperV Vendor because otherwise,
+     * the Intel GPU driver does not load.
+     */
+    bool withVirtioGPU {false};
+    int rc = CFGMR3QueryBoolDef(pCfgHv, "VirtioGPU", &withVirtioGPU, false);
+    AssertLogRelRCReturn(rc, rc);
+
+    /** @cfgm{/GIM/HyperV/VendorID, string, 'VBoxVBoxVBox'}
+     * The Hyper-V vendor signature, must be 12 characters. */
+    char szVendor[13];
+    rc = CFGMR3QueryStringDef(pCfgHv, "VendorID", szVendor, sizeof(szVendor), withVirtioGPU ? "VBoxVBoxVBox" : "Microsoft Hv");
+    AssertLogRelRCReturn(rc, rc);
+    AssertLogRelMsgReturn(strlen(szVendor) == 12,
+                          ("The VendorID config value must be exactly 12 chars, '%s' isn't!\n", szVendor),
+                          VERR_INVALID_PARAMETER);
+
+    AssertReleaseMsg(!RTStrNCmp(szVendor, GIM_HV_VENDOR_MICROSOFT, sizeof(GIM_HV_VENDOR_MICROSOFT) - 1) ||
+                     !RTStrNCmp(szVendor, GIM_HV_VENDOR_VBOX, sizeof(GIM_HV_VENDOR_VBOX) - 1), (("GIM Vendors other than Microsoft Hv and VBox are unsupported")));
+
+    LogRel(("GIM: HyperV: Reporting vendor as '%s'\n", szVendor));
+
+    pHv->fIsInterfaceVs = false;
+    pHv->fDbgHypercallInterface = false;
+
+    uint32_t uKvmBaseFeat = 0;
+    uint32_t uKvmPartFlags = 0;
+    uint32_t uKvmPowMgmtFeat = 0;
+    uint32_t uKvmMiscFeat = 0;
+    uint32_t uKvmHyperHints = 0;
+
+    {
+        PCPUMCPUIDLEAF pKvmCpuidLeaves = nullptr;
+        size_t cKvmCpuidLeaves = 0;
+
+        rc = NEMR3KvmGetHvCpuIdLeaves(pVM, &pKvmCpuidLeaves, &cKvmCpuidLeaves);
+        AssertLogRelRCReturn(rc, rc);
+
+        for (size_t uLeaf = 0; uLeaf < cKvmCpuidLeaves; uLeaf++) {
+            LogRel(("GIM: KVM CPUID[%08x] eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
+                    pKvmCpuidLeaves[uLeaf].uLeaf,
+                    pKvmCpuidLeaves[uLeaf].uEax, pKvmCpuidLeaves[uLeaf].uEbx,
+                    pKvmCpuidLeaves[uLeaf].uEcx, pKvmCpuidLeaves[uLeaf].uEdx));
+
+            /*
+              See this documentation for an overview of Hyper-V CPUID flags:
+              https://learn.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/feature-discovery
+             */
+
+            switch (pKvmCpuidLeaves[uLeaf].uLeaf) {
+            case 0x40000003: /* Features */
+                uKvmBaseFeat = pKvmCpuidLeaves[uLeaf].uEax;
+                uKvmPartFlags = pKvmCpuidLeaves[uLeaf].uEbx;
+                uKvmPowMgmtFeat = pKvmCpuidLeaves[uLeaf].uEcx;
+                uKvmMiscFeat = pKvmCpuidLeaves[uLeaf].uEdx;
+                break;
+            case 0x40000004: /* Implementation Recommendations */
+                uKvmHyperHints = pKvmCpuidLeaves[uLeaf].uEax;
+                break;
+            default:
+                // Ignore
+                break;
+            }
+        }
+
+        RTMemFree(pKvmCpuidLeaves);
+    }
+
+    /*
+     * Determine interface capabilities based on the version.
+     */
+    if (!pVM->gim.s.u32Version)
+    {
+        /* Basic features. */
+        pHv->uBaseFeat = 0
+                       | GIM_HV_BASE_FEAT_VP_RUNTIME_MSR
+                       | GIM_HV_BASE_FEAT_PART_TIME_REF_COUNT_MSR
+                       | GIM_HV_BASE_FEAT_BASIC_SYNIC_MSRS
+                       | GIM_HV_BASE_FEAT_STIMER_MSRS
+                       | GIM_HV_BASE_FEAT_APIC_ACCESS_MSRS
+                       | GIM_HV_BASE_FEAT_HYPERCALL_MSRS
+                       | GIM_HV_BASE_FEAT_VP_ID_MSR
+                       | GIM_HV_BASE_FEAT_VIRT_SYS_RESET_MSR
+                     //| GIM_HV_BASE_FEAT_STAT_PAGES_MSR
+                       | GIM_HV_BASE_FEAT_PART_REF_TSC_MSR
+                     //| GIM_HV_BASE_FEAT_GUEST_IDLE_STATE_MSR
+                       | GIM_HV_BASE_FEAT_TIMER_FREQ_MSRS
+                     //| GIM_HV_BASE_FEAT_DEBUG_MSRS
+                       ;
+
+        /* Miscellaneous features. */
+        pHv->uMiscFeat = 0
+                       //| GIM_HV_MISC_FEAT_GUEST_DEBUGGING
+                       //| GIM_HV_MISC_FEAT_XMM_HYPERCALL_INPUT
+                         | GIM_HV_MISC_FEAT_TIMER_FREQ
+                         | GIM_HV_MISC_FEAT_GUEST_CRASH_MSRS
+                       //| GIM_HV_MISC_FEAT_DEBUG_MSRS
+                         | GIM_HV_MISC_FEAT_USE_DIRECT_SYNTH_MSRS
+                         ;
+
+        /* Hypervisor recommendations to the guest. */
+        pHv->uHyperHints = GIM_HV_HINT_RELAX_TIME_CHECKS
+                         /* Causes assertion failures in interrupt injection. */
+                       //| GIM_HV_HINT_MSR_FOR_APIC_ACCESS
+                       //|GIM_HV_HINT_MSR_FOR_SYS_RESET
+                         | GIM_HV_HINT_DEPRECATE_AUTO_EOI
+                         /* Inform the guest whether the host has hyperthreading disabled. */
+                         | (GIM_HV_HINT_NO_NONARCH_CORESHARING & uKvmHyperHints)
+                         ;
+
+
+        // We should not enable features and hints that KVM doesn't know about.
+        AssertRelease((pHv->uHyperHints & ~uKvmHyperHints) == 0);
+        AssertRelease((pHv->uBaseFeat & ~uKvmBaseFeat) == 0);
+        AssertRelease((pHv->uMiscFeat & ~uKvmMiscFeat) == 0);
+        AssertRelease((pHv->uPartFlags & ~uKvmPartFlags) == 0);
+        AssertRelease((pHv->uPowMgmtFeat & ~uKvmPowMgmtFeat) == 0);
+    }
+
+    /*
+     * Make sure the CPUID bits are in accordance with the Hyper-V
+     * requirement and other paranoia checks.
+     * See "Requirements for implementing the Microsoft hypervisor interface" spec.
+     */
+    AssertRelease(!(pHv->uPartFlags & (  GIM_HV_PART_FLAGS_CREATE_PART
+                                        | GIM_HV_PART_FLAGS_ACCESS_MEMORY_POOL
+                                        | GIM_HV_PART_FLAGS_ACCESS_PART_ID
+                                        | GIM_HV_PART_FLAGS_ADJUST_MSG_BUFFERS
+                                        | GIM_HV_PART_FLAGS_CREATE_PORT
+                                        | GIM_HV_PART_FLAGS_ACCESS_STATS
+                                        | GIM_HV_PART_FLAGS_CPU_MGMT
+                                        | GIM_HV_PART_FLAGS_CPU_PROFILER)));
+
+    AssertRelease((pHv->uBaseFeat & (GIM_HV_BASE_FEAT_HYPERCALL_MSRS | GIM_HV_BASE_FEAT_VP_ID_MSR))
+            == (GIM_HV_BASE_FEAT_HYPERCALL_MSRS | GIM_HV_BASE_FEAT_VP_ID_MSR));
+
+    /*
+     * Expose HVP (Hypervisor Present) bit to the guest.
+     */
+    CPUMR3SetGuestCpuIdFeature(pVM, CPUMCPUIDFEATURE_HVP);
+
+    /*
+     * Modify the standard hypervisor leaves for Hyper-V.
+     */
+    CPUMCPUIDLEAF HyperLeaf;
+    RT_ZERO(HyperLeaf);
+    HyperLeaf.uLeaf = UINT32_C(0x40000000);
+    HyperLeaf.uEax  = UINT32_C(0x40000006); /* Minimum value for Hyper-V default is 0x40000005. */
+    /*
+     * Don't report vendor as 'Microsoft Hv'[1] by default, see @bugref{7270#c152}.
+     * [1]: ebx=0x7263694d ('rciM') ecx=0x666f736f ('foso') edx=0x76482074 ('vH t')
+     */
+    {
+        uint32_t uVendorEbx;
+        uint32_t uVendorEcx;
+        uint32_t uVendorEdx;
+        uVendorEbx = ((uint32_t)szVendor[ 3]) << 24 | ((uint32_t)szVendor[ 2]) << 16 | ((uint32_t)szVendor[1]) << 8
+                    | (uint32_t)szVendor[ 0];
+        uVendorEcx = ((uint32_t)szVendor[ 7]) << 24 | ((uint32_t)szVendor[ 6]) << 16 | ((uint32_t)szVendor[5]) << 8
+                    | (uint32_t)szVendor[ 4];
+        uVendorEdx = ((uint32_t)szVendor[11]) << 24 | ((uint32_t)szVendor[10]) << 16 | ((uint32_t)szVendor[9]) << 8
+                    | (uint32_t)szVendor[ 8];
+        HyperLeaf.uEbx         = uVendorEbx;
+        HyperLeaf.uEcx         = uVendorEcx;
+        HyperLeaf.uEdx         = uVendorEdx;
+    }
+    rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+    AssertLogRelRCReturn(rc, rc);
+
+    HyperLeaf.uLeaf        = UINT32_C(0x40000001);
+    HyperLeaf.uEax         = 0x31237648;           /* 'Hv#1' */
+    HyperLeaf.uEbx         = 0;                    /* Reserved */
+    HyperLeaf.uEcx         = 0;                    /* Reserved */
+    HyperLeaf.uEdx         = 0;                    /* Reserved */
+    rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+    AssertLogRelRCReturn(rc, rc);
+
+    /*
+     * Add Hyper-V specific leaves.
+     */
+    HyperLeaf.uLeaf        = UINT32_C(0x40000002); /* MBZ until MSR_GIM_HV_GUEST_OS_ID is set by the guest. */
+    HyperLeaf.uEax         = 0;
+    HyperLeaf.uEbx         = 0;
+    HyperLeaf.uEcx         = 0;
+    HyperLeaf.uEdx         = 0;
+    rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+    AssertLogRelRCReturn(rc, rc);
+
+    HyperLeaf.uLeaf        = UINT32_C(0x40000003);
+    HyperLeaf.uEax         = pHv->uBaseFeat;
+    HyperLeaf.uEbx         = pHv->uPartFlags;
+    HyperLeaf.uEcx         = pHv->uPowMgmtFeat;
+    HyperLeaf.uEdx         = pHv->uMiscFeat;
+    rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+    AssertLogRelRCReturn(rc, rc);
+
+    HyperLeaf.uLeaf        = UINT32_C(0x40000004);
+    HyperLeaf.uEax         = pHv->uHyperHints;
+    /* Recommended number of spinlock retries before notifying the Hypervisor. 0xffffffff means that the Hypervisor is never notified */
+    HyperLeaf.uEbx         = 0xffffffff;
+    HyperLeaf.uEcx         = 0;
+    HyperLeaf.uEdx         = 0;
+    rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+    AssertLogRelRCReturn(rc, rc);
+
+    RT_ZERO(HyperLeaf);
+    HyperLeaf.uLeaf        = UINT32_C(0x40000005);
+    rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+    AssertLogRelRCReturn(rc, rc);
+
+    // Let the guest OS know that we're running HyperV PV on KVM.
+    static constexpr char kvmVendor[] = "KVMKVMKVM\0\0\0";
+    HyperLeaf.uLeaf = 0x40000100;
+    {
+        uint32_t uVendorEbx;
+        uint32_t uVendorEcx;
+        uint32_t uVendorEdx;
+        uVendorEbx = ((uint32_t)kvmVendor[ 3]) << 24 | ((uint32_t)kvmVendor[ 2]) << 16 | ((uint32_t)kvmVendor[1]) << 8
+            | (uint32_t)kvmVendor[ 0];
+        uVendorEcx = ((uint32_t)kvmVendor[ 7]) << 24 | ((uint32_t)kvmVendor[ 6]) << 16 | ((uint32_t)kvmVendor[5]) << 8
+            | (uint32_t)kvmVendor[ 4];
+        uVendorEdx = ((uint32_t)kvmVendor[11]) << 24 | ((uint32_t)kvmVendor[10]) << 16 | ((uint32_t)kvmVendor[9]) << 8
+            | (uint32_t)kvmVendor[ 8];
+        HyperLeaf.uEbx = uVendorEbx;
+        HyperLeaf.uEcx = uVendorEcx;
+        HyperLeaf.uEdx = uVendorEdx;
+    }
+
+    rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+    AssertLogRelRCReturn(rc, rc);
+
+
+    /*
+     * Insert all MSR ranges of Hyper-V.
+     */
+    for (unsigned i = 0; i < RT_ELEMENTS(g_aMsrRanges_HyperV); i++)
+    {
+        int rc2 = CPUMR3MsrRangesInsert(pVM, &g_aMsrRanges_HyperV[i]);
+        AssertLogRelRCReturn(rc2, rc2);
+    }
+
+    /*
+     * Setup non-zero MSRs.
+     */
+    if (pHv->uMiscFeat & GIM_HV_MISC_FEAT_GUEST_CRASH_MSRS)
+        pHv->uCrashCtlMsr = MSR_GIM_HV_CRASH_CTL_NOTIFY;
+
+    return VINF_SUCCESS;
+}
+
+
+/**
+ * Initializes remaining bits of the Hyper-V provider.
+ *
+ * This is called after initializing HM and almost all other VMM components.
+ *
+ * @returns VBox status code.
+ * @param   pVM     The cross context VM structure.
+ */
+VMMR3_INT_DECL(int) gimR3HvInitCompleted(PVM pVM)
+{
+    PGIMHV pHv = &pVM->gim.s.u.Hv;
+    pHv->cTscTicksPerSecond = TMCpuTicksPerSecond(pVM);
+
+    /*
+     * Determine interface capabilities based on the version.
+     */
+    if (!pVM->gim.s.u32Version)
+    {
+        /* Hypervisor capabilities; features used by the hypervisor. */
+        pHv->uHyperCaps  = HMIsNestedPagingActive(pVM) ? GIM_HV_HOST_FEAT_NESTED_PAGING : 0;
+        pHv->uHyperCaps |= HMIsMsrBitmapActive(pVM)    ? GIM_HV_HOST_FEAT_MSR_BITMAP    : 0;
+    }
+
+    CPUMCPUIDLEAF HyperLeaf;
+    RT_ZERO(HyperLeaf);
+    HyperLeaf.uLeaf        = UINT32_C(0x40000006);
+    HyperLeaf.uEax         = pHv->uHyperCaps;
+    HyperLeaf.uEbx         = 0;
+    HyperLeaf.uEcx         = 0;
+    HyperLeaf.uEdx         = 0;
+    int rc = CPUMR3CpuIdInsert(pVM, &HyperLeaf);
+    AssertLogRelRCReturn(rc, rc);
+
+    return rc;
+}
+
+
+/**
+ * Terminates the Hyper-V GIM provider.
+ *
+ * @returns VBox status code.
+ * @param   pVM         The cross context VM structure.
+ */
+VMMR3_INT_DECL(int) gimR3HvTerm(PVM pVM)
+{
+    gimR3HvReset(pVM);
+
+    return VINF_SUCCESS;
+}
+
+
+/**
+ * Applies relocations to data and code managed by this
+ * component. This function will be called at init and
+ * whenever the VMM need to relocate it self inside the GC.
+ *
+ * @param   pVM         The cross context VM structure.
+ * @param   offDelta    Relocation delta relative to old location.
+ */
+VMMR3_INT_DECL(void) gimR3HvRelocate(PVM pVM, RTGCINTPTR offDelta)
+{
+    RT_NOREF(pVM, offDelta);
+}
+
+
+static bool isSynICAllowed(PGIMHV pHv)
+{
+    return pHv->uBaseFeat & GIM_HV_BASE_FEAT_BASIC_SYNIC_MSRS;
+}
+
+/**
+ * This resets Hyper-V provider MSRs and unmaps whatever Hyper-V regions that
+ * the guest may have mapped.
+ *
+ * This is called when the VM is being reset.
+ *
+ * @param   pVM     The cross context VM structure.
+ *
+ * @thread  EMT(0)
+ */
+VMMR3_INT_DECL(void) gimR3HvReset(PVM pVM)
+{
+    VM_ASSERT_EMT0(pVM);
+
+    /*
+     * Unmap MMIO2 pages that the guest may have setup.
+     */
+    LogRel(("GIM: HyperV: Resetting MMIO2 regions and MSRs\n"));
+    PGIMHV pHv = &pVM->gim.s.u.Hv;
+
+    /*
+     * Reset MSRs.
+     */
+    pHv->u64GuestOsIdMsr      = 0;
+    pHv->u64HypercallMsr      = 0;
+    pHv->u64TscPageMsr        = 0;
+    pHv->uCrashP0Msr          = 0;
+    pHv->uCrashP1Msr          = 0;
+    pHv->uCrashP2Msr          = 0;
+    pHv->uCrashP3Msr          = 0;
+    pHv->uCrashP4Msr          = 0;
+    pHv->uDbgStatusMsr        = 0;
+    pHv->uDbgPendingBufferMsr = 0;
+    pHv->uDbgSendBufferMsr    = 0;
+    pHv->uDbgRecvBufferMsr    = 0;
+
+    PVMCPU pVCpuBsp = pVM->apCpusR3[0];
+    NEMR3KvmSetMsr(pVCpuBsp, MSR_GIM_HV_GUEST_OS_ID, pHv->u64GuestOsIdMsr);
+    NEMR3KvmSetMsr(pVCpuBsp, MSR_GIM_HV_HYPERCALL, pHv->u64HypercallMsr);
+    NEMR3KvmSetMsr(pVCpuBsp, MSR_GIM_HV_REF_TSC, pHv->u64TscPageMsr);
+    NEMR3KvmSetMsr(pVCpuBsp, MSR_GIM_HV_SYNTH_DEBUG_STATUS, pHv->uDbgStatusMsr);
+    NEMR3KvmSetMsr(pVCpuBsp, MSR_GIM_HV_SYNTH_DEBUG_PENDING_BUFFER, pHv->uDbgPendingBufferMsr);
+    NEMR3KvmSetMsr(pVCpuBsp, MSR_GIM_HV_SYNTH_DEBUG_SEND_BUFFER, pHv->uDbgSendBufferMsr);
+    NEMR3KvmSetMsr(pVCpuBsp, MSR_GIM_HV_SYNTH_DEBUG_RECEIVE_BUFFER, pHv->uDbgRecvBufferMsr);
+
+    for (VMCPUID idCpu = 0; idCpu < pVM->cCpus; idCpu++)
+    {
+        PGIMHVCPU pHvCpu = &pVM->apCpusR3[idCpu]->gim.s.u.HvCpu;
+        PVMCPU pVCpu = pVM->apCpusR3[idCpu];
+
+        pHvCpu->uSControlMsr = 0;
+        pHvCpu->uSimpMsr  = 0;
+        pHvCpu->uSiefpMsr = 0;
+        pHvCpu->uApicAssistPageMsr = 0;
+
+        NEMR3KvmSetMsr(pVCpu, MSR_GIM_HV_SCONTROL, pHvCpu->uSControlMsr);
+        NEMR3KvmSetMsr(pVCpu, MSR_GIM_HV_SIMP, pHvCpu->uSimpMsr);
+        NEMR3KvmSetMsr(pVCpu, MSR_GIM_HV_SIEFP, pHvCpu->uSiefpMsr);
+        NEMR3KvmSetMsr(pVCpu, MSR_GIM_HV_APIC_ASSIST_PAGE, pHvCpu->uApicAssistPageMsr);
+
+        for (uint8_t idxSint = 0; idxSint < RT_ELEMENTS(pHvCpu->auSintMsrs); idxSint++) {
+            pHvCpu->auSintMsrs[idxSint] = MSR_GIM_HV_SINT_MASKED;
+            if (isSynICAllowed(pHv)) {
+                NEMR3KvmSetMsr(pVCpu, MSR_GIM_HV_SINT0 + idxSint, pHvCpu->auSintMsrs[idxSint]);
+            }
+        }
+
+        for (uint8_t idxStimer = 0; idxStimer < RT_ELEMENTS(pHvCpu->aStimers); idxStimer++)
+        {
+            PGIMHVSTIMER pHvStimer = &pHvCpu->aStimers[idxStimer];
+            pHvStimer->uStimerConfigMsr = 0;
+            pHvStimer->uStimerCountMsr  = 0;
+            NEMR3KvmSetMsr(pVCpu, MSR_GIM_HV_STIMER0_CONFIG + idxStimer, pHvStimer->uStimerConfigMsr);
+            NEMR3KvmSetMsr(pVCpu, MSR_GIM_HV_STIMER0_COUNT + idxStimer, pHvStimer->uStimerCountMsr);
+        }
+    }
+}
+
+
+/**
+ * Hyper-V state-load operation, final pass.
+ *
+ * @returns VBox status code.
+ * @param   pVM             The cross context VM structure.
+ * @param   pSSM            The saved state handle.
+ */
+VMMR3_INT_DECL(int) gimR3HvLoad(PVM pVM, PSSMHANDLE pSSM)
+{
+    uint32_t uHvSavedStateVersion;
+    int rc = SSMR3GetU32(pSSM, &uHvSavedStateVersion);
+    AssertRCReturn(rc, rc);
+
+    if (uHvSavedStateVersion != GIM_HV_SAVED_STATE_VERSION) {
+        return SSMR3SetLoadError(pSSM, VERR_SSM_UNSUPPORTED_DATA_UNIT_VERSION, RT_SRC_POS,
+                                 N_("Unsupported Hyper-V saved-state version %u (current %u)!"),
+                                 uHvSavedStateVersion, GIM_HV_SAVED_STATE_VERSION);
+    }
+
+    for (unsigned i = 0; i < RT_ELEMENTS(g_aMsrRanges_HyperV); i++) {
+        for (unsigned msr {g_aMsrRanges_HyperV[i].uFirst}; msr <= g_aMsrRanges_HyperV[i].uLast; ++msr) {
+
+            // See gimR3HvSave to understand why we skip this MSR.
+            if (msr == MSR_GIM_HV_EOI) {
+                continue;
+            }
+
+            uint64_t val {0};
+            PVMCPU pVCpu = pVM->apCpusR3[0];
+
+            SSMR3GetU64(pSSM, &val);
+
+            rc = NEMR3KvmSetMsr(pVCpu, msr, val);
+            if (rc != VINF_SUCCESS) {
+                // Some MSRs can only be written when HYPERV_SYINC2 has been enabled.
+                // We don't actually care here because if we unable to write the MSR,
+                // the guest couldn't have read/written it either.
+                LogRel2(("Unable to read HV MSR: 0x%x\n", msr));
+            }
+        }
+    }
+
+    return VINF_SUCCESS;
+}
+
+
+/**
+ * Hyper-V load-done callback.
+ *
+ * @returns VBox status code.
+ * @param   pVM             The cross context VM structure.
+ * @param   pSSM            The saved state handle.
+ */
+VMMR3_INT_DECL(int) gimR3HvLoadDone(PVM pVM, PSSMHANDLE pSSM)
+{
+    NOREF(pVM); NOREF(pSSM);
+    return VINF_SUCCESS;
+}
+
+/**
+ * Hyper-V state-save operation.
+ *
+ * @returns VBox status code.
+ * @param   pVM     The cross context VM structure.
+ * @param   pSSM    The saved state handle.
+ */
+VMMR3_INT_DECL(int) gimR3HvSave(PVM pVM, PSSMHANDLE pSSM)
+{
+    /*
+     * Save the Hyper-V SSM version.
+     */
+    SSMR3PutU32(pSSM, GIM_HV_SAVED_STATE_VERSION);
+
+    for (unsigned i = 0; i < RT_ELEMENTS(g_aMsrRanges_HyperV); i++) {
+        for (unsigned msr {g_aMsrRanges_HyperV[i].uFirst}; msr <= g_aMsrRanges_HyperV[i].uLast; ++msr) {
+
+            // This register is wirte-only for the guest and the last value written isn't interesting at all.
+            // Thus, there is no need save it here.
+            if (msr == MSR_GIM_HV_EOI) {
+                continue;
+            }
+
+            uint64_t val {0};
+            PVMCPU pVCpu = pVM->apCpusR3[0];
+
+            int rc {NEMR3KvmGetMsr(pVCpu, msr, &val)};
+            if (rc != VINF_SUCCESS) {
+                // Some MSRs can only be read when HYPERV_SYINC2 has been enabled.
+                // We don't actually care here because if we unable to read the MSR,
+                // the guest couldn't have read/written it either. Simply save it as
+                // zero and call it good.
+                LogRel2(("Unable to read HV MSR: 0x%x\n", msr));
+            }
+
+            SSMR3PutU64(pSSM, val);
+        }
+    }
+
+    return VINF_SUCCESS;
+}
+
+/**
+ * Get Hyper-V debug setup parameters.
+ *
+ * @returns VBox status code.
+ * @param   pVM         The cross context VM structure.
+ * @param   pDbgSetup   Where to store the debug setup details.
+ */
+VMMR3_INT_DECL(int) gimR3HvGetDebugSetup(PVM pVM, PGIMDEBUGSETUP pDbgSetup)
+{
+    NOREF(pVM); NOREF(pDbgSetup);
+    return VERR_GIM_NO_DEBUG_CONNECTION;
+}
diff --git a/src/VBox/VMM/VMMR3/NEMR3Native-linux.cpp b/src/VBox/VMM/VMMR3/NEMR3Native-linux.cpp
index 26611df..36dd594 100644
--- a/src/VBox/VMM/VMMR3/NEMR3Native-linux.cpp
+++ b/src/VBox/VMM/VMMR3/NEMR3Native-linux.cpp
@@ -37,30 +37,124 @@
 #include <VBox/vmm/pdmapic.h>
 #include <VBox/vmm/pdm.h>
 #include <VBox/vmm/trpm.h>
+#include "CPUMInternal.h"
 #include "NEMInternal.h"
+#include "HMInternal.h"
+#include "GIMInternal.h"
+#include "GIMHvInternal.h"
 #include <VBox/vmm/vmcc.h>
 
 #include <iprt/alloca.h>
+#include <iprt/mem.h>
 #include <iprt/string.h>
 #include <iprt/system.h>
 #include <iprt/x86.h>
 
 #include <errno.h>
 #include <unistd.h>
+#include <signal.h>
 #include <sys/ioctl.h>
 #include <sys/fcntl.h>
 #include <sys/mman.h>
+#include <sys/prctl.h>
 #include <linux/kvm.h>
 
-
 /* Forward declarations of things called by the template. */
 static int nemR3LnxInitSetupVm(PVM pVM, PRTERRINFO pErrInfo);
+#include <algorithm>
+#include <string_view>
+#include <vector>
 
+/**
+ * The MMIO address of the TPR register of the LAPIC.
+ */
+static constexpr uint64_t XAPIC_TPR_ADDR {0xfee00080};
 
 /* Instantiate the common bits we share with the ARMv8 KVM backend. */
 #include "NEMR3NativeTemplate-linux.cpp.h"
 
+/**
+ * The class priority shift for the TPR register.
+ */
+static constexpr uint64_t LAPIC_TPR_SHIFT {4};
+
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+static int kvmSetGsiRoutingFullIrqChip(PVM pVM);
+#endif
+
+
+
+#ifdef VBOX_WITH_KVM_NESTING
+static int KvmGetGuestModeOffsetFromStatsFd(PVMCPU pVCpu, size_t *offset)
+{
+    // See https://www.kernel.org/doc/html/latest/virt/kvm/api.html to learn more
+    // about the KVM binary statistics (look for KVM_GET_STATS_FD).
+
+    struct kvm_stats_header stats_header;
+    RT_ZERO(stats_header);
+
+    int rcRead = pread(pVCpu->nem.s.statsFd, &stats_header, sizeof(struct kvm_stats_header), 0);
+    AssertReleaseMsg(rcRead == sizeof(struct kvm_stats_header), ("Unable to read stats header"));
+
+    if (offset == nullptr) {
+        printf("Invalid pointer\n");
+        return VERR_INVALID_POINTER;
+    }
+
+    int real_desc_size = sizeof(struct kvm_stats_desc) + stats_header.name_size;
+    void *desc_backing = RTMemAllocZ(real_desc_size);
+
+    int rc = VERR_NOT_IMPLEMENTED;
+
+    for (unsigned i = 0; i < stats_header.num_desc; ++i) {
+        memset(desc_backing, 0, real_desc_size);
+
+        struct kvm_stats_desc* desc = static_cast<struct kvm_stats_desc*>(desc_backing);
+        rcRead = pread(pVCpu->nem.s.statsFd, desc, real_desc_size, stats_header.desc_offset + i * real_desc_size);
+        AssertReleaseMsg(rcRead == real_desc_size, ("Unable to read descriptor"));
+
+        std::basic_string_view name(desc->name);
+        if (name == "guest_mode") {
+            unsigned value_offset = stats_header.data_offset + desc->offset;
+
+            if (desc->size != 1) {
+                LogRel(("Invalid guest_mode stat size: %d\n", desc->size * 8));
+                rc = VERR_NOT_SUPPORTED;
+                break;
+            }
 
+            *offset = value_offset;
+
+            rc = VINF_SUCCESS;
+            break;
+        }
+    }
+
+    RTMemFree(desc_backing);
+    return rc;
+}
+#endif
+
+bool KvmIsNestedGuestExit(PVM pVM, PVMCPU pVCpu)
+{
+#ifdef VBOX_WITH_KVM_NESTING
+    if (not pVM->cpum.s.GuestFeatures.fVmx) {
+        return false;
+    }
+
+    uint64_t value {0};
+
+    AssertReleaseMsg(pVCpu->nem.s.guestModeStatOffset != 0, ("Invalid guest_mode offset"));
+
+    int rcRead = pread(pVCpu->nem.s.statsFd, &value, 8, pVCpu->nem.s.guestModeStatOffset);
+    AssertReleaseMsg(rcRead == 8, ("pread did not read all bytes: %d\n", rcRead));
+
+    return value != 0;
+#else
+    NOREF(pVM); NOREF(pVCpu);
+    return false;
+#endif
+}
 
 /**
  * Does the early setup of a KVM VM.
@@ -86,6 +180,23 @@ static int nemR3LnxInitSetupVm(PVM pVM, PRTERRINFO pErrInfo)
     if (rcLnx == -1)
         return RTErrInfoSetF(pErrInfo, VERR_NEM_VM_CREATE_FAILED, "Failed to enable KVM_CAP_X86_USER_SPACE_MSR failed: %u", errno);
 
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+    rcLnx = ioctl(pVM->nem.s.fdVm, KVM_CREATE_IRQCHIP, 0);
+    if (rcLnx == -1)
+        return RTErrInfoSetF(pErrInfo, VERR_NEM_VM_CREATE_FAILED, "Failed to execute KVM_CREATE_VCPU: %u", errno);
+
+    kvmSetGsiRoutingFullIrqChip(pVM);
+#else
+    struct kvm_enable_cap CapSplitIrqChip =
+    {
+        KVM_CAP_SPLIT_IRQCHIP, 0,
+        { KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS, 0, 0, 0}
+    };
+    rcLnx = ioctl(pVM->nem.s.fdVm, KVM_ENABLE_CAP, &CapSplitIrqChip);
+    if (rcLnx == -1)
+        return RTErrInfoSetF(pErrInfo, VERR_NEM_VM_CREATE_FAILED, "Failed to enable KVM_CAP_SPLIT_IRQCHIP: %u", errno);
+#endif
+
     /*
      * Create the VCpus.
      */
@@ -106,10 +217,128 @@ static int nemR3LnxInitSetupVm(PVM pVM, PRTERRINFO pErrInfo)
 
         /* We want all x86 registers and events on each exit. */
         pVCpu->nem.s.pRun->kvm_valid_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS | KVM_SYNC_X86_EVENTS;
+
+#ifdef VBOX_WITH_KVM_NESTING
+        pVCpu->nem.s.statsFd = ioctl(pVCpu->nem.s.fdVCpu, KVM_GET_STATS_FD, 0);
+
+        if (pVCpu->nem.s.statsFd < 0) {
+            return RTErrInfoSetF(pErrInfo, VERR_NEM_VM_CREATE_FAILED, "Failed to get stats FD");
+        }
+
+        int rc = KvmGetGuestModeOffsetFromStatsFd(pVCpu, &pVCpu->nem.s.guestModeStatOffset);
+        if (not RT_SUCCESS(rc)) {
+            // Instead of failing here, we could also de-feature nested hardware virtualization.
+            return RTErrInfoSetF(pErrInfo, VERR_NEM_VM_CREATE_FAILED, "Failed to get guest_mode offset");
+        }
+
+        if (idCpu == 0) {
+            // Log the offset once, just for debugging purposes.
+            LogRel2(("KVM: guest_mode offset is at %d\n", pVCpu->nem.s.guestModeStatOffset));
+        }
+#endif
     }
+
+    pVM->nem.s.pARedirectionTable = std::make_unique<std::array<std::optional<MSIMSG>, KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS>>();
+
     return VINF_SUCCESS;
 }
 
+static void nemR3LnxConsumePokeSignal()
+{
+    int iPokeSignal = RTThreadPokeSignal();
+    AssertReturnVoid(iPokeSignal >= 0);
+
+    sigset_t sigset;
+    sigemptyset(&sigset);
+    sigaddset(&sigset, iPokeSignal);
+
+    struct timespec timeout;
+
+    /* Don't wait for a signal, just poll. */
+    timeout.tv_sec = 0;
+    timeout.tv_nsec = 0;
+
+    int rc = sigtimedwait(&sigset, nullptr, &timeout);
+    AssertLogRelMsg(rc >= 0 || errno == EAGAIN || errno == EINTR, ("Failed to consume signal: %d", errno));
+}
+
+static PCPUMCPUIDLEAF findKvmLeaf(PCPUMCPUIDLEAF paKvmSupportedLeaves,
+                                  uint32_t cKvmSupportedLeaves,
+                                  uint32_t leaf,
+                                  uint32_t subleaf)
+{
+    for (uint32_t i = 0; i < cKvmSupportedLeaves; i++) {
+        auto& kvmLeaf = paKvmSupportedLeaves[i];
+
+        if (kvmLeaf.uLeaf == leaf && kvmLeaf.uSubLeaf == subleaf) {
+            return &kvmLeaf;
+        }
+    }
+
+    return nullptr;
+}
+
+static void maybeMaskUnsupportedKVMCpuidLeafValues(PCPUMCPUIDLEAF paKvmSupportedLeaves,
+                                                   uint32_t cKvmSupportedLeaves,
+                                                   uint32_t leaf,
+                                                   uint32_t subleaf,
+                                                   uint32_t& eax,
+                                                   uint32_t& ebx,
+                                                   uint32_t& ecx,
+                                                   uint32_t& edx)
+{
+    static const uint32_t CPUID_FEATURE_INFORMATION_LEAF = 0x1;
+
+    /*
+     * A list of CPUID leaves that we want to mask with the KVM
+     * supported values. For example, we want to make sure that FSGSBASE
+     * support is supported by KVM before we offer it to the guest.
+     * VirtualBox detects the features it wants to offer via CPUID,
+     * which bypasses Linux/KVM.
+     */
+    const std::vector<uint32_t> leavesToMask = {
+        CPUID_FEATURE_INFORMATION_LEAF,
+        0x6,        // Thermal and power management
+        0x7,        // Structured Extended Feature Flags Enumeration
+        0x12,       // SGX capabilities
+        0x14,       // Processor Trace
+        0x19,       // AES Key Locker features
+        0x24,       // AVX10 Features
+        0x80000001, // Extended Processor Info and Feature Bits
+        0x80000007, // Processor Power Management Information and RAS Capabilities
+        0x80000008, // Virtual and Physical address Sizes
+        0x8000000A, // Secure Virtual Machine features
+        0x8000001F, // Encrypted Memory Capabilities
+        0x80000021, // Extended Feature Identification 2
+    };
+
+    if (std::find(leavesToMask.begin(), leavesToMask.end(), leaf) == leavesToMask.end()) {
+        return;
+    }
+
+    auto* paKvmSupportedLeaf = findKvmLeaf(paKvmSupportedLeaves, cKvmSupportedLeaves, leaf, subleaf);
+
+    if (paKvmSupportedLeaf == nullptr) {
+        return;
+    }
+
+    switch (leaf) {
+    case CPUID_FEATURE_INFORMATION_LEAF:
+        eax &= paKvmSupportedLeaf->uEax;
+        // ebx reports APIC IDs which we would mask if we use the
+        // KVM supported values.
+        ecx &= paKvmSupportedLeaf->uEcx;
+        ecx |= X86_CPUID_FEATURE_ECX_HVP; // The hypervisor bit is not enabled in the KVM values.
+        edx &= paKvmSupportedLeaf->uEdx;
+        break;
+    default:
+        eax &= paKvmSupportedLeaf->uEax;
+        ebx &= paKvmSupportedLeaf->uEbx;
+        ecx &= paKvmSupportedLeaf->uEcx;
+        edx &= paKvmSupportedLeaf->uEdx;
+        break;
+    }
+}
 
 /**
  * Update the CPUID leaves for a VCPU.
@@ -128,6 +357,12 @@ static int nemR3LnxUpdateCpuIdsLeaves(PVM pVM, PVMCPU pVCpu)
     pReq->nent    = cLeaves;
     pReq->padding = 0;
 
+    size_t cKvmSupportedLeaves = 0;
+    PCPUMCPUIDLEAF paKvmSupportedLeaves = nullptr;
+    int rc = NEMR3KvmGetCpuIdLeaves(pVM, &paKvmSupportedLeaves, &cKvmSupportedLeaves);
+    AssertLogRelMsgReturn(RT_SUCCESS(rc), ("Could not retrieve supported CPUID leaves"), rc);
+
+
     for (uint32_t i = 0; i < cLeaves; i++)
     {
         CPUMGetGuestCpuId(pVCpu, paLeaves[i].uLeaf, paLeaves[i].uSubLeaf, -1 /*f64BitMode*/,
@@ -135,6 +370,16 @@ static int nemR3LnxUpdateCpuIdsLeaves(PVM pVM, PVMCPU pVCpu)
                           &pReq->entries[i].ebx,
                           &pReq->entries[i].ecx,
                           &pReq->entries[i].edx);
+
+        maybeMaskUnsupportedKVMCpuidLeafValues(paKvmSupportedLeaves,
+                                               cKvmSupportedLeaves,
+                                               paLeaves[i].uLeaf,
+                                               paLeaves[i].uSubLeaf,
+                                               pReq->entries[i].eax,
+                                               pReq->entries[i].ebx,
+                                               pReq->entries[i].ecx,
+                                               pReq->entries[i].edx);
+
         pReq->entries[i].function   = paLeaves[i].uLeaf;
         pReq->entries[i].index      = paLeaves[i].uSubLeaf;
         pReq->entries[i].flags      = !paLeaves[i].fSubLeafMask ? 0 : KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
@@ -149,6 +394,111 @@ static int nemR3LnxUpdateCpuIdsLeaves(PVM pVM, PVMCPU pVCpu)
     return VINF_SUCCESS;
 }
 
+static int nemR3LnxInitGuestInterface(PVM pVM)
+{
+    switch (pVM->gim.s.enmProviderId) {
+    case GIMPROVIDERID_HYPERV:
+        /*
+          SynIC is currently disabled pending investigation of interrupt issues. See #19.
+
+          Enabling this capability is not sufficient to enable SynNIC. The corresponding features in the Hyper-V CPUID
+          leaves also have to be enabled. Look for SYNIC and STIMER in GIMHv.cpp.
+
+          The CPUID implementation hints must also indicate deprecating AutoEOI to make APICv work.
+         */
+#if 1
+        LogRel(("NEM: Enabling SYNIC.\n"));
+
+        for (VMCPUID idCpu = 0; idCpu < pVM->cCpus; idCpu++)
+        {
+            PVMCPU pVCpu = pVM->apCpusR3[idCpu];
+
+            struct kvm_enable_cap CapSynIC =
+            {
+                KVM_CAP_HYPERV_SYNIC2, 0, { 0, 0, 0, 0 }
+            };
+
+            int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_ENABLE_CAP, &CapSynIC);
+            AssertLogRelMsgReturn(rcLnx == 0, ("Failed to enable SYNIC: rcLnx=%d errno=%d\n", rcLnx, errno),
+                                  RTErrConvertFromErrno(errno));
+        }
+#endif
+
+        break;
+
+    default:
+        /* Other guest interfaces are not fully supported. */
+        break;
+    }
+
+    return VINF_SUCCESS;
+}
+
+namespace
+{
+
+enum class KvmCpuIdIoctl : uint32_t
+{
+    CPUID = KVM_GET_SUPPORTED_CPUID,
+    HV_CPUID = KVM_GET_SUPPORTED_HV_CPUID
+};
+
+int KvmGetCpuIdLeavesGeneric(PVM pVM, KvmCpuIdIoctl ioctlNum, PCPUMCPUIDLEAF *outpCpuId, size_t *outcLeaves)
+{
+    struct kvm_cpuid2 *pKvmCpuid;
+    uint32_t cLeaves = 0;
+    int rc;
+
+    /* In case we exit due to errors. */
+    *outpCpuId = nullptr;
+    *outcLeaves = 0;
+
+    /* There is no way to query how many leaves there are. We just try until we hit the right size. */
+    do
+    {
+        cLeaves += 1;
+        Log(("Querying for %u leaves\n", cLeaves));
+
+        pKvmCpuid = static_cast<struct kvm_cpuid2 *>(alloca(RT_UOFFSETOF_DYN(struct kvm_cpuid2, entries[cLeaves])));
+
+        pKvmCpuid->nent = cLeaves;
+        pKvmCpuid->padding = 0;
+
+        rc = ioctl(pVM->nem.s.fdKvm, static_cast<uint32_t>(ioctlNum), pKvmCpuid);
+    } while (rc != 0 && errno == E2BIG);
+    AssertLogRelMsgReturn(rc == 0, ("Failed to query supported CPUID leaves: errno=%d", errno), RTErrConvertFromErrno(errno));
+    AssertFatal(cLeaves == pKvmCpuid->nent);
+
+    PCPUMCPUIDLEAF pCpuId = static_cast<PCPUMCPUIDLEAF>(RTMemAllocZ(sizeof(*pCpuId) * cLeaves));
+
+    for (uint32_t uLeaf = 0; uLeaf < cLeaves; uLeaf++)
+    {
+        pCpuId[uLeaf].uLeaf = pKvmCpuid->entries[uLeaf].function;
+        pCpuId[uLeaf].uSubLeaf = pKvmCpuid->entries[uLeaf].index;
+
+        pCpuId[uLeaf].uEax = pKvmCpuid->entries[uLeaf].eax;
+        pCpuId[uLeaf].uEbx = pKvmCpuid->entries[uLeaf].ebx;
+        pCpuId[uLeaf].uEcx = pKvmCpuid->entries[uLeaf].ecx;
+        pCpuId[uLeaf].uEdx = pKvmCpuid->entries[uLeaf].edx;
+    }
+
+    *outpCpuId = pCpuId;
+    *outcLeaves = cLeaves;
+
+    return VINF_SUCCESS;
+}
+
+} // anonymous namespace
+
+int NEMR3KvmGetHvCpuIdLeaves(PVM pVM, PCPUMCPUIDLEAF *outpCpuId, size_t *outcLeaves)
+{
+    return KvmGetCpuIdLeavesGeneric(pVM, KvmCpuIdIoctl::HV_CPUID, outpCpuId, outcLeaves);
+}
+
+int NEMR3KvmGetCpuIdLeaves(PVM pVM, PCPUMCPUIDLEAF *outpCpuId, size_t *outcLeaves)
+{
+    return KvmGetCpuIdLeavesGeneric(pVM, KvmCpuIdIoctl::CPUID, outpCpuId, outcLeaves);
+}
 
 DECLHIDDEN(int) nemR3NativeInitCompletedRing3(PVM pVM)
 {
@@ -163,10 +513,28 @@ DECLHIDDEN(int) nemR3NativeInitCompletedRing3(PVM pVM)
      */
     for (VMCPUID idCpu = 0; idCpu < pVM->cCpus; idCpu++)
     {
+        PCPUMCTXMSRS const  pCtxMsrs    = CPUMQueryGuestCtxMsrsPtr(pVM->apCpusR3[idCpu]);
+
         int rc = nemR3LnxUpdateCpuIdsLeaves(pVM, pVM->apCpusR3[idCpu]);
         AssertRCReturn(rc, rc);
+
+#ifdef VBOX_WITH_KVM_NESTING
+        if (pVM->cpum.s.GuestFeatures.fVmx) {
+            NEMR3KvmSetMsr(pVM->apCpusR3[idCpu], MSR_IA32_FEATURE_CONTROL, MSR_IA32_FEATURE_CONTROL_VMXON | MSR_IA32_FEATURE_CONTROL_LOCK);
+        }
+#endif
+
+        uint64_t val {0};
+        NEMR3KvmGetMsr(pVM->apCpusR3[idCpu], MSR_IA32_ARCH_CAPABILITIES, &val);
+        pCtxMsrs->msr.ArchCaps = val;
+
+        NEMR3KvmGetMsr(pVM->apCpusR3[idCpu], MSR_IA32_SPEC_CTRL, &val);
+        pCtxMsrs->msr.SpecCtrl = val;
     }
 
+    int rcLnxGI = nemR3LnxInitGuestInterface(pVM);
+    AssertRCReturn(rcLnxGI, rcLnxGI);
+
     /*
      * Configure MSRs after ring-3 init is done.
      *
@@ -193,6 +561,8 @@ DECLHIDDEN(int) nemR3NativeInitCompletedRing3(PVM pVM)
         MsrFilters.ranges[iRange].bitmap = (uint8_t *)&RT_CONCAT(bm, a_uBase)[0]
 #define MSR_RANGE_ADD(a_Msr) \
     do { Assert((uint32_t)(a_Msr) - uBase < cMsrs); ASMBitSet(pbm, (uint32_t)(a_Msr) - uBase); } while (0)
+#define MSR_RANGE_ADD_CLOSED_IVL(first_Msr, last_Msr) \
+    for (uint32_t uMsr = (first_Msr); uMsr <= last_Msr; uMsr++) { MSR_RANGE_ADD(uMsr); }
 #define MSR_RANGE_END(a_cMinMsrs) \
         /* optimize the range size before closing: */ \
         uint32_t cBitmap = cMsrs / 64; \
@@ -204,11 +574,44 @@ DECLHIDDEN(int) nemR3NativeInitCompletedRing3(PVM pVM)
 
     /* 1st Intel range: 0000_0000 to 0000_3000. */
     MSR_RANGE_BEGIN(0x00000000, 0x00003000, KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE);
+    MSR_RANGE_ADD(MSR_IA32_BIOS_SIGN_ID);
     MSR_RANGE_ADD(MSR_IA32_TSC);
+    MSR_RANGE_ADD(MSR_IA32_APICBASE);
     MSR_RANGE_ADD(MSR_IA32_SYSENTER_CS);
     MSR_RANGE_ADD(MSR_IA32_SYSENTER_ESP);
     MSR_RANGE_ADD(MSR_IA32_SYSENTER_EIP);
     MSR_RANGE_ADD(MSR_IA32_CR_PAT);
+    MSR_RANGE_ADD(MSR_IA32_ARCH_CAPABILITIES);
+    MSR_RANGE_ADD(MSR_IA32_SPEC_CTRL);
+    MSR_RANGE_ADD(MSR_IA32_PRED_CMD);
+    MSR_RANGE_ADD(MSR_IA32_FLUSH_CMD);
+#ifdef VBOX_WITH_KVM_NESTING
+    if (pVM->cpum.s.GuestFeatures.fVmx) {
+        /* VMX MSRS */
+        MSR_RANGE_ADD(MSR_IA32_FEATURE_CONTROL);
+        MSR_RANGE_ADD(MSR_IA32_MISC_ENABLE);
+        MSR_RANGE_ADD(MSR_IA32_VMX_BASIC);
+        MSR_RANGE_ADD(MSR_IA32_VMX_PINBASED_CTLS);
+        MSR_RANGE_ADD(MSR_IA32_VMX_PROCBASED_CTLS);
+        MSR_RANGE_ADD(MSR_IA32_VMX_EXIT_CTLS);
+        MSR_RANGE_ADD(MSR_IA32_VMX_ENTRY_CTLS);
+        MSR_RANGE_ADD(MSR_IA32_VMX_MISC);
+        MSR_RANGE_ADD(MSR_IA32_VMX_CR0_FIXED0);
+        MSR_RANGE_ADD(MSR_IA32_VMX_CR0_FIXED1);
+        MSR_RANGE_ADD(MSR_IA32_VMX_CR4_FIXED0);
+        MSR_RANGE_ADD(MSR_IA32_VMX_CR4_FIXED1);
+        MSR_RANGE_ADD(MSR_IA32_VMX_VMCS_ENUM);
+        MSR_RANGE_ADD(MSR_IA32_VMX_PROCBASED_CTLS2);
+        MSR_RANGE_ADD(MSR_IA32_VMX_EPT_VPID_CAP);
+        MSR_RANGE_ADD(MSR_IA32_VMX_TRUE_PINBASED_CTLS);
+        MSR_RANGE_ADD(MSR_IA32_VMX_TRUE_PROCBASED_CTLS);
+        MSR_RANGE_ADD(MSR_IA32_VMX_TRUE_EXIT_CTLS);
+        MSR_RANGE_ADD(MSR_IA32_VMX_TRUE_ENTRY_CTLS);
+        MSR_RANGE_ADD(MSR_IA32_VMX_VMFUNC);
+        MSR_RANGE_ADD(MSR_IA32_VMX_PROCBASED_CTLS3);
+        MSR_RANGE_ADD(MSR_IA32_VMX_EXIT_CTLS2);
+    }
+#endif
     /** @todo more? */
     MSR_RANGE_END(64);
 
@@ -216,6 +619,13 @@ DECLHIDDEN(int) nemR3NativeInitCompletedRing3(PVM pVM)
     MSR_RANGE_BEGIN(0xc0000000, 0xc0003000, KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE);
     MSR_RANGE_ADD(MSR_K6_EFER);
     MSR_RANGE_ADD(MSR_K6_STAR);
+
+    /*
+     * If we don't allow direct access to FS_BASE, we clobber the FS base for the guest. This sounds like a bug in
+     * our state synchronization with KVM.
+     */
+    MSR_RANGE_ADD(MSR_K8_FS_BASE);
+
     MSR_RANGE_ADD(MSR_K8_GS_BASE);
     MSR_RANGE_ADD(MSR_K8_KERNEL_GS_BASE);
     MSR_RANGE_ADD(MSR_K8_LSTAR);
@@ -225,6 +635,49 @@ DECLHIDDEN(int) nemR3NativeInitCompletedRing3(PVM pVM)
     /** @todo add more? */
     MSR_RANGE_END(64);
 
+    if (pVM->gim.s.enmProviderId == GIMPROVIDERID_HYPERV)
+    {
+        MSR_RANGE_BEGIN(0x40000000, 0x40003000, KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE);
+
+        MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE0_FIRST, MSR_GIM_HV_RANGE0_LAST);
+        MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE1_FIRST, MSR_GIM_HV_RANGE1_LAST);
+        MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE2_FIRST, MSR_GIM_HV_RANGE2_LAST);
+        MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE3_FIRST, MSR_GIM_HV_RANGE3_LAST);
+
+        /* SynIC / STimer */
+        MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE4_FIRST, MSR_GIM_HV_RANGE4_LAST);
+        MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE5_FIRST, MSR_GIM_HV_RANGE5_LAST);
+        MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE6_FIRST, MSR_GIM_HV_RANGE6_LAST);
+
+        MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE7_FIRST, MSR_GIM_HV_RANGE7_LAST);
+        MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE8_FIRST, MSR_GIM_HV_RANGE8_LAST);
+        MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE9_FIRST, MSR_GIM_HV_RANGE9_LAST);
+        MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE10_FIRST, MSR_GIM_HV_RANGE10_LAST);
+        MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE11_FIRST, MSR_GIM_HV_RANGE11_LAST);
+
+        /*
+         * Crash MSRs
+         *
+         * We deliberately don't add them here, so we can handle them instead of KVM. This allows us to log the
+         * crash reason into VM log instead of it ending up in the kernel's log.
+         */
+        // MSR_RANGE_ADD_CLOSED_IVL(MSR_GIM_HV_RANGE12_FIRST, MSR_GIM_HV_RANGE12_LAST);
+
+        /*
+         * These should be available to the guest with feature bit 23 in the base features, which we don't
+         * expose. But Windows touches them anyway?
+         */
+        MSR_RANGE_ADD(0x40000114 /* HV_X64_MSR_STIME_UNHALTED_TIMER_CONFIG */);
+        MSR_RANGE_ADD(0x40000115 /* HV_X64_MSR_STIME_UNHALTED_TIMER_COUNT */);
+
+        /*
+         * These are available to the guest with feature bit 15 in the base features (undocumented).
+         */
+        MSR_RANGE_ADD(0x40000118 /* HV_X64_MSR_TSC_INVARIANT_CONTROL */);
+
+        MSR_RANGE_END(64);
+    }
+
     /** @todo Specify other ranges too? Like hyper-V and KVM to make sure we get
      *        the MSR requests instead of KVM. */
 
@@ -237,6 +690,330 @@ DECLHIDDEN(int) nemR3NativeInitCompletedRing3(PVM pVM)
 }
 
 
+
+/*********************************************************************************************************************************
+*   Memory management                                                                                                            *
+*********************************************************************************************************************************/
+
+VMMR3_INT_DECL(int) NEMR3LoadExec(PVM pVM)
+{
+    // TODO: this code leaves a small window between the guest sending an INIT IPI
+    // and a subsequent SIPI IPI. If that's the case, we need to set the MP state
+    // `KVM_MP_STATE_INIT_RECEIVED` which requires some serious interaction
+    // between the NEM and SSM. For now, we hope that noone suspends a VM during
+    // VCPU bringup. See vbox-engineering#426.
+    for (VMCPUID i = 0; i < pVM->cCpus; i++) {
+        PVMCPU pVCpu = pVM->apCpusR3[i];
+        auto state = VMCPU_GET_STATE(pVCpu);
+        if (state == VMCPUSTATE_STARTED || state == VMCPUSTATE_STARTED_EXEC_NEM || state == VMCPUSTATE_STARTED_EXEC_NEM_WAIT )
+        {
+            struct kvm_mp_state mp;
+            mp.mp_state = KVM_MP_STATE_RUNNABLE;
+            int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_SET_MP_STATE, &mp);
+            AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3Load: Failed to set MP state. Error: %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+        }
+    }
+    return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmGetMsr(PVMCPU pVCpu, uint64_t msr, uint64_t* val)
+{
+    alignas(struct kvm_msrs) char backing[sizeof(struct kvm_msrs) + sizeof(struct kvm_msr_entry)];
+    struct kvm_msrs* msr_data {reinterpret_cast<struct kvm_msrs*>(&backing[0])};
+    RT_ZERO(backing);
+
+    msr_data->nmsrs = 1;
+    msr_data->entries[0].index = msr;
+
+    int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_GET_MSRS, msr_data);
+    AssertLogRelMsgReturn(rcLnx == 1, ("NEMR3KvmGetMsr: \
+                Failed to get MSR data. Error: %d, errno %d\n", rcLnx, errno), VERR_NOT_SUPPORTED);
+
+    AssertLogRelMsgReturn(val != nullptr, ("NEMR3KvmGetMsr: \
+                Invalid buffer\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+    *val = msr_data->entries[0].data;
+
+    return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmSetMsr(PVMCPU pVCpu, uint64_t msr, uint64_t val)
+{
+    alignas(struct kvm_msrs) char backing[sizeof(struct kvm_msrs) + sizeof(struct kvm_msr_entry)];
+    struct kvm_msrs* msr_data {reinterpret_cast<struct kvm_msrs*>(&backing[0])};
+    RT_ZERO(backing);
+
+    msr_data->nmsrs = 1;
+    msr_data->entries[0].index = msr;
+    msr_data->entries[0].data = val;
+
+    int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_SET_MSRS, msr_data);
+    AssertLogRelMsgReturn(rcLnx == 1, ("NEMR3KvmSetMsr: \
+                Failed to set MSR[%lx] data. Error: %d, errno %d\n", msr, rcLnx, errno), VERR_NOT_SUPPORTED);
+
+    return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmGetLapicState(PVMCPU pVCpu, void* pXApicPage)
+{
+    struct kvm_lapic_state state;
+
+    int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_GET_LAPIC, &state);
+    AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3KvmGetLapicState: \
+                Failed to get APIC state. Error: %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+    memcpy(pXApicPage, &state.regs[0], KVM_APIC_REG_SIZE);
+    return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmSetLapicState(PVMCPU pVCpu, void* pXApicPage)
+{
+    struct kvm_lapic_state state;
+
+    memcpy(&state.regs[0], pXApicPage, KVM_APIC_REG_SIZE);
+
+    int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_SET_LAPIC, &state);
+    AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3KvmSetApicState: \
+                Failed to set APIC state. Error %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+    return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmSetIrqLine(PVM pVM, uint16_t u16Gsi, int iLevel)
+{
+    struct kvm_irq_level irq;
+    RT_ZERO(irq);
+
+    irq.irq = u16Gsi;
+    irq.level = iLevel;
+
+    int rcLnx = ioctl(pVM->nem.s.fdVm, KVM_IRQ_LINE, &irq);
+    AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3KvmSetIrqLine: Failed to set irq line %d! error: %d, errno %d\n", u16Gsi, rcLnx, errno), VERR_NEM_IPE_5);
+
+    return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmSplitIrqchipDeliverMsi(PVM pVM, PCMSIMSG pMsi)
+{
+    AssertLogRelReturn(pVM != nullptr, VERR_INVALID_POINTER);
+    AssertLogRelReturn(pMsi != nullptr, VERR_INVALID_POINTER);
+
+    struct kvm_msi msi;
+    RT_ZERO(msi);
+    msi.address_lo = pMsi->Addr.au32[0];
+    msi.address_hi = pMsi->Addr.au32[1];
+    msi.data = pMsi->Data.u32;
+
+    int rcLnx = ioctl(pVM->nem.s.fdVm, KVM_SIGNAL_MSI, &msi);
+    AssertLogRelMsgReturn(rcLnx >= 0, ("NEMR3KvmSplitIrqchipDeliverMsi: Failed to deliver MSI! error: %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+    return rcLnx == 0 ? VERR_APIC_INTR_DISCARDED : VINF_SUCCESS;
+}
+
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+static int kvmSetGsiRoutingFullIrqChip(PVM pVM)
+{
+    alignas(kvm_irq_routing) char backing[ sizeof(struct kvm_irq_routing) +
+        (KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS + KVM_IRQCHIP_NUM_PIC_INTR_PINS) * sizeof(struct kvm_irq_routing_entry) ] {};
+    kvm_irq_routing* routing = reinterpret_cast<kvm_irq_routing*>(backing);
+
+    for (unsigned i = 0; i < KVM_IRQCHIP_NUM_PIC_INTR_PINS; ++i) {
+        routing->entries[i].gsi = i;
+        routing->entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
+        routing->entries[i].u.irqchip.irqchip = (i < 8) ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE;
+        routing->entries[i].u.irqchip.pin = (i < 8) ? i : (i - 8);
+    }
+
+    for (unsigned i = 0; i < KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS; ++i) {
+        uint64_t arr_idx = i + KVM_IRQCHIP_NUM_PIC_INTR_PINS;
+        routing->entries[arr_idx].gsi = i;
+        routing->entries[arr_idx].type = KVM_IRQ_ROUTING_IRQCHIP;
+        routing->entries[arr_idx].u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC;
+        if (i == 0) {
+            routing->entries[arr_idx].u.irqchip.pin = 2;
+        } else {
+            routing->entries[arr_idx].u.irqchip.pin = i;
+        }
+    }
+    routing->nr = KVM_IRQCHIP_NUM_PIC_INTR_PINS + KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS;
+
+    int rc = ioctl(pVM->nem.s.fdVm, KVM_SET_GSI_ROUTING, routing);
+
+    AssertLogRelMsgReturn(rc >= 0, ("NEM/KVM: Unable to set GSI routing! rc: %d errno %d \n", rc, errno), VERR_INTERNAL_ERROR);
+
+    return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmGetPicState(PVM pVM, KVMIRQCHIP irqchip, KVMPICSTATE* state)
+{
+    struct kvm_irqchip irqchip_state;
+    irqchip_state.chip_id = irqchip == KVMIRQCHIP::PIC_MASTER ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE;
+
+    if (state == nullptr) {
+        return VERR_INVALID_POINTER;
+    }
+
+    int rcLnx = ioctl(pVM->nem.s.fdVm, KVM_GET_IRQCHIP, &irqchip_state);
+    AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3KvmGetPicState: \
+                Failed to get PIC state. Error: %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+    state->last_irr = irqchip_state.chip.pic.last_irr;
+    state->irr = irqchip_state.chip.pic.irr;
+    state->imr = irqchip_state.chip.pic.imr;
+    state->isr = irqchip_state.chip.pic.isr;
+    state->priority_add = irqchip_state.chip.pic.priority_add;
+    state->irq_base = irqchip_state.chip.pic.irq_base;
+    state->read_reg_select = irqchip_state.chip.pic.read_reg_select;
+    state->poll = irqchip_state.chip.pic.poll;
+    state->special_mask = irqchip_state.chip.pic.special_mask;
+    state->init_state = irqchip_state.chip.pic.init_state;
+    state->auto_eoi = irqchip_state.chip.pic.auto_eoi;
+    state->rotate_on_auto_eoi = irqchip_state.chip.pic.rotate_on_auto_eoi;
+    state->special_fully_nested_mode = irqchip_state.chip.pic.special_fully_nested_mode;
+    state->init4 = irqchip_state.chip.pic.init4;
+    state->elcr = irqchip_state.chip.pic.elcr;
+    state->elcr_mask = irqchip_state.chip.pic.elcr_mask;
+
+    return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmSetPicState(PVM pVM, KVMIRQCHIP irqchip, KVMPICSTATE* state)
+{
+    struct kvm_irqchip irqchip_state;
+    irqchip_state.chip_id = irqchip == KVMIRQCHIP::PIC_MASTER ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE;
+
+    if (state == nullptr) {
+        return VERR_INVALID_POINTER;
+    }
+
+    irqchip_state.chip.pic.last_irr = state->last_irr;
+    irqchip_state.chip.pic.irr = state->irr;
+    irqchip_state.chip.pic.imr = state->imr;
+    irqchip_state.chip.pic.isr = state->isr;
+    irqchip_state.chip.pic.priority_add = state->priority_add;
+    irqchip_state.chip.pic.irq_base = state->irq_base;
+    irqchip_state.chip.pic.read_reg_select = state->read_reg_select;
+    irqchip_state.chip.pic.poll = state->poll;
+    irqchip_state.chip.pic.special_mask = state->special_mask;
+    irqchip_state.chip.pic.init_state = state->init_state;
+    irqchip_state.chip.pic.auto_eoi = state->auto_eoi;
+    irqchip_state.chip.pic.rotate_on_auto_eoi = state->rotate_on_auto_eoi;
+    irqchip_state.chip.pic.special_fully_nested_mode = state->special_fully_nested_mode;
+    irqchip_state.chip.pic.init4 = state->init4;
+    irqchip_state.chip.pic.elcr = state->elcr;
+    irqchip_state.chip.pic.elcr_mask = state->elcr_mask;
+
+    int rcLnx = ioctl(pVM->nem.s.fdVm, KVM_GET_IRQCHIP, &irqchip_state);
+    AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3KvmSetPicState: \
+                Failed to get PIC state. Error: %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+    return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmGetIoApicState(PVM pVM, KVMIOAPICSTATE* state)
+{
+    struct kvm_irqchip irqchip_state;
+    irqchip_state.chip_id = KVM_IRQCHIP_IOAPIC;
+
+    if (state == nullptr) {
+        return VERR_INVALID_POINTER;
+    }
+
+    int rcLnx = ioctl(pVM->nem.s.fdVm, KVM_GET_IRQCHIP, &irqchip_state);
+    AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3KvmGetIoApicState: \
+                Failed to get IOAPIC state. Error: %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+    state->base_address = irqchip_state.chip.ioapic.base_address;
+    state->ioregsel = irqchip_state.chip.ioapic.ioregsel;
+    state->id = irqchip_state.chip.ioapic.id;
+    state->irr = irqchip_state.chip.ioapic.irr;
+
+    for (unsigned i = 0; i < KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS; ++i) {
+        state->redirtbl[i] = irqchip_state.chip.ioapic.redirtbl[i].bits;
+    }
+
+    return VINF_SUCCESS;
+}
+
+VMMR3_INT_DECL(int) NEMR3KvmSetIoApicState(PVM pVM, KVMIOAPICSTATE* state)
+{
+    struct kvm_irqchip irqchip_state;
+    irqchip_state.chip_id = KVM_IRQCHIP_IOAPIC;
+
+    if (state == nullptr) {
+        return VERR_INVALID_POINTER;
+    }
+
+    irqchip_state.chip.ioapic.base_address = state->base_address;
+    irqchip_state.chip.ioapic.ioregsel = state->ioregsel;
+    irqchip_state.chip.ioapic.id = state->id;
+    irqchip_state.chip.ioapic.irr = state->irr;
+
+    for (unsigned i = 0; i < KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS; ++i) {
+        irqchip_state.chip.ioapic.redirtbl[i].bits = state->redirtbl[i];
+    }
+
+    int rcLnx = ioctl(pVM->nem.s.fdVm, KVM_SET_IRQCHIP, &irqchip_state);
+    AssertLogRelMsgReturn(rcLnx == 0, ("NEMR3KvmSetIoApicState: \
+                Failed to set IOPIC state. Error: %d, errno %d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+    return VINF_SUCCESS;
+}
+#endif
+
+static int kvmSetGsiRouting(PVM pVM)
+{
+    alignas(kvm_irq_routing) char backing[ sizeof(struct kvm_irq_routing) + KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS * sizeof(struct kvm_irq_routing_entry) ] {};
+    kvm_irq_routing* routing = reinterpret_cast<kvm_irq_routing*>(backing);
+
+    unsigned routingCount {0};
+
+    for(unsigned i {0}; i < KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS; ++i)
+    {
+        if (pVM->nem.s.pARedirectionTable->at(i).has_value())
+        {
+            PMSIMSG msi = &(pVM->nem.s.pARedirectionTable->at(i).value());
+            routing->entries[routingCount].gsi = i;
+            routing->entries[routingCount].type = KVM_IRQ_ROUTING_MSI;
+            routing->entries[routingCount].u.msi.address_lo = msi->Addr.au32[0];
+            routing->entries[routingCount].u.msi.address_hi = msi->Addr.au32[1];
+            routing->entries[routingCount].u.msi.data = msi->Data.u32;
+            routingCount++;
+        }
+    }
+
+    routing->nr = routingCount;
+
+    int rc = ioctl(pVM->nem.s.fdVm, KVM_SET_GSI_ROUTING, routing);
+
+    AssertLogRelMsgReturn(rc >= 0, ("NEM/KVM: Unable to set GSI routing! rc: %d errno %d \n", rc, errno), VERR_INTERNAL_ERROR);
+
+    return VINF_SUCCESS;
+}
+
+
+VMMR3_INT_DECL(int) NEMR3KvmSplitIrqchipAddUpdateRTE(PVM pVM, uint16_t u16Gsi, PCMSIMSG pMsi)
+{
+    AssertRelease(pVM->nem.s.pARedirectionTable != nullptr);
+    AssertRelease(u16Gsi < KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS);
+
+    pVM->nem.s.pARedirectionTable->at(u16Gsi) = *pMsi;
+
+    return kvmSetGsiRouting(pVM);
+}
+
+
+VMMR3_INT_DECL(int) NEMR3KvmSplitIrqchipRemoveRTE(PVM pVM, uint16_t u16Gsi)
+{
+    AssertRelease(pVM->nem.s.pARedirectionTable != nullptr);
+    AssertRelease(u16Gsi < KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS);
+
+    pVM->nem.s.pARedirectionTable->at(u16Gsi) = std::nullopt;
+
+    return kvmSetGsiRouting(pVM);
+}
+
+
 /*********************************************************************************************************************************
 *   CPU State                                                                                                                    *
 *********************************************************************************************************************************/
@@ -379,8 +1156,7 @@ static int nemHCLnxImportState(PVMCPUCC pVCpu, uint64_t fWhat, PCPUMCTX pCtx, st
                 }
             }
         }
-        if (fWhat & CPUMCTX_EXTRN_APIC_TPR)
-            APICSetTpr(pVCpu, (uint8_t)pRun->s.regs.sregs.cr8 << 4);
+
         if (fWhat & CPUMCTX_EXTRN_EFER)
         {
             if (pCtx->msrEFER != pRun->s.regs.sregs.efer)
@@ -447,6 +1223,7 @@ static int nemHCLnxImportState(PVMCPUCC pVCpu, uint64_t fWhat, PCPUMCTX pCtx, st
 
             pCtx->aXcr[0] = Xcrs.xcrs[0].value;
             pCtx->aXcr[1] = Xcrs.xcrs[1].value;
+            pCtx->fXStateMask = Xcrs.xcrs[0].value;
         }
     }
 
@@ -494,6 +1271,8 @@ static int nemHCLnxImportState(PVMCPUCC pVCpu, uint64_t fWhat, PCPUMCTX pCtx, st
         if (fWhat & CPUMCTX_EXTRN_OTHER_MSRS)
         {
             ADD_MSR(MSR_IA32_CR_PAT, pCtx->msrPAT);
+            ADD_MSR(MSR_IA32_ARCH_CAPABILITIES, pCtxMsrs->msr.ArchCaps);
+            ADD_MSR(MSR_IA32_SPEC_CTRL, pCtxMsrs->msr.SpecCtrl);
             /** @todo What do we _have_ to add here?
              * We also have: Mttr*, MiscEnable, FeatureControl. */
         }
@@ -531,12 +1310,6 @@ static int nemHCLnxImportState(PVMCPUCC pVCpu, uint64_t fWhat, PCPUMCTX pCtx, st
                                          pVCpu->cpum.GstCtx.rip);
         CPUMUpdateInterruptInhibitingByNmi(&pVCpu->cpum.GstCtx, KvmEvents.nmi.masked != 0);
 
-        if (KvmEvents.interrupt.injected)
-        {
-            STAM_REL_COUNTER_INC(&pVCpu->nem.s.StatImportPendingInterrupt);
-            TRPMAssertTrap(pVCpu, KvmEvents.interrupt.nr, !KvmEvents.interrupt.soft ? TRPM_HARDWARE_INT : TRPM_SOFTWARE_INT);
-        }
-
         Assert(KvmEvents.nmi.injected == 0);
         Assert(KvmEvents.nmi.pending  == 0);
     }
@@ -647,6 +1420,13 @@ VMM_INT_DECL(int) NEMImportStateOnDemand(PVMCPUCC pVCpu, uint64_t fWhat)
  */
 static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_run *pRun)
 {
+#define NEM_UPDATE_IF_CHANGED(dst, src, dirty_flag) \
+        if (src != dst) { \
+            dst = src; \
+            dirty_flag = true; \
+        }
+
+
     uint64_t const fExtrn = ~pCtx->fExtrn & CPUMCTX_EXTRN_ALL;
     Assert((~fExtrn & CPUMCTX_EXTRN_ALL) != CPUMCTX_EXTRN_ALL);
 
@@ -655,39 +1435,53 @@ static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_
      */
     if (fExtrn & (CPUMCTX_EXTRN_RIP | CPUMCTX_EXTRN_RFLAGS | CPUMCTX_EXTRN_GPRS_MASK))
     {
-        if (fExtrn & CPUMCTX_EXTRN_RIP)
-            pRun->s.regs.regs.rip    = pCtx->rip;
-        if (fExtrn & CPUMCTX_EXTRN_RFLAGS)
-            pRun->s.regs.regs.rflags = pCtx->rflags.u;
-
-        if (fExtrn & CPUMCTX_EXTRN_RAX)
-            pRun->s.regs.regs.rax    = pCtx->rax;
-        if (fExtrn & CPUMCTX_EXTRN_RCX)
-            pRun->s.regs.regs.rcx    = pCtx->rcx;
-        if (fExtrn & CPUMCTX_EXTRN_RDX)
-            pRun->s.regs.regs.rdx    = pCtx->rdx;
-        if (fExtrn & CPUMCTX_EXTRN_RBX)
-            pRun->s.regs.regs.rbx    = pCtx->rbx;
-        if (fExtrn & CPUMCTX_EXTRN_RSP)
-            pRun->s.regs.regs.rsp    = pCtx->rsp;
-        if (fExtrn & CPUMCTX_EXTRN_RBP)
-            pRun->s.regs.regs.rbp    = pCtx->rbp;
-        if (fExtrn & CPUMCTX_EXTRN_RSI)
-            pRun->s.regs.regs.rsi    = pCtx->rsi;
-        if (fExtrn & CPUMCTX_EXTRN_RDI)
-            pRun->s.regs.regs.rdi    = pCtx->rdi;
+        bool dirty_gprs {false};
+
+        if (fExtrn & CPUMCTX_EXTRN_RIP) {
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rip, pCtx->rip, dirty_gprs);
+        }
+        if (fExtrn & CPUMCTX_EXTRN_RFLAGS) {
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rflags, pCtx->rflags.u, dirty_gprs);
+        }
+
+        if (fExtrn & CPUMCTX_EXTRN_RAX) {
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rax, pCtx->rax, dirty_gprs);
+        }
+        if (fExtrn & CPUMCTX_EXTRN_RCX) {
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rcx, pCtx->rcx, dirty_gprs);
+        }
+        if (fExtrn & CPUMCTX_EXTRN_RDX) {
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rdx, pCtx->rdx, dirty_gprs);
+        }
+        if (fExtrn & CPUMCTX_EXTRN_RBX) {
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rbx, pCtx->rbx, dirty_gprs);
+        }
+        if (fExtrn & CPUMCTX_EXTRN_RSP) {
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rsp, pCtx->rsp, dirty_gprs);
+        }
+        if (fExtrn & CPUMCTX_EXTRN_RBP) {
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rbp, pCtx->rbp, dirty_gprs);
+        }
+        if (fExtrn & CPUMCTX_EXTRN_RSI) {
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rsi, pCtx->rsi, dirty_gprs);
+        }
+        if (fExtrn & CPUMCTX_EXTRN_RDI) {
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.rdi, pCtx->rdi, dirty_gprs);
+        }
         if (fExtrn & CPUMCTX_EXTRN_R8_R15)
         {
-            pRun->s.regs.regs.r8     = pCtx->r8;
-            pRun->s.regs.regs.r9     = pCtx->r9;
-            pRun->s.regs.regs.r10    = pCtx->r10;
-            pRun->s.regs.regs.r11    = pCtx->r11;
-            pRun->s.regs.regs.r12    = pCtx->r12;
-            pRun->s.regs.regs.r13    = pCtx->r13;
-            pRun->s.regs.regs.r14    = pCtx->r14;
-            pRun->s.regs.regs.r15    = pCtx->r15;
-        }
-        pRun->kvm_dirty_regs |= KVM_SYNC_X86_REGS;
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r8, pCtx->r8, dirty_gprs);
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r9, pCtx->r9, dirty_gprs);
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r10, pCtx->r10, dirty_gprs);
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r11, pCtx->r11, dirty_gprs);
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r12, pCtx->r12, dirty_gprs);
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r13, pCtx->r13, dirty_gprs);
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r14, pCtx->r14, dirty_gprs);
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.regs.r15, pCtx->r15, dirty_gprs);
+        }
+        if (dirty_gprs) {
+            pRun->kvm_dirty_regs |= KVM_SYNC_X86_REGS;
+        }
     }
 
     /*
@@ -701,15 +1495,7 @@ static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_
                       | CPUMCTX_EXTRN_EFER      | CPUMCTX_EXTRN_APIC_TPR))
         || uApicBase != pVCpu->nem.s.uKvmApicBase)
     {
-        if ((pVCpu->nem.s.uKvmApicBase ^ uApicBase) & MSR_IA32_APICBASE_EN)
-            Log(("NEM/%u: APICBASE_EN changed %#010RX64 -> %#010RX64\n", pVCpu->idCpu, pVCpu->nem.s.uKvmApicBase, uApicBase));
-        pRun->s.regs.sregs.apic_base = uApicBase;
-        pVCpu->nem.s.uKvmApicBase    = uApicBase;
-
-        if (fExtrn & CPUMCTX_EXTRN_APIC_TPR)
-            pRun->s.regs.sregs.cr8   = CPUMGetGuestCR8(pVCpu);
-
-#define NEM_LNX_EXPORT_SEG(a_KvmSeg, a_CtxSeg) do { \
+#define NEM_LNX_EXPORT_SEG(a_KvmSeg, a_CtxSeg, dirty_flag) do { \
             (a_KvmSeg).base     = (a_CtxSeg).u64Base; \
             (a_KvmSeg).limit    = (a_CtxSeg).u32Limit; \
             (a_KvmSeg).selector = (a_CtxSeg).Sel; \
@@ -723,64 +1509,123 @@ static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_
             (a_KvmSeg).g        = (a_CtxSeg).Attr.n.u1Granularity; \
             (a_KvmSeg).unusable = (a_CtxSeg).Attr.n.u1Unusable; \
             (a_KvmSeg).padding  = 0; \
+            dirty_flag = true; \
         } while (0)
+#define NEM_LNX_SREG_IDENTICAL(a_KvmSeg, a_CtxSeg) ( \
+            (a_KvmSeg).base     == (a_CtxSeg).u64Base && \
+            (a_KvmSeg).limit    == (a_CtxSeg).u32Limit && \
+            (a_KvmSeg).selector == (a_CtxSeg).Sel && \
+            (a_KvmSeg).type     == (a_CtxSeg).Attr.n.u4Type && \
+            (a_KvmSeg).s        == (a_CtxSeg).Attr.n.u1DescType && \
+            (a_KvmSeg).dpl      == (a_CtxSeg).Attr.n.u2Dpl && \
+            (a_KvmSeg).present  == (a_CtxSeg).Attr.n.u1Present && \
+            (a_KvmSeg).avl      == (a_CtxSeg).Attr.n.u1Available && \
+            (a_KvmSeg).l        == (a_CtxSeg).Attr.n.u1Long && \
+            (a_KvmSeg).db       == (a_CtxSeg).Attr.n.u1DefBig && \
+            (a_KvmSeg).g        == (a_CtxSeg).Attr.n.u1Granularity && \
+            (a_KvmSeg).unusable == (a_CtxSeg).Attr.n.u1Unusable \
+        )
+        bool dirty_sregs = false;
+
+        if ((pVCpu->nem.s.uKvmApicBase ^ uApicBase) & MSR_IA32_APICBASE_EN)
+            Log(("NEM/%u: APICBASE_EN changed %#010RX64 -> %#010RX64\n", pVCpu->idCpu, pVCpu->nem.s.uKvmApicBase, uApicBase));
+
+        NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.apic_base, uApicBase, dirty_sregs);
+        NEM_UPDATE_IF_CHANGED(pVCpu->nem.s.uKvmApicBase, uApicBase, dirty_sregs);
 
         if (fExtrn & CPUMCTX_EXTRN_SREG_MASK)
         {
-            if (fExtrn & CPUMCTX_EXTRN_ES)
-                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.es, pCtx->es);
-            if (fExtrn & CPUMCTX_EXTRN_CS)
-                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.cs, pCtx->cs);
-            if (fExtrn & CPUMCTX_EXTRN_SS)
-                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.ss, pCtx->ss);
-            if (fExtrn & CPUMCTX_EXTRN_DS)
-                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.ds, pCtx->ds);
-            if (fExtrn & CPUMCTX_EXTRN_FS)
-                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.fs, pCtx->fs);
-            if (fExtrn & CPUMCTX_EXTRN_GS)
-                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.gs, pCtx->gs);
+            if (fExtrn & CPUMCTX_EXTRN_ES and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.es, pCtx->es)) {
+                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.es, pCtx->es, dirty_sregs);
+            }
+            if (fExtrn & CPUMCTX_EXTRN_CS and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.cs, pCtx->cs)) {
+                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.cs, pCtx->cs, dirty_sregs);
+            }
+            if (fExtrn & CPUMCTX_EXTRN_SS and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.ss, pCtx->ss)) {
+                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.ss, pCtx->ss, dirty_sregs);
+            }
+            if (fExtrn & CPUMCTX_EXTRN_DS and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.ds, pCtx->ds)) {
+                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.ds, pCtx->ds, dirty_sregs);
+            }
+            if (fExtrn & CPUMCTX_EXTRN_FS and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.fs, pCtx->fs)) {
+                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.fs, pCtx->fs, dirty_sregs);
+            }
+            if (fExtrn & CPUMCTX_EXTRN_GS and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.gs, pCtx->gs)) {
+                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.gs, pCtx->gs, dirty_sregs);
+            }
+
         }
         if (fExtrn & CPUMCTX_EXTRN_TABLE_MASK)
         {
             if (fExtrn & CPUMCTX_EXTRN_GDTR)
             {
-                pRun->s.regs.sregs.gdt.base  = pCtx->gdtr.pGdt;
-                pRun->s.regs.sregs.gdt.limit = pCtx->gdtr.cbGdt;
+                NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.gdt.base, pCtx->gdtr.pGdt, dirty_sregs);
+                NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.gdt.limit, pCtx->gdtr.cbGdt, dirty_sregs);
                 pRun->s.regs.sregs.gdt.padding[0] = 0;
                 pRun->s.regs.sregs.gdt.padding[1] = 0;
                 pRun->s.regs.sregs.gdt.padding[2] = 0;
             }
             if (fExtrn & CPUMCTX_EXTRN_IDTR)
             {
-                pRun->s.regs.sregs.idt.base  = pCtx->idtr.pIdt;
-                pRun->s.regs.sregs.idt.limit = pCtx->idtr.cbIdt;
+                NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.idt.base, pCtx->idtr.pIdt, dirty_sregs);
+                NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.idt.limit, pCtx->idtr.cbIdt, dirty_sregs);
                 pRun->s.regs.sregs.idt.padding[0] = 0;
                 pRun->s.regs.sregs.idt.padding[1] = 0;
                 pRun->s.regs.sregs.idt.padding[2] = 0;
             }
-            if (fExtrn & CPUMCTX_EXTRN_LDTR)
-                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.ldt, pCtx->ldtr);
-            if (fExtrn & CPUMCTX_EXTRN_TR)
-                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.tr, pCtx->tr);
+            if (fExtrn & CPUMCTX_EXTRN_LDTR and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.ldt, pCtx->ldtr)) {
+                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.ldt, pCtx->ldtr, dirty_sregs);
+            }
+            if (fExtrn & CPUMCTX_EXTRN_TR and not NEM_LNX_SREG_IDENTICAL(pRun->s.regs.sregs.tr, pCtx->tr)) {
+                NEM_LNX_EXPORT_SEG(pRun->s.regs.sregs.tr, pCtx->tr, dirty_sregs);
+            }
+
         }
         if (fExtrn & CPUMCTX_EXTRN_CR_MASK)
         {
-            if (fExtrn & CPUMCTX_EXTRN_CR0)
-                pRun->s.regs.sregs.cr0   = pCtx->cr0;
-            if (fExtrn & CPUMCTX_EXTRN_CR2)
-                pRun->s.regs.sregs.cr2   = pCtx->cr2;
-            if (fExtrn & CPUMCTX_EXTRN_CR3)
-                pRun->s.regs.sregs.cr3   = pCtx->cr3;
-            if (fExtrn & CPUMCTX_EXTRN_CR4)
-                pRun->s.regs.sregs.cr4   = pCtx->cr4;
+            if (fExtrn & CPUMCTX_EXTRN_CR0) {
+                NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.cr0, pCtx->cr0, dirty_sregs);
+            }
+            if (fExtrn & CPUMCTX_EXTRN_CR2) {
+                NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.cr2, pCtx->cr2, dirty_sregs);
+            }
+            if (fExtrn & CPUMCTX_EXTRN_CR3) {
+                NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.cr3, pCtx->cr3, dirty_sregs);
+            }
+            if (fExtrn & CPUMCTX_EXTRN_CR4) {
+                NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.cr4, pCtx->cr4, dirty_sregs);
+            }
         }
-        if (fExtrn & CPUMCTX_EXTRN_EFER)
-            pRun->s.regs.sregs.efer   = pCtx->msrEFER;
+        if (fExtrn & CPUMCTX_EXTRN_EFER) {
+            NEM_UPDATE_IF_CHANGED(pRun->s.regs.sregs.efer, pCtx->msrEFER, dirty_sregs);
+        }
+
 
-        RT_ZERO(pRun->s.regs.sregs.interrupt_bitmap); /* this is an alternative interrupt injection interface */
+        if (dirty_sregs) {
+            pRun->kvm_dirty_regs |= KVM_SYNC_X86_SREGS;
+        } else {
+            // This is a very weird and poorly documented part of the kvm_run structure.
+            // https://www.kernel.org/doc/html/latest/virt/kvm/api.html explains this the following way:
+            //
+            //   interrupt_bitmap is a bitmap of pending external interrupts. At most one bit may be set.
+            //   This interrupt has been acknowledged by the APIC but not yet injected into the cpu core.
+            //
+            // Looking at the kernel part of SET/GET_SREGS, we can see that this is kinda true, but not quite.
+            // The kernel sets only 1 bit, but never clears any of the fields. Thus, in order to have only
+            // a single bit set, userspace must clear the bitmap iff we haven't modified any SREGS. If we have
+            // modified SREGS, we have to transfer the unmodified bitmap back to KVM, because otherwise, we
+            // would tell KVM that the injection is no longer pending.
+            //
+            //
+            // This is a nasty interface and we should probably do what Qemu does, that is, using SET/GET_SREGS2
+            // where this field is no longer present.
+            RT_ZERO(pRun->s.regs.sregs.interrupt_bitmap);
+        }
 
-        pRun->kvm_dirty_regs |= KVM_SYNC_X86_SREGS;
     }
+#undef NEM_LNX_EXPORT_SEG
+#undef NEM_LNX_SREG_IDENTICAL
+#undef NEM_UPDATE_IF_CHANGED
 
     /*
      * Debug registers.
@@ -886,6 +1731,8 @@ static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_
         if (fExtrn & CPUMCTX_EXTRN_OTHER_MSRS)
         {
             ADD_MSR(MSR_IA32_CR_PAT, pCtx->msrPAT);
+            ADD_MSR(MSR_IA32_ARCH_CAPABILITIES, pCtxMsrs->msr.ArchCaps);
+            ADD_MSR(MSR_IA32_SPEC_CTRL, pCtxMsrs->msr.SpecCtrl);
             /** @todo What do we _have_ to add here?
              * We also have: Mttr*, MiscEnable, FeatureControl. */
         }
@@ -912,6 +1759,8 @@ static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_
                ==           (CPUMCTX_EXTRN_INHIBIT_INT | CPUMCTX_EXTRN_INHIBIT_NMI));
 
         struct kvm_vcpu_events KvmEvents = {0};
+        int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_GET_VCPU_EVENTS, &KvmEvents);
+        AssertLogRelMsgReturn(rcLnx == 0, ("rcLnx=%d errno=%d\n", rcLnx, errno), VERR_NEM_IPE_5);
 
         KvmEvents.flags = KVM_VCPUEVENT_VALID_SHADOW;
         if (!CPUMIsInInterruptShadowWithUpdate(&pVCpu->cpum.GstCtx))
@@ -923,26 +1772,7 @@ static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_
         /* No flag - this is updated unconditionally. */
         KvmEvents.nmi.masked = CPUMAreInterruptsInhibitedByNmi(&pVCpu->cpum.GstCtx);
 
-        if (TRPMHasTrap(pVCpu))
-        {
-            TRPMEVENT enmType = TRPM_32BIT_HACK;
-            uint8_t   bTrapNo = 0;
-            TRPMQueryTrap(pVCpu, &bTrapNo, &enmType);
-            Log(("nemHCLnxExportState: Pending trap: bTrapNo=%#x enmType=%d\n", bTrapNo, enmType));
-            if (   enmType == TRPM_HARDWARE_INT
-                || enmType == TRPM_SOFTWARE_INT)
-            {
-                KvmEvents.interrupt.soft     = enmType == TRPM_SOFTWARE_INT;
-                KvmEvents.interrupt.nr       = bTrapNo;
-                KvmEvents.interrupt.injected = 1;
-                STAM_REL_COUNTER_INC(&pVCpu->nem.s.StatExportPendingInterrupt);
-                TRPMResetTrap(pVCpu);
-            }
-            else
-                AssertFailed();
-        }
-
-        int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_SET_VCPU_EVENTS, &KvmEvents);
+        rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_SET_VCPU_EVENTS, &KvmEvents);
         AssertLogRelMsgReturn(rcLnx == 0, ("rcLnx=%d errno=%d\n", rcLnx, errno), VERR_NEM_IPE_3);
     }
 
@@ -967,8 +1797,31 @@ static int nemHCLnxExportState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx, struct kvm_
 VMM_INT_DECL(int) NEMHCQueryCpuTick(PVMCPUCC pVCpu, uint64_t *pcTicks, uint32_t *puAux)
 {
     STAM_REL_COUNTER_INC(&pVCpu->nem.s.StatQueryCpuTick);
-    // KVM_GET_CLOCK?
-    RT_NOREF(pVCpu, pcTicks, puAux);
+
+    // This function is called when the VM is paused or
+    // suspended. It's called for all vCPUs.
+
+    const size_t NMSRS = 2;
+
+    size_t szReq = RT_UOFFSETOF_DYN(struct kvm_msrs, entries[NMSRS]);
+    struct kvm_msrs *pReq = static_cast<kvm_msrs *>(alloca(szReq));
+    memset(pReq, 0, szReq);
+
+    pReq->nmsrs = NMSRS;
+    pReq->entries[0].index = MSR_IA32_TSC;
+    pReq->entries[1].index = MSR_K8_TSC_AUX;
+
+    int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_GET_MSRS, pReq);
+    AssertLogRelMsgReturn(rcLnx == NMSRS, ("rcLnx=%d errno=%d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+    if (pcTicks) {
+      *pcTicks = pReq->entries[0].data;
+    }
+
+    if (puAux) {
+      *puAux = static_cast<uint32_t>(pReq->entries[1].data);
+    }
+
     return VINF_SUCCESS;
 }
 
@@ -985,8 +1838,39 @@ VMM_INT_DECL(int) NEMHCQueryCpuTick(PVMCPUCC pVCpu, uint64_t *pcTicks, uint32_t
  */
 VMM_INT_DECL(int) NEMHCResumeCpuTickOnAll(PVMCC pVM, PVMCPUCC pVCpu, uint64_t uPausedTscValue)
 {
-    // KVM_SET_CLOCK?
-    RT_NOREF(pVM, pVCpu, uPausedTscValue);
+    RT_NOREF(pVCpu);
+
+    // This function is called once during unpause or resume. Despite
+    // the pVCpu parameter it is _not_ called for all vCPUs.
+
+    const size_t NMSRS = 1;
+
+    size_t szReq = RT_UOFFSETOF_DYN(struct kvm_msrs, entries[NMSRS]);
+    struct kvm_msrs *pReq = static_cast<kvm_msrs *>(alloca(szReq));
+    memset(pReq, 0, szReq);
+
+    pReq->nmsrs = NMSRS;
+    pReq->entries[0].index = MSR_IA32_TSC;
+    pReq->entries[0].data = uPausedTscValue;
+
+    // Setting the individual TSC values of all CPUs is fundamentally
+    // flawed, because the TSCs keep ticking while we set them. That
+    // means that we never really end up with synchronized TSC values
+    // unless KVM's built-in TSC synchronization magic fixes things up
+    // for us. But the interface doesn't leave us a lot of choice here
+    // for now.
+    //
+    // A better approach would be to use KVM_GET_CLOCK/KVM_SET_CLOCK
+    // and restore TSC_ADJUST values. We should validate whether this
+    // does the right thing though first.
+    for (VMCPUID idCpu = 0; idCpu < pVM->cCpus; idCpu++)
+    {
+        PVMCPU pVCpuCur = pVM->apCpusR3[idCpu];
+
+        int rcLnx = ioctl(pVCpuCur->nem.s.fdVCpu, KVM_SET_MSRS, pReq);
+        AssertLogRelMsgReturn(rcLnx == NMSRS, ("rcLnx=%d errno=%d\n", rcLnx, errno), VERR_NEM_IPE_5);
+    }
+
     return VINF_SUCCESS;
 }
 
@@ -1008,6 +1892,7 @@ VMM_INT_DECL(uint32_t) NEMHCGetFeatures(PVMCC pVM)
 
 VMMR3_INT_DECL(bool) NEMR3CanExecuteGuest(PVM pVM, PVMCPU pVCpu)
 {
+#ifndef VBOX_WITH_KVM_IRQCHIP_FULL
     /*
      * Only execute when the A20 gate is enabled as I cannot immediately
      * spot any A20 support in KVM.
@@ -1015,6 +1900,15 @@ VMMR3_INT_DECL(bool) NEMR3CanExecuteGuest(PVM pVM, PVMCPU pVCpu)
     RT_NOREF(pVM);
     Assert(VM_IS_NEM_ENABLED(pVM));
     return PGMPhysIsA20Enabled(pVCpu);
+#else
+    /*
+     * In full-irqchip mode, we always need to execute via KVM because we
+     * have no other way to inject interrupt into the guest (because the PIC is
+     * in the kernel!). Otherwise, we will break non-UEFI boot. This will
+     * break DOS support.
+     */
+    return true;
+#endif
 }
 
 
@@ -1027,6 +1921,14 @@ DECLHIDDEN(bool) nemR3NativeSetSingleInstruction(PVM pVM, PVMCPU pVCpu, bool fEn
 
 DECLHIDDEN(void) nemR3NativeNotifyFF(PVM pVM, PVMCPU pVCpu, uint32_t fFlags)
 {
+    if (pVCpu->hThread == RTThreadSelf()) {
+        // RTThreadPoke doesn't like poking the current thread. We can
+        // safely return here because the vCPU thread is currently handling
+        // an exit and will will check all conditions again when we re-enter
+        // the run-loop.
+        return;
+    }
+
     int rc = RTThreadPoke(pVCpu->hThread);
     LogFlow(("nemR3NativeNotifyFF: #%u -> %Rrc\n", pVCpu->idCpu, rc));
     AssertRC(rc);
@@ -1060,12 +1962,10 @@ static VBOXSTRICTRC nemHCLnxHandleInterruptFF(PVM pVM, PVMCPU pVCpu, struct kvm_
      * only inject one event per KVM_RUN call.  This can only happend if we
      * can directly from the loop in EM, so the inhibit bits must be internal.
      */
-    if (!TRPMHasTrap(pVCpu))
-    { /* semi likely */ }
-    else
+    if (TRPMHasTrap(pVCpu))
     {
-        Assert(!(pVCpu->cpum.GstCtx.fExtrn & (CPUMCTX_EXTRN_INHIBIT_INT | CPUMCTX_EXTRN_INHIBIT_NMI)));
         Log8(("nemHCLnxHandleInterruptFF: TRPM has an pending event already\n"));
+
         return VINF_SUCCESS;
     }
 
@@ -1074,12 +1974,12 @@ static VBOXSTRICTRC nemHCLnxHandleInterruptFF(PVM pVM, PVMCPU pVCpu, struct kvm_
      */
     if (VMCPU_FF_TEST_AND_CLEAR(pVCpu, VMCPU_FF_UPDATE_APIC))
     {
-        PDMApicUpdatePendingInterrupts(pVCpu);
-        if (!VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC
-                                      | VMCPU_FF_INTERRUPT_NMI  | VMCPU_FF_INTERRUPT_SMI))
-            return VINF_SUCCESS;
+        AssertLogRelMsgReturn(false, ("VMCPU_FF_UPDATE_APIC is set"), VERR_NEM_IPE_5);
     }
 
+    if (!VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_PIC | VMCPU_FF_INTERRUPT_NMI  | VMCPU_FF_INTERRUPT_SMI))
+        return VINF_SUCCESS;
+
     /*
      * We don't currently implement SMIs.
      */
@@ -1135,35 +2035,24 @@ static VBOXSTRICTRC nemHCLnxHandleInterruptFF(PVM pVM, PVMCPU pVCpu, struct kvm_
         Log8(("Queuing NMI on %u\n", pVCpu->idCpu));
     }
 
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+    AssertLogRelMsg(!VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_PIC), ("PDM has pic interrupt but full irqchip is enabled"));
+#else
     /*
-     * APIC or PIC interrupt?
+     * PIC interrupt?
      */
-    if (VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC))
+    if (VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_PIC))
     {
         if (pRun->s.regs.regs.rflags & X86_EFL_IF)
         {
-            if (KvmEvents.interrupt.shadow == 0)
+            if (pRun->ready_for_interrupt_injection)
             {
-                /*
-                 * If CR8 is in KVM, update the VBox copy so PDMGetInterrupt will
-                 * work correctly.
-                 */
-                if (pVCpu->cpum.GstCtx.fExtrn & CPUMCTX_EXTRN_APIC_TPR)
-                    PDMApicSetTpr(pVCpu, (uint8_t)pRun->cr8 << 4);
-
                 uint8_t bInterrupt;
                 int rc = PDMGetInterrupt(pVCpu, &bInterrupt);
                 if (RT_SUCCESS(rc))
                 {
-                    Assert(KvmEvents.interrupt.injected == false);
-#if 0
-                    int rcLnx = ioctl(pVCpu->nem.s.fdVm, KVM_INTERRUPT, (unsigned long)bInterrupt);
-                    AssertLogRelMsgReturn(rcLnx == 0, ("rcLnx=%d errno=%d\n", rcLnx, errno), VERR_NEM_IPE_5);
-#else
-                    KvmEvents.interrupt.nr       = bInterrupt;
-                    KvmEvents.interrupt.soft     = false;
-                    KvmEvents.interrupt.injected = true;
-#endif
+                    TRPMAssertTrap(pVCpu, bInterrupt, TRPM_HARDWARE_INT);
+
                     Log8(("Queuing interrupt %#x on %u: %04x:%08RX64 efl=%#x\n", bInterrupt, pVCpu->idCpu,
                           pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, pVCpu->cpum.GstCtx.eflags.u));
                 }
@@ -1184,7 +2073,7 @@ static VBOXSTRICTRC nemHCLnxHandleInterruptFF(PVM pVM, PVMCPU pVCpu, struct kvm_
             Log8(("Interrupt window pending on %u (#1)\n", pVCpu->idCpu));
         }
     }
-
+#endif
     /*
      * Now, update the state.
      */
@@ -1371,6 +2260,16 @@ static VBOXSTRICTRC nemHCLnxHandleExitMmio(PVMCC pVM, PVMCPUCC pVCpu, struct kvm
     VBOXSTRICTRC rcStrict;
     if (pRun->mmio.is_write)
     {
+        /*
+         * Sync LAPIC TPR register with cr8 from KVM. This is required as long
+         * as we don't use KVM's IRQCHIP feature.
+         *
+         * This doesn't cover the X2APIC mode. But the whole cr8-code will be
+         * gone very soon anyway as we will use KVM's split-irqchip.
+         */
+        if (pRun->mmio.phys_addr == XAPIC_TPR_ADDR) {
+            pRun->cr8 = *pRun->mmio.data >> LAPIC_TPR_SHIFT;
+        }
         rcStrict = PGMPhysWrite(pVM, pRun->mmio.phys_addr, pRun->mmio.data, pRun->mmio.len, PGMACCESSORIGIN_HM);
         Log4(("MmioExit/%u: %04x:%08RX64: WRITE %#x LB %u, %.*Rhxs -> rcStrict=%Rrc\n",
               pVCpu->idCpu, pRun->s.regs.sregs.cs.selector, pRun->s.regs.regs.rip,
@@ -1470,8 +2369,6 @@ static VBOXSTRICTRC nemHCLnxHandleExitWrMsr(PVMCPUCC pVCpu, struct kvm_run *pRun
     return rcStrict;
 }
 
-
-
 static VBOXSTRICTRC nemHCLnxHandleExit(PVMCC pVM, PVMCPUCC pVCpu, struct kvm_run *pRun, bool *pfStatefulExit)
 {
     STAM_REL_COUNTER_INC(&pVCpu->nem.s.StatExitTotal);
@@ -1500,12 +2397,10 @@ static VBOXSTRICTRC nemHCLnxHandleExit(PVMCC pVM, PVMCPUCC pVCpu, struct kvm_run
             return VINF_SUCCESS;
 
         case KVM_EXIT_SET_TPR:
-            STAM_REL_COUNTER_INC(&pVCpu->nem.s.StatExitSetTpr);
             AssertFailed();
             break;
 
         case KVM_EXIT_TPR_ACCESS:
-            STAM_REL_COUNTER_INC(&pVCpu->nem.s.StatExitTprAccess);
             AssertFailed();
             break;
 
@@ -1531,6 +2426,10 @@ static VBOXSTRICTRC nemHCLnxHandleExit(PVMCC pVM, PVMCPUCC pVCpu, struct kvm_run
                              pRun->s.regs.regs.rip + pRun->s.regs.sregs.cs.base, ASMReadTSC());
             STAM_REL_COUNTER_INC(&pVCpu->nem.s.StatExitIntr);
             Log5(("Intr/%u\n", pVCpu->idCpu));
+
+            /* If we don't consume the poke signal, subsequent KVM_RUN invocations will immediately return EINTR again. */
+            nemR3LnxConsumePokeSignal();
+
             return VINF_SUCCESS;
 
         case KVM_EXIT_HYPERCALL:
@@ -1547,11 +2446,48 @@ static VBOXSTRICTRC nemHCLnxHandleExit(PVMCC pVM, PVMCPUCC pVCpu, struct kvm_run
             AssertFailed();
             break;
         case KVM_EXIT_IOAPIC_EOI:
-            AssertFailed();
-            break;
+            PDMIoApicBroadcastEoi(pVM, pRun->eoi.vector);
+            return VINF_SUCCESS;
         case KVM_EXIT_HYPERV:
-            AssertFailed();
-            break;
+            Assert(pVM->gim.s.enmProviderId == GIMPROVIDERID_HYPERV);
+
+            switch (pRun->hyperv.type)
+            {
+            case KVM_EXIT_HYPERV_SYNDBG:
+                /* The synthetic debugger is not enabled and we should not get these exits. */
+                AssertFailed();
+                break;
+            case KVM_EXIT_HYPERV_HCALL:
+                LogRel2(("Hyper-V hcall input:%lx p0:%lx p1:%lx\n", pRun->hyperv.u.hcall.input, pRun->hyperv.u.hcall.params[0], pRun->hyperv.u.hcall.params[1]));
+
+                /* TODO KVM handles the performance-critical hypercalls on its own. We get mostly extended hypercalls
+                   here. We would need to forward them to gimHvHypercall. None of these features are enabled right now,
+                   so we can just deny the hypercall right away. */
+
+                pRun->hyperv.u.hcall.result = GIM_HV_STATUS_ACCESS_DENIED;
+                break;
+            case KVM_EXIT_HYPERV_SYNIC:
+                LogRel2(("HyperV synic msr:%lx control:%lx evt_page:%lx msg_page:%lx\n",
+                         pRun->hyperv.u.synic.msr,
+                         pRun->hyperv.u.synic.control,
+                         pRun->hyperv.u.synic.evt_page,
+                         pRun->hyperv.u.synic.msg_page));
+
+                switch (pRun->hyperv.u.synic.msr)
+                {
+                case MSR_GIM_HV_SCONTROL:
+                case MSR_GIM_HV_SIMP:
+                case MSR_GIM_HV_SIEFP:
+                    break;
+                default:
+                    AssertReleaseFailed();
+                }
+                break;
+            default:
+                AssertReleaseFailed();
+            }
+
+            return VINF_SUCCESS;
 
         case KVM_EXIT_DIRTY_RING_FULL:
             AssertFailed();
@@ -1619,6 +2555,82 @@ static VBOXSTRICTRC nemHCLnxHandleExit(PVMCC pVM, PVMCPUCC pVCpu, struct kvm_run
     return VERR_NOT_IMPLEMENTED;
 }
 
+static VBOXSTRICTRC nemHCLnxHandleTimers(PVMCC pVM, PVMCPUCC pVCpu)
+{
+    uint64_t nsAbsNextTimerEvt;
+    uint64_t uTscNow;
+    uint64_t nsDelta = TMVirtualSyncGetNsToDeadline(pVM, &nsAbsNextTimerEvt, &uTscNow);
+
+    [[maybe_unused]] uint64_t const nsAbsOldTimerEvt = pVCpu->nem.s.nsAbsNextTimerEvt;
+
+    pVCpu->nem.s.nsAbsNextTimerEvt = nsAbsNextTimerEvt;
+
+    /*
+     * With this optimization we only program timers once when something changes. We can enable this when we are
+     * confident that everything works correctly.
+     */
+#ifdef VBOX_KVM_DONT_REPROGRAM_TIMERS
+    if (nsAbsOldTimerEvt == nsAbsNextTimerEvt) {
+        return VINF_SUCCESS;
+    }
+#endif
+
+    if (nsDelta == 0) {
+        /* If there is no timeout, program a catch-all timer instead. */
+        nsDelta = RT_NS_1MS_64;
+    } else if (nsDelta >= RT_NS_1SEC_64) {
+        /* We need to exit at least once every 4 seconds. */
+        nsDelta = RT_NS_1SEC_64;
+    }
+
+    struct itimerspec timeout {};
+
+    /*
+     * It would be nice to program absolute timeouts here instead for better accuracy, but VBox times do not correlate
+     * to any Linux timer.
+     */
+    timeout.it_value.tv_sec = nsDelta / RT_NS_1SEC_64;
+    timeout.it_value.tv_nsec = nsDelta % RT_NS_1SEC_64;
+
+    int rcTimer = timer_settime(pVCpu->nem.s.pTimer, 0 /* relative timeout */,
+                                    &timeout, nullptr);
+    AssertLogRel(rcTimer == 0);
+
+    return VINF_SUCCESS;
+}
+
+static VBOXSTRICTRC nemHCLnxCheckAndInjectInterrupts(PVMCPUCC pVCpu)
+{
+#ifdef VBOX_WITH_KVM_IRQCHIP_FULL
+    NOREF(pVCpu);
+    AssertLogRelMsg(!TRPMHasTrap(pVCpu), ("TRPM has trap but full irqchip is enabled"));
+    return VINF_SUCCESS;
+#else
+    if (TRPMHasTrap(pVCpu))
+    {
+        TRPMEVENT enmType = TRPM_32BIT_HACK;
+        uint8_t   bTrapNo = 0;
+        TRPMQueryTrap(pVCpu, &bTrapNo, &enmType);
+        Log(("nemHCLnxCheckAndInjectInterrupts: Pending trap: bTrapNo=%#x enmType=%d\n", bTrapNo, enmType));
+        if (enmType == TRPM_HARDWARE_INT)
+        {
+            struct kvm_interrupt kvm_int;
+            RT_ZERO(kvm_int);
+            kvm_int.irq = bTrapNo;
+            int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_INTERRUPT, &kvm_int);
+            AssertLogRelMsgReturn(rcLnx == 0, ("rcLnx=%d errno=%d\n", rcLnx, errno), VERR_NEM_IPE_5);
+
+            TRPMResetTrap(pVCpu);
+        }
+        else
+        {
+            return VERR_NOT_SUPPORTED;
+        }
+
+    }
+    return VINF_SUCCESS;
+#endif
+}
 
 VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
 {
@@ -1636,6 +2648,28 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
         return VINF_SUCCESS;
     }
 
+    /*
+     * The first time we come here, we have to apply Spectre mitigations. The prctl interface only allows us to set
+     * these only for the current thread.
+     */
+    if (!pVCpu->nem.s.fMitigationsApplied) {
+        Log(("NEM/%u: applying mitigations\n", pVCpu->idCpu));
+        if (pVM->hm.s.fIbpbOnVmEntry || pVM->hm.s.fIbpbOnVmExit) {
+            int rcLnx = prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0);
+
+            if (rcLnx != 0 && errno == EPERM) {
+                LogRel(("WARNING: requested IBPB, but kernel API is not activated! Boot Linux with spectre_v2_user=prctl.\n", pVCpu->idCpu));
+            } else {
+                AssertLogRelMsgReturn(rcLnx == 0,
+                                      ("rcLnx=%d errno=%d\n", rcLnx, errno),
+                                      VERR_NEM_MISSING_KERNEL_API_1);
+                Log(("NEM/%u: enabled IBPB\n", pVCpu->idCpu));
+            }
+        }
+
+        pVCpu->nem.s.fMitigationsApplied = true;
+    }
+
     /*
      * The run loop.
      */
@@ -1664,6 +2698,8 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
             }
         }
 
+    // See NEMR3CanExecuteGuest for details why we ignore A20 at this point.
+#ifndef VBOX_WITH_KVM_IRQCHIP_FULL
         /*
          * Do not execute in KVM if the A20 isn't enabled.
          */
@@ -1675,6 +2711,7 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
             LogFlow(("NEM/%u: breaking: A20 disabled\n", pVCpu->idCpu));
             break;
         }
+#endif
 
         /*
          * Ensure KVM has the whole state.
@@ -1685,17 +2722,9 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
             AssertRCReturn(rc2, rc2);
         }
 
-        /*
-         * Poll timers and run for a bit.
-         *
-         * With the VID approach (ring-0 or ring-3) we can specify a timeout here,
-         * so we take the time of the next timer event and uses that as a deadline.
-         * The rounding heuristics are "tuned" so that rhel5 (1K timer) will boot fine.
-         */
-        /** @todo See if we cannot optimize this TMTimerPollGIP by only redoing
-         *        the whole polling job when timers have changed... */
-        uint64_t       offDeltaIgnored;
-        uint64_t const nsNextTimerEvt = TMTimerPollGIP(pVM, pVCpu, &offDeltaIgnored); NOREF(nsNextTimerEvt);
+        /* Poll timers and run for a bit. */
+        nemHCLnxHandleTimers(pVM, pVCpu);
+
         if (   !VM_FF_IS_ANY_SET(pVM, VM_FF_EMT_RENDEZVOUS | VM_FF_TM_VIRTUAL_SYNC)
             && !VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_HM_TO_R3_MASK))
         {
@@ -1705,13 +2734,25 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
                          pVCpu->idCpu, pRun->s.regs.sregs.cs.selector, pRun->s.regs.regs.rip,
                          !!(pRun->s.regs.regs.rflags & X86_EFL_IF), pRun->s.regs.regs.rflags,
                          pRun->s.regs.sregs.ss.selector, pRun->s.regs.regs.rsp, pRun->s.regs.sregs.cr0));
+
+                VBOXSTRICTRC rc2 = nemHCLnxCheckAndInjectInterrupts(pVCpu);
+                AssertLogRelMsg(RT_SUCCESS(rc2), ("Failed to inject interrupt"));
+
                 TMNotifyStartOfExecution(pVM, pVCpu);
 
+#ifdef VBOX_WITH_KVM_NESTING
+                AssertReleaseMsg(not (pVCpu->nem.s.nestedGuestActive and pRun->kvm_dirty_regs),
+                            ("Bug: Nested guest actitive and dirty regs are set: %x", pRun->kvm_dirty_regs));
+#endif
+
                 int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_RUN, 0UL);
+                int errno_ = errno;
 
                 VMCPU_CMPXCHG_STATE(pVCpu, VMCPUSTATE_STARTED_EXEC_NEM, VMCPUSTATE_STARTED_EXEC_NEM_WAIT);
                 TMNotifyEndOfExecution(pVM, pVCpu, ASMReadTSC());
 
+                pVCpu->nem.s.pRun->immediate_exit = 0;
+
 #ifdef LOG_ENABLED
                 if (LogIsFlowEnabled())
                 {
@@ -1724,8 +2765,15 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
                 }
 #endif
                 fStatefulExit = false;
-                if (RT_LIKELY(rcLnx == 0 || errno == EINTR))
+                if (RT_LIKELY(rcLnx == 0 || errno_ == EINTR))
                 {
+#ifdef VBOX_WITH_KVM_NESTING
+                    if (pRun->exit_reason == KVM_EXIT_INTR) {
+                        pVCpu->nem.s.nestedGuestActive = KvmIsNestedGuestExit(pVM, pVCpu);
+                    } else {
+                        pVCpu->nem.s.nestedGuestActive = false;
+                    }
+#endif
                     /*
                      * Deal with the exit.
                      */
@@ -1739,10 +2787,19 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
                         break;
                     }
                 }
+                else if (errno_ == EAGAIN) {
+                    /*
+                    * We might drop out of KVM_RUN if the vCPU is still in an
+                    * uninitialized state (e.g. WAIT_FOR_INIT) and some spurious
+                    * wakeup event is received. In this case, simply do nothing
+                    * and let the run loop enter KVM_RUN again.
+                    * See https://elixir.bootlin.com/linux/v6.6/source/arch/x86/kvm/x86.c#L11138
+                    */
+                }
                 else
                 {
-                    int rc2 = RTErrConvertFromErrno(errno);
-                    AssertLogRelMsgFailedReturn(("KVM_RUN failed: rcLnx=%d errno=%u rc=%Rrc\n", rcLnx, errno, rc2), rc2);
+                    rc2 = RTErrConvertFromErrno(errno_);
+                    AssertLogRelMsgFailedReturn(("KVM_RUN failed: rcLnx=%d errno=%u rc=%Rrc\n", rcLnx, errno_, rc2), rc2);
                 }
 
                 /*
@@ -1887,4 +2944,3 @@ VMMR3_INT_DECL(VBOXSTRICTRC) NEMR3RunGC(PVM pVM, PVMCPU pVCpu)
  * This is using KVM.
  *
  */
-
diff --git a/src/VBox/VMM/VMMR3/NEMR3NativeTemplate-linux.cpp.h b/src/VBox/VMM/VMMR3/NEMR3NativeTemplate-linux.cpp.h
index edce310..62c788f 100644
--- a/src/VBox/VMM/VMMR3/NEMR3NativeTemplate-linux.cpp.h
+++ b/src/VBox/VMM/VMMR3/NEMR3NativeTemplate-linux.cpp.h
@@ -431,6 +431,7 @@ static int nemR3LnxInitCheckCapabilities(PVM pVM, PRTERRINFO pErrInfo)
 
 
 /** @callback_method_impl{FNVMMEMTRENDEZVOUS}   */
+#ifndef VBOX_WITH_KVM
 static DECLCALLBACK(VBOXSTRICTRC) nemR3LnxFixThreadPoke(PVM pVM, PVMCPU pVCpu, void *pvUser)
 {
     RT_NOREF(pVM, pvUser);
@@ -438,11 +439,107 @@ static DECLCALLBACK(VBOXSTRICTRC) nemR3LnxFixThreadPoke(PVM pVM, PVMCPU pVCpu, v
     AssertLogRelRC(rc);
     return VINF_SUCCESS;
 }
+#else
+static VBOXSTRICTRC nemR3LnxSetVCpuSignalMask(PVMCPU pVCpu, sigset_t *pSigset)
+{
+    /*
+     * glibc and Linux/KVM do not agree on the size of sigset_t.
+     */
+    constexpr size_t kernel_sigset_size = 8;
+
+    alignas(kvm_signal_mask) char backing[sizeof(kvm_signal_mask) + kernel_sigset_size];
+    kvm_signal_mask *pKvmSignalMask = reinterpret_cast<kvm_signal_mask *>(backing);
+
+    static_assert(sizeof(sigset_t) >= kernel_sigset_size);
+
+    pKvmSignalMask->len = kernel_sigset_size;
+    memcpy(pKvmSignalMask->sigset, pSigset, kernel_sigset_size);
+
+    int rc = ioctl(pVCpu->nem.s.fdVCpu, KVM_SET_SIGNAL_MASK, pKvmSignalMask);
+    AssertLogRelMsgReturn(rc == 0, ("Failed to set vCPU signal mask: %d", errno),
+                          VERR_NEM_INIT_FAILED);
+
+    return VINF_SUCCESS;
+}
+
+static DECLCALLBACK(VBOXSTRICTRC) nemR3LnxFixThreadPoke(PVM pVM, PVMCPU pVCpu, void *pvUser)
+{
+    RT_NOREF(pVM, pvUser);
+
+    int iPokeSignal = RTThreadPokeSignal();
+    AssertReturn(iPokeSignal >= 0, VERR_NEM_INIT_FAILED);
+
+    /* We disable the poke signal for the host. We never want that signal to be delivered. */
+    int rc = RTThreadControlPokeSignal(pVCpu->hThread, false /*fEnable*/);
+    AssertLogRelRC(rc);
+
+    sigset_t sigset;
+
+    /* Fetch the current signal mask. */
+    int rcProcMask = pthread_sigmask(SIG_BLOCK /* ignored */, nullptr, &sigset);
+    AssertLogRelMsgReturn(rcProcMask == 0, ("Failed to retrieve thread signal mask"), VERR_NEM_INIT_FAILED);
+
+    sigdelset(&sigset, iPokeSignal);
+
+    /* We enable the poke signal for the vCPU. Any poke will kick the vCPU out of guest execution. */
+    VBOXSTRICTRC rcVcpuMask = nemR3LnxSetVCpuSignalMask(pVCpu, &sigset);
+    AssertRCSuccessReturn(rcVcpuMask, rcVcpuMask);
+
+    /* Create a timer that delivers the poke signal. */
+    struct sigevent sev {};
+
+    sev.sigev_notify = SIGEV_THREAD_ID;
+    sev.sigev_signo = iPokeSignal;
+    sev._sigev_un._tid = gettid();
+
+    int rcTimer = timer_create(CLOCK_MONOTONIC, &sev, &pVCpu->nem.s.pTimer);
+    AssertLogRelMsgReturn(rcTimer == 0, ("Failed to create timer: %d", errno), VERR_NEM_INIT_FAILED);
+
+    return VINF_SUCCESS;
+}
+#endif
 
 
+#ifdef VBOX_WITH_KVM
+/**
+ * Check common environment problems and inform the user about misconfigurations.
+ */
+int nemR3CheckEnvironment(void)
+{
+    static const char szSplitLockMitigationFile[] = "/proc/sys/kernel/split_lock_mitigate";
+
+    char buf[64] {};
+    int fd = open(szSplitLockMitigationFile, O_RDONLY | O_CLOEXEC);
+
+    // Older kernels might not have this. A hard error feels unjustified here.
+    AssertLogRelMsgReturn(fd >= 0, ("Failed to check %s (%d). Assuming there is no problem.\n", szSplitLockMitigationFile, fd),
+                          VINF_SUCCESS);
+
+    /* Leave one character to ensure that the string is zero-terminated. */
+    ssize_t bytes = read(fd, buf, sizeof(buf) - 1);
+    AssertLogRelMsgReturn(bytes >= 0, ("Failed to read %s (%zd)\n", szSplitLockMitigationFile, bytes),
+                          VERR_NEM_INIT_FAILED);
+
+    int mitigationStatus = atoi(buf);
+
+    if (mitigationStatus != 0) {
+        LogRel(("NEM: WARNING: %s is %d. This can cause VM hangs, unless you set split_lock_detect=off on the host kernel command line! Please set it to 0.\n",
+                szSplitLockMitigationFile, mitigationStatus));
+    }
+
+    return VINF_SUCCESS;
+}
+#endif
+
 DECLHIDDEN(int) nemR3NativeInit(PVM pVM, bool fFallback, bool fForced)
 {
     RT_NOREF(pVM, fFallback, fForced);
+
+#ifdef VBOX_WITH_KVM
+    int rcCheck = nemR3CheckEnvironment();
+    AssertLogRelMsgReturn(RT_SUCCESS(rcCheck), ("Failed to check environment\n"), VERR_NEM_INIT_FAILED);
+#endif
+
     /*
      * Some state init.
      */
@@ -623,13 +720,32 @@ DECLHIDDEN(int) nemR3NativeTerm(PVM pVM)
         close(pVM->nem.s.fdKvm);
         pVM->nem.s.fdKvm = -1;
     }
+
+#ifdef VBOX_WITH_KVM
+    pVM->nem.s.pARedirectionTable.reset();
+#endif
     return VINF_SUCCESS;
 }
 
 
 DECLHIDDEN(void) nemR3NativeReset(PVM pVM)
 {
+#ifndef VBOX_WITH_KVM
     RT_NOREF(pVM);
+#else
+    pVM->nem.s.pARedirectionTable->fill(std::nullopt);
+
+    for (VMCPUID idCpu = 0; idCpu < pVM->cCpus; idCpu++)
+    {
+        PVMCPU pVCpu = pVM->apCpusR3[idCpu];
+
+        struct kvm_mp_state mp;
+        mp.mp_state = pVCpu->idCpu == 0 ? KVM_MP_STATE_RUNNABLE : KVM_MP_STATE_UNINITIALIZED;
+
+        int rcLnx = ioctl(pVCpu->nem.s.fdVCpu, KVM_SET_MP_STATE, &mp);
+        AssertLogRelMsg(rcLnx == 0, ("nemR3NativeReset: Failed to set MP state. Error: %d, errno %d\n", rcLnx, errno));
+    }
+#endif
 }
 
 
diff --git a/src/VBox/VMM/VMMR3/PDMDevMiscHlp.cpp b/src/VBox/VMM/VMMR3/PDMDevMiscHlp.cpp
index 6ae3e07..3eaa3be 100644
--- a/src/VBox/VMM/VMMR3/PDMDevMiscHlp.cpp
+++ b/src/VBox/VMM/VMMR3/PDMDevMiscHlp.cpp
@@ -37,6 +37,7 @@
 #ifdef VBOX_VMM_TARGET_X86
 # include <VBox/vmm/pdmapic.h>
 #endif
+#include <VBox/vmm/nem.h>
 #include <VBox/vmm/vm.h>
 #include <VBox/vmm/vmm.h>
 
@@ -116,6 +117,34 @@ static DECLCALLBACK(void) pdmR3PicHlp_Unlock(PPDMDEVINS pDevIns)
     pdmUnlock(pDevIns->Internal.s.pVMR3);
 }
 
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+/** @interface_method_impl{PDMPICHLP,pfnKvmSetIrqLine} */
+static DECLCALLBACK(int) pdmR3PicHlp_KvmSetIrqLine(PPDMDEVINS pDevIns, uint16_t u16Gsi, int iLevel)
+{
+    PDMDEV_ASSERT_DEVINS(pDevIns);
+    PVM pVM = pDevIns->Internal.s.pVMR3;
+
+    return NEMR3KvmSetIrqLine(pVM, u16Gsi, iLevel);
+}
+
+/** @interface_method_impl{PDMPICHLP,pfnKvmGetPicState} */
+static DECLCALLBACK(int) pdmR3PicHlp_KvmGetPicState(PPDMDEVINS pDevIns, KVMIRQCHIP irqchip, KVMPICSTATE* state)
+{
+    PDMDEV_ASSERT_DEVINS(pDevIns);
+    PVM pVM = pDevIns->Internal.s.pVMR3;
+
+    return NEMR3KvmGetPicState(pVM, irqchip, state);
+}
+
+/** @interface_method_impl{PDMPICHLP,pfnKvmSetPicState} */
+static DECLCALLBACK(int) pdmR3PicHlp_KvmSetPicState(PPDMDEVINS pDevIns, KVMIRQCHIP irqchip, KVMPICSTATE* state)
+{
+    PDMDEV_ASSERT_DEVINS(pDevIns);
+    PVM pVM = pDevIns->Internal.s.pVMR3;
+
+    return NEMR3KvmSetPicState(pVM, irqchip, state);
+}
+#endif
 
 /**
  * PIC Device Helpers.
@@ -127,6 +156,11 @@ const PDMPICHLP g_pdmR3DevPicHlp =
     pdmR3PicHlp_ClearInterruptFF,
     pdmR3PicHlp_Lock,
     pdmR3PicHlp_Unlock,
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+    pdmR3PicHlp_KvmSetIrqLine,
+    pdmR3PicHlp_KvmGetPicState,
+    pdmR3PicHlp_KvmSetPicState,
+#endif
     PDM_PICHLP_VERSION /* the end */
 };
 
@@ -199,7 +233,64 @@ static DECLCALLBACK(int) pdmR3IoApicHlp_IommuMsiRemap(PPDMDEVINS pDevIns, uint16
     return VERR_IOMMU_NOT_PRESENT;
 }
 
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+/** @interface_method_impl{PDMIOAPICHLP,pfnKvmSetIrqLine} */
+static DECLCALLBACK(int) pdmR3IoApicHlp_KvmSetIrqLine(PPDMDEVINS pDevIns, uint16_t u16Gsi, int iLevel) {
+    PDMDEV_ASSERT_DEVINS(pDevIns);
+    PVM pVM = pDevIns->Internal.s.pVMR3;
+
+    return NEMR3KvmSetIrqLine(pVM, u16Gsi, iLevel);
+}
+
+/** @interface_method_impl{PDMIOAPICHLP,pfnKvmSplitIrqchipDeliverMsi} */
+static DECLCALLBACK(int) pdmR3IoApicHlp_KvmSplitIrqchipDeliverMsi(PPDMDEVINS pDevIns, PCMSIMSG pMsi)
+{
+    PDMDEV_ASSERT_DEVINS(pDevIns);
+    PVM pVM = pDevIns->Internal.s.pVMR3;
+
+    return NEMR3KvmSplitIrqchipDeliverMsi(pVM, pMsi);
+}
+
 
+/** @interface_method_impl{PDMIOAPICHLP,pfnKvmSplitIrqchipAddUpdateRTE} */
+static DECLCALLBACK(int) pdmR3IoApicHlp_KvmSplitIrqchipAddUpdateRTE(PPDMDEVINS pDevIns, uint16_t gsi, PCMSIMSG pMsi)
+{
+    PDMDEV_ASSERT_DEVINS(pDevIns);
+    PVM pVM = pDevIns->Internal.s.pVMR3;
+
+    return NEMR3KvmSplitIrqchipAddUpdateRTE(pVM, gsi, pMsi);
+}
+
+
+/** @interface_method_impl{PDMIOAPICHLP,pfnKvmSplitIrqchipRemoveRTE} */
+static DECLCALLBACK(int) pdmR3IoApicHlp_KvmSplitIrqchipRemoveRTE(PPDMDEVINS pDevIns, uint16_t gsi)
+{
+    PDMDEV_ASSERT_DEVINS(pDevIns);
+    PVM pVM = pDevIns->Internal.s.pVMR3;
+
+    return NEMR3KvmSplitIrqchipRemoveRTE(pVM, gsi);
+}
+#endif
+
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+/** @interface_method_impl{PDMIOAPICHLP,pfnKvmGetIoApicState} */
+static DECLCALLBACK(int) pdmR3IoApicHlp_pfnKvmGetIoApicState(PPDMDEVINS pDevIns, KVMIOAPICSTATE* state)
+{
+    PDMDEV_ASSERT_DEVINS(pDevIns);
+    PVM pVM = pDevIns->Internal.s.pVMR3;
+
+    return NEMR3KvmGetIoApicState(pVM, state);
+}
+
+/** @interface_method_impl{PDMIOAPICHLP,pfnKvmSetIoApicState} */
+static DECLCALLBACK(int) pdmR3IoApicHlp_pfnKvmSetIoApicState(PPDMDEVINS pDevIns, KVMIOAPICSTATE* state)
+{
+    PDMDEV_ASSERT_DEVINS(pDevIns);
+    PVM pVM = pDevIns->Internal.s.pVMR3;
+
+    return NEMR3KvmSetIoApicState(pVM, state);
+}
+#endif
 /**
  * I/O APIC Device Helpers.
  */
@@ -211,6 +302,17 @@ const PDMIOAPICHLP g_pdmR3DevIoApicHlp =
     pdmR3IoApicHlp_Unlock,
     pdmR3IoApicHlp_LockIsOwner,
     pdmR3IoApicHlp_IommuMsiRemap,
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+    pdmR3IoApicHlp_KvmSetIrqLine,
+    pdmR3IoApicHlp_KvmSplitIrqchipDeliverMsi,
+    pdmR3IoApicHlp_KvmSplitIrqchipAddUpdateRTE,
+    pdmR3IoApicHlp_KvmSplitIrqchipRemoveRTE,
+#endif
+
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM_IRQCHIP_FULL)
+    pdmR3IoApicHlp_pfnKvmGetIoApicState,
+    pdmR3IoApicHlp_pfnKvmSetIoApicState,
+#endif
     PDM_IOAPICHLP_VERSION /* the end */
 };
 
diff --git a/src/VBox/VMM/VMMR3/PGMPhys.cpp b/src/VBox/VMM/VMMR3/PGMPhys.cpp
index 4b98cf6..55eb2ca 100644
--- a/src/VBox/VMM/VMMR3/PGMPhys.cpp
+++ b/src/VBox/VMM/VMMR3/PGMPhys.cpp
@@ -2123,11 +2123,16 @@ int pgmR3PhysRamPreAllocate(PVM pVM)
     Assert(pVM->pgm.s.fRamPreAlloc);
     Log(("pgmR3PhysRamPreAllocate: enter\n"));
 # ifdef VBOX_WITH_PGM_NEM_MODE
+#ifdef VBOX_WITH_PREALLOC_RAM_BY_DEFAULT
+    Log(("pgmR3PhysRamPreAllocate: Handled by default in NEM mode, skip\n"));
+    return VINF_SUCCESS;
+#else
     if (VM_IS_NEM_ENABLED(pVM))
     {
         LogRel(("PGM: Pre-alloc ignored in NEM mode.\n"));
         return VINF_SUCCESS;
     }
+#endif
 # endif
 
     /*
diff --git a/src/VBox/VMM/VMMR3/VMM.cpp b/src/VBox/VMM/VMMR3/VMM.cpp
index aa51e5f..c65adcd 100644
--- a/src/VBox/VMM/VMMR3/VMM.cpp
+++ b/src/VBox/VMM/VMMR3/VMM.cpp
@@ -1104,6 +1104,11 @@ static DECLCALLBACK(int) vmmR3Load(PVM pVM, PSSMHANDLE pSSM, uint32_t uVersion,
         AssertMsgFailed(("u32=%#x\n", u32));
         return VERR_SSM_DATA_UNIT_FORMAT_CHANGED;
     }
+
+#ifdef VBOX_WITH_KVM
+    NEMR3LoadExec(pVM);
+#endif
+
     return VINF_SUCCESS;
 }
 
diff --git a/src/VBox/VMM/VMMR3/target-x86/CPUMR3-x86.cpp b/src/VBox/VMM/VMMR3/target-x86/CPUMR3-x86.cpp
index cb13659..06bf326 100644
--- a/src/VBox/VMM/VMMR3/target-x86/CPUMR3-x86.cpp
+++ b/src/VBox/VMM/VMMR3/target-x86/CPUMR3-x86.cpp
@@ -1569,6 +1569,7 @@ DECLHIDDEN(void) cpumR3InitVmxGuestFeaturesAndMsrs(PVM pVM, PCFGMNODE pCpumCfg,
     if (fVmxEpt)
     {
         const char *pszWhy = NULL;
+#ifndef VBOX_WITH_KVM_NESTING
         if (!VM_IS_HM_ENABLED(pVM) && !VM_IS_EXEC_ENGINE_IEM(pVM))
             pszWhy = "execution engine is neither HM nor IEM";
 #ifdef RT_ARCH_AMD64
@@ -1576,6 +1577,9 @@ DECLHIDDEN(void) cpumR3InitVmxGuestFeaturesAndMsrs(PVM pVM, PCFGMNODE pCpumCfg,
             pszWhy = "nested paging is not enabled for the VM or it is not supported by the host";
         else if (VM_IS_HM_ENABLED(pVM) && !pVM->cpum.s.HostFeatures.s.fNoExecute)
             pszWhy = "NX is not available on the host";
+#endif
+#else
+        if (VM_IS_HM_ENABLED(pVM) && !HMIsNestedPagingActive(pVM))
 #endif
         if (pszWhy)
         {
@@ -2453,10 +2457,20 @@ DECLCALLBACK(int) cpumR3LoadExecTarget(PVM pVM, PSSMHANDLE pSSM, uint32_t uVersi
                     rc = SSMR3GetStructEx(pSSM, &pGstCtx->XState.Hdr, sizeof(pGstCtx->XState.Hdr),
                                           0, g_aCpumXSaveHdrFields, NULL);
                     AssertRCReturn(rc, rc);
+#ifndef VBOX_WITH_KVM
+                    /*
+                     * This assertion triggers on resume when the guest was
+                     * suspended early during boot. The hypothesis is that this
+                     * happens when XSAVE is not enabled yet. Seems harmless for
+                     * now.
+                     *
+                     * See virtualbox#69.
+                     */
                     AssertLogRelMsgReturn(!(pGstCtx->XState.Hdr.bmXState & ~pGstCtx->fXStateMask),
                                           ("bmXState=%#RX64 fXStateMask=%#RX64\n",
                                            pGstCtx->XState.Hdr.bmXState, pGstCtx->fXStateMask),
                                           VERR_CPUM_INVALID_XSAVE_HDR);
+#endif
                 }
                 if (pGstCtx->fXStateMask & XSAVE_C_YMM)
                 {
diff --git a/src/VBox/VMM/VMMR3/target-x86/CPUMR3CpuId-x86.cpp b/src/VBox/VMM/VMMR3/target-x86/CPUMR3CpuId-x86.cpp
index 913e00a..db60ee1 100644
--- a/src/VBox/VMM/VMMR3/target-x86/CPUMR3CpuId-x86.cpp
+++ b/src/VBox/VMM/VMMR3/target-x86/CPUMR3CpuId-x86.cpp
@@ -1325,6 +1325,13 @@ static int cpumR3CpuIdSanitize(PVM pVM, PCPUM pCpum, PCPUMCPUIDCONFIG pConfig)
     PASSTHRU_FEATURE_EX(enmConfig, fHostFeature, !VM_IS_EXEC_ENGINE_IEM(pVM), fConst)
 #define PASSTHRU_FEATURE_TODO(enmConfig, fConst) ((enmConfig) ? (fConst) : 0)
 
+#ifdef VBOX_WITH_KVM
+#define PASSTHRU_FEATURE_KVM_ONLY(fConst) (fConst)
+#else
+#define PASSTHRU_FEATURE_KVM_ONLY(fConst) (0)
+#endif
+
+
     /* Cpuid 1:
      * EAX: CPU model, family and stepping.
      *
@@ -1584,7 +1591,7 @@ static int cpumR3CpuIdSanitize(PVM pVM, PCPUM pCpum, PCPUMCPUIDCONFIG pConfig)
                                | X86_CPUID_AMD_FEATURE_EDX_MMX
                                | X86_CPUID_AMD_FEATURE_EDX_FXSR
                                | X86_CPUID_AMD_FEATURE_EDX_FFXSR
-                               //| X86_CPUID_EXT_FEATURE_EDX_PAGE1GB
+                               | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_EXT_FEATURE_EDX_PAGE1GB)
                                | X86_CPUID_EXT_FEATURE_EDX_RDTSCP
                                //| RT_BIT_32(28)                    - reserved
                                //| X86_CPUID_EXT_FEATURE_EDX_LONG_MODE - turned on when necessary
@@ -1846,9 +1853,9 @@ static int cpumR3CpuIdSanitize(PVM pVM, PCPUM pCpum, PCPUMCPUIDCONFIG pConfig)
                                //| X86_CPUID_STEXT_FEATURE_EBX_HLE               RT_BIT(4)
                                | PASSTHRU_FEATURE(pConfig->enmAvx2, pHstFeat->fAvx2, X86_CPUID_STEXT_FEATURE_EBX_AVX2)
                                | X86_CPUID_STEXT_FEATURE_EBX_FDP_EXCPTN_ONLY
-                               //| X86_CPUID_STEXT_FEATURE_EBX_SMEP              RT_BIT(7)
+                               | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EBX_SMEP)
                                | X86_CPUID_STEXT_FEATURE_EBX_BMI2
-                               //| X86_CPUID_STEXT_FEATURE_EBX_ERMS              RT_BIT(9)
+                               | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EBX_ERMS)
                                | PASSTHRU_FEATURE_NOT_IEM(pConfig->enmInvpcid, pHstFeat->fInvpcid, X86_CPUID_STEXT_FEATURE_EBX_INVPCID)
                                //| X86_CPUID_STEXT_FEATURE_EBX_RTM               RT_BIT(11)
                                //| X86_CPUID_STEXT_FEATURE_EBX_PQM               RT_BIT(12)
@@ -1860,10 +1867,11 @@ static int cpumR3CpuIdSanitize(PVM pVM, PCPUM pCpum, PCPUMCPUIDCONFIG pConfig)
                                | PASSTHRU_FEATURE_TODO(pConfig->enmRdSeed, X86_CPUID_STEXT_FEATURE_EBX_RDSEED)
                                | PASSTHRU_FEATURE(pConfig->enmAdx, pHstFeat->fAdx, X86_CPUID_STEXT_FEATURE_EBX_ADX)
                                //| X86_CPUID_STEXT_FEATURE_EBX_SMAP              RT_BIT(20)
+                               | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EBX_SMAP)
                                //| RT_BIT(21) - reserved
                                //| RT_BIT(22) - reserved
                                | PASSTHRU_FEATURE(pConfig->enmCLFlushOpt, pHstFeat->fClFlushOpt, X86_CPUID_STEXT_FEATURE_EBX_CLFLUSHOPT)
-                               //| RT_BIT(24) - reserved
+                               | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EBX_CLWB)
                                //| X86_CPUID_STEXT_FEATURE_EBX_INTEL_PT          RT_BIT(25)
                                //| X86_CPUID_STEXT_FEATURE_EBX_AVX512PF          RT_BIT(26)
                                //| X86_CPUID_STEXT_FEATURE_EBX_AVX512ER          RT_BIT(27)
@@ -1874,18 +1882,21 @@ static int cpumR3CpuIdSanitize(PVM pVM, PCPUM pCpum, PCPUMCPUIDCONFIG pConfig)
                                ;
                 pCurLeaf->uEcx &= 0
                                //| X86_CPUID_STEXT_FEATURE_ECX_PREFETCHWT1 - we do not do vector functions yet.
+                               | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_ECX_GFNI)
                                ;
                 pCurLeaf->uEdx &= 0
+                               | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EDX_FSRM)
+                               | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EDX_SERIALIZE)
                                //| X86_CPUID_STEXT_FEATURE_EDX_SRBDS_CTRL        RT_BIT(9)
                                | PASSTHRU_FEATURE(pConfig->enmMdsClear,   pHstFeat->fMdsClear, X86_CPUID_STEXT_FEATURE_EDX_MD_CLEAR)
                                //| X86_CPUID_STEXT_FEATURE_EDX_TSX_FORCE_ABORT   RT_BIT_32(11)
                                //| X86_CPUID_STEXT_FEATURE_EDX_CET_IBT           RT_BIT(20)
-                               //| X86_CPUID_STEXT_FEATURE_EDX_IBRS_IBPB         RT_BIT(26)
-                               //| X86_CPUID_STEXT_FEATURE_EDX_STIBP             RT_BIT(27)
+                               | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EDX_IBRS_IBPB)
+                               | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EDX_STIBP)
                                | PASSTHRU_FEATURE(pConfig->enmFlushCmdMsr, pHstFeat->fFlushCmd, X86_CPUID_STEXT_FEATURE_EDX_FLUSH_CMD)
                                | PASSTHRU_FEATURE(pConfig->enmArchCapMsr,  pHstFeat->fArchCap,  X86_CPUID_STEXT_FEATURE_EDX_ARCHCAP)
                                //| X86_CPUID_STEXT_FEATURE_EDX_CORECAP           RT_BIT_32(30)
-                               //| X86_CPUID_STEXT_FEATURE_EDX_SSBD              RT_BIT_32(31)
+                               | PASSTHRU_FEATURE_KVM_ONLY(X86_CPUID_STEXT_FEATURE_EDX_SSBD)
                                ;
 
                 /* Mask out INVPCID unless FSGSBASE is exposed due to a bug in Windows 10 SMP guests, see @bugref{9089#c15}. */
@@ -2915,6 +2926,7 @@ static int cpumR3CpuIdReadConfig(PVM pVM, PCPUMCPUIDCONFIG pConfig, PCFGMNODE pC
         AssertLogRelRCReturn(rc, rc);
         if (pConfig->fNestedHWVirt)
         {
+#ifndef VBOX_WITH_KVM_NESTING
             /** @todo Think about enabling this later with NEM/KVM. */
             if (VM_IS_NEM_ENABLED(pVM))
             {
@@ -2924,6 +2936,7 @@ static int cpumR3CpuIdReadConfig(PVM pVM, PCPUMCPUIDCONFIG pConfig, PCFGMNODE pC
             else if (!fNestedPagingAndFullGuestExec)
                 return VMSetError(pVM, VERR_CPUM_INVALID_HWVIRT_CONFIG, RT_SRC_POS,
                                   "Cannot enable nested VT-x/AMD-V without nested-paging and unrestricted guest execution!\n");
+#endif
         }
     }
 #endif /** @todo */
@@ -3882,6 +3895,7 @@ VMMR3_INT_DECL(void) CPUMR3SetGuestCpuIdFeature(PVM pVM, CPUMCPUIDFEATURE enmFea
          * Note! ASSUMES CPUMCPUIDFEATURE_APIC is called first.
          */
         case CPUMCPUIDFEATURE_X2APIC:
+#ifndef VBOX_WITH_KVM
             pLeaf = cpumCpuIdGetLeaf(pVM, UINT32_C(0x00000001));
             if (pLeaf)
                 pVM->cpum.s.aGuestCpuIdPatmStd[1].uEcx = pLeaf->uEcx |= X86_CPUID_FEATURE_ECX_X2APIC;
@@ -3896,6 +3910,7 @@ VMMR3_INT_DECL(void) CPUMR3SetGuestCpuIdFeature(PVM pVM, CPUMCPUIDFEATURE enmFea
             }
 
             LogRel(("CPUM: SetGuestCpuIdFeature: Enabled x2APIC\n"));
+#endif
             break;
 
         /*
diff --git a/src/VBox/VMM/include/GIMHvInternal.h b/src/VBox/VMM/include/GIMHvInternal.h
index 4397207..66a8510 100644
--- a/src/VBox/VMM/include/GIMHvInternal.h
+++ b/src/VBox/VMM/include/GIMHvInternal.h
@@ -202,6 +202,8 @@
 #define GIM_HV_HINT_INT_FOR_MBEC_SYSCALLS                   RT_BIT(13)
 /** Recommend using enlightened VMCS interfacea and nested enlightenments. */
 #define GIM_HV_HINT_NESTED_ENLIGHTENED_VMCS_INTERFACE       RT_BIT(14)
+/** Indicates that core-sharing is not possible. */
+#define GIM_HV_HINT_NO_NONARCH_CORESHARING                  RT_BIT(18)
 /** @}  */
 
 
@@ -1117,6 +1119,15 @@ AssertCompile(sizeof(GIMHVEXTGETBOOTZEROMEM) <= GIM_HV_PAGE_SIZE);
 /** @} */
 
 
+/** Hyper-V page size.  */
+#define GIM_HV_PAGE_SIZE                          4096
+/** Hyper-V page shift. */
+#define GIM_HV_PAGE_SHIFT                         12
+
+/** Microsoft Hyper-V vendor signature. */
+#define GIM_HV_VENDOR_MICROSOFT                   "Microsoft Hv"
+#define GIM_HV_VENDOR_VBOX                        "VBoxVBoxVBox"
+
 /**
  * MMIO2 region indices.
  */
diff --git a/src/VBox/VMM/include/NEMInternal.h b/src/VBox/VMM/include/NEMInternal.h
index be5377c..1d53a8c 100644
--- a/src/VBox/VMM/include/NEMInternal.h
+++ b/src/VBox/VMM/include/NEMInternal.h
@@ -35,8 +35,17 @@
 #include <VBox/types.h>
 #include <VBox/vmm/nem.h>
 #include <VBox/vmm/cpum.h> /* For CPUMCPUVENDOR. */
+#ifdef VBOX_WITH_KVM
+#include <VBox/vmm/pdmdev.h> /* For KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS */
+#endif
 #include <VBox/vmm/stam.h>
 #include <VBox/vmm/vmapi.h>
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+#include <array>
+#include <memory>
+#include <optional>
+#include <VBox/msi.h>
+#endif
 #ifdef RT_OS_WINDOWS
 # include <iprt/nt/hyperv.h>
 # include <iprt/critsect.h>
@@ -46,6 +55,8 @@
 # else
 #  include "VMXInternal.h"
 # endif
+#elif defined(RT_OS_LINUX)
+# include <time.h>
 #endif
 
 RT_C_DECLS_BEGIN
@@ -246,6 +257,9 @@ typedef struct NEM
     uint16_t                    idPrevSlot;
     /** Memory slot ID allocation bitmap. */
     uint64_t                    bmSlotIds[_32K / 8 / sizeof(uint64_t)];
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+    std::unique_ptr<std::array<std::optional<MSIMSG>, KVM_IRQCHIP_NUM_IOAPIC_INTR_PINS>> pARedirectionTable;
+#endif
 
 #elif defined(RT_OS_WINDOWS)
     /** Set if we've created the EMTs. */
@@ -453,7 +467,9 @@ typedef struct NEMCPU
 #endif
 
 #if defined(RT_OS_LINUX)
-    uint8_t                     abPadding[3];
+    uint8_t                     abPadding[2];
+    /** Whether processor bug mitigations have already been applied. */
+    bool                        fMitigationsApplied;
     /** The KVM VCpu file descriptor. */
     int32_t                     fdVCpu;
     /** Pointer to the KVM_RUN data exchange region. */
@@ -466,6 +482,21 @@ typedef struct NEMCPU
     /** Status of the FIQ line when last seen. */
     bool                        fFiqLastSeen;
 # elif defined(VBOX_VMM_TARGET_X86)
+#ifdef VBOX_WITH_KVM_NESTING
+    /** KVM stats file descriptor for binary statistics */
+    int                         statsFd;
+    size_t                      guestModeStatOffset;
+    bool                        nestedGuestActive;
+#endif
+
+#if defined(IN_RING3) && defined(VBOX_WITH_KVM)
+    /** The vCPU timer. */
+    timer_t                     pTimer;
+
+    /** The the next timeout (absolute). */
+    uint64_t                    nsAbsNextTimerEvt;
+#endif
+
     /** The MSR_IA32_APICBASE value known to KVM. */
     uint64_t                    uKvmApicBase;
 # endif
@@ -849,4 +880,3 @@ DECLHIDDEN(int)     nemHCNativeNotifyPhysPageAllocated(PVMCC pVM, RTGCPHYS GCPhy
 RT_C_DECLS_END
 
 #endif /* !VMM_INCLUDED_SRC_include_NEMInternal_h */
-
openSUSE Build Service is sponsored by