[BACK]Return to nvmm_x86_vmx.c CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / src / sys / dev / nvmm / x86

Annotation of src/sys/dev/nvmm/x86/nvmm_x86_vmx.c, Revision 1.25

1.25    ! maxv        1: /*     $NetBSD: nvmm_x86_vmx.c,v 1.24 2019/04/06 11:49:53 maxv Exp $   */
1.1       maxv        2:
                      3: /*
                      4:  * Copyright (c) 2018 The NetBSD Foundation, Inc.
                      5:  * All rights reserved.
                      6:  *
                      7:  * This code is derived from software contributed to The NetBSD Foundation
                      8:  * by Maxime Villard.
                      9:  *
                     10:  * Redistribution and use in source and binary forms, with or without
                     11:  * modification, are permitted provided that the following conditions
                     12:  * are met:
                     13:  * 1. Redistributions of source code must retain the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer.
                     15:  * 2. Redistributions in binary form must reproduce the above copyright
                     16:  *    notice, this list of conditions and the following disclaimer in the
                     17:  *    documentation and/or other materials provided with the distribution.
                     18:  *
                     19:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     20:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     21:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     22:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     23:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     24:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     25:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     26:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     27:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     28:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     29:  * POSSIBILITY OF SUCH DAMAGE.
                     30:  */
                     31:
                     32: #include <sys/cdefs.h>
1.25    ! maxv       33: __KERNEL_RCSID(0, "$NetBSD: nvmm_x86_vmx.c,v 1.24 2019/04/06 11:49:53 maxv Exp $");
1.1       maxv       34:
                     35: #include <sys/param.h>
                     36: #include <sys/systm.h>
                     37: #include <sys/kernel.h>
                     38: #include <sys/kmem.h>
                     39: #include <sys/cpu.h>
                     40: #include <sys/xcall.h>
1.20      maxv       41: #include <sys/mman.h>
1.1       maxv       42:
                     43: #include <uvm/uvm.h>
                     44: #include <uvm/uvm_page.h>
                     45:
                     46: #include <x86/cputypes.h>
                     47: #include <x86/specialreg.h>
                     48: #include <x86/pmap.h>
                     49: #include <x86/dbregs.h>
1.4       maxv       50: #include <x86/cpu_counter.h>
1.1       maxv       51: #include <machine/cpuvar.h>
                     52:
                     53: #include <dev/nvmm/nvmm.h>
                     54: #include <dev/nvmm/nvmm_internal.h>
                     55: #include <dev/nvmm/x86/nvmm_x86.h>
                     56:
                     57: int _vmx_vmxon(paddr_t *pa);
                     58: int _vmx_vmxoff(void);
                     59: int _vmx_invept(uint64_t op, void *desc);
                     60: int _vmx_invvpid(uint64_t op, void *desc);
                     61: int _vmx_vmread(uint64_t op, uint64_t *val);
                     62: int _vmx_vmwrite(uint64_t op, uint64_t val);
                     63: int _vmx_vmptrld(paddr_t *pa);
                     64: int _vmx_vmptrst(paddr_t *pa);
                     65: int _vmx_vmclear(paddr_t *pa);
                     66: int vmx_vmlaunch(uint64_t *gprs);
                     67: int vmx_vmresume(uint64_t *gprs);
                     68:
                     69: #define vmx_vmxon(a) \
                     70:        if (__predict_false(_vmx_vmxon(a) != 0)) { \
                     71:                panic("%s: VMXON failed", __func__); \
                     72:        }
                     73: #define vmx_vmxoff() \
                     74:        if (__predict_false(_vmx_vmxoff() != 0)) { \
                     75:                panic("%s: VMXOFF failed", __func__); \
                     76:        }
                     77: #define vmx_invept(a, b) \
                     78:        if (__predict_false(_vmx_invept(a, b) != 0)) { \
                     79:                panic("%s: INVEPT failed", __func__); \
                     80:        }
                     81: #define vmx_invvpid(a, b) \
                     82:        if (__predict_false(_vmx_invvpid(a, b) != 0)) { \
                     83:                panic("%s: INVVPID failed", __func__); \
                     84:        }
                     85: #define vmx_vmread(a, b) \
                     86:        if (__predict_false(_vmx_vmread(a, b) != 0)) { \
                     87:                panic("%s: VMREAD failed", __func__); \
                     88:        }
                     89: #define vmx_vmwrite(a, b) \
                     90:        if (__predict_false(_vmx_vmwrite(a, b) != 0)) { \
                     91:                panic("%s: VMWRITE failed", __func__); \
                     92:        }
                     93: #define vmx_vmptrld(a) \
                     94:        if (__predict_false(_vmx_vmptrld(a) != 0)) { \
                     95:                panic("%s: VMPTRLD failed", __func__); \
                     96:        }
                     97: #define vmx_vmptrst(a) \
                     98:        if (__predict_false(_vmx_vmptrst(a) != 0)) { \
                     99:                panic("%s: VMPTRST failed", __func__); \
                    100:        }
                    101: #define vmx_vmclear(a) \
                    102:        if (__predict_false(_vmx_vmclear(a) != 0)) { \
                    103:                panic("%s: VMCLEAR failed", __func__); \
                    104:        }
                    105:
                    106: #define MSR_IA32_FEATURE_CONTROL       0x003A
                    107: #define                IA32_FEATURE_CONTROL_LOCK       __BIT(0)
                    108: #define                IA32_FEATURE_CONTROL_IN_SMX     __BIT(1)
                    109: #define                IA32_FEATURE_CONTROL_OUT_SMX    __BIT(2)
                    110:
                    111: #define MSR_IA32_VMX_BASIC             0x0480
                    112: #define                IA32_VMX_BASIC_IDENT            __BITS(30,0)
                    113: #define                IA32_VMX_BASIC_DATA_SIZE        __BITS(44,32)
                    114: #define                IA32_VMX_BASIC_MEM_WIDTH        __BIT(48)
                    115: #define                IA32_VMX_BASIC_DUAL             __BIT(49)
                    116: #define                IA32_VMX_BASIC_MEM_TYPE         __BITS(53,50)
                    117: #define                        MEM_TYPE_UC             0
                    118: #define                        MEM_TYPE_WB             6
                    119: #define                IA32_VMX_BASIC_IO_REPORT        __BIT(54)
                    120: #define                IA32_VMX_BASIC_TRUE_CTLS        __BIT(55)
                    121:
                    122: #define MSR_IA32_VMX_PINBASED_CTLS             0x0481
                    123: #define MSR_IA32_VMX_PROCBASED_CTLS            0x0482
                    124: #define MSR_IA32_VMX_EXIT_CTLS                 0x0483
                    125: #define MSR_IA32_VMX_ENTRY_CTLS                        0x0484
                    126: #define MSR_IA32_VMX_PROCBASED_CTLS2           0x048B
                    127:
                    128: #define MSR_IA32_VMX_TRUE_PINBASED_CTLS                0x048D
                    129: #define MSR_IA32_VMX_TRUE_PROCBASED_CTLS       0x048E
                    130: #define MSR_IA32_VMX_TRUE_EXIT_CTLS            0x048F
                    131: #define MSR_IA32_VMX_TRUE_ENTRY_CTLS           0x0490
                    132:
                    133: #define MSR_IA32_VMX_CR0_FIXED0                        0x0486
                    134: #define MSR_IA32_VMX_CR0_FIXED1                        0x0487
                    135: #define MSR_IA32_VMX_CR4_FIXED0                        0x0488
                    136: #define MSR_IA32_VMX_CR4_FIXED1                        0x0489
                    137:
                    138: #define MSR_IA32_VMX_EPT_VPID_CAP      0x048C
                    139: #define                IA32_VMX_EPT_VPID_WALKLENGTH_4          __BIT(6)
                    140: #define                IA32_VMX_EPT_VPID_UC                    __BIT(8)
                    141: #define                IA32_VMX_EPT_VPID_WB                    __BIT(14)
                    142: #define                IA32_VMX_EPT_VPID_INVEPT                __BIT(20)
                    143: #define                IA32_VMX_EPT_VPID_FLAGS_AD              __BIT(21)
                    144: #define                IA32_VMX_EPT_VPID_INVEPT_CONTEXT        __BIT(25)
                    145: #define                IA32_VMX_EPT_VPID_INVEPT_ALL            __BIT(26)
                    146: #define                IA32_VMX_EPT_VPID_INVVPID               __BIT(32)
                    147: #define                IA32_VMX_EPT_VPID_INVVPID_ADDR          __BIT(40)
                    148: #define                IA32_VMX_EPT_VPID_INVVPID_CONTEXT       __BIT(41)
                    149: #define                IA32_VMX_EPT_VPID_INVVPID_ALL           __BIT(42)
                    150: #define                IA32_VMX_EPT_VPID_INVVPID_CONTEXT_NOG   __BIT(43)
                    151:
                    152: /* -------------------------------------------------------------------------- */
                    153:
                    154: /* 16-bit control fields */
                    155: #define VMCS_VPID                              0x00000000
                    156: #define VMCS_PIR_VECTOR                                0x00000002
                    157: #define VMCS_EPTP_INDEX                                0x00000004
                    158: /* 16-bit guest-state fields */
                    159: #define VMCS_GUEST_ES_SELECTOR                 0x00000800
                    160: #define VMCS_GUEST_CS_SELECTOR                 0x00000802
                    161: #define VMCS_GUEST_SS_SELECTOR                 0x00000804
                    162: #define VMCS_GUEST_DS_SELECTOR                 0x00000806
                    163: #define VMCS_GUEST_FS_SELECTOR                 0x00000808
                    164: #define VMCS_GUEST_GS_SELECTOR                 0x0000080A
                    165: #define VMCS_GUEST_LDTR_SELECTOR               0x0000080C
                    166: #define VMCS_GUEST_TR_SELECTOR                 0x0000080E
                    167: #define VMCS_GUEST_INTR_STATUS                 0x00000810
                    168: #define VMCS_PML_INDEX                         0x00000812
                    169: /* 16-bit host-state fields */
                    170: #define VMCS_HOST_ES_SELECTOR                  0x00000C00
                    171: #define VMCS_HOST_CS_SELECTOR                  0x00000C02
                    172: #define VMCS_HOST_SS_SELECTOR                  0x00000C04
                    173: #define VMCS_HOST_DS_SELECTOR                  0x00000C06
                    174: #define VMCS_HOST_FS_SELECTOR                  0x00000C08
                    175: #define VMCS_HOST_GS_SELECTOR                  0x00000C0A
                    176: #define VMCS_HOST_TR_SELECTOR                  0x00000C0C
                    177: /* 64-bit control fields */
                    178: #define VMCS_IO_BITMAP_A                       0x00002000
                    179: #define VMCS_IO_BITMAP_B                       0x00002002
                    180: #define VMCS_MSR_BITMAP                                0x00002004
                    181: #define VMCS_EXIT_MSR_STORE_ADDRESS            0x00002006
                    182: #define VMCS_EXIT_MSR_LOAD_ADDRESS             0x00002008
                    183: #define VMCS_ENTRY_MSR_LOAD_ADDRESS            0x0000200A
                    184: #define VMCS_EXECUTIVE_VMCS                    0x0000200C
                    185: #define VMCS_PML_ADDRESS                       0x0000200E
                    186: #define VMCS_TSC_OFFSET                                0x00002010
                    187: #define VMCS_VIRTUAL_APIC                      0x00002012
                    188: #define VMCS_APIC_ACCESS                       0x00002014
                    189: #define VMCS_PIR_DESC                          0x00002016
                    190: #define VMCS_VM_CONTROL                                0x00002018
                    191: #define VMCS_EPTP                              0x0000201A
                    192: #define                EPTP_TYPE                       __BITS(2,0)
                    193: #define                        EPTP_TYPE_UC            0
                    194: #define                        EPTP_TYPE_WB            6
                    195: #define                EPTP_WALKLEN                    __BITS(5,3)
                    196: #define                EPTP_FLAGS_AD                   __BIT(6)
                    197: #define                EPTP_PHYSADDR                   __BITS(63,12)
                    198: #define VMCS_EOI_EXIT0                         0x0000201C
                    199: #define VMCS_EOI_EXIT1                         0x0000201E
                    200: #define VMCS_EOI_EXIT2                         0x00002020
                    201: #define VMCS_EOI_EXIT3                         0x00002022
                    202: #define VMCS_EPTP_LIST                         0x00002024
                    203: #define VMCS_VMREAD_BITMAP                     0x00002026
                    204: #define VMCS_VMWRITE_BITMAP                    0x00002028
                    205: #define VMCS_VIRTUAL_EXCEPTION                 0x0000202A
                    206: #define VMCS_XSS_EXIT_BITMAP                   0x0000202C
                    207: #define VMCS_ENCLS_EXIT_BITMAP                 0x0000202E
1.22      maxv      208: #define VMCS_SUBPAGE_PERM_TABLE_PTR            0x00002030
1.1       maxv      209: #define VMCS_TSC_MULTIPLIER                    0x00002032
                    210: /* 64-bit read-only fields */
                    211: #define VMCS_GUEST_PHYSICAL_ADDRESS            0x00002400
                    212: /* 64-bit guest-state fields */
                    213: #define VMCS_LINK_POINTER                      0x00002800
                    214: #define VMCS_GUEST_IA32_DEBUGCTL               0x00002802
                    215: #define VMCS_GUEST_IA32_PAT                    0x00002804
                    216: #define VMCS_GUEST_IA32_EFER                   0x00002806
                    217: #define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL       0x00002808
                    218: #define VMCS_GUEST_PDPTE0                      0x0000280A
                    219: #define VMCS_GUEST_PDPTE1                      0x0000280C
                    220: #define VMCS_GUEST_PDPTE2                      0x0000280E
                    221: #define VMCS_GUEST_PDPTE3                      0x00002810
                    222: #define VMCS_GUEST_BNDCFGS                     0x00002812
                    223: /* 64-bit host-state fields */
                    224: #define VMCS_HOST_IA32_PAT                     0x00002C00
                    225: #define VMCS_HOST_IA32_EFER                    0x00002C02
                    226: #define VMCS_HOST_IA32_PERF_GLOBAL_CTRL                0x00002C04
                    227: /* 32-bit control fields */
                    228: #define VMCS_PINBASED_CTLS                     0x00004000
                    229: #define                PIN_CTLS_INT_EXITING            __BIT(0)
                    230: #define                PIN_CTLS_NMI_EXITING            __BIT(3)
                    231: #define                PIN_CTLS_VIRTUAL_NMIS           __BIT(5)
                    232: #define                PIN_CTLS_ACTIVATE_PREEMPT_TIMER __BIT(6)
1.22      maxv      233: #define                PIN_CTLS_PROCESS_POSTED_INTS    __BIT(7)
1.1       maxv      234: #define VMCS_PROCBASED_CTLS                    0x00004002
                    235: #define                PROC_CTLS_INT_WINDOW_EXITING    __BIT(2)
                    236: #define                PROC_CTLS_USE_TSC_OFFSETTING    __BIT(3)
                    237: #define                PROC_CTLS_HLT_EXITING           __BIT(7)
                    238: #define                PROC_CTLS_INVLPG_EXITING        __BIT(9)
                    239: #define                PROC_CTLS_MWAIT_EXITING         __BIT(10)
                    240: #define                PROC_CTLS_RDPMC_EXITING         __BIT(11)
                    241: #define                PROC_CTLS_RDTSC_EXITING         __BIT(12)
                    242: #define                PROC_CTLS_RCR3_EXITING          __BIT(15)
                    243: #define                PROC_CTLS_LCR3_EXITING          __BIT(16)
                    244: #define                PROC_CTLS_RCR8_EXITING          __BIT(19)
                    245: #define                PROC_CTLS_LCR8_EXITING          __BIT(20)
                    246: #define                PROC_CTLS_USE_TPR_SHADOW        __BIT(21)
                    247: #define                PROC_CTLS_NMI_WINDOW_EXITING    __BIT(22)
                    248: #define                PROC_CTLS_DR_EXITING            __BIT(23)
                    249: #define                PROC_CTLS_UNCOND_IO_EXITING     __BIT(24)
                    250: #define                PROC_CTLS_USE_IO_BITMAPS        __BIT(25)
                    251: #define                PROC_CTLS_MONITOR_TRAP_FLAG     __BIT(27)
                    252: #define                PROC_CTLS_USE_MSR_BITMAPS       __BIT(28)
                    253: #define                PROC_CTLS_MONITOR_EXITING       __BIT(29)
                    254: #define                PROC_CTLS_PAUSE_EXITING         __BIT(30)
                    255: #define                PROC_CTLS_ACTIVATE_CTLS2        __BIT(31)
                    256: #define VMCS_EXCEPTION_BITMAP                  0x00004004
                    257: #define VMCS_PF_ERROR_MASK                     0x00004006
                    258: #define VMCS_PF_ERROR_MATCH                    0x00004008
                    259: #define VMCS_CR3_TARGET_COUNT                  0x0000400A
                    260: #define VMCS_EXIT_CTLS                         0x0000400C
                    261: #define                EXIT_CTLS_SAVE_DEBUG_CONTROLS   __BIT(2)
                    262: #define                EXIT_CTLS_HOST_LONG_MODE        __BIT(9)
                    263: #define                EXIT_CTLS_LOAD_PERFGLOBALCTRL   __BIT(12)
                    264: #define                EXIT_CTLS_ACK_INTERRUPT         __BIT(15)
                    265: #define                EXIT_CTLS_SAVE_PAT              __BIT(18)
                    266: #define                EXIT_CTLS_LOAD_PAT              __BIT(19)
                    267: #define                EXIT_CTLS_SAVE_EFER             __BIT(20)
                    268: #define                EXIT_CTLS_LOAD_EFER             __BIT(21)
                    269: #define                EXIT_CTLS_SAVE_PREEMPT_TIMER    __BIT(22)
                    270: #define                EXIT_CTLS_CLEAR_BNDCFGS         __BIT(23)
                    271: #define                EXIT_CTLS_CONCEAL_PT            __BIT(24)
                    272: #define VMCS_EXIT_MSR_STORE_COUNT              0x0000400E
                    273: #define VMCS_EXIT_MSR_LOAD_COUNT               0x00004010
                    274: #define VMCS_ENTRY_CTLS                                0x00004012
                    275: #define                ENTRY_CTLS_LOAD_DEBUG_CONTROLS  __BIT(2)
                    276: #define                ENTRY_CTLS_LONG_MODE            __BIT(9)
                    277: #define                ENTRY_CTLS_SMM                  __BIT(10)
                    278: #define                ENTRY_CTLS_DISABLE_DUAL         __BIT(11)
                    279: #define                ENTRY_CTLS_LOAD_PERFGLOBALCTRL  __BIT(13)
                    280: #define                ENTRY_CTLS_LOAD_PAT             __BIT(14)
                    281: #define                ENTRY_CTLS_LOAD_EFER            __BIT(15)
                    282: #define                ENTRY_CTLS_LOAD_BNDCFGS         __BIT(16)
                    283: #define                ENTRY_CTLS_CONCEAL_PT           __BIT(17)
                    284: #define VMCS_ENTRY_MSR_LOAD_COUNT              0x00004014
                    285: #define VMCS_ENTRY_INTR_INFO                   0x00004016
                    286: #define                INTR_INFO_VECTOR                __BITS(7,0)
1.17      maxv      287: #define                INTR_INFO_TYPE                  __BITS(10,8)
                    288: #define                        INTR_TYPE_EXT_INT       0
                    289: #define                        INTR_TYPE_NMI           2
                    290: #define                        INTR_TYPE_HW_EXC        3
                    291: #define                        INTR_TYPE_SW_INT        4
                    292: #define                        INTR_TYPE_PRIV_SW_EXC   5
                    293: #define                        INTR_TYPE_SW_EXC        6
                    294: #define                        INTR_TYPE_OTHER         7
1.1       maxv      295: #define                INTR_INFO_ERROR                 __BIT(11)
                    296: #define                INTR_INFO_VALID                 __BIT(31)
                    297: #define VMCS_ENTRY_EXCEPTION_ERROR             0x00004018
                    298: #define VMCS_ENTRY_INST_LENGTH                 0x0000401A
                    299: #define VMCS_TPR_THRESHOLD                     0x0000401C
                    300: #define VMCS_PROCBASED_CTLS2                   0x0000401E
                    301: #define                PROC_CTLS2_VIRT_APIC_ACCESSES   __BIT(0)
                    302: #define                PROC_CTLS2_ENABLE_EPT           __BIT(1)
                    303: #define                PROC_CTLS2_DESC_TABLE_EXITING   __BIT(2)
                    304: #define                PROC_CTLS2_ENABLE_RDTSCP        __BIT(3)
                    305: #define                PROC_CTLS2_VIRT_X2APIC          __BIT(4)
                    306: #define                PROC_CTLS2_ENABLE_VPID          __BIT(5)
                    307: #define                PROC_CTLS2_WBINVD_EXITING       __BIT(6)
                    308: #define                PROC_CTLS2_UNRESTRICTED_GUEST   __BIT(7)
                    309: #define                PROC_CTLS2_APIC_REG_VIRT        __BIT(8)
                    310: #define                PROC_CTLS2_VIRT_INT_DELIVERY    __BIT(9)
                    311: #define                PROC_CTLS2_PAUSE_LOOP_EXITING   __BIT(10)
                    312: #define                PROC_CTLS2_RDRAND_EXITING       __BIT(11)
                    313: #define                PROC_CTLS2_INVPCID_ENABLE       __BIT(12)
                    314: #define                PROC_CTLS2_VMFUNC_ENABLE        __BIT(13)
                    315: #define                PROC_CTLS2_VMCS_SHADOWING       __BIT(14)
                    316: #define                PROC_CTLS2_ENCLS_EXITING        __BIT(15)
                    317: #define                PROC_CTLS2_RDSEED_EXITING       __BIT(16)
                    318: #define                PROC_CTLS2_PML_ENABLE           __BIT(17)
                    319: #define                PROC_CTLS2_EPT_VIOLATION        __BIT(18)
                    320: #define                PROC_CTLS2_CONCEAL_VMX_FROM_PT  __BIT(19)
                    321: #define                PROC_CTLS2_XSAVES_ENABLE        __BIT(20)
                    322: #define                PROC_CTLS2_MODE_BASED_EXEC_EPT  __BIT(22)
1.22      maxv      323: #define                PROC_CTLS2_SUBPAGE_PERMISSIONS  __BIT(23)
1.1       maxv      324: #define                PROC_CTLS2_USE_TSC_SCALING      __BIT(25)
1.22      maxv      325: #define                PROC_CTLS2_ENCLV_EXITING        __BIT(28)
1.1       maxv      326: #define VMCS_PLE_GAP                           0x00004020
                    327: #define VMCS_PLE_WINDOW                                0x00004022
                    328: /* 32-bit read-only data fields */
                    329: #define VMCS_INSTRUCTION_ERROR                 0x00004400
                    330: #define VMCS_EXIT_REASON                       0x00004402
                    331: #define VMCS_EXIT_INTR_INFO                    0x00004404
                    332: #define VMCS_EXIT_INTR_ERRCODE                 0x00004406
                    333: #define VMCS_IDT_VECTORING_INFO                        0x00004408
                    334: #define VMCS_IDT_VECTORING_ERROR               0x0000440A
                    335: #define VMCS_EXIT_INSTRUCTION_LENGTH           0x0000440C
                    336: #define VMCS_EXIT_INSTRUCTION_INFO             0x0000440E
                    337: /* 32-bit guest-state fields */
                    338: #define VMCS_GUEST_ES_LIMIT                    0x00004800
                    339: #define VMCS_GUEST_CS_LIMIT                    0x00004802
                    340: #define VMCS_GUEST_SS_LIMIT                    0x00004804
                    341: #define VMCS_GUEST_DS_LIMIT                    0x00004806
                    342: #define VMCS_GUEST_FS_LIMIT                    0x00004808
                    343: #define VMCS_GUEST_GS_LIMIT                    0x0000480A
                    344: #define VMCS_GUEST_LDTR_LIMIT                  0x0000480C
                    345: #define VMCS_GUEST_TR_LIMIT                    0x0000480E
                    346: #define VMCS_GUEST_GDTR_LIMIT                  0x00004810
                    347: #define VMCS_GUEST_IDTR_LIMIT                  0x00004812
                    348: #define VMCS_GUEST_ES_ACCESS_RIGHTS            0x00004814
                    349: #define VMCS_GUEST_CS_ACCESS_RIGHTS            0x00004816
                    350: #define VMCS_GUEST_SS_ACCESS_RIGHTS            0x00004818
                    351: #define VMCS_GUEST_DS_ACCESS_RIGHTS            0x0000481A
                    352: #define VMCS_GUEST_FS_ACCESS_RIGHTS            0x0000481C
                    353: #define VMCS_GUEST_GS_ACCESS_RIGHTS            0x0000481E
                    354: #define VMCS_GUEST_LDTR_ACCESS_RIGHTS          0x00004820
                    355: #define VMCS_GUEST_TR_ACCESS_RIGHTS            0x00004822
                    356: #define VMCS_GUEST_INTERRUPTIBILITY            0x00004824
                    357: #define                INT_STATE_STI                   __BIT(0)
                    358: #define                INT_STATE_MOVSS                 __BIT(1)
                    359: #define                INT_STATE_SMI                   __BIT(2)
                    360: #define                INT_STATE_NMI                   __BIT(3)
                    361: #define                INT_STATE_ENCLAVE               __BIT(4)
                    362: #define VMCS_GUEST_ACTIVITY                    0x00004826
                    363: #define VMCS_GUEST_SMBASE                      0x00004828
                    364: #define VMCS_GUEST_IA32_SYSENTER_CS            0x0000482A
                    365: #define VMCS_PREEMPTION_TIMER_VALUE            0x0000482E
                    366: /* 32-bit host state fields */
                    367: #define VMCS_HOST_IA32_SYSENTER_CS             0x00004C00
                    368: /* Natural-Width control fields */
                    369: #define VMCS_CR0_MASK                          0x00006000
                    370: #define VMCS_CR4_MASK                          0x00006002
                    371: #define VMCS_CR0_SHADOW                                0x00006004
                    372: #define VMCS_CR4_SHADOW                                0x00006006
                    373: #define VMCS_CR3_TARGET0                       0x00006008
                    374: #define VMCS_CR3_TARGET1                       0x0000600A
                    375: #define VMCS_CR3_TARGET2                       0x0000600C
                    376: #define VMCS_CR3_TARGET3                       0x0000600E
                    377: /* Natural-Width read-only fields */
                    378: #define VMCS_EXIT_QUALIFICATION                        0x00006400
                    379: #define VMCS_IO_RCX                            0x00006402
                    380: #define VMCS_IO_RSI                            0x00006404
                    381: #define VMCS_IO_RDI                            0x00006406
                    382: #define VMCS_IO_RIP                            0x00006408
                    383: #define VMCS_GUEST_LINEAR_ADDRESS              0x0000640A
                    384: /* Natural-Width guest-state fields */
                    385: #define VMCS_GUEST_CR0                         0x00006800
                    386: #define VMCS_GUEST_CR3                         0x00006802
                    387: #define VMCS_GUEST_CR4                         0x00006804
                    388: #define VMCS_GUEST_ES_BASE                     0x00006806
                    389: #define VMCS_GUEST_CS_BASE                     0x00006808
                    390: #define VMCS_GUEST_SS_BASE                     0x0000680A
                    391: #define VMCS_GUEST_DS_BASE                     0x0000680C
                    392: #define VMCS_GUEST_FS_BASE                     0x0000680E
                    393: #define VMCS_GUEST_GS_BASE                     0x00006810
                    394: #define VMCS_GUEST_LDTR_BASE                   0x00006812
                    395: #define VMCS_GUEST_TR_BASE                     0x00006814
                    396: #define VMCS_GUEST_GDTR_BASE                   0x00006816
                    397: #define VMCS_GUEST_IDTR_BASE                   0x00006818
                    398: #define VMCS_GUEST_DR7                         0x0000681A
                    399: #define VMCS_GUEST_RSP                         0x0000681C
                    400: #define VMCS_GUEST_RIP                         0x0000681E
                    401: #define VMCS_GUEST_RFLAGS                      0x00006820
                    402: #define VMCS_GUEST_PENDING_DBG_EXCEPTIONS      0x00006822
                    403: #define VMCS_GUEST_IA32_SYSENTER_ESP           0x00006824
                    404: #define VMCS_GUEST_IA32_SYSENTER_EIP           0x00006826
                    405: /* Natural-Width host-state fields */
                    406: #define VMCS_HOST_CR0                          0x00006C00
                    407: #define VMCS_HOST_CR3                          0x00006C02
                    408: #define VMCS_HOST_CR4                          0x00006C04
                    409: #define VMCS_HOST_FS_BASE                      0x00006C06
                    410: #define VMCS_HOST_GS_BASE                      0x00006C08
                    411: #define VMCS_HOST_TR_BASE                      0x00006C0A
                    412: #define VMCS_HOST_GDTR_BASE                    0x00006C0C
                    413: #define VMCS_HOST_IDTR_BASE                    0x00006C0E
                    414: #define VMCS_HOST_IA32_SYSENTER_ESP            0x00006C10
                    415: #define VMCS_HOST_IA32_SYSENTER_EIP            0x00006C12
                    416: #define VMCS_HOST_RSP                          0x00006C14
                    417: #define VMCS_HOST_RIP                          0x00006c16
                    418:
                    419: /* VMX basic exit reasons. */
                    420: #define VMCS_EXITCODE_EXC_NMI                  0
                    421: #define VMCS_EXITCODE_EXT_INT                  1
                    422: #define VMCS_EXITCODE_SHUTDOWN                 2
                    423: #define VMCS_EXITCODE_INIT                     3
                    424: #define VMCS_EXITCODE_SIPI                     4
                    425: #define VMCS_EXITCODE_SMI                      5
                    426: #define VMCS_EXITCODE_OTHER_SMI                        6
                    427: #define VMCS_EXITCODE_INT_WINDOW               7
                    428: #define VMCS_EXITCODE_NMI_WINDOW               8
                    429: #define VMCS_EXITCODE_TASK_SWITCH              9
                    430: #define VMCS_EXITCODE_CPUID                    10
                    431: #define VMCS_EXITCODE_GETSEC                   11
                    432: #define VMCS_EXITCODE_HLT                      12
                    433: #define VMCS_EXITCODE_INVD                     13
                    434: #define VMCS_EXITCODE_INVLPG                   14
                    435: #define VMCS_EXITCODE_RDPMC                    15
                    436: #define VMCS_EXITCODE_RDTSC                    16
                    437: #define VMCS_EXITCODE_RSM                      17
                    438: #define VMCS_EXITCODE_VMCALL                   18
                    439: #define VMCS_EXITCODE_VMCLEAR                  19
                    440: #define VMCS_EXITCODE_VMLAUNCH                 20
                    441: #define VMCS_EXITCODE_VMPTRLD                  21
                    442: #define VMCS_EXITCODE_VMPTRST                  22
                    443: #define VMCS_EXITCODE_VMREAD                   23
                    444: #define VMCS_EXITCODE_VMRESUME                 24
                    445: #define VMCS_EXITCODE_VMWRITE                  25
                    446: #define VMCS_EXITCODE_VMXOFF                   26
                    447: #define VMCS_EXITCODE_VMXON                    27
                    448: #define VMCS_EXITCODE_CR                       28
                    449: #define VMCS_EXITCODE_DR                       29
                    450: #define VMCS_EXITCODE_IO                       30
                    451: #define VMCS_EXITCODE_RDMSR                    31
                    452: #define VMCS_EXITCODE_WRMSR                    32
                    453: #define VMCS_EXITCODE_FAIL_GUEST_INVALID       33
                    454: #define VMCS_EXITCODE_FAIL_MSR_INVALID         34
                    455: #define VMCS_EXITCODE_MWAIT                    36
                    456: #define VMCS_EXITCODE_TRAP_FLAG                        37
                    457: #define VMCS_EXITCODE_MONITOR                  39
                    458: #define VMCS_EXITCODE_PAUSE                    40
                    459: #define VMCS_EXITCODE_FAIL_MACHINE_CHECK       41
                    460: #define VMCS_EXITCODE_TPR_BELOW                        43
                    461: #define VMCS_EXITCODE_APIC_ACCESS              44
                    462: #define VMCS_EXITCODE_VEOI                     45
                    463: #define VMCS_EXITCODE_GDTR_IDTR                        46
                    464: #define VMCS_EXITCODE_LDTR_TR                  47
                    465: #define VMCS_EXITCODE_EPT_VIOLATION            48
                    466: #define VMCS_EXITCODE_EPT_MISCONFIG            49
                    467: #define VMCS_EXITCODE_INVEPT                   50
                    468: #define VMCS_EXITCODE_RDTSCP                   51
                    469: #define VMCS_EXITCODE_PREEMPT_TIMEOUT          52
                    470: #define VMCS_EXITCODE_INVVPID                  53
                    471: #define VMCS_EXITCODE_WBINVD                   54
                    472: #define VMCS_EXITCODE_XSETBV                   55
                    473: #define VMCS_EXITCODE_APIC_WRITE               56
                    474: #define VMCS_EXITCODE_RDRAND                   57
                    475: #define VMCS_EXITCODE_INVPCID                  58
                    476: #define VMCS_EXITCODE_VMFUNC                   59
                    477: #define VMCS_EXITCODE_ENCLS                    60
                    478: #define VMCS_EXITCODE_RDSEED                   61
                    479: #define VMCS_EXITCODE_PAGE_LOG_FULL            62
                    480: #define VMCS_EXITCODE_XSAVES                   63
                    481: #define VMCS_EXITCODE_XRSTORS                  64
                    482:
                    483: /* -------------------------------------------------------------------------- */
                    484:
                    485: #define VMX_MSRLIST_STAR               0
                    486: #define VMX_MSRLIST_LSTAR              1
                    487: #define VMX_MSRLIST_CSTAR              2
                    488: #define VMX_MSRLIST_SFMASK             3
                    489: #define VMX_MSRLIST_KERNELGSBASE       4
                    490: #define VMX_MSRLIST_EXIT_NMSR          5
                    491: #define VMX_MSRLIST_L1DFLUSH           5
                    492:
                    493: /* On entry, we may do +1 to include L1DFLUSH. */
                    494: static size_t vmx_msrlist_entry_nmsr __read_mostly = VMX_MSRLIST_EXIT_NMSR;
                    495:
                    496: struct vmxon {
                    497:        uint32_t ident;
                    498: #define VMXON_IDENT_REVISION   __BITS(30,0)
                    499:
                    500:        uint8_t data[PAGE_SIZE - 4];
                    501: } __packed;
                    502:
                    503: CTASSERT(sizeof(struct vmxon) == PAGE_SIZE);
                    504:
                    505: struct vmxoncpu {
                    506:        vaddr_t va;
                    507:        paddr_t pa;
                    508: };
                    509:
                    510: static struct vmxoncpu vmxoncpu[MAXCPUS];
                    511:
                    512: struct vmcs {
                    513:        uint32_t ident;
                    514: #define VMCS_IDENT_REVISION    __BITS(30,0)
                    515: #define VMCS_IDENT_SHADOW      __BIT(31)
                    516:
                    517:        uint32_t abort;
                    518:        uint8_t data[PAGE_SIZE - 8];
                    519: } __packed;
                    520:
                    521: CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
                    522:
                    523: struct msr_entry {
                    524:        uint32_t msr;
                    525:        uint32_t rsvd;
                    526:        uint64_t val;
                    527: } __packed;
                    528:
                    529: struct ept_desc {
                    530:        uint64_t eptp;
                    531:        uint64_t mbz;
                    532: } __packed;
                    533:
                    534: struct vpid_desc {
                    535:        uint64_t vpid;
                    536:        uint64_t addr;
                    537: } __packed;
                    538:
                    539: #define VPID_MAX       0xFFFF
                    540:
                    541: /* Make sure we never run out of VPIDs. */
                    542: CTASSERT(VPID_MAX-1 >= NVMM_MAX_MACHINES * NVMM_MAX_VCPUS);
                    543:
                    544: static uint64_t vmx_tlb_flush_op __read_mostly;
                    545: static uint64_t vmx_ept_flush_op __read_mostly;
                    546: static uint64_t vmx_eptp_type __read_mostly;
                    547:
                    548: static uint64_t vmx_pinbased_ctls __read_mostly;
                    549: static uint64_t vmx_procbased_ctls __read_mostly;
                    550: static uint64_t vmx_procbased_ctls2 __read_mostly;
                    551: static uint64_t vmx_entry_ctls __read_mostly;
                    552: static uint64_t vmx_exit_ctls __read_mostly;
                    553:
                    554: static uint64_t vmx_cr0_fixed0 __read_mostly;
                    555: static uint64_t vmx_cr0_fixed1 __read_mostly;
                    556: static uint64_t vmx_cr4_fixed0 __read_mostly;
                    557: static uint64_t vmx_cr4_fixed1 __read_mostly;
                    558:
1.13      maxv      559: extern bool pmap_ept_has_ad;
                    560:
1.1       maxv      561: #define VMX_PINBASED_CTLS_ONE  \
                    562:        (PIN_CTLS_INT_EXITING| \
                    563:         PIN_CTLS_NMI_EXITING| \
                    564:         PIN_CTLS_VIRTUAL_NMIS)
                    565:
                    566: #define VMX_PINBASED_CTLS_ZERO 0
                    567:
                    568: #define VMX_PROCBASED_CTLS_ONE \
                    569:        (PROC_CTLS_USE_TSC_OFFSETTING| \
                    570:         PROC_CTLS_HLT_EXITING| \
                    571:         PROC_CTLS_MWAIT_EXITING | \
                    572:         PROC_CTLS_RDPMC_EXITING | \
                    573:         PROC_CTLS_RCR8_EXITING | \
                    574:         PROC_CTLS_LCR8_EXITING | \
                    575:         PROC_CTLS_UNCOND_IO_EXITING | /* no I/O bitmap */ \
                    576:         PROC_CTLS_USE_MSR_BITMAPS | \
                    577:         PROC_CTLS_MONITOR_EXITING | \
                    578:         PROC_CTLS_ACTIVATE_CTLS2)
                    579:
                    580: #define VMX_PROCBASED_CTLS_ZERO        \
                    581:        (PROC_CTLS_RCR3_EXITING| \
                    582:         PROC_CTLS_LCR3_EXITING)
                    583:
                    584: #define VMX_PROCBASED_CTLS2_ONE        \
                    585:        (PROC_CTLS2_ENABLE_EPT| \
                    586:         PROC_CTLS2_ENABLE_VPID| \
                    587:         PROC_CTLS2_UNRESTRICTED_GUEST)
                    588:
                    589: #define VMX_PROCBASED_CTLS2_ZERO       0
                    590:
                    591: #define VMX_ENTRY_CTLS_ONE     \
                    592:        (ENTRY_CTLS_LOAD_DEBUG_CONTROLS| \
                    593:         ENTRY_CTLS_LOAD_EFER| \
                    594:         ENTRY_CTLS_LOAD_PAT)
                    595:
                    596: #define VMX_ENTRY_CTLS_ZERO    \
                    597:        (ENTRY_CTLS_SMM| \
                    598:         ENTRY_CTLS_DISABLE_DUAL)
                    599:
                    600: #define VMX_EXIT_CTLS_ONE      \
                    601:        (EXIT_CTLS_SAVE_DEBUG_CONTROLS| \
                    602:         EXIT_CTLS_HOST_LONG_MODE| \
                    603:         EXIT_CTLS_SAVE_PAT| \
                    604:         EXIT_CTLS_LOAD_PAT| \
                    605:         EXIT_CTLS_SAVE_EFER| \
                    606:         EXIT_CTLS_LOAD_EFER)
                    607:
                    608: #define VMX_EXIT_CTLS_ZERO     0
                    609:
                    610: static uint8_t *vmx_asidmap __read_mostly;
                    611: static uint32_t vmx_maxasid __read_mostly;
                    612: static kmutex_t vmx_asidlock __cacheline_aligned;
                    613:
                    614: #define VMX_XCR0_MASK_DEFAULT  (XCR0_X87|XCR0_SSE)
                    615: static uint64_t vmx_xcr0_mask __read_mostly;
                    616:
                    617: #define VMX_NCPUIDS    32
                    618:
                    619: #define VMCS_NPAGES    1
                    620: #define VMCS_SIZE      (VMCS_NPAGES * PAGE_SIZE)
                    621:
                    622: #define MSRBM_NPAGES   1
                    623: #define MSRBM_SIZE     (MSRBM_NPAGES * PAGE_SIZE)
                    624:
                    625: #define EFER_TLB_FLUSH \
                    626:        (EFER_NXE|EFER_LMA|EFER_LME)
                    627: #define CR0_TLB_FLUSH \
                    628:        (CR0_PG|CR0_WP|CR0_CD|CR0_NW)
                    629: #define CR4_TLB_FLUSH \
                    630:        (CR4_PGE|CR4_PAE|CR4_PSE)
                    631:
                    632: /* -------------------------------------------------------------------------- */
                    633:
                    634: struct vmx_machdata {
                    635:        bool cpuidpresent[VMX_NCPUIDS];
                    636:        struct nvmm_x86_conf_cpuid cpuid[VMX_NCPUIDS];
1.9       maxv      637:        volatile uint64_t mach_htlb_gen;
1.1       maxv      638: };
                    639:
                    640: static const size_t vmx_conf_sizes[NVMM_X86_NCONF] = {
                    641:        [NVMM_X86_CONF_CPUID] = sizeof(struct nvmm_x86_conf_cpuid)
                    642: };
                    643:
                    644: struct vmx_cpudata {
                    645:        /* General */
                    646:        uint64_t asid;
1.8       maxv      647:        bool gtlb_want_flush;
1.21      maxv      648:        bool gtsc_want_update;
1.9       maxv      649:        uint64_t vcpu_htlb_gen;
                    650:        kcpuset_t *htlb_want_flush;
1.1       maxv      651:
                    652:        /* VMCS */
                    653:        struct vmcs *vmcs;
                    654:        paddr_t vmcs_pa;
                    655:        size_t vmcs_refcnt;
1.19      maxv      656:        struct cpu_info *vmcs_ci;
                    657:        bool vmcs_launched;
1.1       maxv      658:
                    659:        /* MSR bitmap */
                    660:        uint8_t *msrbm;
                    661:        paddr_t msrbm_pa;
                    662:
                    663:        /* Host state */
                    664:        uint64_t hxcr0;
                    665:        uint64_t star;
                    666:        uint64_t lstar;
                    667:        uint64_t cstar;
                    668:        uint64_t sfmask;
                    669:        uint64_t kernelgsbase;
                    670:        bool ts_set;
                    671:        struct xsave_header hfpu __aligned(64);
                    672:
1.24      maxv      673:        /* Intr state */
1.1       maxv      674:        bool int_window_exit;
                    675:        bool nmi_window_exit;
1.24      maxv      676:        bool evt_pending;
1.1       maxv      677:
                    678:        /* Guest state */
                    679:        struct msr_entry *gmsr;
                    680:        paddr_t gmsr_pa;
1.5       maxv      681:        uint64_t gmsr_misc_enable;
1.1       maxv      682:        uint64_t gcr2;
                    683:        uint64_t gcr8;
                    684:        uint64_t gxcr0;
                    685:        uint64_t gprs[NVMM_X64_NGPR];
                    686:        uint64_t drs[NVMM_X64_NDR];
1.21      maxv      687:        uint64_t gtsc;
1.1       maxv      688:        struct xsave_header gfpu __aligned(64);
                    689: };
                    690:
                    691: static const struct {
1.2       maxv      692:        uint64_t selector;
                    693:        uint64_t attrib;
                    694:        uint64_t limit;
1.1       maxv      695:        uint64_t base;
                    696: } vmx_guest_segs[NVMM_X64_NSEG] = {
                    697:        [NVMM_X64_SEG_ES] = {
                    698:                VMCS_GUEST_ES_SELECTOR,
                    699:                VMCS_GUEST_ES_ACCESS_RIGHTS,
                    700:                VMCS_GUEST_ES_LIMIT,
                    701:                VMCS_GUEST_ES_BASE
                    702:        },
                    703:        [NVMM_X64_SEG_CS] = {
                    704:                VMCS_GUEST_CS_SELECTOR,
                    705:                VMCS_GUEST_CS_ACCESS_RIGHTS,
                    706:                VMCS_GUEST_CS_LIMIT,
                    707:                VMCS_GUEST_CS_BASE
                    708:        },
                    709:        [NVMM_X64_SEG_SS] = {
                    710:                VMCS_GUEST_SS_SELECTOR,
                    711:                VMCS_GUEST_SS_ACCESS_RIGHTS,
                    712:                VMCS_GUEST_SS_LIMIT,
                    713:                VMCS_GUEST_SS_BASE
                    714:        },
                    715:        [NVMM_X64_SEG_DS] = {
                    716:                VMCS_GUEST_DS_SELECTOR,
                    717:                VMCS_GUEST_DS_ACCESS_RIGHTS,
                    718:                VMCS_GUEST_DS_LIMIT,
                    719:                VMCS_GUEST_DS_BASE
                    720:        },
                    721:        [NVMM_X64_SEG_FS] = {
                    722:                VMCS_GUEST_FS_SELECTOR,
                    723:                VMCS_GUEST_FS_ACCESS_RIGHTS,
                    724:                VMCS_GUEST_FS_LIMIT,
                    725:                VMCS_GUEST_FS_BASE
                    726:        },
                    727:        [NVMM_X64_SEG_GS] = {
                    728:                VMCS_GUEST_GS_SELECTOR,
                    729:                VMCS_GUEST_GS_ACCESS_RIGHTS,
                    730:                VMCS_GUEST_GS_LIMIT,
                    731:                VMCS_GUEST_GS_BASE
                    732:        },
                    733:        [NVMM_X64_SEG_GDT] = {
                    734:                0, /* doesn't exist */
                    735:                0, /* doesn't exist */
                    736:                VMCS_GUEST_GDTR_LIMIT,
                    737:                VMCS_GUEST_GDTR_BASE
                    738:        },
                    739:        [NVMM_X64_SEG_IDT] = {
                    740:                0, /* doesn't exist */
                    741:                0, /* doesn't exist */
                    742:                VMCS_GUEST_IDTR_LIMIT,
                    743:                VMCS_GUEST_IDTR_BASE
                    744:        },
                    745:        [NVMM_X64_SEG_LDT] = {
                    746:                VMCS_GUEST_LDTR_SELECTOR,
                    747:                VMCS_GUEST_LDTR_ACCESS_RIGHTS,
                    748:                VMCS_GUEST_LDTR_LIMIT,
                    749:                VMCS_GUEST_LDTR_BASE
                    750:        },
                    751:        [NVMM_X64_SEG_TR] = {
                    752:                VMCS_GUEST_TR_SELECTOR,
                    753:                VMCS_GUEST_TR_ACCESS_RIGHTS,
                    754:                VMCS_GUEST_TR_LIMIT,
                    755:                VMCS_GUEST_TR_BASE
                    756:        }
                    757: };
                    758:
                    759: /* -------------------------------------------------------------------------- */
                    760:
                    761: static uint64_t
                    762: vmx_get_revision(void)
                    763: {
                    764:        uint64_t msr;
                    765:
                    766:        msr = rdmsr(MSR_IA32_VMX_BASIC);
                    767:        msr &= IA32_VMX_BASIC_IDENT;
                    768:
                    769:        return msr;
                    770: }
                    771:
                    772: static void
1.19      maxv      773: vmx_vmclear_ipi(void *arg1, void *arg2)
                    774: {
                    775:        paddr_t vmcs_pa = (paddr_t)arg1;
                    776:        vmx_vmclear(&vmcs_pa);
                    777: }
                    778:
                    779: static void
                    780: vmx_vmclear_remote(struct cpu_info *ci, paddr_t vmcs_pa)
                    781: {
                    782:        uint64_t xc;
                    783:        int bound;
                    784:
                    785:        KASSERT(kpreempt_disabled());
                    786:
                    787:        bound = curlwp_bind();
                    788:        kpreempt_enable();
                    789:
                    790:        xc = xc_unicast(XC_HIGHPRI, vmx_vmclear_ipi, (void *)vmcs_pa, NULL, ci);
                    791:        xc_wait(xc);
                    792:
                    793:        kpreempt_disable();
                    794:        curlwp_bindx(bound);
                    795: }
                    796:
                    797: static void
1.1       maxv      798: vmx_vmcs_enter(struct nvmm_cpu *vcpu)
                    799: {
                    800:        struct vmx_cpudata *cpudata = vcpu->cpudata;
1.19      maxv      801:        struct cpu_info *vmcs_ci;
1.1       maxv      802:        paddr_t oldpa __diagused;
                    803:
                    804:        cpudata->vmcs_refcnt++;
                    805:        if (cpudata->vmcs_refcnt > 1) {
                    806: #ifdef DIAGNOSTIC
                    807:                KASSERT(kpreempt_disabled());
                    808:                vmx_vmptrst(&oldpa);
                    809:                KASSERT(oldpa == cpudata->vmcs_pa);
                    810: #endif
                    811:                return;
                    812:        }
                    813:
1.19      maxv      814:        vmcs_ci = cpudata->vmcs_ci;
                    815:        cpudata->vmcs_ci = (void *)0x00FFFFFFFFFFFFFF; /* clobber */
                    816:
1.1       maxv      817:        kpreempt_disable();
                    818:
1.19      maxv      819:        if (vmcs_ci == NULL) {
                    820:                /* This VMCS is loaded for the first time. */
                    821:                vmx_vmclear(&cpudata->vmcs_pa);
                    822:                cpudata->vmcs_launched = false;
                    823:        } else if (vmcs_ci != curcpu()) {
                    824:                /* This VMCS is active on a remote CPU. */
                    825:                vmx_vmclear_remote(vmcs_ci, cpudata->vmcs_pa);
                    826:                cpudata->vmcs_launched = false;
                    827:        } else {
                    828:                /* This VMCS is active on curcpu, nothing to do. */
                    829:        }
1.1       maxv      830:
                    831:        vmx_vmptrld(&cpudata->vmcs_pa);
                    832: }
                    833:
                    834: static void
                    835: vmx_vmcs_leave(struct nvmm_cpu *vcpu)
                    836: {
                    837:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                    838:        paddr_t oldpa __diagused;
                    839:
                    840:        KASSERT(kpreempt_disabled());
1.18      maxv      841: #ifdef DIAGNOSTIC
                    842:        vmx_vmptrst(&oldpa);
                    843:        KASSERT(oldpa == cpudata->vmcs_pa);
                    844: #endif
1.1       maxv      845:        KASSERT(cpudata->vmcs_refcnt > 0);
                    846:        cpudata->vmcs_refcnt--;
                    847:
                    848:        if (cpudata->vmcs_refcnt > 0) {
                    849:                return;
                    850:        }
                    851:
1.19      maxv      852:        cpudata->vmcs_ci = curcpu();
                    853:        kpreempt_enable();
                    854: }
                    855:
                    856: static void
                    857: vmx_vmcs_destroy(struct nvmm_cpu *vcpu)
                    858: {
                    859:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                    860:        paddr_t oldpa __diagused;
                    861:
                    862:        KASSERT(kpreempt_disabled());
                    863: #ifdef DIAGNOSTIC
                    864:        vmx_vmptrst(&oldpa);
                    865:        KASSERT(oldpa == cpudata->vmcs_pa);
                    866: #endif
                    867:        KASSERT(cpudata->vmcs_refcnt == 1);
                    868:        cpudata->vmcs_refcnt--;
                    869:
1.1       maxv      870:        vmx_vmclear(&cpudata->vmcs_pa);
                    871:        kpreempt_enable();
                    872: }
                    873:
                    874: /* -------------------------------------------------------------------------- */
                    875:
                    876: static void
                    877: vmx_event_waitexit_enable(struct nvmm_cpu *vcpu, bool nmi)
                    878: {
                    879:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                    880:        uint64_t ctls1;
                    881:
                    882:        vmx_vmread(VMCS_PROCBASED_CTLS, &ctls1);
                    883:
                    884:        if (nmi) {
                    885:                // XXX INT_STATE_NMI?
                    886:                ctls1 |= PROC_CTLS_NMI_WINDOW_EXITING;
                    887:                cpudata->nmi_window_exit = true;
                    888:        } else {
                    889:                ctls1 |= PROC_CTLS_INT_WINDOW_EXITING;
                    890:                cpudata->int_window_exit = true;
                    891:        }
                    892:
                    893:        vmx_vmwrite(VMCS_PROCBASED_CTLS, ctls1);
                    894: }
                    895:
                    896: static void
                    897: vmx_event_waitexit_disable(struct nvmm_cpu *vcpu, bool nmi)
                    898: {
                    899:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                    900:        uint64_t ctls1;
                    901:
                    902:        vmx_vmread(VMCS_PROCBASED_CTLS, &ctls1);
                    903:
                    904:        if (nmi) {
                    905:                ctls1 &= ~PROC_CTLS_NMI_WINDOW_EXITING;
                    906:                cpudata->nmi_window_exit = false;
                    907:        } else {
                    908:                ctls1 &= ~PROC_CTLS_INT_WINDOW_EXITING;
                    909:                cpudata->int_window_exit = false;
                    910:        }
                    911:
                    912:        vmx_vmwrite(VMCS_PROCBASED_CTLS, ctls1);
                    913: }
                    914:
                    915: static inline int
                    916: vmx_event_has_error(uint64_t vector)
                    917: {
                    918:        switch (vector) {
                    919:        case 8:         /* #DF */
                    920:        case 10:        /* #TS */
                    921:        case 11:        /* #NP */
                    922:        case 12:        /* #SS */
                    923:        case 13:        /* #GP */
                    924:        case 14:        /* #PF */
                    925:        case 17:        /* #AC */
                    926:        case 30:        /* #SX */
                    927:                return 1;
                    928:        default:
                    929:                return 0;
                    930:        }
                    931: }
                    932:
                    933: static int
                    934: vmx_vcpu_inject(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
                    935:     struct nvmm_event *event)
                    936: {
                    937:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                    938:        int type = 0, err = 0, ret = 0;
                    939:        uint64_t info, intstate, rflags;
                    940:
                    941:        if (event->vector >= 256) {
                    942:                return EINVAL;
                    943:        }
                    944:
                    945:        vmx_vmcs_enter(vcpu);
                    946:
                    947:        switch (event->type) {
                    948:        case NVMM_EVENT_INTERRUPT_HW:
1.17      maxv      949:                type = INTR_TYPE_EXT_INT;
1.1       maxv      950:                if (event->vector == 2) {
1.17      maxv      951:                        type = INTR_TYPE_NMI;
1.1       maxv      952:                }
                    953:                vmx_vmread(VMCS_GUEST_INTERRUPTIBILITY, &intstate);
1.17      maxv      954:                if (type == INTR_TYPE_NMI) {
1.1       maxv      955:                        if (cpudata->nmi_window_exit) {
                    956:                                ret = EAGAIN;
                    957:                                goto out;
                    958:                        }
                    959:                        vmx_event_waitexit_enable(vcpu, true);
                    960:                } else {
                    961:                        vmx_vmread(VMCS_GUEST_RFLAGS, &rflags);
                    962:                        if ((rflags & PSL_I) == 0 ||
                    963:                            (intstate & (INT_STATE_STI|INT_STATE_MOVSS)) != 0) {
                    964:                                vmx_event_waitexit_enable(vcpu, false);
                    965:                                ret = EAGAIN;
                    966:                                goto out;
                    967:                        }
                    968:                }
                    969:                err = 0;
                    970:                break;
                    971:        case NVMM_EVENT_INTERRUPT_SW:
                    972:                ret = EINVAL;
                    973:                goto out;
                    974:        case NVMM_EVENT_EXCEPTION:
                    975:                if (event->vector == 2 || event->vector >= 32) {
                    976:                        ret = EINVAL;
                    977:                        goto out;
                    978:                }
                    979:                if (event->vector == 3 || event->vector == 0) {
                    980:                        ret = EINVAL;
                    981:                        goto out;
                    982:                }
1.17      maxv      983:                type = INTR_TYPE_HW_EXC;
1.1       maxv      984:                err = vmx_event_has_error(event->vector);
                    985:                break;
                    986:        default:
                    987:                ret = EAGAIN;
                    988:                goto out;
                    989:        }
                    990:
                    991:        info =
                    992:            __SHIFTIN(event->vector, INTR_INFO_VECTOR) |
1.17      maxv      993:            __SHIFTIN(type, INTR_INFO_TYPE) |
1.1       maxv      994:            __SHIFTIN(err, INTR_INFO_ERROR) |
                    995:            __SHIFTIN(1, INTR_INFO_VALID);
                    996:        vmx_vmwrite(VMCS_ENTRY_INTR_INFO, info);
                    997:        vmx_vmwrite(VMCS_ENTRY_EXCEPTION_ERROR, event->u.error);
                    998:
1.24      maxv      999:        cpudata->evt_pending = true;
                   1000:
1.1       maxv     1001: out:
                   1002:        vmx_vmcs_leave(vcpu);
                   1003:        return ret;
                   1004: }
                   1005:
                   1006: static void
                   1007: vmx_inject_ud(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
                   1008: {
                   1009:        struct nvmm_event event;
                   1010:        int ret __diagused;
                   1011:
                   1012:        event.type = NVMM_EVENT_EXCEPTION;
                   1013:        event.vector = 6;
                   1014:        event.u.error = 0;
                   1015:
                   1016:        ret = vmx_vcpu_inject(mach, vcpu, &event);
                   1017:        KASSERT(ret == 0);
                   1018: }
                   1019:
                   1020: static void
                   1021: vmx_inject_gp(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
                   1022: {
                   1023:        struct nvmm_event event;
                   1024:        int ret __diagused;
                   1025:
                   1026:        event.type = NVMM_EVENT_EXCEPTION;
                   1027:        event.vector = 13;
                   1028:        event.u.error = 0;
                   1029:
                   1030:        ret = vmx_vcpu_inject(mach, vcpu, &event);
                   1031:        KASSERT(ret == 0);
                   1032: }
                   1033:
                   1034: static inline void
                   1035: vmx_inkernel_advance(void)
                   1036: {
                   1037:        uint64_t rip, inslen, intstate;
                   1038:
                   1039:        /*
                   1040:         * Maybe we should also apply single-stepping and debug exceptions.
                   1041:         * Matters for guest-ring3, because it can execute 'cpuid' under a
                   1042:         * debugger.
                   1043:         */
                   1044:        vmx_vmread(VMCS_EXIT_INSTRUCTION_LENGTH, &inslen);
                   1045:        vmx_vmread(VMCS_GUEST_RIP, &rip);
                   1046:        vmx_vmwrite(VMCS_GUEST_RIP, rip + inslen);
                   1047:        vmx_vmread(VMCS_GUEST_INTERRUPTIBILITY, &intstate);
                   1048:        vmx_vmwrite(VMCS_GUEST_INTERRUPTIBILITY,
                   1049:            intstate & ~(INT_STATE_STI|INT_STATE_MOVSS));
                   1050: }
                   1051:
                   1052: static void
1.17      maxv     1053: vmx_exit_exc_nmi(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
                   1054:     struct nvmm_exit *exit)
                   1055: {
                   1056:        uint64_t qual;
                   1057:
                   1058:        vmx_vmread(VMCS_EXIT_INTR_INFO, &qual);
                   1059:
                   1060:        if ((qual & INTR_INFO_VALID) == 0) {
                   1061:                goto error;
                   1062:        }
                   1063:        if (__SHIFTOUT(qual, INTR_INFO_TYPE) != INTR_TYPE_NMI) {
                   1064:                goto error;
                   1065:        }
                   1066:
                   1067:        exit->reason = NVMM_EXIT_NONE;
                   1068:        return;
                   1069:
                   1070: error:
                   1071:        exit->reason = NVMM_EXIT_INVALID;
                   1072: }
                   1073:
                   1074: static void
1.1       maxv     1075: vmx_inkernel_handle_cpuid(struct nvmm_cpu *vcpu, uint64_t eax, uint64_t ecx)
                   1076: {
                   1077:        struct vmx_cpudata *cpudata = vcpu->cpudata;
1.6       maxv     1078:        uint64_t cr4;
1.1       maxv     1079:
                   1080:        switch (eax) {
                   1081:        case 0x00000001:
1.16      maxv     1082:                cpudata->gprs[NVMM_X64_GPR_RAX] &= nvmm_cpuid_00000001.eax;
                   1083:
1.1       maxv     1084:                cpudata->gprs[NVMM_X64_GPR_RBX] &= ~CPUID_LOCAL_APIC_ID;
                   1085:                cpudata->gprs[NVMM_X64_GPR_RBX] |= __SHIFTIN(vcpu->cpuid,
                   1086:                    CPUID_LOCAL_APIC_ID);
1.16      maxv     1087:
                   1088:                cpudata->gprs[NVMM_X64_GPR_RCX] &= nvmm_cpuid_00000001.ecx;
                   1089:                cpudata->gprs[NVMM_X64_GPR_RCX] |= CPUID2_RAZ;
                   1090:
                   1091:                cpudata->gprs[NVMM_X64_GPR_RDX] &= nvmm_cpuid_00000001.edx;
1.6       maxv     1092:
                   1093:                /* CPUID2_OSXSAVE depends on CR4. */
                   1094:                vmx_vmread(VMCS_GUEST_CR4, &cr4);
                   1095:                if (!(cr4 & CR4_OSXSAVE)) {
                   1096:                        cpudata->gprs[NVMM_X64_GPR_RCX] &= ~CPUID2_OSXSAVE;
                   1097:                }
1.1       maxv     1098:                break;
                   1099:        case 0x00000005:
                   1100:        case 0x00000006:
                   1101:                cpudata->gprs[NVMM_X64_GPR_RAX] = 0;
                   1102:                cpudata->gprs[NVMM_X64_GPR_RBX] = 0;
                   1103:                cpudata->gprs[NVMM_X64_GPR_RCX] = 0;
                   1104:                cpudata->gprs[NVMM_X64_GPR_RDX] = 0;
                   1105:                break;
                   1106:        case 0x00000007:
1.16      maxv     1107:                cpudata->gprs[NVMM_X64_GPR_RAX] &= nvmm_cpuid_00000007.eax;
                   1108:                cpudata->gprs[NVMM_X64_GPR_RBX] &= nvmm_cpuid_00000007.ebx;
                   1109:                cpudata->gprs[NVMM_X64_GPR_RCX] &= nvmm_cpuid_00000007.ecx;
                   1110:                cpudata->gprs[NVMM_X64_GPR_RDX] &= nvmm_cpuid_00000007.edx;
1.1       maxv     1111:                break;
                   1112:        case 0x0000000D:
1.6       maxv     1113:                if (vmx_xcr0_mask == 0) {
1.1       maxv     1114:                        break;
                   1115:                }
1.6       maxv     1116:                switch (ecx) {
                   1117:                case 0:
                   1118:                        cpudata->gprs[NVMM_X64_GPR_RAX] = vmx_xcr0_mask & 0xFFFFFFFF;
                   1119:                        if (cpudata->gxcr0 & XCR0_SSE) {
                   1120:                                cpudata->gprs[NVMM_X64_GPR_RBX] = sizeof(struct fxsave);
                   1121:                        } else {
                   1122:                                cpudata->gprs[NVMM_X64_GPR_RBX] = sizeof(struct save87);
                   1123:                        }
                   1124:                        cpudata->gprs[NVMM_X64_GPR_RBX] += 64; /* XSAVE header */
                   1125:                        cpudata->gprs[NVMM_X64_GPR_RCX] = sizeof(struct fxsave);
                   1126:                        cpudata->gprs[NVMM_X64_GPR_RDX] = vmx_xcr0_mask >> 32;
                   1127:                        break;
                   1128:                case 1:
                   1129:                        cpudata->gprs[NVMM_X64_GPR_RAX] &= ~CPUID_PES1_XSAVES;
                   1130:                        break;
1.1       maxv     1131:                }
                   1132:                break;
                   1133:        case 0x40000000:
                   1134:                cpudata->gprs[NVMM_X64_GPR_RBX] = 0;
                   1135:                cpudata->gprs[NVMM_X64_GPR_RCX] = 0;
                   1136:                cpudata->gprs[NVMM_X64_GPR_RDX] = 0;
                   1137:                memcpy(&cpudata->gprs[NVMM_X64_GPR_RBX], "___ ", 4);
                   1138:                memcpy(&cpudata->gprs[NVMM_X64_GPR_RCX], "NVMM", 4);
                   1139:                memcpy(&cpudata->gprs[NVMM_X64_GPR_RDX], " ___", 4);
                   1140:                break;
                   1141:        case 0x80000001:
1.16      maxv     1142:                cpudata->gprs[NVMM_X64_GPR_RAX] &= nvmm_cpuid_80000001.eax;
                   1143:                cpudata->gprs[NVMM_X64_GPR_RBX] &= nvmm_cpuid_80000001.ebx;
                   1144:                cpudata->gprs[NVMM_X64_GPR_RCX] &= nvmm_cpuid_80000001.ecx;
                   1145:                cpudata->gprs[NVMM_X64_GPR_RDX] &= nvmm_cpuid_80000001.edx;
1.1       maxv     1146:                break;
                   1147:        default:
                   1148:                break;
                   1149:        }
                   1150: }
                   1151:
                   1152: static void
                   1153: vmx_exit_cpuid(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
                   1154:     struct nvmm_exit *exit)
                   1155: {
                   1156:        struct vmx_machdata *machdata = mach->machdata;
                   1157:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1158:        struct nvmm_x86_conf_cpuid *cpuid;
                   1159:        uint64_t eax, ecx;
                   1160:        u_int descs[4];
                   1161:        size_t i;
                   1162:
                   1163:        eax = cpudata->gprs[NVMM_X64_GPR_RAX];
                   1164:        ecx = cpudata->gprs[NVMM_X64_GPR_RCX];
                   1165:        x86_cpuid2(eax, ecx, descs);
                   1166:
                   1167:        cpudata->gprs[NVMM_X64_GPR_RAX] = descs[0];
                   1168:        cpudata->gprs[NVMM_X64_GPR_RBX] = descs[1];
                   1169:        cpudata->gprs[NVMM_X64_GPR_RCX] = descs[2];
                   1170:        cpudata->gprs[NVMM_X64_GPR_RDX] = descs[3];
                   1171:
1.25    ! maxv     1172:        vmx_inkernel_handle_cpuid(vcpu, eax, ecx);
        !          1173:
1.1       maxv     1174:        for (i = 0; i < VMX_NCPUIDS; i++) {
                   1175:                cpuid = &machdata->cpuid[i];
                   1176:                if (!machdata->cpuidpresent[i]) {
                   1177:                        continue;
                   1178:                }
                   1179:                if (cpuid->leaf != eax) {
                   1180:                        continue;
                   1181:                }
                   1182:
                   1183:                /* del */
                   1184:                cpudata->gprs[NVMM_X64_GPR_RAX] &= ~cpuid->del.eax;
                   1185:                cpudata->gprs[NVMM_X64_GPR_RBX] &= ~cpuid->del.ebx;
                   1186:                cpudata->gprs[NVMM_X64_GPR_RCX] &= ~cpuid->del.ecx;
                   1187:                cpudata->gprs[NVMM_X64_GPR_RDX] &= ~cpuid->del.edx;
                   1188:
                   1189:                /* set */
                   1190:                cpudata->gprs[NVMM_X64_GPR_RAX] |= cpuid->set.eax;
                   1191:                cpudata->gprs[NVMM_X64_GPR_RBX] |= cpuid->set.ebx;
                   1192:                cpudata->gprs[NVMM_X64_GPR_RCX] |= cpuid->set.ecx;
                   1193:                cpudata->gprs[NVMM_X64_GPR_RDX] |= cpuid->set.edx;
                   1194:
                   1195:                break;
                   1196:        }
                   1197:
                   1198:        vmx_inkernel_advance();
                   1199:        exit->reason = NVMM_EXIT_NONE;
                   1200: }
                   1201:
                   1202: static void
                   1203: vmx_exit_hlt(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
                   1204:     struct nvmm_exit *exit)
                   1205: {
                   1206:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1207:        uint64_t rflags;
                   1208:
                   1209:        if (cpudata->int_window_exit) {
                   1210:                vmx_vmread(VMCS_GUEST_RFLAGS, &rflags);
                   1211:                if (rflags & PSL_I) {
                   1212:                        vmx_event_waitexit_disable(vcpu, false);
                   1213:                }
                   1214:        }
                   1215:
                   1216:        vmx_inkernel_advance();
                   1217:        exit->reason = NVMM_EXIT_HALTED;
                   1218: }
                   1219:
                   1220: #define VMX_QUAL_CR_NUM                __BITS(3,0)
                   1221: #define VMX_QUAL_CR_TYPE       __BITS(5,4)
                   1222: #define                CR_TYPE_WRITE   0
                   1223: #define                CR_TYPE_READ    1
                   1224: #define                CR_TYPE_CLTS    2
                   1225: #define                CR_TYPE_LMSW    3
                   1226: #define VMX_QUAL_CR_LMSW_OPMEM __BIT(6)
                   1227: #define VMX_QUAL_CR_GPR                __BITS(11,8)
                   1228: #define VMX_QUAL_CR_LMSW_SRC   __BIT(31,16)
                   1229:
                   1230: static inline int
                   1231: vmx_check_cr(uint64_t crval, uint64_t fixed0, uint64_t fixed1)
                   1232: {
                   1233:        /* Bits set to 1 in fixed0 are fixed to 1. */
                   1234:        if ((crval & fixed0) != fixed0) {
                   1235:                return -1;
                   1236:        }
                   1237:        /* Bits set to 0 in fixed1 are fixed to 0. */
                   1238:        if (crval & ~fixed1) {
                   1239:                return -1;
                   1240:        }
                   1241:        return 0;
                   1242: }
                   1243:
                   1244: static int
                   1245: vmx_inkernel_handle_cr0(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
                   1246:     uint64_t qual)
                   1247: {
                   1248:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1249:        uint64_t type, gpr, cr0;
1.11      maxv     1250:        uint64_t efer, ctls1;
1.1       maxv     1251:
                   1252:        type = __SHIFTOUT(qual, VMX_QUAL_CR_TYPE);
                   1253:        if (type != CR_TYPE_WRITE) {
                   1254:                return -1;
                   1255:        }
                   1256:
                   1257:        gpr = __SHIFTOUT(qual, VMX_QUAL_CR_GPR);
                   1258:        KASSERT(gpr < 16);
                   1259:
                   1260:        if (gpr == NVMM_X64_GPR_RSP) {
                   1261:                vmx_vmread(VMCS_GUEST_RSP, &gpr);
                   1262:        } else {
                   1263:                gpr = cpudata->gprs[gpr];
                   1264:        }
                   1265:
                   1266:        cr0 = gpr | CR0_NE | CR0_ET;
                   1267:        cr0 &= ~(CR0_NW|CR0_CD);
                   1268:
                   1269:        if (vmx_check_cr(cr0, vmx_cr0_fixed0, vmx_cr0_fixed1) == -1) {
                   1270:                return -1;
                   1271:        }
                   1272:
1.11      maxv     1273:        /*
                   1274:         * XXX Handle 32bit PAE paging, need to set PDPTEs, fetched manually
                   1275:         * from CR3.
                   1276:         */
                   1277:
                   1278:        if (cr0 & CR0_PG) {
                   1279:                vmx_vmread(VMCS_ENTRY_CTLS, &ctls1);
                   1280:                vmx_vmread(VMCS_GUEST_IA32_EFER, &efer);
                   1281:                if (efer & EFER_LME) {
                   1282:                        ctls1 |= ENTRY_CTLS_LONG_MODE;
                   1283:                        efer |= EFER_LMA;
                   1284:                } else {
                   1285:                        ctls1 &= ~ENTRY_CTLS_LONG_MODE;
                   1286:                        efer &= ~EFER_LMA;
                   1287:                }
                   1288:                vmx_vmwrite(VMCS_GUEST_IA32_EFER, efer);
                   1289:                vmx_vmwrite(VMCS_ENTRY_CTLS, ctls1);
                   1290:        }
                   1291:
1.1       maxv     1292:        vmx_vmwrite(VMCS_GUEST_CR0, cr0);
                   1293:        vmx_inkernel_advance();
                   1294:        return 0;
                   1295: }
                   1296:
                   1297: static int
                   1298: vmx_inkernel_handle_cr4(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
                   1299:     uint64_t qual)
                   1300: {
                   1301:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1302:        uint64_t type, gpr, cr4;
                   1303:
                   1304:        type = __SHIFTOUT(qual, VMX_QUAL_CR_TYPE);
                   1305:        if (type != CR_TYPE_WRITE) {
                   1306:                return -1;
                   1307:        }
                   1308:
                   1309:        gpr = __SHIFTOUT(qual, VMX_QUAL_CR_GPR);
                   1310:        KASSERT(gpr < 16);
                   1311:
                   1312:        if (gpr == NVMM_X64_GPR_RSP) {
                   1313:                vmx_vmread(VMCS_GUEST_RSP, &gpr);
                   1314:        } else {
                   1315:                gpr = cpudata->gprs[gpr];
                   1316:        }
                   1317:
                   1318:        cr4 = gpr | CR4_VMXE;
                   1319:
                   1320:        if (vmx_check_cr(cr4, vmx_cr4_fixed0, vmx_cr4_fixed1) == -1) {
                   1321:                return -1;
                   1322:        }
                   1323:
                   1324:        vmx_vmwrite(VMCS_GUEST_CR4, cr4);
                   1325:        vmx_inkernel_advance();
                   1326:        return 0;
                   1327: }
                   1328:
                   1329: static int
                   1330: vmx_inkernel_handle_cr8(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
                   1331:     uint64_t qual)
                   1332: {
                   1333:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1334:        uint64_t type, gpr;
                   1335:        bool write;
                   1336:
                   1337:        type = __SHIFTOUT(qual, VMX_QUAL_CR_TYPE);
                   1338:        if (type == CR_TYPE_WRITE) {
                   1339:                write = true;
                   1340:        } else if (type == CR_TYPE_READ) {
                   1341:                write = false;
                   1342:        } else {
                   1343:                return -1;
                   1344:        }
                   1345:
                   1346:        gpr = __SHIFTOUT(qual, VMX_QUAL_CR_GPR);
                   1347:        KASSERT(gpr < 16);
                   1348:
                   1349:        if (write) {
                   1350:                if (gpr == NVMM_X64_GPR_RSP) {
                   1351:                        vmx_vmread(VMCS_GUEST_RSP, &cpudata->gcr8);
                   1352:                } else {
                   1353:                        cpudata->gcr8 = cpudata->gprs[gpr];
                   1354:                }
                   1355:        } else {
                   1356:                if (gpr == NVMM_X64_GPR_RSP) {
                   1357:                        vmx_vmwrite(VMCS_GUEST_RSP, cpudata->gcr8);
                   1358:                } else {
                   1359:                        cpudata->gprs[gpr] = cpudata->gcr8;
                   1360:                }
                   1361:        }
                   1362:
                   1363:        vmx_inkernel_advance();
                   1364:        return 0;
                   1365: }
                   1366:
                   1367: static void
                   1368: vmx_exit_cr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
                   1369:     struct nvmm_exit *exit)
                   1370: {
                   1371:        uint64_t qual;
                   1372:        int ret;
                   1373:
                   1374:        vmx_vmread(VMCS_EXIT_QUALIFICATION, &qual);
                   1375:
                   1376:        switch (__SHIFTOUT(qual, VMX_QUAL_CR_NUM)) {
                   1377:        case 0:
                   1378:                ret = vmx_inkernel_handle_cr0(mach, vcpu, qual);
                   1379:                break;
                   1380:        case 4:
                   1381:                ret = vmx_inkernel_handle_cr4(mach, vcpu, qual);
                   1382:                break;
                   1383:        case 8:
                   1384:                ret = vmx_inkernel_handle_cr8(mach, vcpu, qual);
                   1385:                break;
                   1386:        default:
                   1387:                ret = -1;
                   1388:                break;
                   1389:        }
                   1390:
                   1391:        if (ret == -1) {
                   1392:                vmx_inject_gp(mach, vcpu);
                   1393:        }
                   1394:
                   1395:        exit->reason = NVMM_EXIT_NONE;
                   1396: }
                   1397:
                   1398: #define VMX_QUAL_IO_SIZE       __BITS(2,0)
                   1399: #define                IO_SIZE_8       0
                   1400: #define                IO_SIZE_16      1
                   1401: #define                IO_SIZE_32      3
                   1402: #define VMX_QUAL_IO_IN         __BIT(3)
                   1403: #define VMX_QUAL_IO_STR                __BIT(4)
                   1404: #define VMX_QUAL_IO_REP                __BIT(5)
                   1405: #define VMX_QUAL_IO_DX         __BIT(6)
                   1406: #define VMX_QUAL_IO_PORT       __BITS(31,16)
                   1407:
                   1408: #define VMX_INFO_IO_ADRSIZE    __BITS(9,7)
                   1409: #define                IO_ADRSIZE_16   0
                   1410: #define                IO_ADRSIZE_32   1
                   1411: #define                IO_ADRSIZE_64   2
                   1412: #define VMX_INFO_IO_SEG                __BITS(17,15)
                   1413:
                   1414: static void
                   1415: vmx_exit_io(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
                   1416:     struct nvmm_exit *exit)
                   1417: {
                   1418:        uint64_t qual, info, inslen, rip;
                   1419:
                   1420:        vmx_vmread(VMCS_EXIT_QUALIFICATION, &qual);
                   1421:        vmx_vmread(VMCS_EXIT_INSTRUCTION_INFO, &info);
                   1422:
                   1423:        exit->reason = NVMM_EXIT_IO;
                   1424:
                   1425:        if (qual & VMX_QUAL_IO_IN) {
                   1426:                exit->u.io.type = NVMM_EXIT_IO_IN;
                   1427:        } else {
                   1428:                exit->u.io.type = NVMM_EXIT_IO_OUT;
                   1429:        }
                   1430:
                   1431:        exit->u.io.port = __SHIFTOUT(qual, VMX_QUAL_IO_PORT);
                   1432:
                   1433:        KASSERT(__SHIFTOUT(info, VMX_INFO_IO_SEG) < 6);
1.15      maxv     1434:        exit->u.io.seg = __SHIFTOUT(info, VMX_INFO_IO_SEG);
1.1       maxv     1435:
                   1436:        if (__SHIFTOUT(info, VMX_INFO_IO_ADRSIZE) == IO_ADRSIZE_64) {
                   1437:                exit->u.io.address_size = 8;
                   1438:        } else if (__SHIFTOUT(info, VMX_INFO_IO_ADRSIZE) == IO_ADRSIZE_32) {
                   1439:                exit->u.io.address_size = 4;
                   1440:        } else if (__SHIFTOUT(info, VMX_INFO_IO_ADRSIZE) == IO_ADRSIZE_16) {
                   1441:                exit->u.io.address_size = 2;
                   1442:        }
                   1443:
                   1444:        if (__SHIFTOUT(qual, VMX_QUAL_IO_SIZE) == IO_SIZE_32) {
                   1445:                exit->u.io.operand_size = 4;
                   1446:        } else if (__SHIFTOUT(qual, VMX_QUAL_IO_SIZE) == IO_SIZE_16) {
                   1447:                exit->u.io.operand_size = 2;
                   1448:        } else if (__SHIFTOUT(qual, VMX_QUAL_IO_SIZE) == IO_SIZE_8) {
                   1449:                exit->u.io.operand_size = 1;
                   1450:        }
                   1451:
                   1452:        exit->u.io.rep = (qual & VMX_QUAL_IO_REP) != 0;
                   1453:        exit->u.io.str = (qual & VMX_QUAL_IO_STR) != 0;
                   1454:
                   1455:        if ((exit->u.io.type == NVMM_EXIT_IO_IN) && exit->u.io.str) {
                   1456:                exit->u.io.seg = NVMM_X64_SEG_ES;
                   1457:        }
                   1458:
                   1459:        vmx_vmread(VMCS_EXIT_INSTRUCTION_LENGTH, &inslen);
                   1460:        vmx_vmread(VMCS_GUEST_RIP, &rip);
                   1461:        exit->u.io.npc = rip + inslen;
                   1462: }
                   1463:
                   1464: static const uint64_t msr_ignore_list[] = {
                   1465:        MSR_BIOS_SIGN,
                   1466:        MSR_IA32_PLATFORM_ID
                   1467: };
                   1468:
                   1469: static bool
                   1470: vmx_inkernel_handle_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
                   1471:     struct nvmm_exit *exit)
                   1472: {
                   1473:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1474:        uint64_t val;
                   1475:        size_t i;
                   1476:
                   1477:        switch (exit->u.msr.type) {
                   1478:        case NVMM_EXIT_MSR_RDMSR:
                   1479:                if (exit->u.msr.msr == MSR_CR_PAT) {
                   1480:                        vmx_vmread(VMCS_GUEST_IA32_PAT, &val);
                   1481:                        cpudata->gprs[NVMM_X64_GPR_RAX] = (val & 0xFFFFFFFF);
                   1482:                        cpudata->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
                   1483:                        goto handled;
                   1484:                }
1.5       maxv     1485:                if (exit->u.msr.msr == MSR_MISC_ENABLE) {
                   1486:                        val = cpudata->gmsr_misc_enable;
                   1487:                        cpudata->gprs[NVMM_X64_GPR_RAX] = (val & 0xFFFFFFFF);
                   1488:                        cpudata->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
                   1489:                        goto handled;
                   1490:                }
1.1       maxv     1491:                for (i = 0; i < __arraycount(msr_ignore_list); i++) {
                   1492:                        if (msr_ignore_list[i] != exit->u.msr.msr)
                   1493:                                continue;
                   1494:                        val = 0;
                   1495:                        cpudata->gprs[NVMM_X64_GPR_RAX] = (val & 0xFFFFFFFF);
                   1496:                        cpudata->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
                   1497:                        goto handled;
                   1498:                }
                   1499:                break;
                   1500:        case NVMM_EXIT_MSR_WRMSR:
1.4       maxv     1501:                if (exit->u.msr.msr == MSR_TSC) {
1.21      maxv     1502:                        cpudata->gtsc = exit->u.msr.val;
                   1503:                        cpudata->gtsc_want_update = true;
1.4       maxv     1504:                        goto handled;
                   1505:                }
1.1       maxv     1506:                if (exit->u.msr.msr == MSR_CR_PAT) {
1.23      maxv     1507:                        val = exit->u.msr.val;
                   1508:                        if (__predict_false(!nvmm_x86_pat_validate(val))) {
                   1509:                                goto error;
                   1510:                        }
                   1511:                        vmx_vmwrite(VMCS_GUEST_IA32_PAT, val);
1.1       maxv     1512:                        goto handled;
                   1513:                }
1.5       maxv     1514:                if (exit->u.msr.msr == MSR_MISC_ENABLE) {
                   1515:                        /* Don't care. */
                   1516:                        goto handled;
                   1517:                }
1.1       maxv     1518:                for (i = 0; i < __arraycount(msr_ignore_list); i++) {
                   1519:                        if (msr_ignore_list[i] != exit->u.msr.msr)
                   1520:                                continue;
                   1521:                        goto handled;
                   1522:                }
                   1523:                break;
                   1524:        }
                   1525:
                   1526:        return false;
                   1527:
                   1528: handled:
                   1529:        vmx_inkernel_advance();
                   1530:        return true;
1.23      maxv     1531:
                   1532: error:
                   1533:        vmx_inject_gp(mach, vcpu);
                   1534:        return true;
1.1       maxv     1535: }
                   1536:
                   1537: static void
                   1538: vmx_exit_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
                   1539:     struct nvmm_exit *exit, bool rdmsr)
                   1540: {
                   1541:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1542:        uint64_t inslen, rip;
                   1543:
                   1544:        if (rdmsr) {
                   1545:                exit->u.msr.type = NVMM_EXIT_MSR_RDMSR;
                   1546:        } else {
                   1547:                exit->u.msr.type = NVMM_EXIT_MSR_WRMSR;
                   1548:        }
                   1549:
                   1550:        exit->u.msr.msr = (cpudata->gprs[NVMM_X64_GPR_RCX] & 0xFFFFFFFF);
                   1551:
                   1552:        if (rdmsr) {
                   1553:                exit->u.msr.val = 0;
                   1554:        } else {
                   1555:                uint64_t rdx, rax;
                   1556:                rdx = cpudata->gprs[NVMM_X64_GPR_RDX];
                   1557:                rax = cpudata->gprs[NVMM_X64_GPR_RAX];
                   1558:                exit->u.msr.val = (rdx << 32) | (rax & 0xFFFFFFFF);
                   1559:        }
                   1560:
                   1561:        if (vmx_inkernel_handle_msr(mach, vcpu, exit)) {
                   1562:                exit->reason = NVMM_EXIT_NONE;
                   1563:                return;
                   1564:        }
                   1565:
                   1566:        exit->reason = NVMM_EXIT_MSR;
                   1567:        vmx_vmread(VMCS_EXIT_INSTRUCTION_LENGTH, &inslen);
                   1568:        vmx_vmread(VMCS_GUEST_RIP, &rip);
                   1569:        exit->u.msr.npc = rip + inslen;
                   1570: }
                   1571:
                   1572: static void
                   1573: vmx_exit_xsetbv(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
                   1574:     struct nvmm_exit *exit)
                   1575: {
                   1576:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1577:        uint16_t val;
                   1578:
                   1579:        exit->reason = NVMM_EXIT_NONE;
                   1580:
                   1581:        val = (cpudata->gprs[NVMM_X64_GPR_RDX] << 32) |
                   1582:            (cpudata->gprs[NVMM_X64_GPR_RAX] & 0xFFFFFFFF);
                   1583:
                   1584:        if (__predict_false(cpudata->gprs[NVMM_X64_GPR_RCX] != 0)) {
                   1585:                goto error;
                   1586:        } else if (__predict_false((val & ~vmx_xcr0_mask) != 0)) {
                   1587:                goto error;
                   1588:        } else if (__predict_false((val & XCR0_X87) == 0)) {
                   1589:                goto error;
                   1590:        }
                   1591:
                   1592:        cpudata->gxcr0 = val;
                   1593:
                   1594:        vmx_inkernel_advance();
                   1595:        return;
                   1596:
                   1597: error:
                   1598:        vmx_inject_gp(mach, vcpu);
                   1599: }
                   1600:
                   1601: #define VMX_EPT_VIOLATION_READ         __BIT(0)
                   1602: #define VMX_EPT_VIOLATION_WRITE                __BIT(1)
                   1603: #define VMX_EPT_VIOLATION_EXECUTE      __BIT(2)
                   1604:
                   1605: static void
                   1606: vmx_exit_epf(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
                   1607:     struct nvmm_exit *exit)
                   1608: {
                   1609:        uint64_t perm;
                   1610:        gpaddr_t gpa;
                   1611:
                   1612:        vmx_vmread(VMCS_GUEST_PHYSICAL_ADDRESS, &gpa);
                   1613:
1.7       maxv     1614:        exit->reason = NVMM_EXIT_MEMORY;
                   1615:        vmx_vmread(VMCS_EXIT_QUALIFICATION, &perm);
                   1616:        if (perm & VMX_EPT_VIOLATION_WRITE)
1.20      maxv     1617:                exit->u.mem.prot = PROT_WRITE;
1.7       maxv     1618:        else if (perm & VMX_EPT_VIOLATION_EXECUTE)
1.20      maxv     1619:                exit->u.mem.prot = PROT_EXEC;
1.7       maxv     1620:        else
1.20      maxv     1621:                exit->u.mem.prot = PROT_READ;
1.7       maxv     1622:        exit->u.mem.gpa = gpa;
                   1623:        exit->u.mem.inst_len = 0;
1.1       maxv     1624: }
                   1625:
1.9       maxv     1626: /* -------------------------------------------------------------------------- */
                   1627:
1.1       maxv     1628: static void
                   1629: vmx_vcpu_guest_fpu_enter(struct nvmm_cpu *vcpu)
                   1630: {
                   1631:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1632:
                   1633:        cpudata->ts_set = (rcr0() & CR0_TS) != 0;
                   1634:
                   1635:        fpu_area_save(&cpudata->hfpu, vmx_xcr0_mask);
                   1636:        fpu_area_restore(&cpudata->gfpu, vmx_xcr0_mask);
                   1637:
                   1638:        if (vmx_xcr0_mask != 0) {
                   1639:                cpudata->hxcr0 = rdxcr(0);
                   1640:                wrxcr(0, cpudata->gxcr0);
                   1641:        }
                   1642: }
                   1643:
                   1644: static void
                   1645: vmx_vcpu_guest_fpu_leave(struct nvmm_cpu *vcpu)
                   1646: {
                   1647:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1648:
                   1649:        if (vmx_xcr0_mask != 0) {
                   1650:                cpudata->gxcr0 = rdxcr(0);
                   1651:                wrxcr(0, cpudata->hxcr0);
                   1652:        }
                   1653:
                   1654:        fpu_area_save(&cpudata->gfpu, vmx_xcr0_mask);
                   1655:        fpu_area_restore(&cpudata->hfpu, vmx_xcr0_mask);
                   1656:
                   1657:        if (cpudata->ts_set) {
                   1658:                stts();
                   1659:        }
                   1660: }
                   1661:
                   1662: static void
                   1663: vmx_vcpu_guest_dbregs_enter(struct nvmm_cpu *vcpu)
                   1664: {
                   1665:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1666:
                   1667:        x86_dbregs_save(curlwp);
                   1668:
                   1669:        ldr7(0);
                   1670:
                   1671:        ldr0(cpudata->drs[NVMM_X64_DR_DR0]);
                   1672:        ldr1(cpudata->drs[NVMM_X64_DR_DR1]);
                   1673:        ldr2(cpudata->drs[NVMM_X64_DR_DR2]);
                   1674:        ldr3(cpudata->drs[NVMM_X64_DR_DR3]);
                   1675:        ldr6(cpudata->drs[NVMM_X64_DR_DR6]);
                   1676: }
                   1677:
                   1678: static void
                   1679: vmx_vcpu_guest_dbregs_leave(struct nvmm_cpu *vcpu)
                   1680: {
                   1681:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1682:
                   1683:        cpudata->drs[NVMM_X64_DR_DR0] = rdr0();
                   1684:        cpudata->drs[NVMM_X64_DR_DR1] = rdr1();
                   1685:        cpudata->drs[NVMM_X64_DR_DR2] = rdr2();
                   1686:        cpudata->drs[NVMM_X64_DR_DR3] = rdr3();
                   1687:        cpudata->drs[NVMM_X64_DR_DR6] = rdr6();
                   1688:
                   1689:        x86_dbregs_restore(curlwp);
                   1690: }
                   1691:
                   1692: static void
                   1693: vmx_vcpu_guest_misc_enter(struct nvmm_cpu *vcpu)
                   1694: {
                   1695:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1696:
                   1697:        /* This gets restored automatically by the CPU. */
                   1698:        vmx_vmwrite(VMCS_HOST_FS_BASE, rdmsr(MSR_FSBASE));
                   1699:        vmx_vmwrite(VMCS_HOST_CR3, rcr3());
                   1700:        vmx_vmwrite(VMCS_HOST_CR4, rcr4());
                   1701:
                   1702:        /* Note: MSR_LSTAR is not static, because of SVS. */
                   1703:        cpudata->lstar = rdmsr(MSR_LSTAR);
                   1704:        cpudata->kernelgsbase = rdmsr(MSR_KERNELGSBASE);
                   1705: }
                   1706:
                   1707: static void
                   1708: vmx_vcpu_guest_misc_leave(struct nvmm_cpu *vcpu)
                   1709: {
                   1710:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1711:
                   1712:        wrmsr(MSR_STAR, cpudata->star);
                   1713:        wrmsr(MSR_LSTAR, cpudata->lstar);
                   1714:        wrmsr(MSR_CSTAR, cpudata->cstar);
                   1715:        wrmsr(MSR_SFMASK, cpudata->sfmask);
                   1716:        wrmsr(MSR_KERNELGSBASE, cpudata->kernelgsbase);
                   1717: }
                   1718:
1.9       maxv     1719: /* -------------------------------------------------------------------------- */
1.8       maxv     1720:
1.1       maxv     1721: #define VMX_INVVPID_ADDRESS            0
                   1722: #define VMX_INVVPID_CONTEXT            1
                   1723: #define VMX_INVVPID_ALL                        2
                   1724: #define VMX_INVVPID_CONTEXT_NOGLOBAL   3
                   1725:
                   1726: #define VMX_INVEPT_CONTEXT             1
                   1727: #define VMX_INVEPT_ALL                 2
                   1728:
1.8       maxv     1729: static inline void
                   1730: vmx_gtlb_catchup(struct nvmm_cpu *vcpu, int hcpu)
                   1731: {
                   1732:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1733:
                   1734:        if (vcpu->hcpu_last != hcpu) {
                   1735:                cpudata->gtlb_want_flush = true;
                   1736:        }
                   1737: }
                   1738:
1.9       maxv     1739: static inline void
                   1740: vmx_htlb_catchup(struct nvmm_cpu *vcpu, int hcpu)
                   1741: {
                   1742:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1743:        struct ept_desc ept_desc;
                   1744:
                   1745:        if (__predict_true(!kcpuset_isset(cpudata->htlb_want_flush, hcpu))) {
                   1746:                return;
                   1747:        }
                   1748:
                   1749:        vmx_vmread(VMCS_EPTP, &ept_desc.eptp);
                   1750:        ept_desc.mbz = 0;
                   1751:        vmx_invept(vmx_ept_flush_op, &ept_desc);
                   1752:        kcpuset_clear(cpudata->htlb_want_flush, hcpu);
                   1753: }
                   1754:
                   1755: static inline uint64_t
                   1756: vmx_htlb_flush(struct vmx_machdata *machdata, struct vmx_cpudata *cpudata)
                   1757: {
                   1758:        struct ept_desc ept_desc;
                   1759:        uint64_t machgen;
                   1760:
                   1761:        machgen = machdata->mach_htlb_gen;
                   1762:        if (__predict_true(machgen == cpudata->vcpu_htlb_gen)) {
                   1763:                return machgen;
                   1764:        }
                   1765:
                   1766:        kcpuset_copy(cpudata->htlb_want_flush, kcpuset_running);
                   1767:
                   1768:        vmx_vmread(VMCS_EPTP, &ept_desc.eptp);
                   1769:        ept_desc.mbz = 0;
                   1770:        vmx_invept(vmx_ept_flush_op, &ept_desc);
                   1771:
                   1772:        return machgen;
                   1773: }
                   1774:
                   1775: static inline void
                   1776: vmx_htlb_flush_ack(struct vmx_cpudata *cpudata, uint64_t machgen)
                   1777: {
                   1778:        cpudata->vcpu_htlb_gen = machgen;
                   1779:        kcpuset_clear(cpudata->htlb_want_flush, cpu_number());
                   1780: }
                   1781:
1.1       maxv     1782: static int
                   1783: vmx_vcpu_run(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
                   1784:     struct nvmm_exit *exit)
                   1785: {
                   1786:        struct vmx_machdata *machdata = mach->machdata;
                   1787:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   1788:        struct vpid_desc vpid_desc;
                   1789:        struct cpu_info *ci;
                   1790:        uint64_t exitcode;
                   1791:        uint64_t intstate;
1.9       maxv     1792:        uint64_t machgen;
1.1       maxv     1793:        int hcpu, s, ret;
1.19      maxv     1794:        bool launched;
1.1       maxv     1795:
                   1796:        vmx_vmcs_enter(vcpu);
                   1797:        ci = curcpu();
                   1798:        hcpu = cpu_number();
1.19      maxv     1799:        launched = cpudata->vmcs_launched;
1.1       maxv     1800:
1.8       maxv     1801:        vmx_gtlb_catchup(vcpu, hcpu);
1.9       maxv     1802:        vmx_htlb_catchup(vcpu, hcpu);
1.1       maxv     1803:
                   1804:        if (vcpu->hcpu_last != hcpu) {
                   1805:                vmx_vmwrite(VMCS_HOST_TR_SELECTOR, ci->ci_tss_sel);
                   1806:                vmx_vmwrite(VMCS_HOST_TR_BASE, (uint64_t)ci->ci_tss);
                   1807:                vmx_vmwrite(VMCS_HOST_GDTR_BASE, (uint64_t)ci->ci_gdt);
                   1808:                vmx_vmwrite(VMCS_HOST_GS_BASE, rdmsr(MSR_GSBASE));
1.21      maxv     1809:                cpudata->gtsc_want_update = true;
1.1       maxv     1810:                vcpu->hcpu_last = hcpu;
                   1811:        }
                   1812:
                   1813:        vmx_vcpu_guest_dbregs_enter(vcpu);
                   1814:        vmx_vcpu_guest_misc_enter(vcpu);
                   1815:
                   1816:        while (1) {
1.8       maxv     1817:                if (cpudata->gtlb_want_flush) {
1.1       maxv     1818:                        vpid_desc.vpid = cpudata->asid;
                   1819:                        vpid_desc.addr = 0;
                   1820:                        vmx_invvpid(vmx_tlb_flush_op, &vpid_desc);
1.8       maxv     1821:                        cpudata->gtlb_want_flush = false;
1.1       maxv     1822:                }
                   1823:
1.21      maxv     1824:                if (__predict_false(cpudata->gtsc_want_update)) {
                   1825:                        vmx_vmwrite(VMCS_TSC_OFFSET, cpudata->gtsc - rdtsc());
                   1826:                        cpudata->gtsc_want_update = false;
                   1827:                }
                   1828:
1.1       maxv     1829:                s = splhigh();
1.9       maxv     1830:                machgen = vmx_htlb_flush(machdata, cpudata);
1.1       maxv     1831:                vmx_vcpu_guest_fpu_enter(vcpu);
                   1832:                lcr2(cpudata->gcr2);
                   1833:                if (launched) {
                   1834:                        ret = vmx_vmresume(cpudata->gprs);
                   1835:                } else {
                   1836:                        ret = vmx_vmlaunch(cpudata->gprs);
                   1837:                }
                   1838:                cpudata->gcr2 = rcr2();
                   1839:                vmx_vcpu_guest_fpu_leave(vcpu);
1.9       maxv     1840:                vmx_htlb_flush_ack(cpudata, machgen);
1.1       maxv     1841:                splx(s);
                   1842:
                   1843:                if (__predict_false(ret != 0)) {
                   1844:                        exit->reason = NVMM_EXIT_INVALID;
                   1845:                        break;
                   1846:                }
1.24      maxv     1847:                cpudata->evt_pending = false;
1.1       maxv     1848:
                   1849:                launched = true;
                   1850:
                   1851:                vmx_vmread(VMCS_EXIT_REASON, &exitcode);
                   1852:                exitcode &= __BITS(15,0);
                   1853:
                   1854:                switch (exitcode) {
1.17      maxv     1855:                case VMCS_EXITCODE_EXC_NMI:
                   1856:                        vmx_exit_exc_nmi(mach, vcpu, exit);
                   1857:                        break;
1.1       maxv     1858:                case VMCS_EXITCODE_EXT_INT:
                   1859:                        exit->reason = NVMM_EXIT_NONE;
                   1860:                        break;
                   1861:                case VMCS_EXITCODE_CPUID:
                   1862:                        vmx_exit_cpuid(mach, vcpu, exit);
                   1863:                        break;
                   1864:                case VMCS_EXITCODE_HLT:
                   1865:                        vmx_exit_hlt(mach, vcpu, exit);
                   1866:                        break;
                   1867:                case VMCS_EXITCODE_CR:
                   1868:                        vmx_exit_cr(mach, vcpu, exit);
                   1869:                        break;
                   1870:                case VMCS_EXITCODE_IO:
                   1871:                        vmx_exit_io(mach, vcpu, exit);
                   1872:                        break;
                   1873:                case VMCS_EXITCODE_RDMSR:
                   1874:                        vmx_exit_msr(mach, vcpu, exit, true);
                   1875:                        break;
                   1876:                case VMCS_EXITCODE_WRMSR:
                   1877:                        vmx_exit_msr(mach, vcpu, exit, false);
                   1878:                        break;
                   1879:                case VMCS_EXITCODE_SHUTDOWN:
                   1880:                        exit->reason = NVMM_EXIT_SHUTDOWN;
                   1881:                        break;
                   1882:                case VMCS_EXITCODE_MONITOR:
                   1883:                        exit->reason = NVMM_EXIT_MONITOR;
                   1884:                        break;
                   1885:                case VMCS_EXITCODE_MWAIT:
                   1886:                        exit->reason = NVMM_EXIT_MWAIT;
                   1887:                        break;
                   1888:                case VMCS_EXITCODE_XSETBV:
                   1889:                        vmx_exit_xsetbv(mach, vcpu, exit);
                   1890:                        break;
                   1891:                case VMCS_EXITCODE_RDPMC:
                   1892:                case VMCS_EXITCODE_RDTSCP:
                   1893:                case VMCS_EXITCODE_INVVPID:
                   1894:                case VMCS_EXITCODE_INVEPT:
                   1895:                case VMCS_EXITCODE_VMCALL:
                   1896:                case VMCS_EXITCODE_VMCLEAR:
                   1897:                case VMCS_EXITCODE_VMLAUNCH:
                   1898:                case VMCS_EXITCODE_VMPTRLD:
                   1899:                case VMCS_EXITCODE_VMPTRST:
                   1900:                case VMCS_EXITCODE_VMREAD:
                   1901:                case VMCS_EXITCODE_VMRESUME:
                   1902:                case VMCS_EXITCODE_VMWRITE:
                   1903:                case VMCS_EXITCODE_VMXOFF:
                   1904:                case VMCS_EXITCODE_VMXON:
                   1905:                        vmx_inject_ud(mach, vcpu);
                   1906:                        exit->reason = NVMM_EXIT_NONE;
                   1907:                        break;
                   1908:                case VMCS_EXITCODE_EPT_VIOLATION:
                   1909:                        vmx_exit_epf(mach, vcpu, exit);
                   1910:                        break;
                   1911:                case VMCS_EXITCODE_INT_WINDOW:
                   1912:                        vmx_event_waitexit_disable(vcpu, false);
                   1913:                        exit->reason = NVMM_EXIT_INT_READY;
                   1914:                        break;
                   1915:                case VMCS_EXITCODE_NMI_WINDOW:
                   1916:                        vmx_event_waitexit_disable(vcpu, true);
                   1917:                        exit->reason = NVMM_EXIT_NMI_READY;
                   1918:                        break;
                   1919:                default:
                   1920:                        exit->reason = NVMM_EXIT_INVALID;
                   1921:                        break;
                   1922:                }
                   1923:
                   1924:                /* If no reason to return to userland, keep rolling. */
                   1925:                if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) {
                   1926:                        break;
                   1927:                }
                   1928:                if (curcpu()->ci_data.cpu_softints != 0) {
                   1929:                        break;
                   1930:                }
                   1931:                if (curlwp->l_flag & LW_USERRET) {
                   1932:                        break;
                   1933:                }
                   1934:                if (exit->reason != NVMM_EXIT_NONE) {
                   1935:                        break;
                   1936:                }
                   1937:        }
                   1938:
1.19      maxv     1939:        cpudata->vmcs_launched = launched;
                   1940:
1.21      maxv     1941:        vmx_vmread(VMCS_TSC_OFFSET, &cpudata->gtsc);
                   1942:        cpudata->gtsc += rdtsc();
                   1943:
1.1       maxv     1944:        vmx_vcpu_guest_misc_leave(vcpu);
                   1945:        vmx_vcpu_guest_dbregs_leave(vcpu);
                   1946:
                   1947:        exit->exitstate[NVMM_X64_EXITSTATE_CR8] = cpudata->gcr8;
                   1948:        vmx_vmread(VMCS_GUEST_RFLAGS,
                   1949:            &exit->exitstate[NVMM_X64_EXITSTATE_RFLAGS]);
                   1950:        vmx_vmread(VMCS_GUEST_INTERRUPTIBILITY, &intstate);
                   1951:        exit->exitstate[NVMM_X64_EXITSTATE_INT_SHADOW] =
                   1952:            (intstate & (INT_STATE_STI|INT_STATE_MOVSS)) != 0;
                   1953:        exit->exitstate[NVMM_X64_EXITSTATE_INT_WINDOW_EXIT] =
                   1954:            cpudata->int_window_exit;
                   1955:        exit->exitstate[NVMM_X64_EXITSTATE_NMI_WINDOW_EXIT] =
                   1956:            cpudata->nmi_window_exit;
1.24      maxv     1957:        exit->exitstate[NVMM_X64_EXITSTATE_EVT_PENDING] =
                   1958:            cpudata->evt_pending;
1.1       maxv     1959:
                   1960:        vmx_vmcs_leave(vcpu);
                   1961:
                   1962:        return 0;
                   1963: }
                   1964:
                   1965: /* -------------------------------------------------------------------------- */
                   1966:
                   1967: static int
                   1968: vmx_memalloc(paddr_t *pa, vaddr_t *va, size_t npages)
                   1969: {
                   1970:        struct pglist pglist;
                   1971:        paddr_t _pa;
                   1972:        vaddr_t _va;
                   1973:        size_t i;
                   1974:        int ret;
                   1975:
                   1976:        ret = uvm_pglistalloc(npages * PAGE_SIZE, 0, ~0UL, PAGE_SIZE, 0,
                   1977:            &pglist, 1, 0);
                   1978:        if (ret != 0)
                   1979:                return ENOMEM;
                   1980:        _pa = TAILQ_FIRST(&pglist)->phys_addr;
                   1981:        _va = uvm_km_alloc(kernel_map, npages * PAGE_SIZE, 0,
                   1982:            UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
                   1983:        if (_va == 0)
                   1984:                goto error;
                   1985:
                   1986:        for (i = 0; i < npages; i++) {
                   1987:                pmap_kenter_pa(_va + i * PAGE_SIZE, _pa + i * PAGE_SIZE,
                   1988:                    VM_PROT_READ | VM_PROT_WRITE, PMAP_WRITE_BACK);
                   1989:        }
                   1990:        pmap_update(pmap_kernel());
                   1991:
                   1992:        memset((void *)_va, 0, npages * PAGE_SIZE);
                   1993:
                   1994:        *pa = _pa;
                   1995:        *va = _va;
                   1996:        return 0;
                   1997:
                   1998: error:
                   1999:        for (i = 0; i < npages; i++) {
                   2000:                uvm_pagefree(PHYS_TO_VM_PAGE(_pa + i * PAGE_SIZE));
                   2001:        }
                   2002:        return ENOMEM;
                   2003: }
                   2004:
                   2005: static void
                   2006: vmx_memfree(paddr_t pa, vaddr_t va, size_t npages)
                   2007: {
                   2008:        size_t i;
                   2009:
                   2010:        pmap_kremove(va, npages * PAGE_SIZE);
                   2011:        pmap_update(pmap_kernel());
                   2012:        uvm_km_free(kernel_map, va, npages * PAGE_SIZE, UVM_KMF_VAONLY);
                   2013:        for (i = 0; i < npages; i++) {
                   2014:                uvm_pagefree(PHYS_TO_VM_PAGE(pa + i * PAGE_SIZE));
                   2015:        }
                   2016: }
                   2017:
                   2018: /* -------------------------------------------------------------------------- */
                   2019:
                   2020: static void
                   2021: vmx_vcpu_msr_allow(uint8_t *bitmap, uint64_t msr, bool read, bool write)
                   2022: {
                   2023:        uint64_t byte;
                   2024:        uint8_t bitoff;
                   2025:
                   2026:        if (msr < 0x00002000) {
                   2027:                /* Range 1 */
                   2028:                byte = ((msr - 0x00000000) / 8) + 0;
                   2029:        } else if (msr >= 0xC0000000 && msr < 0xC0002000) {
                   2030:                /* Range 2 */
                   2031:                byte = ((msr - 0xC0000000) / 8) + 1024;
                   2032:        } else {
                   2033:                panic("%s: wrong range", __func__);
                   2034:        }
                   2035:
                   2036:        bitoff = (msr & 0x7);
                   2037:
                   2038:        if (read) {
                   2039:                bitmap[byte] &= ~__BIT(bitoff);
                   2040:        }
                   2041:        if (write) {
                   2042:                bitmap[2048 + byte] &= ~__BIT(bitoff);
                   2043:        }
                   2044: }
                   2045:
1.15      maxv     2046: #define VMX_SEG_ATTRIB_TYPE            __BITS(3,0)
                   2047: #define VMX_SEG_ATTRIB_S               __BIT(4)
1.12      maxv     2048: #define VMX_SEG_ATTRIB_DPL             __BITS(6,5)
                   2049: #define VMX_SEG_ATTRIB_P               __BIT(7)
                   2050: #define VMX_SEG_ATTRIB_AVL             __BIT(12)
1.15      maxv     2051: #define VMX_SEG_ATTRIB_L               __BIT(13)
                   2052: #define VMX_SEG_ATTRIB_DEF             __BIT(14)
                   2053: #define VMX_SEG_ATTRIB_G               __BIT(15)
1.12      maxv     2054: #define VMX_SEG_ATTRIB_UNUSABLE                __BIT(16)
                   2055:
1.1       maxv     2056: static void
1.12      maxv     2057: vmx_vcpu_setstate_seg(const struct nvmm_x64_state_seg *segs, int idx)
1.1       maxv     2058: {
1.12      maxv     2059:        uint64_t attrib;
1.1       maxv     2060:
1.12      maxv     2061:        attrib =
                   2062:            __SHIFTIN(segs[idx].attrib.type, VMX_SEG_ATTRIB_TYPE) |
1.15      maxv     2063:            __SHIFTIN(segs[idx].attrib.s, VMX_SEG_ATTRIB_S) |
1.12      maxv     2064:            __SHIFTIN(segs[idx].attrib.dpl, VMX_SEG_ATTRIB_DPL) |
                   2065:            __SHIFTIN(segs[idx].attrib.p, VMX_SEG_ATTRIB_P) |
                   2066:            __SHIFTIN(segs[idx].attrib.avl, VMX_SEG_ATTRIB_AVL) |
1.15      maxv     2067:            __SHIFTIN(segs[idx].attrib.l, VMX_SEG_ATTRIB_L) |
                   2068:            __SHIFTIN(segs[idx].attrib.def, VMX_SEG_ATTRIB_DEF) |
                   2069:            __SHIFTIN(segs[idx].attrib.g, VMX_SEG_ATTRIB_G) |
1.12      maxv     2070:            (!segs[idx].attrib.p ? VMX_SEG_ATTRIB_UNUSABLE : 0);
1.1       maxv     2071:
1.12      maxv     2072:        if (idx != NVMM_X64_SEG_GDT && idx != NVMM_X64_SEG_IDT) {
                   2073:                vmx_vmwrite(vmx_guest_segs[idx].selector, segs[idx].selector);
                   2074:                vmx_vmwrite(vmx_guest_segs[idx].attrib, attrib);
                   2075:        }
                   2076:        vmx_vmwrite(vmx_guest_segs[idx].limit, segs[idx].limit);
                   2077:        vmx_vmwrite(vmx_guest_segs[idx].base, segs[idx].base);
                   2078: }
1.1       maxv     2079:
1.12      maxv     2080: static void
                   2081: vmx_vcpu_getstate_seg(struct nvmm_x64_state_seg *segs, int idx)
                   2082: {
1.15      maxv     2083:        uint64_t selector, base, limit, attrib = 0;
1.1       maxv     2084:
1.12      maxv     2085:        if (idx != NVMM_X64_SEG_GDT && idx != NVMM_X64_SEG_IDT) {
1.15      maxv     2086:                vmx_vmread(vmx_guest_segs[idx].selector, &selector);
1.12      maxv     2087:                vmx_vmread(vmx_guest_segs[idx].attrib, &attrib);
                   2088:        }
1.15      maxv     2089:        vmx_vmread(vmx_guest_segs[idx].limit, &limit);
                   2090:        vmx_vmread(vmx_guest_segs[idx].base, &base);
1.1       maxv     2091:
1.15      maxv     2092:        segs[idx].selector = selector;
                   2093:        segs[idx].limit = limit;
                   2094:        segs[idx].base = base;
1.12      maxv     2095:        segs[idx].attrib.type = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_TYPE);
1.15      maxv     2096:        segs[idx].attrib.s = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_S);
1.12      maxv     2097:        segs[idx].attrib.dpl = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_DPL);
                   2098:        segs[idx].attrib.p = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_P);
                   2099:        segs[idx].attrib.avl = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_AVL);
1.15      maxv     2100:        segs[idx].attrib.l = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_L);
                   2101:        segs[idx].attrib.def = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_DEF);
                   2102:        segs[idx].attrib.g = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_G);
1.12      maxv     2103:        if (attrib & VMX_SEG_ATTRIB_UNUSABLE) {
                   2104:                segs[idx].attrib.p = 0;
                   2105:        }
                   2106: }
1.1       maxv     2107:
1.12      maxv     2108: static inline bool
                   2109: vmx_state_tlb_flush(const struct nvmm_x64_state *state, uint64_t flags)
                   2110: {
                   2111:        uint64_t cr0, cr3, cr4, efer;
1.1       maxv     2112:
1.12      maxv     2113:        if (flags & NVMM_X64_STATE_CRS) {
                   2114:                vmx_vmread(VMCS_GUEST_CR0, &cr0);
                   2115:                if ((cr0 ^ state->crs[NVMM_X64_CR_CR0]) & CR0_TLB_FLUSH) {
                   2116:                        return true;
                   2117:                }
                   2118:                vmx_vmread(VMCS_GUEST_CR3, &cr3);
                   2119:                if (cr3 != state->crs[NVMM_X64_CR_CR3]) {
                   2120:                        return true;
                   2121:                }
                   2122:                vmx_vmread(VMCS_GUEST_CR4, &cr4);
                   2123:                if ((cr4 ^ state->crs[NVMM_X64_CR_CR4]) & CR4_TLB_FLUSH) {
                   2124:                        return true;
                   2125:                }
                   2126:        }
1.1       maxv     2127:
1.12      maxv     2128:        if (flags & NVMM_X64_STATE_MSRS) {
                   2129:                vmx_vmread(VMCS_GUEST_IA32_EFER, &efer);
                   2130:                if ((efer ^
                   2131:                     state->msrs[NVMM_X64_MSR_EFER]) & EFER_TLB_FLUSH) {
                   2132:                        return true;
                   2133:                }
                   2134:        }
1.1       maxv     2135:
1.12      maxv     2136:        return false;
                   2137: }
1.1       maxv     2138:
1.12      maxv     2139: static void
1.14      maxv     2140: vmx_vcpu_setstate(struct nvmm_cpu *vcpu, const void *data, uint64_t flags)
1.12      maxv     2141: {
                   2142:        const struct nvmm_x64_state *state = data;
                   2143:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   2144:        struct fxsave *fpustate;
                   2145:        uint64_t ctls1, intstate;
1.1       maxv     2146:
1.12      maxv     2147:        vmx_vmcs_enter(vcpu);
1.1       maxv     2148:
1.12      maxv     2149:        if (vmx_state_tlb_flush(state, flags)) {
                   2150:                cpudata->gtlb_want_flush = true;
                   2151:        }
1.1       maxv     2152:
1.12      maxv     2153:        if (flags & NVMM_X64_STATE_SEGS) {
                   2154:                vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_CS);
                   2155:                vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_DS);
                   2156:                vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_ES);
                   2157:                vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_FS);
                   2158:                vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_GS);
                   2159:                vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_SS);
                   2160:                vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_GDT);
                   2161:                vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_IDT);
                   2162:                vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_LDT);
                   2163:                vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_TR);
                   2164:        }
1.5       maxv     2165:
1.12      maxv     2166:        CTASSERT(sizeof(cpudata->gprs) == sizeof(state->gprs));
                   2167:        if (flags & NVMM_X64_STATE_GPRS) {
                   2168:                memcpy(cpudata->gprs, state->gprs, sizeof(state->gprs));
1.1       maxv     2169:
1.12      maxv     2170:                vmx_vmwrite(VMCS_GUEST_RIP, state->gprs[NVMM_X64_GPR_RIP]);
                   2171:                vmx_vmwrite(VMCS_GUEST_RSP, state->gprs[NVMM_X64_GPR_RSP]);
                   2172:                vmx_vmwrite(VMCS_GUEST_RFLAGS, state->gprs[NVMM_X64_GPR_RFLAGS]);
                   2173:        }
                   2174:
                   2175:        if (flags & NVMM_X64_STATE_CRS) {
                   2176:                /*
                   2177:                 * CR0_NE and CR4_VMXE are mandatory.
                   2178:                 */
                   2179:                vmx_vmwrite(VMCS_GUEST_CR0,
                   2180:                    state->crs[NVMM_X64_CR_CR0] | CR0_NE);
                   2181:                cpudata->gcr2 = state->crs[NVMM_X64_CR_CR2];
                   2182:                vmx_vmwrite(VMCS_GUEST_CR3, state->crs[NVMM_X64_CR_CR3]); // XXX PDPTE?
                   2183:                vmx_vmwrite(VMCS_GUEST_CR4,
                   2184:                    state->crs[NVMM_X64_CR_CR4] | CR4_VMXE);
                   2185:                cpudata->gcr8 = state->crs[NVMM_X64_CR_CR8];
1.1       maxv     2186:
1.12      maxv     2187:                if (vmx_xcr0_mask != 0) {
                   2188:                        /* Clear illegal XCR0 bits, set mandatory X87 bit. */
                   2189:                        cpudata->gxcr0 = state->crs[NVMM_X64_CR_XCR0];
                   2190:                        cpudata->gxcr0 &= vmx_xcr0_mask;
                   2191:                        cpudata->gxcr0 |= XCR0_X87;
                   2192:                }
                   2193:        }
1.1       maxv     2194:
1.12      maxv     2195:        CTASSERT(sizeof(cpudata->drs) == sizeof(state->drs));
                   2196:        if (flags & NVMM_X64_STATE_DRS) {
                   2197:                memcpy(cpudata->drs, state->drs, sizeof(state->drs));
1.1       maxv     2198:
1.12      maxv     2199:                cpudata->drs[NVMM_X64_DR_DR6] &= 0xFFFFFFFF;
                   2200:                vmx_vmwrite(VMCS_GUEST_DR7, cpudata->drs[NVMM_X64_DR_DR7]);
                   2201:        }
1.1       maxv     2202:
1.12      maxv     2203:        if (flags & NVMM_X64_STATE_MSRS) {
                   2204:                cpudata->gmsr[VMX_MSRLIST_STAR].val =
                   2205:                    state->msrs[NVMM_X64_MSR_STAR];
                   2206:                cpudata->gmsr[VMX_MSRLIST_LSTAR].val =
                   2207:                    state->msrs[NVMM_X64_MSR_LSTAR];
                   2208:                cpudata->gmsr[VMX_MSRLIST_CSTAR].val =
                   2209:                    state->msrs[NVMM_X64_MSR_CSTAR];
                   2210:                cpudata->gmsr[VMX_MSRLIST_SFMASK].val =
                   2211:                    state->msrs[NVMM_X64_MSR_SFMASK];
                   2212:                cpudata->gmsr[VMX_MSRLIST_KERNELGSBASE].val =
                   2213:                    state->msrs[NVMM_X64_MSR_KERNELGSBASE];
1.1       maxv     2214:
1.12      maxv     2215:                vmx_vmwrite(VMCS_GUEST_IA32_EFER,
                   2216:                    state->msrs[NVMM_X64_MSR_EFER]);
                   2217:                vmx_vmwrite(VMCS_GUEST_IA32_PAT,
                   2218:                    state->msrs[NVMM_X64_MSR_PAT]);
                   2219:                vmx_vmwrite(VMCS_GUEST_IA32_SYSENTER_CS,
                   2220:                    state->msrs[NVMM_X64_MSR_SYSENTER_CS]);
                   2221:                vmx_vmwrite(VMCS_GUEST_IA32_SYSENTER_ESP,
                   2222:                    state->msrs[NVMM_X64_MSR_SYSENTER_ESP]);
                   2223:                vmx_vmwrite(VMCS_GUEST_IA32_SYSENTER_EIP,
                   2224:                    state->msrs[NVMM_X64_MSR_SYSENTER_EIP]);
1.1       maxv     2225:
1.21      maxv     2226:                cpudata->gtsc = state->msrs[NVMM_X64_MSR_TSC];
                   2227:                cpudata->gtsc_want_update = true;
                   2228:
1.12      maxv     2229:                /* ENTRY_CTLS_LONG_MODE must match EFER_LMA. */
                   2230:                vmx_vmread(VMCS_ENTRY_CTLS, &ctls1);
                   2231:                if (state->msrs[NVMM_X64_MSR_EFER] & EFER_LMA) {
                   2232:                        ctls1 |= ENTRY_CTLS_LONG_MODE;
                   2233:                } else {
                   2234:                        ctls1 &= ~ENTRY_CTLS_LONG_MODE;
                   2235:                }
                   2236:                vmx_vmwrite(VMCS_ENTRY_CTLS, ctls1);
                   2237:        }
1.1       maxv     2238:
1.24      maxv     2239:        if (flags & NVMM_X64_STATE_INTR) {
1.12      maxv     2240:                vmx_vmread(VMCS_GUEST_INTERRUPTIBILITY, &intstate);
                   2241:                intstate &= ~(INT_STATE_STI|INT_STATE_MOVSS);
1.24      maxv     2242:                if (state->intr.int_shadow) {
1.12      maxv     2243:                        intstate |= INT_STATE_MOVSS;
                   2244:                }
                   2245:                vmx_vmwrite(VMCS_GUEST_INTERRUPTIBILITY, intstate);
1.1       maxv     2246:
1.24      maxv     2247:                if (state->intr.int_window_exiting) {
1.12      maxv     2248:                        vmx_event_waitexit_enable(vcpu, false);
                   2249:                } else {
                   2250:                        vmx_event_waitexit_disable(vcpu, false);
                   2251:                }
1.1       maxv     2252:
1.24      maxv     2253:                if (state->intr.nmi_window_exiting) {
1.12      maxv     2254:                        vmx_event_waitexit_enable(vcpu, true);
                   2255:                } else {
                   2256:                        vmx_event_waitexit_disable(vcpu, true);
                   2257:                }
                   2258:        }
1.9       maxv     2259:
1.12      maxv     2260:        CTASSERT(sizeof(cpudata->gfpu.xsh_fxsave) == sizeof(state->fpu));
                   2261:        if (flags & NVMM_X64_STATE_FPU) {
                   2262:                memcpy(cpudata->gfpu.xsh_fxsave, &state->fpu,
                   2263:                    sizeof(state->fpu));
1.1       maxv     2264:
1.12      maxv     2265:                fpustate = (struct fxsave *)cpudata->gfpu.xsh_fxsave;
                   2266:                fpustate->fx_mxcsr_mask &= x86_fpu_mxcsr_mask;
                   2267:                fpustate->fx_mxcsr &= fpustate->fx_mxcsr_mask;
1.1       maxv     2268:
1.12      maxv     2269:                if (vmx_xcr0_mask != 0) {
                   2270:                        /* Reset XSTATE_BV, to force a reload. */
                   2271:                        cpudata->gfpu.xsh_xstate_bv = vmx_xcr0_mask;
                   2272:                }
1.1       maxv     2273:        }
                   2274:
1.12      maxv     2275:        vmx_vmcs_leave(vcpu);
1.1       maxv     2276: }
                   2277:
                   2278: static void
1.12      maxv     2279: vmx_vcpu_getstate(struct nvmm_cpu *vcpu, void *data, uint64_t flags)
1.1       maxv     2280: {
1.12      maxv     2281:        struct nvmm_x64_state *state = (struct nvmm_x64_state *)data;
1.1       maxv     2282:        struct vmx_cpudata *cpudata = vcpu->cpudata;
1.12      maxv     2283:        uint64_t intstate;
1.1       maxv     2284:
                   2285:        vmx_vmcs_enter(vcpu);
                   2286:
1.12      maxv     2287:        if (flags & NVMM_X64_STATE_SEGS) {
                   2288:                vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_CS);
                   2289:                vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_DS);
                   2290:                vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_ES);
                   2291:                vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_FS);
                   2292:                vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_GS);
                   2293:                vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_SS);
                   2294:                vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_GDT);
                   2295:                vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_IDT);
                   2296:                vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_LDT);
                   2297:                vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_TR);
                   2298:        }
                   2299:
                   2300:        CTASSERT(sizeof(cpudata->gprs) == sizeof(state->gprs));
                   2301:        if (flags & NVMM_X64_STATE_GPRS) {
                   2302:                memcpy(state->gprs, cpudata->gprs, sizeof(state->gprs));
                   2303:
                   2304:                vmx_vmread(VMCS_GUEST_RIP, &state->gprs[NVMM_X64_GPR_RIP]);
                   2305:                vmx_vmread(VMCS_GUEST_RSP, &state->gprs[NVMM_X64_GPR_RSP]);
                   2306:                vmx_vmread(VMCS_GUEST_RFLAGS, &state->gprs[NVMM_X64_GPR_RFLAGS]);
                   2307:        }
                   2308:
                   2309:        if (flags & NVMM_X64_STATE_CRS) {
                   2310:                vmx_vmread(VMCS_GUEST_CR0, &state->crs[NVMM_X64_CR_CR0]);
                   2311:                state->crs[NVMM_X64_CR_CR2] = cpudata->gcr2;
                   2312:                vmx_vmread(VMCS_GUEST_CR3, &state->crs[NVMM_X64_CR_CR3]);
                   2313:                vmx_vmread(VMCS_GUEST_CR4, &state->crs[NVMM_X64_CR_CR4]);
                   2314:                state->crs[NVMM_X64_CR_CR8] = cpudata->gcr8;
                   2315:                state->crs[NVMM_X64_CR_XCR0] = cpudata->gxcr0;
                   2316:
                   2317:                /* Hide VMXE. */
                   2318:                state->crs[NVMM_X64_CR_CR4] &= ~CR4_VMXE;
                   2319:        }
                   2320:
                   2321:        CTASSERT(sizeof(cpudata->drs) == sizeof(state->drs));
                   2322:        if (flags & NVMM_X64_STATE_DRS) {
                   2323:                memcpy(state->drs, cpudata->drs, sizeof(state->drs));
                   2324:
                   2325:                vmx_vmread(VMCS_GUEST_DR7, &state->drs[NVMM_X64_DR_DR7]);
                   2326:        }
1.9       maxv     2327:
1.12      maxv     2328:        if (flags & NVMM_X64_STATE_MSRS) {
                   2329:                state->msrs[NVMM_X64_MSR_STAR] =
                   2330:                    cpudata->gmsr[VMX_MSRLIST_STAR].val;
                   2331:                state->msrs[NVMM_X64_MSR_LSTAR] =
                   2332:                    cpudata->gmsr[VMX_MSRLIST_LSTAR].val;
                   2333:                state->msrs[NVMM_X64_MSR_CSTAR] =
                   2334:                    cpudata->gmsr[VMX_MSRLIST_CSTAR].val;
                   2335:                state->msrs[NVMM_X64_MSR_SFMASK] =
                   2336:                    cpudata->gmsr[VMX_MSRLIST_SFMASK].val;
                   2337:                state->msrs[NVMM_X64_MSR_KERNELGSBASE] =
                   2338:                    cpudata->gmsr[VMX_MSRLIST_KERNELGSBASE].val;
1.1       maxv     2339:
1.12      maxv     2340:                vmx_vmread(VMCS_GUEST_IA32_EFER,
                   2341:                    &state->msrs[NVMM_X64_MSR_EFER]);
                   2342:                vmx_vmread(VMCS_GUEST_IA32_PAT,
                   2343:                    &state->msrs[NVMM_X64_MSR_PAT]);
                   2344:                vmx_vmread(VMCS_GUEST_IA32_SYSENTER_CS,
                   2345:                    &state->msrs[NVMM_X64_MSR_SYSENTER_CS]);
                   2346:                vmx_vmread(VMCS_GUEST_IA32_SYSENTER_ESP,
                   2347:                    &state->msrs[NVMM_X64_MSR_SYSENTER_ESP]);
                   2348:                vmx_vmread(VMCS_GUEST_IA32_SYSENTER_EIP,
                   2349:                    &state->msrs[NVMM_X64_MSR_SYSENTER_EIP]);
1.21      maxv     2350:
                   2351:                state->msrs[NVMM_X64_MSR_TSC] = cpudata->gtsc;
1.12      maxv     2352:        }
1.1       maxv     2353:
1.24      maxv     2354:        if (flags & NVMM_X64_STATE_INTR) {
1.12      maxv     2355:                vmx_vmread(VMCS_GUEST_INTERRUPTIBILITY, &intstate);
1.24      maxv     2356:                state->intr.int_shadow =
1.12      maxv     2357:                    (intstate & (INT_STATE_STI|INT_STATE_MOVSS)) != 0;
1.24      maxv     2358:                state->intr.int_window_exiting = cpudata->int_window_exit;
                   2359:                state->intr.nmi_window_exiting = cpudata->nmi_window_exit;
                   2360:                state->intr.evt_pending = cpudata->evt_pending;
1.12      maxv     2361:        }
1.1       maxv     2362:
1.12      maxv     2363:        CTASSERT(sizeof(cpudata->gfpu.xsh_fxsave) == sizeof(state->fpu));
                   2364:        if (flags & NVMM_X64_STATE_FPU) {
                   2365:                memcpy(&state->fpu, cpudata->gfpu.xsh_fxsave,
                   2366:                    sizeof(state->fpu));
1.1       maxv     2367:        }
1.12      maxv     2368:
                   2369:        vmx_vmcs_leave(vcpu);
1.1       maxv     2370: }
                   2371:
1.12      maxv     2372: /* -------------------------------------------------------------------------- */
                   2373:
1.1       maxv     2374: static void
1.12      maxv     2375: vmx_asid_alloc(struct nvmm_cpu *vcpu)
1.1       maxv     2376: {
1.12      maxv     2377:        struct vmx_cpudata *cpudata = vcpu->cpudata;
                   2378:        size_t i, oct, bit;
                   2379:
                   2380:        mutex_enter(&vmx_asidlock);
                   2381:
                   2382:        for (i = 0; i < vmx_maxasid; i++) {
                   2383:                oct = i / 8;
                   2384:                bit = i % 8;
                   2385:
                   2386:                if (vmx_asidmap[oct] & __BIT(bit)) {
                   2387:                        continue;
                   2388:                }
                   2389:
                   2390:                cpudata->asid = i;
1.1       maxv     2391:
1.12      maxv     2392:                vmx_asidmap[oct] |= __BIT(bit);
                   2393:                vmx_vmwrite(VMCS_VPID, i);
                   2394:                mutex_exit(&vmx_asidlock);
                   2395:                return;
1.1       maxv     2396:        }
                   2397:
1.12      maxv     2398:        mutex_exit(&vmx_asidlock);
                   2399:
                   2400:        panic("%s: impossible", __func__);
1.1       maxv     2401: }
                   2402:
1.12      maxv     2403: static void
                   2404: vmx_asid_free(struct nvmm_cpu *vcpu)
1.1       maxv     2405: {
1.12      maxv     2406:        size_t oct, bit;
                   2407:        uint64_t asid;
1.1       maxv     2408:
1.12      maxv     2409:        vmx_vmread(VMCS_VPID, &asid);
1.1       maxv     2410:
1.12      maxv     2411:        oct = asid / 8;
                   2412:        bit = asid % 8;
1.1       maxv     2413:
1.12      maxv     2414:        mutex_enter(&vmx_asidlock);
                   2415:        vmx_asidmap[oct] &= ~__BIT(bit);
                   2416:        mutex_exit(&vmx_asidlock);
1.1       maxv     2417: }
                   2418:
                   2419: static void
1.12      maxv     2420: vmx_vcpu_init(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
1.1       maxv     2421: {
                   2422:        struct vmx_cpudata *cpudata = vcpu->cpudata;
1.12      maxv     2423:        struct vmcs *vmcs = cpudata->vmcs;
                   2424:        struct msr_entry *gmsr = cpudata->gmsr;
                   2425:        extern uint8_t vmx_resume_rip;
                   2426:        uint64_t rev, eptp;
1.1       maxv     2427:
1.12      maxv     2428:        rev = vmx_get_revision();
1.1       maxv     2429:
1.12      maxv     2430:        memset(vmcs, 0, VMCS_SIZE);
                   2431:        vmcs->ident = __SHIFTIN(rev, VMCS_IDENT_REVISION);
                   2432:        vmcs->abort = 0;
1.1       maxv     2433:
1.12      maxv     2434:        vmx_vmcs_enter(vcpu);
1.1       maxv     2435:
1.12      maxv     2436:        /* No link pointer. */
                   2437:        vmx_vmwrite(VMCS_LINK_POINTER, 0xFFFFFFFFFFFFFFFF);
1.1       maxv     2438:
1.12      maxv     2439:        /* Install the CTLSs. */
                   2440:        vmx_vmwrite(VMCS_PINBASED_CTLS, vmx_pinbased_ctls);
                   2441:        vmx_vmwrite(VMCS_PROCBASED_CTLS, vmx_procbased_ctls);
                   2442:        vmx_vmwrite(VMCS_PROCBASED_CTLS2, vmx_procbased_ctls2);
                   2443:        vmx_vmwrite(VMCS_ENTRY_CTLS, vmx_entry_ctls);
                   2444:        vmx_vmwrite(VMCS_EXIT_CTLS, vmx_exit_ctls);
1.1       maxv     2445:
1.12      maxv     2446:        /* Allow direct access to certain MSRs. */
                   2447:        memset(cpudata->msrbm, 0xFF, MSRBM_SIZE);
                   2448:        vmx_vcpu_msr_allow(cpudata->msrbm, MSR_EFER, true, true);
                   2449:        vmx_vcpu_msr_allow(cpudata->msrbm, MSR_STAR, true, true);
                   2450:        vmx_vcpu_msr_allow(cpudata->msrbm, MSR_LSTAR, true, true);
                   2451:        vmx_vcpu_msr_allow(cpudata->msrbm, MSR_CSTAR, true, true);
                   2452:        vmx_vcpu_msr_allow(cpudata->msrbm, MSR_SFMASK, true, true);
                   2453:        vmx_vcpu_msr_allow(cpudata->msrbm, MSR_KERNELGSBASE, true, true);
                   2454:        vmx_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_CS, true, true);
                   2455:        vmx_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_ESP, true, true);
                   2456:        vmx_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_EIP, true, true);
                   2457:        vmx_vcpu_msr_allow(cpudata->msrbm, MSR_FSBASE, true, true);
                   2458:        vmx_vcpu_msr_allow(cpudata->msrbm, MSR_GSBASE, true, true);
                   2459:        vmx_vcpu_msr_allow(cpudata->msrbm, MSR_TSC, true, false);
                   2460:        vmx_vcpu_msr_allow(cpudata->msrbm, MSR_IA32_ARCH_CAPABILITIES,
                   2461:            true, false);
                   2462:        vmx_vmwrite(VMCS_MSR_BITMAP, (uint64_t)cpudata->msrbm_pa);
1.1       maxv     2463:
1.12      maxv     2464:        /*
                   2465:         * List of Guest MSRs loaded on VMENTRY, saved on VMEXIT. This
                   2466:         * includes the L1D_FLUSH MSR, to mitigate L1TF.
                   2467:         */
                   2468:        gmsr[VMX_MSRLIST_STAR].msr = MSR_STAR;
                   2469:        gmsr[VMX_MSRLIST_STAR].val = 0;
                   2470:        gmsr[VMX_MSRLIST_LSTAR].msr = MSR_LSTAR;
                   2471:        gmsr[VMX_MSRLIST_LSTAR].val = 0;
                   2472:        gmsr[VMX_MSRLIST_CSTAR].msr = MSR_CSTAR;
                   2473:        gmsr[VMX_MSRLIST_CSTAR].val = 0;
                   2474:        gmsr[VMX_MSRLIST_SFMASK].msr = MSR_SFMASK;
                   2475:        gmsr[VMX_MSRLIST_SFMASK].val = 0;
                   2476:        gmsr[VMX_MSRLIST_KERNELGSBASE].msr = MSR_KERNELGSBASE;
                   2477:        gmsr[VMX_MSRLIST_KERNELGSBASE].val = 0;
                   2478:        gmsr[VMX_MSRLIST_L1DFLUSH].msr = MSR_IA32_FLUSH_CMD;
                   2479:        gmsr[VMX_MSRLIST_L1DFLUSH].val = IA32_FLUSH_CMD_L1D_FLUSH;
                   2480:        vmx_vmwrite(VMCS_ENTRY_MSR_LOAD_ADDRESS, cpudata->gmsr_pa);
                   2481:        vmx_vmwrite(VMCS_EXIT_MSR_STORE_ADDRESS, cpudata->gmsr_pa);
                   2482:        vmx_vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, vmx_msrlist_entry_nmsr);
                   2483:        vmx_vmwrite(VMCS_EXIT_MSR_STORE_COUNT, VMX_MSRLIST_EXIT_NMSR);
1.1       maxv     2484:
1.12      maxv     2485:        /* Force CR0_NW and CR0_CD to zero, CR0_ET to one. */
                   2486:        vmx_vmwrite(VMCS_CR0_MASK, CR0_NW|CR0_CD|CR0_ET);
                   2487:        vmx_vmwrite(VMCS_CR0_SHADOW, CR0_ET);
1.1       maxv     2488:
1.12      maxv     2489:        /* Force CR4_VMXE to zero. */
                   2490:        vmx_vmwrite(VMCS_CR4_MASK, CR4_VMXE);
1.1       maxv     2491:
1.12      maxv     2492:        /* Set the Host state for resuming. */
                   2493:        vmx_vmwrite(VMCS_HOST_RIP, (uint64_t)&vmx_resume_rip);
                   2494:        vmx_vmwrite(VMCS_HOST_CS_SELECTOR, GSEL(GCODE_SEL, SEL_KPL));
                   2495:        vmx_vmwrite(VMCS_HOST_SS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL));
                   2496:        vmx_vmwrite(VMCS_HOST_DS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL));
                   2497:        vmx_vmwrite(VMCS_HOST_ES_SELECTOR, GSEL(GDATA_SEL, SEL_KPL));
                   2498:        vmx_vmwrite(VMCS_HOST_FS_SELECTOR, 0);
                   2499:        vmx_vmwrite(VMCS_HOST_GS_SELECTOR, 0);
                   2500:        vmx_vmwrite(VMCS_HOST_IA32_SYSENTER_CS, 0);
                   2501:        vmx_vmwrite(VMCS_HOST_IA32_SYSENTER_ESP, 0);
                   2502:        vmx_vmwrite(VMCS_HOST_IA32_SYSENTER_EIP, 0);
                   2503:        vmx_vmwrite(VMCS_HOST_IDTR_BASE, (uint64_t)idt);
                   2504:        vmx_vmwrite(VMCS_HOST_IA32_PAT, rdmsr(MSR_CR_PAT));
                   2505:        vmx_vmwrite(VMCS_HOST_IA32_EFER, rdmsr(MSR_EFER));
                   2506:        vmx_vmwrite(VMCS_HOST_CR0, rcr0());
1.1       maxv     2507:
1.12      maxv     2508:        /* Generate ASID. */
                   2509:        vmx_asid_alloc(vcpu);
1.1       maxv     2510:
1.12      maxv     2511:        /* Enable Extended Paging, 4-Level. */
                   2512:        eptp =
                   2513:            __SHIFTIN(vmx_eptp_type, EPTP_TYPE) |
                   2514:            __SHIFTIN(4-1, EPTP_WALKLEN) |
1.13      maxv     2515:            (pmap_ept_has_ad ? EPTP_FLAGS_AD : 0) |
1.12      maxv     2516:            mach->vm->vm_map.pmap->pm_pdirpa[0];
                   2517:        vmx_vmwrite(VMCS_EPTP, eptp);
1.1       maxv     2518:
1.12      maxv     2519:        /* Init IA32_MISC_ENABLE. */
                   2520:        cpudata->gmsr_misc_enable = rdmsr(MSR_MISC_ENABLE);
                   2521:        cpudata->gmsr_misc_enable &=
                   2522:            ~(IA32_MISC_PERFMON_EN|IA32_MISC_EISST_EN|IA32_MISC_MWAIT_EN);
                   2523:        cpudata->gmsr_misc_enable |=
                   2524:            (IA32_MISC_BTS_UNAVAIL|IA32_MISC_PEBS_UNAVAIL);
1.1       maxv     2525:
1.12      maxv     2526:        /* Init XSAVE header. */
                   2527:        cpudata->gfpu.xsh_xstate_bv = vmx_xcr0_mask;
                   2528:        cpudata->gfpu.xsh_xcomp_bv = 0;
1.1       maxv     2529:
1.12      maxv     2530:        /* These MSRs are static. */
                   2531:        cpudata->star = rdmsr(MSR_STAR);
                   2532:        cpudata->cstar = rdmsr(MSR_CSTAR);
                   2533:        cpudata->sfmask = rdmsr(MSR_SFMASK);
1.1       maxv     2534:
1.14      maxv     2535:        /* Install the RESET state. */
                   2536:        vmx_vcpu_setstate(vcpu, &nvmm_x86_reset_state, NVMM_X64_STATE_ALL);
                   2537:
1.1       maxv     2538:        vmx_vmcs_leave(vcpu);
                   2539: }
                   2540:
1.12      maxv     2541: static int
                   2542: vmx_vcpu_create(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
1.1       maxv     2543: {
1.12      maxv     2544:        struct vmx_cpudata *cpudata;
                   2545:        int error;
1.1       maxv     2546:
1.12      maxv     2547:        /* Allocate the VMX cpudata. */
                   2548:        cpudata = (struct vmx_cpudata *)uvm_km_alloc(kernel_map,
                   2549:            roundup(sizeof(*cpudata), PAGE_SIZE), 0,
                   2550:            UVM_KMF_WIRED|UVM_KMF_ZERO);
                   2551:        vcpu->cpudata = cpudata;
1.1       maxv     2552:
1.12      maxv     2553:        /* VMCS */
                   2554:        error = vmx_memalloc(&cpudata->vmcs_pa, (vaddr_t *)&cpudata->vmcs,
                   2555:            VMCS_NPAGES);
                   2556:        if (error)
                   2557:                goto error;
1.1       maxv     2558:
1.12      maxv     2559:        /* MSR Bitmap */
                   2560:        error = vmx_memalloc(&cpudata->msrbm_pa, (vaddr_t *)&cpudata->msrbm,
                   2561:            MSRBM_NPAGES);
                   2562:        if (error)
                   2563:                goto error;
1.1       maxv     2564:
1.12      maxv     2565:        /* Guest MSR List */
                   2566:        error = vmx_memalloc(&cpudata->gmsr_pa, (vaddr_t *)&cpudata->gmsr, 1);
                   2567:        if (error)
                   2568:                goto error;
1.1       maxv     2569:
1.12      maxv     2570:        kcpuset_create(&cpudata->htlb_want_flush, true);
1.1       maxv     2571:
1.12      maxv     2572:        /* Init the VCPU info. */
                   2573:        vmx_vcpu_init(mach, vcpu);
1.1       maxv     2574:
1.12      maxv     2575:        return 0;
1.1       maxv     2576:
1.12      maxv     2577: error:
                   2578:        if (cpudata->vmcs_pa) {
                   2579:                vmx_memfree(cpudata->vmcs_pa, (vaddr_t)cpudata->vmcs,
                   2580:                    VMCS_NPAGES);
                   2581:        }
                   2582:        if (cpudata->msrbm_pa) {
                   2583:                vmx_memfree(cpudata->msrbm_pa, (vaddr_t)cpudata->msrbm,
                   2584:                    MSRBM_NPAGES);
                   2585:        }
                   2586:        if (cpudata->gmsr_pa) {
                   2587:                vmx_memfree(cpudata->gmsr_pa, (vaddr_t)cpudata->gmsr, 1);
1.1       maxv     2588:        }
                   2589:
1.12      maxv     2590:        kmem_free(cpudata, sizeof(*cpudata));
                   2591:        return error;
                   2592: }
1.1       maxv     2593:
1.12      maxv     2594: static void
                   2595: vmx_vcpu_destroy(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
                   2596: {
                   2597:        struct vmx_cpudata *cpudata = vcpu->cpudata;
1.1       maxv     2598:
1.12      maxv     2599:        vmx_vmcs_enter(vcpu);
                   2600:        vmx_asid_free(vcpu);
1.19      maxv     2601:        vmx_vmcs_destroy(vcpu);
1.1       maxv     2602:
1.12      maxv     2603:        kcpuset_destroy(cpudata->htlb_want_flush);
1.1       maxv     2604:
1.12      maxv     2605:        vmx_memfree(cpudata->vmcs_pa, (vaddr_t)cpudata->vmcs, VMCS_NPAGES);
                   2606:        vmx_memfree(cpudata->msrbm_pa, (vaddr_t)cpudata->msrbm, MSRBM_NPAGES);
                   2607:        vmx_memfree(cpudata->gmsr_pa, (vaddr_t)cpudata->gmsr, 1);
                   2608:        uvm_km_free(kernel_map, (vaddr_t)cpudata,
                   2609:            roundup(sizeof(*cpudata), PAGE_SIZE), UVM_KMF_WIRED);
1.1       maxv     2610: }
                   2611:
                   2612: /* -------------------------------------------------------------------------- */
                   2613:
                   2614: static void
                   2615: vmx_tlb_flush(struct pmap *pm)
                   2616: {
                   2617:        struct nvmm_machine *mach = pm->pm_data;
                   2618:        struct vmx_machdata *machdata = mach->machdata;
                   2619:
1.9       maxv     2620:        atomic_inc_64(&machdata->mach_htlb_gen);
1.1       maxv     2621:
1.9       maxv     2622:        /* Generates IPIs, which cause #VMEXITs. */
                   2623:        pmap_tlb_shootdown(pmap_kernel(), -1, PG_G, TLBSHOOT_UPDATE);
1.1       maxv     2624: }
                   2625:
                   2626: static void
                   2627: vmx_machine_create(struct nvmm_machine *mach)
                   2628: {
                   2629:        struct pmap *pmap = mach->vm->vm_map.pmap;
                   2630:        struct vmx_machdata *machdata;
                   2631:
                   2632:        /* Convert to EPT. */
                   2633:        pmap_ept_transform(pmap);
                   2634:
                   2635:        /* Fill in pmap info. */
                   2636:        pmap->pm_data = (void *)mach;
                   2637:        pmap->pm_tlb_flush = vmx_tlb_flush;
                   2638:
                   2639:        machdata = kmem_zalloc(sizeof(struct vmx_machdata), KM_SLEEP);
                   2640:        mach->machdata = machdata;
                   2641:
1.9       maxv     2642:        /* Start with an hTLB flush everywhere. */
                   2643:        machdata->mach_htlb_gen = 1;
1.1       maxv     2644: }
                   2645:
                   2646: static void
                   2647: vmx_machine_destroy(struct nvmm_machine *mach)
                   2648: {
                   2649:        struct vmx_machdata *machdata = mach->machdata;
                   2650:
                   2651:        kmem_free(machdata, sizeof(struct vmx_machdata));
                   2652: }
                   2653:
                   2654: static int
                   2655: vmx_machine_configure(struct nvmm_machine *mach, uint64_t op, void *data)
                   2656: {
                   2657:        struct nvmm_x86_conf_cpuid *cpuid = data;
                   2658:        struct vmx_machdata *machdata = (struct vmx_machdata *)mach->machdata;
                   2659:        size_t i;
                   2660:
                   2661:        if (__predict_false(op != NVMM_X86_CONF_CPUID)) {
                   2662:                return EINVAL;
                   2663:        }
                   2664:
                   2665:        if (__predict_false((cpuid->set.eax & cpuid->del.eax) ||
                   2666:            (cpuid->set.ebx & cpuid->del.ebx) ||
                   2667:            (cpuid->set.ecx & cpuid->del.ecx) ||
                   2668:            (cpuid->set.edx & cpuid->del.edx))) {
                   2669:                return EINVAL;
                   2670:        }
                   2671:
                   2672:        /* If already here, replace. */
                   2673:        for (i = 0; i < VMX_NCPUIDS; i++) {
                   2674:                if (!machdata->cpuidpresent[i]) {
                   2675:                        continue;
                   2676:                }
                   2677:                if (machdata->cpuid[i].leaf == cpuid->leaf) {
                   2678:                        memcpy(&machdata->cpuid[i], cpuid,
                   2679:                            sizeof(struct nvmm_x86_conf_cpuid));
                   2680:                        return 0;
                   2681:                }
                   2682:        }
                   2683:
                   2684:        /* Not here, insert. */
                   2685:        for (i = 0; i < VMX_NCPUIDS; i++) {
                   2686:                if (!machdata->cpuidpresent[i]) {
                   2687:                        machdata->cpuidpresent[i] = true;
                   2688:                        memcpy(&machdata->cpuid[i], cpuid,
                   2689:                            sizeof(struct nvmm_x86_conf_cpuid));
                   2690:                        return 0;
                   2691:                }
                   2692:        }
                   2693:
                   2694:        return ENOBUFS;
                   2695: }
                   2696:
                   2697: /* -------------------------------------------------------------------------- */
                   2698:
                   2699: static int
                   2700: vmx_init_ctls(uint64_t msr_ctls, uint64_t msr_true_ctls,
                   2701:     uint64_t set_one, uint64_t set_zero, uint64_t *res)
                   2702: {
                   2703:        uint64_t basic, val, true_val;
                   2704:        bool one_allowed, zero_allowed, has_true;
                   2705:        size_t i;
                   2706:
                   2707:        basic = rdmsr(MSR_IA32_VMX_BASIC);
                   2708:        has_true = (basic & IA32_VMX_BASIC_TRUE_CTLS) != 0;
                   2709:
                   2710:        val = rdmsr(msr_ctls);
                   2711:        if (has_true) {
                   2712:                true_val = rdmsr(msr_true_ctls);
                   2713:        } else {
                   2714:                true_val = val;
                   2715:        }
                   2716:
                   2717: #define ONE_ALLOWED(msrval, bitoff) \
                   2718:        ((msrval & __BIT(32 + bitoff)) != 0)
                   2719: #define ZERO_ALLOWED(msrval, bitoff) \
                   2720:        ((msrval & __BIT(bitoff)) == 0)
                   2721:
                   2722:        for (i = 0; i < 32; i++) {
                   2723:                one_allowed = ONE_ALLOWED(true_val, i);
                   2724:                zero_allowed = ZERO_ALLOWED(true_val, i);
                   2725:
                   2726:                if (zero_allowed && !one_allowed) {
                   2727:                        if (set_one & __BIT(i))
                   2728:                                return -1;
                   2729:                        *res &= ~__BIT(i);
                   2730:                } else if (one_allowed && !zero_allowed) {
                   2731:                        if (set_zero & __BIT(i))
                   2732:                                return -1;
                   2733:                        *res |= __BIT(i);
                   2734:                } else {
                   2735:                        if (set_zero & __BIT(i)) {
                   2736:                                *res &= ~__BIT(i);
                   2737:                        } else if (set_one & __BIT(i)) {
                   2738:                                *res |= __BIT(i);
                   2739:                        } else if (!has_true) {
                   2740:                                *res &= ~__BIT(i);
                   2741:                        } else if (ZERO_ALLOWED(val, i)) {
                   2742:                                *res &= ~__BIT(i);
                   2743:                        } else if (ONE_ALLOWED(val, i)) {
                   2744:                                *res |= __BIT(i);
                   2745:                        } else {
                   2746:                                return -1;
                   2747:                        }
                   2748:                }
                   2749:        }
                   2750:
                   2751:        return 0;
                   2752: }
                   2753:
                   2754: static bool
                   2755: vmx_ident(void)
                   2756: {
                   2757:        uint64_t msr;
                   2758:        int ret;
                   2759:
                   2760:        if (!(cpu_feature[1] & CPUID2_VMX)) {
                   2761:                return false;
                   2762:        }
                   2763:
                   2764:        msr = rdmsr(MSR_IA32_FEATURE_CONTROL);
                   2765:        if ((msr & IA32_FEATURE_CONTROL_LOCK) == 0) {
                   2766:                return false;
                   2767:        }
                   2768:
                   2769:        msr = rdmsr(MSR_IA32_VMX_BASIC);
                   2770:        if ((msr & IA32_VMX_BASIC_IO_REPORT) == 0) {
                   2771:                return false;
                   2772:        }
                   2773:        if (__SHIFTOUT(msr, IA32_VMX_BASIC_MEM_TYPE) != MEM_TYPE_WB) {
                   2774:                return false;
                   2775:        }
                   2776:
                   2777:        /* PG and PE are reported, even if Unrestricted Guests is supported. */
                   2778:        vmx_cr0_fixed0 = rdmsr(MSR_IA32_VMX_CR0_FIXED0) & ~(CR0_PG|CR0_PE);
                   2779:        vmx_cr0_fixed1 = rdmsr(MSR_IA32_VMX_CR0_FIXED1) | (CR0_PG|CR0_PE);
                   2780:        ret = vmx_check_cr(rcr0(), vmx_cr0_fixed0, vmx_cr0_fixed1);
                   2781:        if (ret == -1) {
                   2782:                return false;
                   2783:        }
                   2784:
                   2785:        vmx_cr4_fixed0 = rdmsr(MSR_IA32_VMX_CR4_FIXED0);
                   2786:        vmx_cr4_fixed1 = rdmsr(MSR_IA32_VMX_CR4_FIXED1);
                   2787:        ret = vmx_check_cr(rcr4() | CR4_VMXE, vmx_cr4_fixed0, vmx_cr4_fixed1);
                   2788:        if (ret == -1) {
                   2789:                return false;
                   2790:        }
                   2791:
                   2792:        /* Init the CTLSs right now, and check for errors. */
                   2793:        ret = vmx_init_ctls(
                   2794:            MSR_IA32_VMX_PINBASED_CTLS, MSR_IA32_VMX_TRUE_PINBASED_CTLS,
                   2795:            VMX_PINBASED_CTLS_ONE, VMX_PINBASED_CTLS_ZERO,
                   2796:            &vmx_pinbased_ctls);
                   2797:        if (ret == -1) {
                   2798:                return false;
                   2799:        }
                   2800:        ret = vmx_init_ctls(
                   2801:            MSR_IA32_VMX_PROCBASED_CTLS, MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
                   2802:            VMX_PROCBASED_CTLS_ONE, VMX_PROCBASED_CTLS_ZERO,
                   2803:            &vmx_procbased_ctls);
                   2804:        if (ret == -1) {
                   2805:                return false;
                   2806:        }
                   2807:        ret = vmx_init_ctls(
                   2808:            MSR_IA32_VMX_PROCBASED_CTLS2, MSR_IA32_VMX_PROCBASED_CTLS2,
                   2809:            VMX_PROCBASED_CTLS2_ONE, VMX_PROCBASED_CTLS2_ZERO,
                   2810:            &vmx_procbased_ctls2);
                   2811:        if (ret == -1) {
                   2812:                return false;
                   2813:        }
                   2814:        ret = vmx_init_ctls(
                   2815:            MSR_IA32_VMX_ENTRY_CTLS, MSR_IA32_VMX_TRUE_ENTRY_CTLS,
                   2816:            VMX_ENTRY_CTLS_ONE, VMX_ENTRY_CTLS_ZERO,
                   2817:            &vmx_entry_ctls);
                   2818:        if (ret == -1) {
                   2819:                return false;
                   2820:        }
                   2821:        ret = vmx_init_ctls(
                   2822:            MSR_IA32_VMX_EXIT_CTLS, MSR_IA32_VMX_TRUE_EXIT_CTLS,
                   2823:            VMX_EXIT_CTLS_ONE, VMX_EXIT_CTLS_ZERO,
                   2824:            &vmx_exit_ctls);
                   2825:        if (ret == -1) {
                   2826:                return false;
                   2827:        }
                   2828:
1.10      maxv     2829:        msr = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP);
                   2830:        if ((msr & IA32_VMX_EPT_VPID_WALKLENGTH_4) == 0) {
                   2831:                return false;
                   2832:        }
                   2833:        if ((msr & IA32_VMX_EPT_VPID_INVEPT) == 0) {
                   2834:                return false;
                   2835:        }
                   2836:        if ((msr & IA32_VMX_EPT_VPID_INVVPID) == 0) {
                   2837:                return false;
                   2838:        }
1.13      maxv     2839:        if ((msr & IA32_VMX_EPT_VPID_FLAGS_AD) != 0) {
                   2840:                pmap_ept_has_ad = true;
                   2841:        } else {
                   2842:                pmap_ept_has_ad = false;
1.10      maxv     2843:        }
                   2844:        if (!(msr & IA32_VMX_EPT_VPID_UC) && !(msr & IA32_VMX_EPT_VPID_WB)) {
                   2845:                return false;
                   2846:        }
                   2847:
1.1       maxv     2848:        return true;
                   2849: }
                   2850:
                   2851: static void
1.12      maxv     2852: vmx_init_asid(uint32_t maxasid)
                   2853: {
                   2854:        size_t allocsz;
                   2855:
                   2856:        mutex_init(&vmx_asidlock, MUTEX_DEFAULT, IPL_NONE);
                   2857:
                   2858:        vmx_maxasid = maxasid;
                   2859:        allocsz = roundup(maxasid, 8) / 8;
                   2860:        vmx_asidmap = kmem_zalloc(allocsz, KM_SLEEP);
                   2861:
                   2862:        /* ASID 0 is reserved for the host. */
                   2863:        vmx_asidmap[0] |= __BIT(0);
                   2864: }
                   2865:
                   2866: static void
1.1       maxv     2867: vmx_change_cpu(void *arg1, void *arg2)
                   2868: {
                   2869:        struct cpu_info *ci = curcpu();
                   2870:        bool enable = (bool)arg1;
                   2871:        uint64_t cr4;
                   2872:
                   2873:        if (!enable) {
                   2874:                vmx_vmxoff();
                   2875:        }
                   2876:
                   2877:        cr4 = rcr4();
                   2878:        if (enable) {
                   2879:                cr4 |= CR4_VMXE;
                   2880:        } else {
                   2881:                cr4 &= ~CR4_VMXE;
                   2882:        }
                   2883:        lcr4(cr4);
                   2884:
                   2885:        if (enable) {
                   2886:                vmx_vmxon(&vmxoncpu[cpu_index(ci)].pa);
                   2887:        }
                   2888: }
                   2889:
                   2890: static void
                   2891: vmx_init_l1tf(void)
                   2892: {
                   2893:        u_int descs[4];
                   2894:        uint64_t msr;
                   2895:
                   2896:        if (cpuid_level < 7) {
                   2897:                return;
                   2898:        }
                   2899:
                   2900:        x86_cpuid(7, descs);
                   2901:
                   2902:        if (descs[3] & CPUID_SEF_ARCH_CAP) {
                   2903:                msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
                   2904:                if (msr & IA32_ARCH_SKIP_L1DFL_VMENTRY) {
                   2905:                        /* No mitigation needed. */
                   2906:                        return;
                   2907:                }
                   2908:        }
                   2909:
                   2910:        if (descs[3] & CPUID_SEF_L1D_FLUSH) {
                   2911:                /* Enable hardware mitigation. */
                   2912:                vmx_msrlist_entry_nmsr += 1;
                   2913:        }
                   2914: }
                   2915:
                   2916: static void
                   2917: vmx_init(void)
                   2918: {
                   2919:        CPU_INFO_ITERATOR cii;
                   2920:        struct cpu_info *ci;
                   2921:        uint64_t xc, msr;
                   2922:        struct vmxon *vmxon;
                   2923:        uint32_t revision;
                   2924:        paddr_t pa;
                   2925:        vaddr_t va;
                   2926:        int error;
                   2927:
                   2928:        /* Init the ASID bitmap (VPID). */
                   2929:        vmx_init_asid(VPID_MAX);
                   2930:
                   2931:        /* Init the XCR0 mask. */
                   2932:        vmx_xcr0_mask = VMX_XCR0_MASK_DEFAULT & x86_xsave_features;
                   2933:
                   2934:        /* Init the TLB flush op, the EPT flush op and the EPTP type. */
                   2935:        msr = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP);
                   2936:        if ((msr & IA32_VMX_EPT_VPID_INVVPID_CONTEXT) != 0) {
                   2937:                vmx_tlb_flush_op = VMX_INVVPID_CONTEXT;
                   2938:        } else {
                   2939:                vmx_tlb_flush_op = VMX_INVVPID_ALL;
                   2940:        }
                   2941:        if ((msr & IA32_VMX_EPT_VPID_INVEPT_CONTEXT) != 0) {
                   2942:                vmx_ept_flush_op = VMX_INVEPT_CONTEXT;
                   2943:        } else {
                   2944:                vmx_ept_flush_op = VMX_INVEPT_ALL;
                   2945:        }
                   2946:        if ((msr & IA32_VMX_EPT_VPID_WB) != 0) {
                   2947:                vmx_eptp_type = EPTP_TYPE_WB;
                   2948:        } else {
                   2949:                vmx_eptp_type = EPTP_TYPE_UC;
                   2950:        }
                   2951:
                   2952:        /* Init the L1TF mitigation. */
                   2953:        vmx_init_l1tf();
                   2954:
                   2955:        memset(vmxoncpu, 0, sizeof(vmxoncpu));
                   2956:        revision = vmx_get_revision();
                   2957:
                   2958:        for (CPU_INFO_FOREACH(cii, ci)) {
                   2959:                error = vmx_memalloc(&pa, &va, 1);
                   2960:                if (error) {
                   2961:                        panic("%s: out of memory", __func__);
                   2962:                }
                   2963:                vmxoncpu[cpu_index(ci)].pa = pa;
                   2964:                vmxoncpu[cpu_index(ci)].va = va;
                   2965:
                   2966:                vmxon = (struct vmxon *)vmxoncpu[cpu_index(ci)].va;
                   2967:                vmxon->ident = __SHIFTIN(revision, VMXON_IDENT_REVISION);
                   2968:        }
                   2969:
                   2970:        xc = xc_broadcast(0, vmx_change_cpu, (void *)true, NULL);
                   2971:        xc_wait(xc);
                   2972: }
                   2973:
                   2974: static void
                   2975: vmx_fini_asid(void)
                   2976: {
                   2977:        size_t allocsz;
                   2978:
                   2979:        allocsz = roundup(vmx_maxasid, 8) / 8;
                   2980:        kmem_free(vmx_asidmap, allocsz);
                   2981:
                   2982:        mutex_destroy(&vmx_asidlock);
                   2983: }
                   2984:
                   2985: static void
                   2986: vmx_fini(void)
                   2987: {
                   2988:        uint64_t xc;
                   2989:        size_t i;
                   2990:
                   2991:        xc = xc_broadcast(0, vmx_change_cpu, (void *)false, NULL);
                   2992:        xc_wait(xc);
                   2993:
                   2994:        for (i = 0; i < MAXCPUS; i++) {
                   2995:                if (vmxoncpu[i].pa != 0)
                   2996:                        vmx_memfree(vmxoncpu[i].pa, vmxoncpu[i].va, 1);
                   2997:        }
                   2998:
                   2999:        vmx_fini_asid();
                   3000: }
                   3001:
                   3002: static void
                   3003: vmx_capability(struct nvmm_capability *cap)
                   3004: {
                   3005:        cap->u.x86.xcr0_mask = vmx_xcr0_mask;
                   3006:        cap->u.x86.mxcsr_mask = x86_fpu_mxcsr_mask;
                   3007:        cap->u.x86.conf_cpuid_maxops = VMX_NCPUIDS;
                   3008: }
                   3009:
                   3010: const struct nvmm_impl nvmm_x86_vmx = {
                   3011:        .ident = vmx_ident,
                   3012:        .init = vmx_init,
                   3013:        .fini = vmx_fini,
                   3014:        .capability = vmx_capability,
                   3015:        .conf_max = NVMM_X86_NCONF,
                   3016:        .conf_sizes = vmx_conf_sizes,
                   3017:        .state_size = sizeof(struct nvmm_x64_state),
                   3018:        .machine_create = vmx_machine_create,
                   3019:        .machine_destroy = vmx_machine_destroy,
                   3020:        .machine_configure = vmx_machine_configure,
                   3021:        .vcpu_create = vmx_vcpu_create,
                   3022:        .vcpu_destroy = vmx_vcpu_destroy,
                   3023:        .vcpu_setstate = vmx_vcpu_setstate,
                   3024:        .vcpu_getstate = vmx_vcpu_getstate,
                   3025:        .vcpu_inject = vmx_vcpu_inject,
                   3026:        .vcpu_run = vmx_vcpu_run
                   3027: };

CVSweb <webmaster@jp.NetBSD.org>