2 * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
30 #include <sys/param.h>
31 #include <sys/_iovec.h>
35 #include <x86/segments.h>
36 #include <x86/specialreg.h>
37 #include <machine/vmm.h>
38 #include <machine/vmm_instruction_emul.h>
51 * Various functions in this file use 0 to denote success and VMEXIT_ABORT
52 * or VMEXIT_RESTART to denote failure. This assumes that the VMEXIT_xyz
53 * macros expand to non-zero values. Enforce this with a compile-time
56 CTASSERT(VMEXIT_ABORT != 0);
57 CTASSERT(VMEXIT_RESTART != 0);
60 * Using 'struct i386tss' is tempting but causes myriad sign extension
61 * issues because all of its fields are defined as signed integers.
103 CTASSERT(sizeof(struct tss32) == 104);
105 #define SEL_START(sel) (((sel) & ~0x7))
106 #define SEL_LIMIT(sel) (((sel) | 0x7))
107 #define TSS_BUSY(type) (((type) & 0x2) != 0)
110 GETREG(struct vmctx *ctx, int vcpu, int reg)
115 error = vm_get_register(ctx, vcpu, reg, &val);
121 SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
125 error = vm_set_register(ctx, vcpu, reg, val);
129 static struct seg_desc
130 usd_to_seg_desc(struct user_segment_descriptor *usd)
132 struct seg_desc seg_desc;
134 seg_desc.base = (u_int)USD_GETBASE(usd);
136 seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
138 seg_desc.limit = (u_int)USD_GETLIMIT(usd);
139 seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
140 seg_desc.access |= usd->sd_xx << 12;
141 seg_desc.access |= usd->sd_def32 << 14;
142 seg_desc.access |= usd->sd_gran << 15;
148 * Inject an exception with an error code that is a segment selector.
149 * The format of the error code is described in section 6.13, "Error Code",
150 * Intel SDM volume 3.
152 * Bit 0 (EXT) denotes whether the exception occurred during delivery
153 * of an external event like an interrupt.
155 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
158 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
161 sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
166 * Bit 2 from the selector is retained as-is in the error code.
168 * Bit 1 can be safely cleared because none of the selectors
169 * encountered during task switch emulation refer to a task
172 * Bit 0 is set depending on the value of 'ext'.
177 error = vm_inject_exception2(ctx, vcpu, vector, sel);
182 desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
185 uint32_t limit, access;
188 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
189 error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
192 if (reg == VM_REG_GUEST_LDTR) {
193 if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
197 if (limit < SEL_LIMIT(sel))
204 desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
205 uint16_t sel, struct user_segment_descriptor *desc, bool doread)
209 uint32_t limit, access;
212 reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
213 error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
215 assert(limit >= SEL_LIMIT(sel));
217 error = vm_gla2gpa(ctx, vcpu, paging, base + SEL_START(sel),
218 sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov));
221 vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
223 vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
229 desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
230 uint16_t sel, struct user_segment_descriptor *desc)
232 return (desc_table_rw(ctx, vcpu, paging, sel, desc, true));
236 desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
237 uint16_t sel, struct user_segment_descriptor *desc)
239 return (desc_table_rw(ctx, vcpu, paging, sel, desc, false));
243 read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
244 uint16_t sel, struct user_segment_descriptor *desc)
246 struct vm_guest_paging sup_paging;
250 assert(IDXSEL(sel) != 0);
252 /* Fetch the new TSS descriptor */
253 if (desc_table_limit_check(ctx, vcpu, sel)) {
254 if (ts->reason == TSR_IRET)
255 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
257 sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
258 return (VMEXIT_RESTART);
261 sup_paging = ts->paging;
262 sup_paging.cpl = 0; /* implicit supervisor mode */
263 error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc);
265 return (VMEXIT_ABORT);
267 return (VMEXIT_RESTART);
273 code_desc(int sd_type)
275 /* code descriptor */
276 return ((sd_type & 0x18) == 0x18);
280 stack_desc(int sd_type)
282 /* writable data descriptor */
283 return ((sd_type & 0x1A) == 0x12);
287 data_desc(int sd_type)
289 /* data descriptor or a readable code descriptor */
290 return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
294 ldt_desc(int sd_type)
297 return (sd_type == SDT_SYSLDT);
301 validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
302 int segment, struct seg_desc *seg_desc)
304 struct vm_guest_paging sup_paging;
305 struct user_segment_descriptor usd;
309 bool ldtseg, codeseg, stackseg, dataseg, conforming;
311 ldtseg = codeseg = stackseg = dataseg = false;
313 case VM_REG_GUEST_LDTR:
316 case VM_REG_GUEST_CS:
319 case VM_REG_GUEST_SS:
322 case VM_REG_GUEST_DS:
323 case VM_REG_GUEST_ES:
324 case VM_REG_GUEST_FS:
325 case VM_REG_GUEST_GS:
332 /* Get the segment selector */
333 sel = GETREG(ctx, vcpu, segment);
335 /* LDT selector must point into the GDT */
336 if (ldtseg && ISLDT(sel)) {
337 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
338 return (VMEXIT_RESTART);
341 /* Descriptor table limit check */
342 if (desc_table_limit_check(ctx, vcpu, sel)) {
343 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
344 return (VMEXIT_RESTART);
348 if (IDXSEL(sel) == 0) {
349 /* Code and stack segment selectors cannot be NULL */
350 if (codeseg || stackseg) {
351 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
352 return (VMEXIT_RESTART);
356 seg_desc->access = 0x10000; /* unusable */
360 /* Read the descriptor from the GDT/LDT */
361 sup_paging = ts->paging;
362 sup_paging.cpl = 0; /* implicit supervisor mode */
363 error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd);
365 return (VMEXIT_ABORT);
367 return (VMEXIT_RESTART);
369 /* Verify that the descriptor type is compatible with the segment */
370 if ((ldtseg && !ldt_desc(usd.sd_type)) ||
371 (codeseg && !code_desc(usd.sd_type)) ||
372 (dataseg && !data_desc(usd.sd_type)) ||
373 (stackseg && !stack_desc(usd.sd_type))) {
374 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
375 return (VMEXIT_RESTART);
378 /* Segment must be marked present */
386 sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
387 return (VMEXIT_RESTART);
390 cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
391 cpl = cs & SEL_RPL_MASK;
392 rpl = sel & SEL_RPL_MASK;
395 if (stackseg && (rpl != cpl || dpl != cpl)) {
396 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
397 return (VMEXIT_RESTART);
401 conforming = (usd.sd_type & 0x4) ? true : false;
402 if ((conforming && (cpl < dpl)) ||
403 (!conforming && (cpl != dpl))) {
404 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
405 return (VMEXIT_RESTART);
411 * A data segment is always non-conforming except when it's
412 * descriptor is a readable, conforming code segment.
414 if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
419 if (!conforming && (rpl > dpl || cpl > dpl)) {
420 sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
421 return (VMEXIT_RESTART);
424 *seg_desc = usd_to_seg_desc(&usd);
429 tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
430 uint32_t eip, struct tss32 *tss, struct iovec *iov)
433 /* General purpose registers */
434 tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
435 tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
436 tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
437 tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
438 tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
439 tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
440 tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
441 tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
443 /* Segment selectors */
444 tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
445 tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
446 tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
447 tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
448 tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
449 tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
452 tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
453 if (task_switch->reason == TSR_IRET)
454 tss->tss_eflags &= ~PSL_NT;
457 /* Copy updated old TSS into guest memory */
458 vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
462 update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
466 error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
471 tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
472 uint16_t ot_sel, struct tss32 *tss, struct iovec *iov)
474 struct seg_desc seg_desc, seg_desc2;
475 uint64_t *pdpte, maxphyaddr, reserved;
481 if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
482 tss->tss_link = ot_sel;
486 eflags = tss->tss_eflags;
491 SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
494 if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
495 if (ts->paging.paging_mode == PAGING_MODE_PAE) {
497 * XXX Assuming 36-bit MAXPHYADDR.
499 maxphyaddr = (1UL << 36) - 1;
500 pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
501 for (i = 0; i < 4; i++) {
502 /* Check reserved bits if the PDPTE is valid */
503 if (!(pdpte[i] & 0x1))
506 * Bits 2:1, 8:5 and bits above the processor's
507 * maximum physical address are reserved.
509 reserved = ~maxphyaddr | 0x1E6;
510 if (pdpte[i] & reserved) {
511 error = vm_inject_exception2(ctx, vcpu,
514 return (VMEXIT_RESTART);
517 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
518 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
519 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
520 SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
522 SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
523 ts->paging.cr3 = tss->tss_cr3;
527 SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
528 SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
530 /* General purpose registers */
531 SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
532 SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
533 SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
534 SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
535 SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
536 SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
537 SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
538 SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
540 /* Segment selectors */
541 SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
542 SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
543 SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
544 SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
545 SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
546 SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
549 * If this is a nested task then write out the new TSS to update
550 * the previous link field.
553 vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
555 /* Validate segment descriptors */
556 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc);
559 update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
562 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
564 * The SS and CS attribute checks on VM-entry are inter-dependent so
565 * we need to make sure that both segments are valid before updating
566 * either of them. This ensures that the VMCS state can pass the
567 * VM-entry checks so the guest can handle any exception injected
568 * during task switch emulation.
570 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc);
573 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2);
576 update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
577 update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
578 ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
580 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc);
583 update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
585 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc);
588 update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
590 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc);
593 update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
595 error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc);
598 update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
604 push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
605 int task_type, uint32_t errcode)
608 struct seg_desc seg_desc;
609 int stacksize, bytes, error;
610 uint64_t gla, cr0, rflags;
614 cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
615 rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
616 stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
618 error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
619 &seg_desc.limit, &seg_desc.access);
623 * Section "Error Code" in the Intel SDM vol 3: the error code is
624 * pushed on the stack as a doubleword or word (depending on the
625 * default interrupt, trap or task gate size).
627 if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
633 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
634 * stack-segment descriptor determines the size of the stack
635 * pointer outside of 64-bit mode.
637 if (SEG_DESC_DEF32(seg_desc.access))
642 esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
645 if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
646 &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
647 sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
648 return (VMEXIT_RESTART);
651 if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
652 error = vm_inject_exception2(ctx, vcpu, IDT_AC, 1);
654 return (VMEXIT_RESTART);
657 error = vm_gla2gpa(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
659 assert(error == 0 || error == 1 || error == -1);
661 return ((error == 1) ? VMEXIT_RESTART : VMEXIT_ABORT);
664 vm_copyout(ctx, vcpu, &errcode, iov, bytes);
665 SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
670 vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
673 struct tss32 oldtss, newtss;
674 struct vm_task_switch *task_switch;
675 struct vm_guest_paging *paging, sup_paging;
676 struct user_segment_descriptor nt_desc, ot_desc;
677 struct iovec nt_iov[2], ot_iov[2];
678 uint64_t cr0, ot_base;
679 uint32_t eip, ot_lim, access;
680 int error, ext, minlimit, nt_type, ot_type, vcpu;
681 enum task_switch_reason reason;
682 uint16_t nt_sel, ot_sel;
684 task_switch = &vmexit->u.task_switch;
685 nt_sel = task_switch->tsssel;
686 ext = vmexit->u.task_switch.ext;
687 reason = vmexit->u.task_switch.reason;
688 paging = &vmexit->u.task_switch.paging;
691 assert(paging->cpu_mode == CPU_MODE_PROTECTED);
694 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
695 * The following page table accesses are implicitly supervisor mode:
696 * - accesses to GDT or LDT to load segment descriptors
697 * - accesses to the task state segment during task switch
699 sup_paging = *paging;
700 sup_paging.cpl = 0; /* implicit supervisor mode */
702 /* Fetch the new TSS descriptor */
703 error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc);
707 nt = usd_to_seg_desc(&nt_desc);
709 /* Verify the type of the new TSS */
710 nt_type = SEG_DESC_TYPE(nt.access);
711 if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
712 nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
713 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
714 return (VMEXIT_RESTART);
717 /* TSS descriptor must have present bit set */
718 if (!SEG_DESC_PRESENT(nt.access)) {
719 sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
720 return (VMEXIT_RESTART);
724 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
725 * 44 bytes for a 16-bit TSS.
727 if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
729 else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
734 assert(minlimit > 0);
735 if (nt.limit < minlimit) {
736 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
737 return (VMEXIT_RESTART);
740 /* TSS must be busy if task switch is due to IRET */
741 if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
742 sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
743 return (VMEXIT_RESTART);
747 * TSS must be available (not busy) if task switch reason is
748 * CALL, JMP, exception or interrupt.
750 if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
751 sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
752 return (VMEXIT_RESTART);
755 /* Fetch the new TSS */
756 error = vm_gla2gpa(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
757 PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov));
759 /* Restart vcpu execution to handle the page fault */
760 return (VMEXIT_RESTART);
761 } else if (error == -1) {
762 return (VMEXIT_ABORT);
765 vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
768 /* Get the old TSS selector from the guest's task register */
769 ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
770 if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
772 * This might happen if a task switch was attempted without
773 * ever loading the task register with LTR. In this case the
774 * TR would contain the values from power-on:
775 * (sel = 0, base = 0, limit = 0xffff).
777 sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
778 return (VMEXIT_RESTART);
781 /* Get the old TSS base and limit from the guest's task register */
782 error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
785 assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
786 ot_type = SEG_DESC_TYPE(access);
787 assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
789 /* Fetch the old TSS descriptor */
790 error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel,
795 /* Get the old TSS */
796 error = vm_gla2gpa(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
797 PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov));
799 /* Restart vcpu execution to handle the page fault */
800 return (VMEXIT_RESTART);
801 } else if (error == -1) {
802 fprintf(stderr, "Error copying in old TSS: %d\n", errno);
803 return (VMEXIT_ABORT);
806 vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
810 * Clear the busy bit in the old TSS descriptor if the task switch
811 * due to an IRET or JMP instruction.
813 if (reason == TSR_IRET || reason == TSR_JMP) {
814 ot_desc.sd_type &= ~0x2;
815 error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
821 if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
822 fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
823 return (VMEXIT_ABORT);
826 /* Save processor state in old TSS */
827 eip = vmexit->rip + vmexit->inst_length;
828 tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
831 * If the task switch was triggered for any reason other than IRET
832 * then set the busy bit in the new TSS descriptor.
834 if (reason != TSR_IRET) {
835 nt_desc.sd_type |= 0x2;
836 error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
842 /* Update task register to point at the new TSS */
843 SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
845 /* Update the hidden descriptor state of the task register */
846 nt = usd_to_seg_desc(&nt_desc);
847 update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
850 cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
851 SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
854 * We are now committed to the task switch. Any exceptions encountered
855 * after this point will be handled in the context of the new task and
856 * the saved instruction pointer will belong to the new task.
858 vmexit->rip = newtss.tss_eip;
859 vmexit->inst_length = 0;
861 /* Load processor state from new TSS */
862 error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov);
867 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
868 * caused an error code to be generated, this error code is copied
869 * to the stack of the new task.
871 if (task_switch->errcode_valid) {
872 assert(task_switch->ext);
873 assert(task_switch->reason == TSR_IDT_GATE);
874 error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
875 task_switch->errcode);
882 * Treatment of virtual-NMI blocking if NMI is delivered through
885 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
886 * If the virtual NMIs VM-execution control is 1, VM entry injects
887 * an NMI, and delivery of the NMI causes a task switch that causes
888 * a VM exit, virtual-NMI blocking is in effect before the VM exit
891 * Thus, virtual-NMI blocking is in effect at the time of the task
896 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
898 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
899 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
900 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
902 * Thus, virtual-NMI blocking is cleared at the time of the task switch
907 * XXX is the original task switch was triggered by a hardware
908 * exception then do we generate a double-fault if we encounter
909 * an exception during the task switch?
913 * XXX should inject debug exception if 'T' bit is 1
915 return (VMEXIT_RESTART);