如果没阅读,之前阅读
手工编写linux下的intel虚拟化驱动
手工编写linux下的intel虚拟化驱动(二)
前情提要
上一篇中,完成了修改cpuid返回的VendorID的功能,但功能还是仅可存在一瞬间,因为调用完成后,立刻调用了退出VM的cpuid指令,本篇中,继续来讨论如何完善VM的持久化。
持久化初探
最直接的办法就是使用延时函数,增加一个5秒的延时,运行后可以发现,系统报错了,报错代码所在位置为arch/x86/kernel/fpu/xstate.h:197
找到对应的源码,并打开查看,可以发现报错由err来决定,最终状态在XSTATE_XSAVE中进行修改,所以,上述的cut here 报错为XSAVE报错
根据手册,可以找到关于异常的解释
也就是说Sencondary Processor base中的位控制该异常,所以,如果进行修复的话,需要初始化该字段,当然,如果使该字段生效,根据手册,还需要设置Processor based字段中的相关位,同样的问题还有rdtscp、invpcid,所以,也需要设置对应的位
修复代码
C // vmcsfield.h // MSR Msr_kIa32VmxProcBasedCtls/Msr_kIa32VmxTrueProcBasedCtls0x482/0x48E typedef union _Vmx_ProcessorBased_Controls { uint32_t all; struct { uint32_t reserved1 : 2;//!< [0:1] uint32_t interrupt_window_exiting : 1;//!< [2] uint32_t use_tsc_offseting : 1;//!< [3] uint32_t reserved2 : 3;//!< [4:6] uint32_t hlt_exiting : 1;//!< [7] uint32_t reserved3 : 1;//!< [8] uint32_t invlpg_exiting : 1;//!< [9] uint32_t mwait_exiting : 1;//!< [10] uint32_t rdpmc_exiting : 1;//!< [11] uint32_t rdtsc_exiting : 1;//!< [12] uint32_t reserved4 : 2;//!< [13:14] uint32_t cr3_load_exiting : 1;//!< [15] uint32_t cr3_store_exiting : 1;//!< [16] uint32_t reserved5 : 2;//!< [17:18] uint32_t cr8_load_exiting : 1;//!< [19] uint32_t cr8_store_exiting : 1;//!< [20] uint32_t use_tpr_shadow : 1;//!< [21] uint32_t nmi_window_exiting : 1;//!< [22] uint32_t mov_dr_exiting : 1;//!< [23] uint32_t unconditional_io_exiting : 1;//!< [24] uint32_t use_io_bitmaps : 1;//!< [25] uint32_t reserved6 : 1;//!< [26] uint32_t monitor_trap_flag : 1;//!< [27] uint32_t use_msr_bitmaps : 1;//!< [28] uint32_t monitor_exiting : 1;//!< [29] uint32_t pause_exiting : 1;//!< [30] uint32_t activate_secondary_control : 1;//!< [31] } fields; }__attribute__((packed)) Vmx_ProcessorBased_Controls, *PVmx_ProcessorBased_Controls; /// MSR Msr_kIa32VmxProcBasedCtls2 0x48B typedef union _Vmx_SecondaryProcessorBased_Controls { uint32_t all; struct { uint32_t virtualize_apic_accesses : 1;//!< [0] uint32_t enable_ept : 1;//!< [1] uint32_t descriptor_table_exiting : 1;//!< [2] uint32_t enable_rdtscp : 1;//!< [3] uint32_t virtualize_x2apic_mode : 1;//!< [4] uint32_t enable_vpid : 1;//!< [5] uint32_t wbinvd_exiting : 1;//!< [6] uint32_t unrestricted_guest : 1;//!< [7] uint32_t apic_register_virtualization : 1;//!< [8] uint32_t virtual_interrupt_delivery : 1;//!< [9] uint32_t pause_loop_exiting : 1;//!< [10] uint32_t rdrand_exiting : 1;//!< [11] uint32_t enable_invpcid : 1;//!< [12] uint32_t enable_vm_functions : 1;//!< [13] uint32_t vmcs_shadowing : 1;//!< [14] uint32_t reserved1 : 1;//!< [15] uint32_t rdseed_exiting : 1;//!< [16] uint32_t reserved2 : 1;//!< [17] uint32_t ept_violation_ve : 1;//!< [18] uint32_t reserved3 : 1;//!< [19] uint32_t enable_xsaves_xstors : 1;//!< [20] uint32_t reserved4 : 1;//!< [21] uint32_t mode_based_execute_control_for_ept : 1;//!< [22] uint32_t reserved5 : 2;//!< [23:24] uint32_t use_tsc_scaling : 1;//!< [25] } fields; }__attribute__((packed)) Vmx_SecondaryProcessorBased_Controls, *PVmx_SecondaryProcessorBased_Controls;
|
C // whisper_linux.c ... Vmx_ProcessorBased_Controls StructVMProcessorControls = {0}; StructVMProcessorControls.fields.activate_secondary_control = 1; Asm_vmxWrite(VmcsField_kCpuBasedVmExecControl, WhisperAdjustControlValue((l_bool_useTrueMsr) ? Msr_kIa32VmxTrueProcBasedCtls: Msr_kIa32VmxProcBasedCtls, StructVMProcessorControls.all)); ... Vmx_SecondaryProcessorBased_Controls StructSecondaryProcessorBasedControl = {0}; StructSecondaryProcessorBasedControl.fields.enable_rdtscp = 1; StructSecondaryProcessorBasedControl.fields.enable_invpcid = 1; StructSecondaryProcessorBasedControl.fields.enable_xsaves_xstors = 1; Asm_vmxWrite(VmcsField_kSecondaryVmExecControl, WhisperAdjustControlValue(Msr_kIa32VmxProcBasedCtls2, StructSecondaryProcessorBasedControl.all));
|
重新编译后,可以发现,错误已经被修复
代码提取函数
在继续后续操作之前,需要对函数whisper_init中的代码进行一下整理,即先将CPU的开启动作、关闭动作都提成一个单独的函数
C // whisper_linux.c ... bool StartWhisper(void) { unsigned int eax = 0x1, ebx = 0, ecx = 0, edx = 0; __cpuid(&eax, &ebx, &ecx, &edx); unsigned long long val = __rdmsr(Msr_kIa32FeatureControl); if (isSupported) { val |= 1; __wrmsr(Msr_kIa32FeatureControl, val, val >> 32); InitCrX(); unsigned int vendorID[4] = {}; eax = 0; vendorID[3] = 0; __cpuid(&eax, &vendorID[0], &vendorID[2], &vendorID[1]); printk(KERN_INFO "VendorID[before]: %sn", vendorID); isSupported = Asm_init((void*)InitializeVM); if (isSupported) { printk(KERN_INFO "Asm_init run successn"); eax = 0; __cpuid(&eax, &vendorID[0], &vendorID[2], &vendorID[1]); printk(KERN_INFO "VendorID[after]: %sn", vendorID); } else { printk(KERN_INFO "Asm_init run failedn"); } } if (!isSupported) { printk(KERN_INFO "%sn", "unspported"); } return isSupported; } void StopWhisper(void) { // trigger exit. unsigned int eax = 1; unsigned int vendorID[4] = {}; __cpuid(&eax, &vendorID[0], &vendorID[2], &vendorID[1]); } int whisper_init(void) { // turn on run StartWhisper(); //Sleep ssleep(5); // stop run StopWhisper(); return 0; }
|
持久化再探
目前功能还仅能在单核CPU上进行持久化(或者说几秒的持久化),所以本小节尝试将功能扩展到所有核上,为了遍历所有的在线cpu,且在对应的cpu上执行相应的代码,将使用for_each_online_cpu宏,及work_on_cpu函数进行执行
C // whisper_linux.c int whisper_init(void) { ... // turn on run int i; bool bReturn = true; for_each_online_cpu(i) { if (work_on_cpu(i, (void *)StartWhisper, 0)) { } else { bReturn = false; break; } } //Sleep ssleep(5); // stop run for_each_online_cpu(i) { if (work_on_cpu(i, (void *)StopWhisper, 0)) { } else { bReturn = false; break; } } ... }
|
运行后发现日志中产生了一个报错
多次运行并观察其中的报错,可以发现,仅有一个cpu报错,所以推测其他的CPU已经退出了VM,所以,尝试在处理函数中增加一行打印日志代码
C // whsiper_handler.c bool Handler(Guest_Context *guest_context) { ... printk(KERN_INFO "reason:0x%llx stopn", exitReason); return false; }
|
可以发现,由于未处理msr write/read事件,导致运行过程中直接退出了VM,所以需要接管msr事件,在此之前,还需要先处理一下cpuid的其他功能号事件,并把退出的功能号改为'whis',因为在运行过程中cpuid会被非常经常调用,为了避免和其他功能冲突,所以将退出的功能号改为'whis'
C // whisper_handler.c ... bool Handler(Guest_Context *guest_context) { uint64_t exitReason = Asm_VmRead(VmcsField_kVmExitReason); if (exitReason == VmxExitReason_kCpuid) { ... } else if (guest_context->stack->gp_regs.ax != 'whis') { __cpuid((unsigned int *)&guest_context->stack->gp_regs.ax, (unsigned int *)&guest_context->stack->gp_regs.bx, (unsigned int *)&guest_context->stack->gp_regs.cx, (unsigned int *)&guest_context->stack->gp_regs.dx); guest_context->stack->result_reg.rip = guest_context->stack->result_reg.rip + Asm_VmRead(VmcsField_kVmExitInstructionLen); return true; } else { printk(KERN_INFO "cpuid stop 0x%llxn", guest_context->stack->gp_regs.ax); return false; } } printk(KERN_INFO "reason:0x%llx stopn", exitReason); return false; }
|
针对于msr write/read事件,需要针对特殊的功能号,进行特殊处理处理,例如:Msr_kIa32SysenterCs、Msr_kIa32SysenterEsp等,对这类寄存器,需要做一个转换,写入/读取guest状态值,针对其他功能号直接写入/读取对应的msr
C // whisper_handler.c ... elseif (exitReason == VmxExitReason_kMsrWrite) { bool transfer_to_vmcs = false; bool l_bool_tryRun = false; uint64_t vmcs_field = 0; switch ( guest_context->stack->gp_regs.cx) { case Msr_kIa32SysenterCs: vmcs_field = VmcsField_kGuestSysenterCs; transfer_to_vmcs = true; l_bool_tryRun = true; break; case Msr_kIa32SysenterEsp: vmcs_field = VmcsField_kGuestSysenterEsp; transfer_to_vmcs = true; l_bool_tryRun = true; break; case Msr_kIa32SysenterEip: vmcs_field = VmcsField_kGuestSysenterEip; transfer_to_vmcs = true; l_bool_tryRun = true; break; case Msr_kIa32Debugctl: vmcs_field = VmcsField_kGuestIa32Debugctl; transfer_to_vmcs = true; l_bool_tryRun = true; break; case Msr_kIa32GsBase: vmcs_field = VmcsField_kGuestGsBase; transfer_to_vmcs = true; l_bool_tryRun = true; break; case Msr_kIa32FsBase: vmcs_field = VmcsField_kGuestFsBase; transfer_to_vmcs = true; l_bool_tryRun = true; break; default: break; } uint64_t val = (guest_context->stack->gp_regs.ax & 0xFFFFFFFF) | ((guest_context->stack->gp_regs.dx & 0xFFFFFFFF) << 32); if (transfer_to_vmcs) { uint64_t writeresult = Asm_vmxWrite(vmcs_field, val); } else { __wrmsr(guest_context->stack->gp_regs.cx, guest_context->stack->gp_regs.ax, guest_context->stack->gp_regs.dx); l_bool_tryRun = true; } // continue if write successfully! if (l_bool_tryRun) { guest_context->stack->result_reg.rip = guest_context->stack->result_reg.rip + Asm_VmRead(VmcsField_kVmExitInstructionLen); return true; } } elseif (exitReason == VmxExitReason_kMsrRead) { bool transfer_to_vmcs = false; bool l_bool_tryRun = false; uint64_t vmcs_field = 0; switch ( guest_context->stack->gp_regs.cx) { case Msr_kIa32SysenterCs: vmcs_field = VmcsField_kGuestSysenterCs; transfer_to_vmcs = true; l_bool_tryRun = true; break; case Msr_kIa32SysenterEsp: vmcs_field = VmcsField_kGuestSysenterEsp; transfer_to_vmcs = true; l_bool_tryRun = true; break; case Msr_kIa32SysenterEip: vmcs_field = VmcsField_kGuestSysenterEip; transfer_to_vmcs = true; l_bool_tryRun = true; break; case Msr_kIa32Debugctl: vmcs_field = VmcsField_kGuestIa32Debugctl; transfer_to_vmcs = true; l_bool_tryRun = true; break; case Msr_kIa32GsBase: vmcs_field = VmcsField_kGuestGsBase; transfer_to_vmcs = true; l_bool_tryRun = true; break; case Msr_kIa32FsBase: vmcs_field = VmcsField_kGuestFsBase; transfer_to_vmcs = true; l_bool_tryRun = true; break; default: break; } uint64_t val = (guest_context->stack->gp_regs.ax & 0xFFFFFFFF) | ((guest_context->stack->gp_regs.dx & 0xFFFFFFFF) << 32); if (transfer_to_vmcs) { uint64_t readresult = Asm_VmRead(vmcs_field); guest_context->stack->gp_regs.ax = readresult & 0xFFFFFFFF; readresult >>= 32; guest_context->stack->gp_regs.dx = readresult & 0xFFFFFFFF; } else { uint64_t readresult = __rdmsr(guest_context->stack->gp_regs.cx); guest_context->stack->gp_regs.ax = readresult & 0xFFFFFFFF; readresult >>= 32; guest_context->stack->gp_regs.dx = readresult & 0xFFFFFFFF; l_bool_tryRun = true; } // continue if write successfully! if (l_bool_tryRun) { guest_context->stack->result_reg.rip = guest_context->stack->result_reg.rip + Asm_VmRead(VmcsField_kVmExitInstructionLen); return true; } } ...
|
重新make后再次加载驱动,dmesg不再提示上面的报错,转而开始报内核换页错误了
调度源码分析
驱动中用了work_on_cpu对每个cpu进行调用,所以为了解决上面的报错问题,需要分析一下调度的源码,其中work_on_cpu调用了work_on_cpu_key
C long work_on_cpu_key(int cpu, long (*fn)(void *), void *arg, struct lock_class_key *key); /* * A new key is defined for each caller to make sure the work * associated with the function doesn't share its locking class. */ #define work_on_cpu(_cpu, _fn, _arg) ({ static struct lock_class_key __key; work_on_cpu_key(_cpu, _fn, _arg, &__key); })
|
C /** * work_on_cpu_key - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function arg * @key: The lock class key for lock debugging purposes * * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. * * Return: The value @fn returns. */ long work_on_cpu_key(int cpu, long (*fn)(void *), void *arg, struct lock_class_key *key) { struct work_for_cpu wfc = { .fn = fn, .arg = arg }; INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key); schedule_work_on(cpu, &wfc.work); flush_work(&wfc.work); destroy_work_on_stack(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu_key);
|
在work_on_cpu_key中可以看到首先初始化了一个work结构体,调用schedule_work_on对work进行了调度
C /** * schedule_work_on - put work task on a specific cpu * @cpu: cpu to put the work task on * @work: job to be done * * This puts a job on a specific cpu */ static inline bool schedule_work_on(int cpu, struct work_struct *work) { return queue_work_on(cpu, system_wq, work); }
|
C /** * queue_work_on - queue work on specific cpu * @cpu: CPU number to execute work on * @wq: workqueue to use * @work: work to queue * * We queue the work to a specific CPU, the caller must ensure it * can't go away.Callers that fail to ensure that the specified * CPU cannot go away will execute on a randomly chosen CPU. * But note well that callers specifying a CPU that never has been * online will get a splat. * * Return: %false if @work was already on a queue, %true otherwise. */ bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { bool ret = false; unsigned long irq_flags; local_irq_save(irq_flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && !clear_pending_if_disabled(work)) { __queue_work(cpu, wq, work); ret = true; } local_irq_restore(irq_flags); return ret; } EXPORT_SYMBOL(queue_work_on);
|
C static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; struct worker_pool *last_pool, *pool; unsigned int work_flags; unsigned int req_cpu = cpu; /* * While a work item is PENDING && off queue, a task trying to * steal the PENDING will busy-loop waiting for it to either get * queued or lose PENDING.Grabbing PENDING and queueing should * happen with IRQ disabled. */ lockdep_assert_irqs_disabled(); /* * For a draining wq, only works from the same workqueue are * allowed. The __WQ_DESTROYING helps to spot the issue that * queues a new work item to a wq after destroy_workqueue(wq). */ if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq)))) return; rcu_read_lock(); retry: /* pwq which will be used unless @work is executing elsewhere */ if (req_cpu == WORK_CPU_UNBOUND) { if (wq->flags & WQ_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); else cpu = raw_smp_processor_id(); } pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu)); pool = pwq->pool; /* * If @work was previously on a different pool, it might still be * running there, in which case the work needs to be queued on that * pool to guarantee non-reentrancy. * * For ordered workqueue, work items must be queued on the newest pwq * for accurate order management.Guaranteed order also guarantees * non-reentrancy.See the comments above unplug_oldest_pwq(). */ last_pool = get_work_pool(work); if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) { struct worker *worker; raw_spin_lock(&last_pool->lock); worker = find_worker_executing_work(last_pool, work); if (worker && worker->current_pwq->wq == wq) { pwq = worker->current_pwq; pool = pwq->pool; WARN_ON_ONCE(pool != last_pool); } else { /* meh... not running there, queue here */ raw_spin_unlock(&last_pool->lock); raw_spin_lock(&pool->lock); } } else { raw_spin_lock(&pool->lock); } /* * pwq is determined and locked. For unbound pools, we could have raced * with pwq release and it could already be dead. If its refcnt is zero, * repeat pwq selection. Note that unbound pwqs never die without * another pwq replacing it in cpu_pwq or while work items are executing * on it, so the retrying is guaranteed to make forward-progress. */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { raw_spin_unlock(&pool->lock); cpu_relax(); goto retry; } /* oops */ WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", wq->name, cpu); } /* pwq determined, queue */ trace_workqueue_queue_work(req_cpu, pwq, work); if (WARN_ON(!list_empty(&work->entry))) goto out; pwq->nr_in_flight[pwq->work_color]++; work_flags = work_color_to_flags(pwq->work_color); /* * Limit the number of concurrently active work items to max_active. * @work must also queue behind existing inactive work items to maintain * ordering when max_active changes. See wq_adjust_max_active(). */ if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) { if (list_empty(&pool->worklist)) pool->watchdog_ts = jiffies; trace_workqueue_activate_work(work); insert_work(pwq, work, &pool->worklist, work_flags); kick_pool(pool); } else { work_flags |= WORK_STRUCT_INACTIVE; insert_work(pwq, work, &pwq->inactive_works, work_flags); } out: raw_spin_unlock(&pool->lock); rcu_read_unlock(); }
|
C /** * insert_work - insert a work into a pool * @pwq: pwq @work belongs to * @work: work to insert * @head: insertion point * @extra_flags: extra WORK_STRUCT_* flags to set * * Insert @work which belongs to @pwq after @head.@extra_flags is or'd to * work_struct flags. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { debug_work_activate(work); /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack_noalloc(work); /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); get_pwq(pwq); }
|
schedule_work_on中调用了queue_work_on函数,queue_work_on函数调用了__queue_work,最后调用到了insert_work,设置并通过list_add_tail插入到对应的cpu的的work列表的末尾。
而调度的过程是worker_thread->process_scheduled_works->process_one_work(源码就不再贴出,可自行查看),而worker_thread则是create_worker函数创建的一个调度线程的主题函数,所以可以得知,work_on_cpu并非当前线程切换到对应的cpu上进行执行,而是先插入到对应的cpu中的队列中,等待相关的worker_thread线程执行,而flush_work则是等待work对象完成的后返回。
此时,可以推断出,内核错误的原因是CR3不对,也就是说,通过work_on_cpu初始化的每个CPU上的VM host后,当VM进入host后,用到的CR3为调度线程的,所以,当vm关闭后,需要恢复guest下的CR3(理论上CRX都需要进行恢复,但是一般没有人进行修改,所以本次直接忽略)
修复返回
先在返回的栈中定义一个64位长度的cr3
C // processdata.h ... typedef struct _Whisper_Result_Registers { uint64_t rip; uint64_t cs; uint64_t eflags; uint64_t rsp; uint64_t ss; uint64_t cr3; }__attribute__((packed)) Whisper_Result_Registers, *PWhisper_Result_Registers; ...
|
再在汇编代码中扩大变量所占的空间,以及返回时恢复guest的Cr3
Assembly language // asm_function_x64.S ... Asm_VmmEntryPoint: subq $0x38, %rsp# "movaps needs 16 bytes aligned"--huoji (key08.com) ... call_vmOff: xchgq 0x28(%rsp), %rax movq %rax, %cr3 movq 0x28(%rsp), %rax iretq ...
|
再次运行,可以发现,驱动不再报错,但是vmtoolsd报了个段错误
解决vmtoolsd错误
在日志中,可以看到vm已经退出掉了,退出的原因是接到了0x12的事件,所以,需要对该事件进行兼容,查表后可以得知该指令为VMCall,本篇中默认不可使用该指令,所以将guest中的eflags中的cf置1,zf置0,并返回,代表vmcall调用错误
C // whisper_handler.c ... bool Handler(Guest_Context *guest_context) { ... } elseif (exitReason == VmxExitReason_kVmcall) { guest_context->flag_reg.fields.cf = 1; guest_context->flag_reg.fields.zf = 0; return true; } .. } bool WhisperExitHandler(Whisper_Call_Stack *stack) { ... if (!guest_context.vm_continue) { } else { Asm_vmxWrite(VmcsField_kGuestRip, stack->result_reg.rip); Asm_vmxWrite(VmcsField_kGuestRflags, guest_context.flag_reg.all); guest_context->stack->result_reg.rip = guest_context->stack->result_reg.rip + Asm_VmRead(VmcsField_kVmExitInstructionLen); } ... }
|
再次运行后发现vmtoolsd已经不再报错了
此时,为了让vm存在的时间更久一些,将whisper_linux.c中的ssleep改为20,即20秒后再退出,编译运行,可以发现系统在正常的跑,键盘输入没问题...唉,等一下,鼠标怎么失灵不能动了?
解决鼠标问题
事实上,该问题(猜测)仅会存在于vmware虚拟机中的linux,为了解决这个问题,需要查看linux源码driversinputmousevmmouse.c文件中的代码,可以发现,linux使用了vmcall指令针对虚拟机中的鼠标进行加速,所以,vmcall并不能像理想中的状态一样一ban了之,还需要针对几个特殊的指令进行转发
C #define VMWARE_CMD_GETVERSION10 #define VMWARE_CMD_GETHZ45 #define VMWARE_CMD_GETVCPU_INFO68 #define VMWARE_CMD_STEALCLOCK91 #define VMWARE_CMD_ABSPOINTER_DATA39 #define VMWARE_CMD_ABSPOINTER_STATUS40 #define VMWARE_CMD_ABSPOINTER_COMMAND41 #define VMWARE_CMD_ABSPOINTER_RESTRICT86
|
通过源码可以得知,vmware的vmcall参数传值顺序为rax rbx rcx rdx rsi rdi ,所以需要一个汇编函数,对参数进行传参,调用,并返回
Assembly language // asm_function_x64.S ... Asm_VmCallForVmware: Asm_pushad movq 9*8(%rdi), %rsi movq 13*8(%rdi), %rdx movq 14*8(%rdi), %rcx movq 12*8(%rdi), %rbx movq 15*8(%rdi), %rax movq 8*8(%rdi), %rdi vmcall movq 8*8(%rsp), %r8 movq %rsi,9*8(%r8) movq %rdx,13*8(%r8) movq %rcx,14*8(%r8) movq %rbx,12*8(%r8) movq %rax,15*8(%r8) movq %rdi,8*8(%r8) Asm_popad ret ...
|
C // asm_function_x64.h ... void Asm_VmCallForVmware(void *regstack); ...
|
C // whisper_handler.c ... } elseif (exitReason == VmxExitReason_kVmcall) { if (guest_context->stack->gp_regs.cx == VMWARE_CMD_ABSPOINTER_DATA || guest_context->stack->gp_regs.cx == VMWARE_CMD_ABSPOINTER_STATUS || guest_context->stack->gp_regs.cx == VMWARE_CMD_ABSPOINTER_COMMAND || guest_context->stack->gp_regs.cx == VMWARE_CMD_ABSPOINTER_RESTRICT || guest_context->stack->gp_regs.cx == VMWARE_CMD_STEALCLOCK || guest_context->stack->gp_regs.cx == VMWARE_CMD_GETVCPU_INFO || guest_context->stack->gp_regs.cx == VMWARE_CMD_GETHZ || guest_context->stack->gp_regs.cx == VMWARE_CMD_GETVERSION) { Asm_VmCallForVmware(&(guest_context->stack->gp_regs)); } else { guest_context->flag_reg.fields.cf = 1; guest_context->flag_reg.fields.zf = 0; } guest_context->stack->result_reg.rip = guest_context->stack->result_reg.rip + Asm_VmRead(VmcsField_kVmExitInstructionLen); return true; } ...
|
编译后,再运行,发现鼠标可以正常的动了,随便运行一个指令吧,例如dmesg,再次发现一个报错,这个又是什么情况?
解决dmesg指令报错
这个问题事实上是驱动运行后,新运行的所有ring3进程都会有这个问题,因为基本所有的进程都引用了glibc库,glibc的源码中sysdepsx86cpu-features.c在进程初始化的时候,对cpu的VendorID做了校验,目前仅支持GenuineIntel、AuthenticAMD、HygonGenuine、CentaurHauls、 Shanghai,其中最后一个shanghai两边有空格不可忽略,其他情况的VendorID都会抛出CPU ISA level的异常
分析出问题就好解决了,将对cpuid的VendorID篡改由FakeIntel改为AuthenticAMD
C // whisper_handler.c ... bool Handler(Guest_Context *guest_context) { uint64_t exitReason = Asm_VmRead(VmcsField_kVmExitReason); if (exitReason == VmxExitReason_kCpuid) { if (guest_context->stack->gp_regs.ax == 0) { guest_context->stack->gp_regs.bx = 0x68747541; guest_context->stack->gp_regs.dx = 0x69746e65; guest_context->stack->gp_regs.cx = 0x444d4163; ... } } } ...
|
再次编译、加载,执行dmesg,可以看到指令已经可以正常执行,鼠标能动了,且VendorID已经被改为了AuthenticAMD
持久化再再探
现在万事俱备,只差东风了,先重构一下代码,将在驱动加载时启动VM,在驱动关闭时,退出VM
C // whisper_linux.c ... int whisper_init(void) { // turn on run int i; bool bReturn = true; for_each_online_cpu(i) { if (work_on_cpu(i, (void *)StartWhisper, 0)) { } else { bReturn = false; break; } } printk(KERN_INFO "%sn", "[whisper]start"); return 0; } void whisper_exit(void) { int i; for_each_online_cpu(i) { if (work_on_cpu(i, (void *)StopWhisper, 0)) { } else { break; } printk(KERN_INFO "Stop %dn", i); } printk(KERN_INFO "%sn", "[whisper]bye"); } ...
|
重新编译后,发现运行几秒后,虚拟机崩溃掉了,这个问题怎么发生的,又该怎么解决呢?事实上,这个问题,在本篇的调度源码分析节,就已经埋下了,开启host vm的时候,代码中使用了当前线程的cr3(linux中每个task_struct也就是每个线程,都有单独的cr3),且对应的调度线程worker_thread有可能被销毁掉,就导致了cr3也会被销毁掉,所以需要根据linux源码中的创建源码,自行构建cr3
C // archumkernelmem.c ... pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); if (pgd) { memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); } return pgd; } ...
|
可以发现,申请内存后,将swapper_pg_dir内存中的内核地址部分复制了过去,也就是说,所有的内核cr3的内核地址部分是完全相同的,尽管驱动无法访问到swapper_pg_dir,但可以使用当前线程的cr3,全部复制,只要不访问用户空间地址即可(其实,用户区域地址清零也可以,但还是需要不访问用户空间地址,并且USER_PTRS_PER_PGD大小会随着地址变化,不好判断用户空间的地址,导致不方便清零,所以还是保留吧,只要不访问即可)
C // processdata.h ... typedef union { uint64_t all; struct { uint64_t reserve : 3; uint64_t writethrough : 1; uint64_t cachedisable : 1; uint64_t reserve2 : 7; uint64_t pfn : 36; uint64_t reserve3 : 16; }; } _cr3;
|
C // whisper_linux.c ... _cr3 g_cr3_newInstance = {0}; pgd_t *g_pgdt_instance = 0; uint64_t buildCr3(void) { if (g_cr3_newInstance.all) { return g_cr3_newInstance.all; } else { g_pgdt_instance = (pgd_t *)__get_free_page(GFP_KERNEL); printk(KERN_INFO "[Whipser]crrent 0x%llxn", current); printk(KERN_INFO "[Whipser]current->mm 0x%llxn", current->mm); printk(KERN_INFO "[Whipser]current->mm->pgd 0x%llxn", current->mm->pgd); { memset(g_pgdt_instance, 0, PAGE_SIZE); memcpy(g_pgdt_instance, current->mm->pgd, PAGE_SIZE); uint64_t newcr3 = (uint64_t)virt_to_phys(g_pgdt_instance) >> PAGE_SHIFT; g_cr3_newInstance.all = __read_cr3(); g_cr3_newInstance.pfn = newcr3; } printk(KERN_INFO "[Whipser]cr3NewUnion 0x%llxn", g_cr3_newInstance.all); return g_cr3_newInstance.all; } } bool SetupVMCS(void *guest_rsp, void *guest_rip, void *host_rip) { ... Asm_vmxWrite(VmcsField_kHostCr3, buildCr3()); } ... int whisper_init(void) { ... buildCr3(); ... } ...
|
编译后,再进行加载,即可发现已经可以进行长时间持久化了
代码工程
https://github.com/CrazyHarb/VmxProject
参考代码:
https://github.com/torvalds/linux
https://github.com/tandasat/HyperPlatform
https://cloud.tencent.com/developer/article/2144036
https://www.gnu.org/software/libc
原文始发于微信公众号(冲鸭安全):INTEL CPU虚拟化特性研究&初探 (三)
评论