Linux内核学习笔记

admin 2023年1月19日23:11:12评论10 views字数 44316阅读147分43秒阅读模式

Linux内核学习笔记

本文为看雪论坛优秀文章

看雪论坛作者ID:e*16 a


以下是基于linux0.11的代码。




内核的五大结构

Linux内核学习笔记




中断工作流程

1、ARM回忆


(1)做CPU工作模式的转化

(2)进行寄存器的拷贝与压栈

(3)设置中断向量表

(4)保存正常运行的函数返回值

(5)跳转到对应的中断服务函数上运行

(6)进行模式的复原及寄存器的复原

(7)跳转回正常工作的函数地址继续运行


2、linux中中断的工作流程

(1)将所有寄存器值入栈

(2)将异常吗入栈(中断号)

(3)将当前函数的返回地址入栈

(4)调用中断函数

(5)返回地址出栈

(6)寄存器值出栈

3、中断源码


中断前后的处理 中断的执行

硬件中断的处理过程 asm.s trap.c

软件及系统调用的处理过程 system_call.s fork.c/signal.c/exit.c/sys.c

① asm.s代码及trap.c分析 (OPENING)

② system_call.s代码及fork.c/signal.c/exit.c/sys.c分析


(1) fork.c

在system_call.s内有存在fork的系统调用,先call _find_empty_process,然后call _copy_process。

.align 2_sys_fork:    call _find_empty_process    testl %eax,%eax    js 1f    push %gs    pushl %esi    pushl %edi    pushl %ebp    pushl %eax    call _copy_process    addl $20,%esp1:    ret


#include <errno.h>#include <linux/sched.h>#include <linux/kernel.h>#include <asm/segment.h>#include <asm/system.h> extern void write_verify(unsigned long address); long last_pid=0; void verify_area(void * addr,int size){    unsigned long start;     start = (unsigned long) addr;    size += start & 0xfff;    start &= 0xfffff000;    start += get_base(current->ldt[2]);    while (size>0) {        size -= 4096;        write_verify(start);        start += 4096;    }} int copy_mem(int nr,struct task_struct * p){    unsigned long old_data_base,new_data_base,data_limit;    unsigned long old_code_base,new_code_base,code_limit;     code_limit=get_limit(0x0f);    data_limit=get_limit(0x17);    old_code_base = get_base(current->ldt[1]);    old_data_base = get_base(current->ldt[2]);    if (old_data_base != old_code_base)        panic("We don't support separate I&D");    if (data_limit < code_limit)        panic("Bad data_limit");    new_data_base = new_code_base = nr * 0x4000000;    p->start_code = new_code_base;    set_base(p->ldt[1],new_code_base);    set_base(p->ldt[2],new_data_base);    if (copy_page_tables(old_data_base,new_data_base,data_limit)) {        free_page_tables(new_data_base,data_limit);        return -ENOMEM;    }    return 0;} /* *  Ok, this is the main fork-routine. It copies the system process * information (task[nr]) and sets up the necessary registers. It * also copies the data segment in it's entirety. */int copy_process(int nr,long ebp,long edi,long esi,long gs,long none,        long ebx,long ecx,long edx,        long fs,long es,long ds,        long eip,long cs,long eflags,long esp,long ss){    struct task_struct *p;   //创建子进程的task_struct结构体    int i;    struct file *f;     p = (struct task_struct *) get_free_page();    if (!p)        return -EAGAIN;    task[nr] = p;   //将子进程存到task链表中    *p = *current;    /* NOTE! this doesn't copy the supervisor stack */    //下面开始设置结构体内容    p->state = TASK_UNINTERRUPTIBLE;    p->pid = last_pid;    p->father = current->pid;    p->counter = p->priority;    p->signal = 0;    p->alarm = 0;    p->leader = 0;        /* process leadership doesn't inherit */    p->utime = p->stime = 0;    p->cutime = p->cstime = 0;    p->start_time = jiffies;    p->tss.back_link = 0;    p->tss.esp0 = PAGE_SIZE + (long) p;    p->tss.ss0 = 0x10;    p->tss.eip = eip;    p->tss.eflags = eflags;    p->tss.eax = 0;    p->tss.ecx = ecx;    p->tss.edx = edx;    p->tss.ebx = ebx;    p->tss.esp = esp;    p->tss.ebp = ebp;    p->tss.esi = esi;    p->tss.edi = edi;    p->tss.es = es & 0xffff;    p->tss.cs = cs & 0xffff;    p->tss.ss = ss & 0xffff;    p->tss.ds = ds & 0xffff;    p->tss.fs = fs & 0xffff;    p->tss.gs = gs & 0xffff;    p->tss.ldt = _LDT(nr);    p->tss.trace_bitmap = 0x80000000;    if (last_task_used_math == current)        __asm__("clts ; fnsave %0"::"m" (p->tss.i387));  //如果父进程用了协处理器,需要在tss段进行设置    if (copy_mem(nr,p)) {  //内存拷贝        task[nr] = NULL;        free_page((long) p);        return -EAGAIN;    }    for (i=0; i<NR_OPEN;i++)        if (f=p->filp[i])            f->f_count++;    if (current->pwd)        current->pwd->i_count++;    if (current->root)        current->root->i_count++;    if (current->executable)        current->executable->i_count++;    set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss));    set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt));    p->state = TASK_RUNNING;    /* do this last, just in case */    return last_pid; } int find_empty_process(void){    int i;     repeat:        if ((++last_pid)<0) last_pid=1;        for(i=0 ; i<NR_TASKS ; i++)            if (task[i] && task[i]->pid == last_pid) goto repeat;    for(i=1 ; i<NR_TASKS ; i++)        if (!task[i])              return i;    return -EAGAIN;}

① 在task链表中找一个进程空位存放

② 创建一个task_struct

③ 设置task_struct


(2)signal.c


这里只是进行一个简单的分析,详细分析请见第五章。

#include <linux/sched.h>#include <linux/kernel.h>#include <asm/segment.h> #include <signal.h> volatile void do_exit(int error_code); int sys_sgetmask(){    return current->blocked;} int sys_ssetmask(int newmask){    int old=current->blocked;     current->blocked = newmask & ~(1<<(SIGKILL-1));    return old;} static inline void save_old(char * from,char * to){    int i;     verify_area(to, sizeof(struct sigaction));    for (i=0 ; i< sizeof(struct sigaction) ; i++) {        put_fs_byte(*from,to);        from++;        to++;    }} static inline void get_new(char * from,char * to){    int i;     for (i=0 ; i< sizeof(struct sigaction) ; i++)        *(to++) = get_fs_byte(from++);} int sys_signal(int signum, long handler, long restorer){    struct sigaction tmp;     if (signum<1 || signum>32 || signum==SIGKILL) //判断信号值是否合法        return -1;    tmp.sa_handler = (void (*)(int)) handler;    tmp.sa_mask = 0;    tmp.sa_flags = SA_ONESHOT | SA_NOMASK;    tmp.sa_restorer = (void (*)(void)) restorer;   //设置sigaction结构体    handler = (long) current->sigaction[signum-1].sa_handler;    current->sigaction[signum-1] = tmp; //将当前进程对应的信号结构体改为新分配的结构体    return handler; //返回处理函数} int sys_sigaction(int signum, const struct sigaction * action,    struct sigaction * oldaction){    struct sigaction tmp;     if (signum<1 || signum>32 || signum==SIGKILL)        return -1;    tmp = current->sigaction[signum-1];    get_new((char *) action,        (char *) (signum-1+current->sigaction));    if (oldaction)        save_old((char *) &tmp,(char *) oldaction);    if (current->sigaction[signum-1].sa_flags & SA_NOMASK)        current->sigaction[signum-1].sa_mask = 0;    else        current->sigaction[signum-1].sa_mask |= (1<<(signum-1));    return 0;} void do_signal(long signr,long eax, long ebx, long ecx, long edx,    long fs, long es, long ds,    long eip, long cs, long eflags,    unsigned long * esp, long ss){    unsigned long sa_handler;    long old_eip=eip;    struct sigaction * sa = current->sigaction + signr - 1;    int longs;    unsigned long * tmp_esp;     sa_handler = (unsigned long) sa->sa_handler;    if (sa_handler==1)        return;    if (!sa_handler) {        if (signr==SIGCHLD)            return;        else            do_exit(1<<(signr-1));    }    if (sa->sa_flags & SA_ONESHOT)        sa->sa_handler = NULL;    *(&eip) = sa_handler;    longs = (sa->sa_flags & SA_NOMASK)?7:8;    *(&esp) -= longs;    verify_area(esp,longs*4);    tmp_esp=esp;    put_fs_long((long) sa->sa_restorer,tmp_esp++);    put_fs_long(signr,tmp_esp++);    if (!(sa->sa_flags & SA_NOMASK))        put_fs_long(current->blocked,tmp_esp++);    put_fs_long(eax,tmp_esp++);    put_fs_long(ecx,tmp_esp++);    put_fs_long(edx,tmp_esp++);    put_fs_long(eflags,tmp_esp++);    put_fs_long(old_eip,tmp_esp++);    current->blocked |= sa->sa_mask;}


// Line 12#define SIGHUP         1        // 挂断控制终端或进程#define SIGINT         2        // 键盘中断#define SIGQUIT         3        // 键盘退出#define SIGILL         4        // 非法指令#define SIGTRAP         5        // 跟踪断点#define SIGABRT         6        // 异常结束#define SIGIOT         6        // 异常结束#define SIGUNUSED     7        // 未使用#define SIGFPE         8        // 协处理器错误#define SIGKILL         9        // 终止进程#define SIGUSR1        10        // 用户信号 1#define SIGSEGV        11        // 无效的内存引用#define SIGUSR2        12        // 用户信号 2#define SIGPIPE        13        // 管道写出错,读端全关闭#define SIGALRM        14        // 定时器警报#define SIGTERM        15        // 进程终止#define SIGSTKFLT    16        // 栈出错#define SIGCHLD        17        // 子进程状态改变#define SIGCONT        18        // 恢复进程继续执行#define SIGSTOP        19        // 暂停进程执行#define SIGTSTP        20        // tty 发出的停止进程信号#define SIGTTIN        21        // 后台进程请求输入#define SIGTTOU        22        // 后台进程请求输出 // Line 37#define SA_NOCLDSTOP    1            // 当子进程处于停止状态,就不对 SIGCHLD 信号做处理#define SA_NOMASK    0x40000000        // 允许在指定信号处理程序中再次收到该信号#define SA_ONESHOT    0x80000000        // 信号句柄一旦被调用过就恢复默认处理函数 // Line 45#define SIG_DFL        ((void (*)(int))0)    // 默认处理程序#define SIG_IGN        ((void (*)(int))1)    // 忽略信号对应的处理程序typedef unsigned int sigset_t; struct sigaction {    void (*sa_handler)(int);    // 信号处理程序指针    sigset_t sa_mask;        // 指出当前信号处理程序执行期间需要被屏蔽的信号    int sa_flags;            // 从 37 行的三个定义中选出    void (*sa_restorer)(void);    // 恢复函数指针,由 libc 提供};


(3)exit.c

#include <errno.h>#include <signal.h>#include <sys/wait.h> #include <linux/sched.h>#include <linux/kernel.h>#include <linux/tty.h>#include <asm/segment.h> int sys_pause(void);int sys_close(int fd); void release(struct task_struct * p)  //释放进程p{    int i;     if (!p)        return;    for (i=1 ; i<NR_TASKS ; i++)        if (task[i]==p) {            task[i]=NULL;            free_page((long)p);  //释放内存页            schedule();   //之后重新进行进程调度            return;        }    panic("trying to release non-existent task");} static inline int send_sig(long sig,struct task_struct * p,int priv){    if (!p || sig<1 || sig>32)        return -EINVAL;    if (priv || (current->euid==p->euid) || suser())        p->signal |= (1<<(sig-1));   //给p进程发送信号    else        return -EPERM;    return 0;} static void kill_session(void)  //关闭对话函数{    struct task_struct **p = NR_TASKS + task; //获得task数组最后一个任务     while (--p > &FIRST_TASK) {  //从最后一个向前遍历        if (*p && (*p)->session == current->session) //如果遍历到当前的任务            (*p)->signal |= 1<<(SIGHUP-1);     //则将SIGHUP挂断信号发送给当前任务    }} /* * XXX need to check permissions needed to send signals to process * groups, etc. etc.  kill() permissions semantics are tricky! */int sys_kill(int pid,int sig)  //linux命令kill不是杀死的意思,是向某进程发送任何信号{    struct task_struct **p = NR_TASKS + task;  //指向最后    int err, retval = 0; // 注: 每个进程组都有一个组长进程,组长进程的进程号等于进程组ID     if (!pid) while (--p > &FIRST_TASK) {  //如果pid为0,进入循环        if (*p && (*p)->pgrp == current->pid) //向进程组的所有成员发送信号            if (err=send_sig(sig,*p,1))                retval = err;    }    else if (pid>0) while (--p > &FIRST_TASK) { //如果pid大于0        if (*p && (*p)->pid == pid)  //仅向pid进程发送信号            if (err=send_sig(sig,*p,0))                retval = err;    }    else if (pid == -1) while (--p > &FIRST_TASK) //如果pid=-1        if (err = send_sig(sig,*p,0)) //向除0号进程外的进程发送信号            retval = err;    else while (--p > &FIRST_TASK) //如果pid<-1        if (*p && (*p)->pgrp == -pid) //向进程组号为-pid的进程组发送信号            if (err = send_sig(sig,*p,0))                retval = err;    return retval;} static void tell_father(int pid)  //传入参数为父进程的pid{    int i;     if (pid)        for (i=0;i<NR_TASKS;i++) {            if (!task[i])                continue;            if (task[i]->pid != pid)                continue;            task[i]->signal |= (1<<(SIGCHLD-1)); //SIGCHLD=17            return;        }/* if we don't find any fathers, we just release ourselves *//* This is not really OK. Must change it to make father 1 */    printk("BAD BAD - no father foundnr");    release(current);  //释放子进程} int do_exit(long code){    int i;                                                                //#define LDT_NUL 0                                                                //#define LDT_CODE 1                                                                //#define LDT_DATA 2    free_page_tables(get_base(current->ldt[1]),get_limit(0x0f)); //释放当前进程的CODE段所占用的内存页    free_page_tables(get_base(current->ldt[2]),get_limit(0x17));    for (i=0 ; i<NR_TASKS ; i++)  //从前向后遍历        if (task[i] && task[i]->father == current->pid) { //若当前进程就是某个进程的父进程时;            task[i]->father = 1; //就让1号进程作为某个进程的父进程(因为current这个进程将会exit)            if (task[i]->state == TASK_ZOMBIE) //若某进程是僵死状态                /* assumption task[1] is always init */                (void) send_sig(SIGCHLD, task[1], 1); //给1号进程发送信号        }    for (i=0 ; i<NR_OPEN ; i++)        if (current->filp[i]) //关闭当前进程打开的所有文件            sys_close(i);    iput(current->pwd);  //把当前进程的路径放回i节点并置空    current->pwd=NULL;    iput(current->root);    current->root=NULL;    iput(current->executable);    current->executable=NULL;    if (current->leader && current->tty >= 0) //若当前进程是进程组的头头,并且拥有tty终端        tty_table[current->tty].pgrp = 0; //释放该终端    if (last_task_used_math == current)        last_task_used_math = NULL;    if (current->leader)        kill_session(); //关闭session    current->state = TASK_ZOMBIE; //设置成僵死状态    current->exit_code = code;    tell_father(current->father); //向当前进程的父进程发送 SIGCHLD 信号    schedule();    return (-1);    /* just to suppress warnings */} int sys_exit(int error_code){    return do_exit((error_code&0xff)<<8);} int sys_waitpid(pid_t pid,unsigned long * stat_addr, int options){    int flag, code;    struct task_struct ** p;     verify_area(stat_addr,4);repeat:    flag=0;    for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) {        if (!*p || *p == current) //若该项为空或者该项是当前进程,则跳过            continue;        if ((*p)->father != current->pid) //若该项的父进程不是当前进程,则跳过            continue;        if (pid>0) {  //若pid>0            if ((*p)->pid != pid) //若该项的pid不是waitpid传进来的pid参数,则跳过                continue;        } else if (!pid) {    //若pid=0,            if ((*p)->pgrp != current->pgrp) //若当前项不在当前进程组,则跳过                continue;        } else if (pid != -1) { //若pid<-1            if ((*p)->pgrp != -pid) //若当前项不在-pid的进程组,则跳过                continue;        }        switch ((*p)->state) {  //若pid=-1,则直接来到switch;判断所选进程p的状态            case TASK_STOPPED: //若是停止状态                if (!(options & WUNTRACED))   //                    continue;                put_fs_long(0x7f,stat_addr);                return (*p)->pid;            case TASK_ZOMBIE:                current->cutime += (*p)->utime;                current->cstime += (*p)->stime;                flag = (*p)->pid;                code = (*p)->exit_code;                release(*p);                put_fs_long(code,stat_addr);                return flag;            default:   //p是睡眠或运行状态,设置flag为1                flag=1;                continue;        }    }    if (flag) {        if (options & WNOHANG)  //WNOHANG 表示若没有子进程处于退出或终止态就返回            return 0;        current->state=TASK_INTERRUPTIBLE; //否则将当前进程的状态置为可中断睡眠态        schedule();        if (!(current->signal &= ~(1<<(SIGCHLD-1))))            goto repeat;        else            return -EINTR;    }    return -ECHILD;}


do_exit()


① 释放进程的代码段和数据段占用的内存。


② 关闭进程打开的所有文件,对当前目录和i节点进行同步(文件操作)。


③ 如果当前要销毁的进程有子进程,就让1号进程作为新的父进程。


④ 如果当前进程是一个会话头进程,则会终止会话中的所有进程。


⑤ 改变当前进程的运行状态,变成TASK_ZOMBIE(僵死)状态,并且向其父进程发送SIGCHLD信号,说明自己要死了。


sys_waitpid()


① 父进程在运行子进程时一般都会运行wait waitpid这两个函数,用来父进程等待子进程终止。


② 当父进程收到SIGCHLD信号时,父进程会终止僵死状态的子进程。


③ 父进程会把子进程的运行时间累加到自己的运行时间上。


④ 把对应子进程的进程描述结构体进行释放,置空数组空槽。




进程


1.内核进程初始化与创建


每创建一个进程就对应着一个task_struct结构体。

struct task_struct {/* these are hardcoded - don't touch */    long state;    /* -1 unrunnable, 0 runnable, >0 stopped */    long counter;     long priority;    long signal;    struct sigaction sigaction[32];    long blocked;    /* bitmap of masked signals *//* various fields */    int exit_code;    unsigned long start_code,end_code,end_data,brk,start_stack;    long pid,father,pgrp,session,leader;    unsigned short uid,euid,suid;    unsigned short gid,egid,sgid;    long alarm;    long utime,stime,cutime,cstime,start_time;    unsigned short used_math;/* file system info */    int tty;        /* -1 if no tty, so it must be signed */    unsigned short umask;    struct m_inode * pwd;    struct m_inode * root;    struct m_inode * executable;    unsigned long close_on_exec;    struct file * filp[NR_OPEN];/* ldt for this task 0 - zero 1 - cs 2 - ds&ss */    struct desc_struct ldt[3];/* tss for this task */    struct tss_struct tss;  //cpu运行一个进程后各个寄存器都保存在tss内};


(1)0号和1号进程的创建

Linux在初始化的过程中会进行0号进程的创建。


注:分析0.11的main函数

void main(void)        /* This really IS void, no error here. */{            /* The startup routine assumes (well, ...) this *//* * Interrupts are still disabled. Do necessary setups, then * enable them */     ROOT_DEV = ORIG_ROOT_DEV;     drive_info = DRIVE_INFO;    memory_end = (1<<20) + (EXT_MEM_K<<10);    memory_end &= 0xfffff000;    if (memory_end > 16*1024*1024)        memory_end = 16*1024*1024;    if (memory_end > 12*1024*1024)        buffer_memory_end = 4*1024*1024;    else if (memory_end > 6*1024*1024)        buffer_memory_end = 2*1024*1024;    else        buffer_memory_end = 1*1024*1024;    main_memory_start = buffer_memory_end;#ifdef RAMDISK    main_memory_start += rd_init(main_memory_start, RAMDISK*1024);#endif    mem_init(main_memory_start,memory_end);    trap_init();    blk_dev_init();    chr_dev_init();    tty_init();    time_init();    sched_init();    buffer_init(buffer_memory_end);    hd_init();    floppy_init();    sti();    move_to_user_mode(); //切换到用户态    if (!fork()) {        /* 创建0号进程 */          init();    }    for(;;) pause();}


内核要先切换到用户态之后再fork生成0号进程。

#define move_to_user_mode() __asm__ ("movl %%esp,%%eaxnt"     "pushl $0x17nt"     "pushl %%eaxnt"     "pushflnt"     "pushl $0x0fnt"     "pushl $1fnt"     "iretn"     "1:tmovl $0x17,%%eaxnt"     "movw %%ax,%%dsnt"     "movw %%ax,%%esnt"     "movw %%ax,%%fsnt"     "movw %%ax,%%gs"     :::"ax")


iret是从中断返回的指令,在iret之前,之前5个push压入的数据会出栈,分别赋给ss,esp,eflags,cs,eip。

fork生成0进程之后,会进行初始化,进一步分析如下:

static char * argv_rc[] = { "/bin/sh", NULL };static char * envp_rc[] = { "HOME=/", NULL }; static char * argv[] = { "-/bin/sh",NULL };static char * envp[] = { "HOME=/usr/root", NULL }; void init(void){    int pid,i;     setup((void *) &drive_info);    (void) open("/dev/tty0",O_RDWR,0); //tty0设备是标准输入控制台,句柄为0    (void) dup(0);    (void) dup(0);    printf("%d buffers = %d bytes buffer spacenr",NR_BUFFERS,        NR_BUFFERS*BLOCK_SIZE);    printf("Free mem: %d bytesnr",memory_end-main_memory_start);    if (!(pid=fork())) {  //对于被创建的子进程,返回值为0,所以if里面的语句是在子进程中执行,并打开rc文件并用获得的shell在/执行rc里的命令        close(0);  //关闭标准输入,所有进程共用文件描述符        if (open("/etc/rc",O_RDONLY,0))            _exit(1);        execve("/bin/sh",argv_rc,envp_rc);        _exit(2);    }    if (pid>0)  //fork后对于父进程来说,返回的是子进程的进程号,即if语句内是父进程要执行的代码        while (pid != wait(&i))  //等待子进程退出               /* nothing */;    while (1) {   //如果执行到了这里,就说明子进程已经创建完成退出或者终止,下面是再创建一个子进程,        if ((pid=fork())<0) {            printf("Fork failed in initrn");            continue;        }        if (!pid) {  //创建成功            close(0);close(1);close(2);            setsid();            (void) open("/dev/tty0",O_RDWR,0);            (void) dup(0);            (void) dup(0);            _exit(execve("/bin/sh",argv,envp));        }        while (1)            if (pid == wait(&i))                break;        printf("nrchild %d died with code %04xnr",pid,i);        sync();    }    _exit(0);    /* NOTE! _exit, not exit() */}


① 0号进程打开标准输入输出错误句柄

② 创建1号进程,首先打开"/dev/rc"文件,执行shell

③ 如果1号进程创建失败,会换一种方式再次创建

④ 之后就是进行pause()暂停状态,系统等待运行下一步

####

2、普通进程的创建(WORKING)


众所周知每创建一个进程都会创建一个相对应的task_struct结构体,task结构体里就有代表该进程唯一的PID。


3、进程的调度与切换


这是Sched.c函数。

#include <linux/sched.h>#include <linux/kernel.h>#include <linux/sys.h>#include <linux/fdreg.h>#include <asm/system.h>#include <asm/io.h>#include <asm/segment.h> #include <signal.h> #define _S(nr) (1<<((nr)-1))#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP))) void show_task(int nr,struct task_struct * p)  //nr就是pid{    int i,j = 4096-sizeof(struct task_struct);     printk("%d: pid=%d, state=%d, ",nr,p->pid,p->state); //打印pid与state    i=0;    while (i<j && !((char *)(p+1))[i])        i++;    printk("%d (of %d) chars free in kernel stacknr",i,j); //打印栈} void show_stat(void){    int i;     for (i=0;i<NR_TASKS;i++)        if (task[i])            show_task(i,task[i]);} #define LATCH (1193180/HZ) extern void mem_use(void); extern int timer_interrupt(void);extern int system_call(void); union task_union {    struct task_struct task;    char stack[PAGE_SIZE];}; static union task_union init_task = {INIT_TASK,}; long volatile jiffies=0;long startup_time=0;struct task_struct *current = &(init_task.task);struct task_struct *last_task_used_math = NULL; struct task_struct * task[NR_TASKS] = {&(init_task.task), }; long user_stack [ PAGE_SIZE>>2 ] ; struct {    long * a;    short b;    } stack_start = { & user_stack [PAGE_SIZE>>2] , 0x10 };/* *  'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task */void math_state_restore()  //进程切换时完成协处理器中寄存器的切换{    if (last_task_used_math == current)        return;    __asm__("fwait");    if (last_task_used_math) {        __asm__("fnsave %0"::"m" (last_task_used_math->tss.i387));    }    last_task_used_math=current;    if (current->used_math) {        __asm__("frstor %0"::"m" (current->tss.i387));    } else {        __asm__("fninit"::);        current->used_math=1;    }} /* *  'schedule()' is the scheduler function. This is GOOD CODE! There * probably won't be any reason to change this, as it should work well * in all circumstances (ie gives IO-bound processes good response etc). * The one thing you might take a look at is the signal-handler code here. * *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other * tasks can run. It can not be killed, and it cannot sleep. The 'state' * information in task[0] is never used. */void schedule(void) {    int i,next,c;    struct task_struct ** p; /* check alarm, wake up any interruptible tasks that have got a signal */ /*#define TASK_RUNNING        0     只有state是0时,该进程才会被运行,或进入就绪队列#define TASK_INTERRUPTIBLE    1     可中断睡眠状态   可以被信号中断,变成running状态#define TASK_UNINTERRUPTIBLE    2 不可中断睡眠状态 只能被wakeup函数唤醒,变成running状态#define TASK_ZOMBIE        3         僵死状态        进程停止运行,但是其task_struct未被清空#define TASK_STOPPED        4     暂停状态  */    for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) //从后往前遍历        if (*p) { //若进程存在            if ((*p)->alarm && (*p)->alarm < jiffies) { //若alarm不为空且小于jiffies(此处是0)                    (*p)->signal |= (1<<(SIGALRM-1));                    (*p)->alarm = 0;                }            if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&  //进程不理会某些信号;并且进程是可中断睡眠状态            (*p)->state==TASK_INTERRUPTIBLE)                (*p)->state=TASK_RUNNING;        } /* this is the scheduler proper: */     while (1) {  //进行counter的比较,来决定进程的调用        c = -1;        next = 0;        i = NR_TASKS;        p = &task[NR_TASKS];        while (--i) {            if (!*--p)                continue;            if ((*p)->state == TASK_RUNNING && (*p)->counter > c)                c = (*p)->counter, next = i;  //遍历之后,会将counter的最大值赋给c,并且next存着最大counter的pid        }        if (c) break;        for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)            if (*p)                (*p)->counter = ((*p)->counter >> 1) +    //counter = counter/2 + priority                        (*p)->priority;    }    switch_to(next); //进程切换}/*这部分代码的目的是在所有就绪状态的任务进程中筛选出counter值最大的进程ID。之后如果counter值不为0则进入调度这个进程执行,如果counter值为0,则说明所有就绪状态的进程的时间片都已用完,需要重新调整所有进程的时间片。*/  /*#define switch_to(n) {struct {long a,b;} __tmp; __asm__("cmpl %%ecx,_currentnt"     "je 1fnt"     "movw %%dx,%1nt"     "xchgl %%ecx,_currentnt"     "ljmp %0nt"     "cmpl %%ecx,_last_task_used_mathnt"     "jne 1fnt"     "cltsn"     "1:"     ::"m" (*&__tmp.a),"m" (*&__tmp.b),     "d" (_TSS(n)),"c" ((long) task[n])); }*/     int sys_pause(void){    current->state = TASK_INTERRUPTIBLE;    schedule();    return 0;} void sleep_on(struct task_struct **p) //当p进程想访问cpu的某个资源,但是该资源被占用;{    struct task_struct *tmp;     if (!p)        return;    if (current == &(init_task.task)) //如果当前进程为0号进程时,就返回,不能sleep        panic("task[0] trying to sleep");    tmp = *p;    *p = current; //将p赋为当前进程    current->state = TASK_UNINTERRUPTIBLE;    schedule();    if (tmp)        tmp->state=0;} void interruptible_sleep_on(struct task_struct **p){    struct task_struct *tmp;     if (!p)        return;    if (current == &(init_task.task))        panic("task[0] trying to sleep");    tmp=*p;    *p=current;repeat:    current->state = TASK_INTERRUPTIBLE;    schedule();    if (*p && *p != current) {        (**p).state=0;        goto repeat;    }    *p=NULL;    if (tmp)        tmp->state=0;} void wake_up(struct task_struct **p){    if (p && *p) {        (**p).state=0;        *p=NULL;    }} /* * OK, here are some floppy things that shouldn't be in the kernel * proper. They are here because the floppy needs a timer, and this * was the easiest way of doing it. */static struct task_struct * wait_motor[4] = {NULL,NULL,NULL,NULL};static int  mon_timer[4]={0,0,0,0};static int moff_timer[4]={0,0,0,0};unsigned char current_DOR = 0x0C; int ticks_to_floppy_on(unsigned int nr){    extern unsigned char selected;    unsigned char mask = 0x10 << nr;     if (nr>3)        panic("floppy_on: nr>3");    moff_timer[nr]=10000;        /* 100 s = very big :-) */    cli();                /* use floppy_off to turn it off */    mask |= current_DOR;    if (!selected) {        mask &= 0xFC;        mask |= nr;    }    if (mask != current_DOR) {        outb(mask,FD_DOR);        if ((mask ^ current_DOR) & 0xf0)            mon_timer[nr] = HZ/2;        else if (mon_timer[nr] < 2)            mon_timer[nr] = 2;        current_DOR = mask;    }    sti();    return mon_timer[nr];} void floppy_on(unsigned int nr){    cli();    while (ticks_to_floppy_on(nr))        sleep_on(nr+wait_motor);    sti();} void floppy_off(unsigned int nr){    moff_timer[nr]=3*HZ;} void do_floppy_timer(void){    int i;    unsigned char mask = 0x10;     for (i=0 ; i<4 ; i++,mask <<= 1) {        if (!(mask & current_DOR))            continue;        if (mon_timer[i]) {            if (!--mon_timer[i])                wake_up(i+wait_motor);        } else if (!moff_timer[i]) {            current_DOR &= ~mask;            outb(current_DOR,FD_DOR);        } else            moff_timer[i]--;    }} #define TIME_REQUESTS 64 static struct timer_list {    long jiffies;    void (*fn)();    struct timer_list * next;} timer_list[TIME_REQUESTS], * next_timer = NULL; void add_timer(long jiffies, void (*fn)(void)){    struct timer_list * p;     if (!fn)        return;    cli();    if (jiffies <= 0)        (fn)();    else {        for (p = timer_list ; p < timer_list + TIME_REQUESTS ; p++)            if (!p->fn)                break;        if (p >= timer_list + TIME_REQUESTS)            panic("No more time requests free");        p->fn = fn;        p->jiffies = jiffies;        p->next = next_timer;        next_timer = p;        while (p->next && p->next->jiffies < p->jiffies) {            p->jiffies -= p->next->jiffies;            fn = p->fn;            p->fn = p->next->fn;            p->next->fn = fn;            jiffies = p->jiffies;            p->jiffies = p->next->jiffies;            p->next->jiffies = jiffies;            p = p->next;        }    }    sti();} void do_timer(long cpl){    extern int beepcount;    extern void sysbeepstop(void);     if (beepcount)        if (!--beepcount)            sysbeepstop();     if (cpl)        current->utime++;    else        current->stime++;     if (next_timer) {        next_timer->jiffies--;        while (next_timer && next_timer->jiffies <= 0) {            void (*fn)(void);             fn = next_timer->fn;            next_timer->fn = NULL;            next_timer = next_timer->next;            (fn)();        }    }    if (current_DOR & 0xf0)        do_floppy_timer();    if ((--current->counter)>0) return;    current->counter=0;    if (!cpl) return;    schedule();} int sys_alarm(long seconds){    int old = current->alarm;     if (old)        old = (old - jiffies) / HZ;    current->alarm = (seconds>0)?(jiffies+HZ*seconds):0;    return (old);} int sys_getpid(void){    return current->pid;} int sys_getppid(void){    return current->father;} int sys_getuid(void){    return current->uid;} int sys_geteuid(void){    return current->euid;} int sys_getgid(void){    return current->gid;} int sys_getegid(void){    return current->egid;} int sys_nice(long increment){    if (current->priority-increment>0)        current->priority -= increment;    return 0;} void sched_init(void){    int i;    struct desc_struct * p;     if (sizeof(struct sigaction) != 16)        panic("Struct sigaction MUST be 16 bytes");    set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss));    set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt));    p = gdt+2+FIRST_TSS_ENTRY;    for(i=1;i<NR_TASKS;i++) {        task[i] = NULL;        p->a=p->b=0;        p++;        p->a=p->b=0;        p++;    }/* Clear NT, so that we won't have troubles with that later on */    __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl");    ltr(0);    lldt(0);    outb_p(0x36,0x43);        /* binary, mode 3, LSB/MSB, ch 0 */    outb_p(LATCH & 0xff , 0x40);    /* LSB */    outb(LATCH >> 8 , 0x40);    /* MSB */    set_intr_gate(0x20,&timer_interrupt);    outb(inb_p(0x21)&~0x01,0x21);    set_system_gate(0x80,&system_call);}


4、进程的销毁


就是exit.c函数的操作。


5、进程间的通信(WORKING)


(1)进程,线程

创建一个进程之后,就会对应一个task_struct结构体,fork之后,会进行写实复制(Copy-On-Write),也就是说子进程和父进程的内容大部分是一致的。


问:一个进程多个线程的调度方式和一个进程一个线程时的调度方式有什么区别?

答:没有区别,内核中线程和进程都需要do_fork来实现,所以没有区别。





操作系统的引导与启动


1、BIOS/Bootloader:


由PC机的BIOS(0xFFFF0是BIOS存储的总线地址)把bootsect从某个固定的地址拿到了内存中的某个固定地址(0x90000),并且进行了一系列的硬件初始化和参数设置。

2、bootsect.s(WORKING)


磁盘引导块程序,在磁盘的第一个扇区中的程序(0磁道,0磁头,1扇区)。


作用:首先将后续的setup.s代码从磁盘中加载到紧接着bootsect.s的地方,在显示屏上显示loading system ,再将操作系统加载到0x10000,最后转到setup.s运行。

! SYS_SIZE is the number of clicks (16 bytes) to be loaded.! 0x3000 is 0x30000 bytes = 196kB, more than enough for current! versions of linux!SYSSIZE = 0x3000!!    bootsect.s        (C) 1991 Linus Torvalds!! bootsect.s is loaded at 0x7c00 by the bios-startup routines, and moves! iself out of the way to address 0x90000, and jumps there.!! It then loads 'setup' directly after itself (0x90200), and the system! at 0x10000, using BIOS interrupts.!! NOTE! currently system is at most 8*65536 bytes long. This should be no! problem, even in the future. I want to keep it simple. This 512 kB! kernel size should be enough, especially as this doesn't contain the! buffer cache as in minix!! The loader has been made as simple as possible, and continuos! read errors will result in a unbreakable loop. Reboot by hand. It! loads pretty fast by getting whole sectors at a time whenever possible. .globl begtext, begdata, begbss, endtext, enddata, endbss.textbegtext:.databegdata:.bssbegbss:.text SETUPLEN = 4                ! nr of setup-sectorsBOOTSEG  = 0x07c0            ! original address of boot-sectorINITSEG  = 0x9000            ! we move boot here - out of the waySETUPSEG = 0x9020            ! setup starts hereSYSSEG   = 0x1000            ! system loaded at 0x10000 (65536).ENDSEG   = SYSSEG + SYSSIZE        ! where to stop loading ! ROOT_DEV:    0x000 - same type of floppy as boot.!        0x301 - first partition on first drive etcROOT_DEV = 0x306 entry startstart:    mov    ax,#BOOTSEG    mov    ds,ax    mov    ax,#INITSEG    mov    es,ax    mov    cx,#256    sub    si,si    sub    di,di    rep    movw    jmpi    go,INITSEGgo:    mov    ax,cs    mov    ds,ax    mov    es,ax! put stack at 0x9ff00.    mov    ss,ax    mov    sp,#0xFF00        ! arbitrary value >>512 ! load the setup-sectors directly after the bootblock.! Note that 'es' is already set up. load_setup:    mov    dx,#0x0000        ! drive 0, head 0    mov    cx,#0x0002        ! sector 2, track 0    mov    bx,#0x0200        ! address = 512, in INITSEG    mov    ax,#0x0200+SETUPLEN    ! service 2, nr of sectors    int    0x13            ! read it    jnc    ok_load_setup        ! ok - continue    mov    dx,#0x0000    mov    ax,#0x0000        ! reset the diskette    int    0x13    j    load_setup ok_load_setup: ! Get disk drive parameters, specifically nr of sectors/track     mov    dl,#0x00    mov    ax,#0x0800        ! AH=8 is get drive parameters    int    0x13    mov    ch,#0x00    seg cs    mov    sectors,cx    mov    ax,#INITSEG    mov    es,ax ! Print some inane message     mov    ah,#0x03        ! read cursor pos    xor    bh,bh    int    0x10     mov    cx,#24    mov    bx,#0x0007        ! page 0, attribute 7 (normal)    mov    bp,#msg1    mov    ax,#0x1301        ! write string, move cursor    int    0x10 ! ok, we've written the message, now! we want to load the system (at 0x10000)     mov    ax,#SYSSEG    mov    es,ax        ! segment of 0x010000    call    read_it    call    kill_motor ! After that we check which root-device to use. If the device is! defined (!= 0), nothing is done and the given device is used.! Otherwise, either /dev/PS0 (2,28) or /dev/at0 (2,8), depending! on the number of sectors that the BIOS reports currently.     seg cs    mov    ax,root_dev    cmp    ax,#0    jne    root_defined    seg cs    mov    bx,sectors    mov    ax,#0x0208        ! /dev/ps0 - 1.2Mb    cmp    bx,#15    je    root_defined    mov    ax,#0x021c        ! /dev/PS0 - 1.44Mb    cmp    bx,#18    je    root_definedundef_root:    jmp undef_rootroot_defined:    seg cs    mov    root_dev,ax ! after that (everyting loaded), we jump to! the setup-routine loaded directly after! the bootblock:     jmpi    0,SETUPSEG ! This routine loads the system at address 0x10000, making sure! no 64kB boundaries are crossed. We try to load it as fast as! possible, loading whole tracks whenever we can.!! in:    es - starting address segment (normally 0x1000)!sread:    .word 1+SETUPLEN    ! sectors read of current trackhead:    .word 0            ! current headtrack:    .word 0            ! current track read_it:    mov ax,es    test ax,#0x0fffdie:    jne die            ! es must be at 64kB boundary    xor bx,bx        ! bx is starting address within segmentrp_read:    mov ax,es    cmp ax,#ENDSEG        ! have we loaded all yet?    jb ok1_read    retok1_read:    seg cs    mov ax,sectors    sub ax,sread    mov cx,ax    shl cx,#9    add cx,bx    jnc ok2_read    je ok2_read    xor ax,ax    sub ax,bx    shr ax,#9ok2_read:    call read_track    mov cx,ax    add ax,sread    seg cs    cmp ax,sectors    jne ok3_read    mov ax,#1    sub ax,head    jne ok4_read    inc trackok4_read:    mov head,ax    xor ax,axok3_read:    mov sread,ax    shl cx,#9    add bx,cx    jnc rp_read    mov ax,es    add ax,#0x1000    mov es,ax    xor bx,bx    jmp rp_read read_track:    push ax    push bx    push cx    push dx    mov dx,track    mov cx,sread    inc cx    mov ch,dl    mov dx,head    mov dh,dl    mov dl,#0    and dx,#0x0100    mov ah,#2    int 0x13    jc bad_rt    pop dx    pop cx    pop bx    pop ax    retbad_rt:    mov ax,#0    mov dx,#0    int 0x13    pop dx    pop cx    pop bx    pop ax    jmp read_track /* * This procedure turns off the floppy drive motor, so * that we enter the kernel in a known state, and * don't have to worry about it later. */kill_motor:    push dx    mov dx,#0x3f2    mov al,#0    outb    pop dx    ret sectors:    .word 0 msg1:    .byte 13,10    .ascii "Loading system ..."    .byte 13,10,13,10 .org 508root_dev:    .word ROOT_DEVboot_flag:    .word 0xAA55 .textendtext:.dataenddata:.bssendbss:


3、setup.s(WORKING)


解析BIOS/Bootloader传进来的参数,设置系统内核运行的LDT(局部描述符),IDT(中断描述符) GDT(全局描述符),设置中断控制芯片,进入保护模式运行;跳转到head.s运行。

setup.s        (C) 1991 Linus Torvalds!! setup.s is responsible for getting the system data from the BIOS,! and putting them into the appropriate places in system memory.! both setup.s and system has been loaded by the bootblock.!! This code asks the bios for memory/disk/other parameters, and! puts them in a "safe" place: 0x90000-0x901FF, ie where the! boot-block used to be. It is then up to the protected mode! system to read them from there before the area is overwritten! for buffer-blocks.! ! NOTE! These had better be the same as in bootsect.s! INITSEG  = 0x9000    ! we move boot here - out of the waySYSSEG   = 0x1000    ! system loaded at 0x10000 (65536).SETUPSEG = 0x9020    ! this is the current segment .globl begtext, begdata, begbss, endtext, enddata, endbss.textbegtext:.databegdata:.bssbegbss:.text entry startstart: ! ok, the read went well so we get current cursor position and save it for! posterity.     mov    ax,#INITSEG    ! this is done in bootsect already, but...    mov    ds,ax    mov    ah,#0x03    ! read cursor pos    xor    bh,bh    int    0x10        ! save it in known place, con_init fetches    mov    [0],dx        ! it from 0x90000. ! Get memory size (extended mem, kB)     mov    ah,#0x88    int    0x15    mov    [2],ax ! Get video-card data:     mov    ah,#0x0f    int    0x10    mov    [4],bx        ! bh = display page    mov    [6],ax        ! al = video mode, ah = window width ! check for EGA/VGA and some config parameters     mov    ah,#0x12    mov    bl,#0x10    int    0x10    mov    [8],ax    mov    [10],bx    mov    [12],cx ! Get hd0 data     mov    ax,#0x0000    mov    ds,ax    lds    si,[4*0x41]    mov    ax,#INITSEG    mov    es,ax    mov    di,#0x0080    mov    cx,#0x10    rep    movsb ! Get hd1 data     mov    ax,#0x0000    mov    ds,ax    lds    si,[4*0x46]    mov    ax,#INITSEG    mov    es,ax    mov    di,#0x0090    mov    cx,#0x10    rep    movsb ! Check that there IS a hd1 :-)     mov    ax,#0x01500    mov    dl,#0x81    int    0x13    jc    no_disk1    cmp    ah,#3    je    is_disk1no_disk1:    mov    ax,#INITSEG    mov    es,ax    mov    di,#0x0090    mov    cx,#0x10    mov    ax,#0x00    rep    stosbis_disk1: ! now we want to move to protected mode ...     cli            ! no interrupts allowed ! ! first we move the system to it's rightful place     mov    ax,#0x0000    cld            ! 'direction'=0, movs moves forwarddo_move:    mov    es,ax        ! destination segment    add    ax,#0x1000    cmp    ax,#0x9000    jz    end_move    mov    ds,ax        ! source segment    sub    di,di    sub    si,si    mov     cx,#0x8000    rep    movsw    jmp    do_move ! then we load the segment descriptors end_move:    mov    ax,#SETUPSEG    ! right, forgot this at first. didn't work :-)    mov    ds,ax    lidt    idt_48        ! load idt with 0,0    lgdt    gdt_48        ! load gdt with whatever appropriate ! that was painless, now we enable A20     call    empty_8042    mov    al,#0xD1        ! command write    out    #0x64,al    call    empty_8042    mov    al,#0xDF        ! A20 on    out    #0x60,al    call    empty_8042 ! well, that went ok, I hope. Now we have to reprogram the interrupts :-(! we put them right after the intel-reserved hardware interrupts, at! int 0x20-0x2F. There they won't mess up anything. Sadly IBM really! messed this up with the original PC, and they haven't been able to! rectify it afterwards. Thus the bios puts interrupts at 0x08-0x0f,! which is used for the internal hardware interrupts as well. We just! have to reprogram the 8259's, and it isn't fun.     mov    al,#0x11        ! initialization sequence    out    #0x20,al        ! send it to 8259A-1    .word    0x00eb,0x00eb        ! jmp $+2, jmp $+2    out    #0xA0,al        ! and to 8259A-2    .word    0x00eb,0x00eb    mov    al,#0x20        ! start of hardware int's (0x20)    out    #0x21,al    .word    0x00eb,0x00eb    mov    al,#0x28        ! start of hardware int's 2 (0x28)    out    #0xA1,al    .word    0x00eb,0x00eb    mov    al,#0x04        ! 8259-1 is master    out    #0x21,al    .word    0x00eb,0x00eb    mov    al,#0x02        ! 8259-2 is slave    out    #0xA1,al    .word    0x00eb,0x00eb    mov    al,#0x01        ! 8086 mode for both    out    #0x21,al    .word    0x00eb,0x00eb    out    #0xA1,al    .word    0x00eb,0x00eb    mov    al,#0xFF        ! mask off all interrupts for now    out    #0x21,al    .word    0x00eb,0x00eb    out    #0xA1,al ! well, that certainly wasn't fun :-(. Hopefully it works, and we don't! need no steenking BIOS anyway (except for the initial loading :-).! The BIOS-routine wants lots of unnecessary data, and it's less! "interesting" anyway. This is how REAL programmers do it.!! Well, now's the time to actually move into protected mode. To make! things as simple as possible, we do no register set-up or anything,! we let the gnu-compiled 32-bit programs do that. We just jump to! absolute address 0x00000, in 32-bit protected mode.     mov    ax,#0x0001    ! protected mode (PE) bit    lmsw    ax        ! This is it!    jmpi    0,8        ! jmp offset 0 of segment 8 (cs) ! This routine checks that the keyboard command queue is empty! No timeout is used - if this hangs there is something wrong with! the machine, and we probably couldn't proceed anyway.empty_8042:    .word    0x00eb,0x00eb    in    al,#0x64    ! 8042 status port    test    al,#2        ! is input buffer full?    jnz    empty_8042    ! yes - loop    ret gdt:    .word    0,0,0,0        ! dummy     .word    0x07FF        ! 8Mb - limit=2047 (2048*4096=8Mb)    .word    0x0000        ! base address=0    .word    0x9A00        ! code read/exec    .word    0x00C0        ! granularity=4096, 386     .word    0x07FF        ! 8Mb - limit=2047 (2048*4096=8Mb)    .word    0x0000        ! base address=0    .word    0x9200        ! data read/write    .word    0x00C0        ! granularity=4096, 386 idt_48:    .word    0            ! idt limit=0    .word    0,0            ! idt base=0L gdt_48:    .word    0x800        ! gdt limit=2048, 256 GDT entries    .word    512+gdt,0x9    ! gdt base = 0X9xxxx .textendtext:.dataenddata:.bssendbss:


注:GDT,LDT,IDT表是什么?


GDT(global descriptor table),全局段描述符表,这些64kb数据整齐的排列在内存中某一位置。而该位置的内存地址以及有效的个数就存放在GDTR中,GDTR是特殊的寄存器。GDT在系统内只存在一个。


LDT(local descripotr table),局部段描述符表,LDT在系统内可存在多个,每个任务最多只能拥有一个LDT,另外,每一个LDT自身作为一个段存在,它们的段描述符被放在GDT中。

IDT(interrupt descriptor table),中断描述符表,IDT记录了0~255的中断号码和中断服务函数的关系。当发生中断的时候,通过中断号码去执行中断服务函数。


GDT可以被放在内存的任何位置,那么当程序员通过段寄存器来引用一个段描述符时,CPU必须知道GDT的入口,也就是基地址放在哪里,所以Intel的设计者门提供了一个寄存器GDTR用来存放GDT的入口地址,程序员将GDT设定在内存中某个位置之后,可以通过LGDT指令将GDT的入口地址装入此寄存器,从此以后,CPU就根据此寄存器中的内容作为GDT的入口来访问GDT了。


IA-32为LDT的入口地址也提供了一个寄存器LDTR,因为在任何时刻只能有一个任务在运行,所以LDT寄存器全局也只需要有一个。如果一个任务拥有自身的LDT,那么当它需要引用自身的LDT时,它需要通过LLDT指令将其LDT的段描述符装入此寄存器。LLDT指令与LGDT指令不同的时,LGDT指令的操作数是一个32-bit的内存地址,这个内存地址处存放的是一个32-bit GDT的入口地址,以及16-bit的GDT Limit。而LLDT指令的操作数是一个16-bit的选择子,这个选择子主要内容是:被装入的LDT的段描述符在GDT中的索引值。


4、head.s(WORKING)

加载内核运行时的各数据段寄存器,重新设置中断描述符表,开启内核正常运行时的协处理器等资源;设置内存管理的分页机制,跳转到main.c运行。

*  head.s contains the 32-bit startup code. * * NOTE!!! Startup happens at absolute address 0x00000000, which is also where * the page directory will exist. The startup code will be overwritten by * the page directory. */.text.globl _idt,_gdt,_pg_dir,_tmp_floppy_area_pg_dir:startup_32:    movl $0x10,%eax    mov %ax,%ds    mov %ax,%es    mov %ax,%fs    mov %ax,%gs   //上面是重新加载寄存器    lss _stack_start,%esp  //lss _stack_start,%esp是将结构体 stact_start 的值传送到ss:esp,即令 ss=0x10(段选择子)和 esp=& user_stack [PAGE_SIZE>>2]    call setup_idt  //设置idt和gdt    call setup_gdt    movl $0x10,%eax        # reload all the segment registers    mov %ax,%ds        # after changing gdt. CS was already    mov %ax,%es        # reloaded in 'setup_gdt'    mov %ax,%fs    mov %ax,%gs    lss _stack_start,%esp    xorl %eax,%eax1:    incl %eax        # check that A20 really IS enabled    movl %eax,0x000000    # loop forever if it isn't    cmpl %eax,0x100000    je 1b/* * NOTE! 486 should set bit 16, to check for write-protect in supervisor * mode. Then it would be unnecessary with the "verify_area()"-calls. * 486 users probably want to set the NE (#5) bit also, so as to use * int 16 for math errors. */    movl %cr0,%eax        # check math chip    andl $0x80000011,%eax    # Save PG,PE,ET/* "orl $0x10020,%eax" here for 486 might be good */    orl $2,%eax        # set MP    movl %eax,%cr0    call check_x87    jmp after_page_tables /* * We depend on ET to be correct. This checks for 287/387. */check_x87:    fninit    fstsw %ax    cmpb $0,%al    je 1f            /* no coprocessor: have to set bits */    movl %cr0,%eax    xorl $6,%eax        /* reset MP, set EM */    movl %eax,%cr0    ret.align 21:    .byte 0xDB,0xE4        /* fsetpm for 287, ignored by 387 */    ret /* *  setup_idt * *  sets up a idt with 256 entries pointing to *  ignore_int, interrupt gates. It then loads *  idt. Everything that wants to install itself *  in the idt-table may do so themselves. Interrupts *  are enabled elsewhere, when we can be relatively *  sure everything is ok. This routine will be over- *  written by the page tables. */setup_idt:    lea ignore_int,%edx   //将ignore_int的有效地址存到edx    movl $0x00080000,%eax  //将0x8000放入eax的高16位    movw %dx,%ax        /* selector = 0x0008 = cs  将ignore_int有效地址存到eax低16字节 */    movw $0x8E00,%dx    /* interrupt gate - dpl=0, present */     lea _idt,%edi      //    mov $256,%ecxrp_sidt:    movl %eax,(%edi)    movl %edx,4(%edi)    addl $8,%edi    dec %ecx    jne rp_sidt    lidt idt_descr    ret /* *  setup_gdt * *  This routines sets up a new gdt and loads it. *  Only two entries are currently built, the same *  ones that were built in init.s. The routine *  is VERY complicated at two whole lines, so this *  rather long comment is certainly needed :-). *  This routine will beoverwritten by the page tables. */setup_gdt:    lgdt gdt_descr    ret /* * I put the kernel page tables right after the page directory, * using 4 of them to span 16 Mb of physical memory. People with * more than 16MB will have to expand this. */.org 0x1000pg0: .org 0x2000pg1: .org 0x3000pg2: .org 0x4000pg3: .org 0x5000/* * tmp_floppy_area is used by the floppy-driver when DMA cannot * reach to a buffer-block. It needs to be aligned, so that it isn't * on a 64kB border. */_tmp_floppy_area:    .fill 1024,1,0 after_page_tables:    pushl $0        # These are the parameters to main :-)    pushl $0    pushl $0    pushl $L6        # return address for main, if it decides to.    pushl $_main    jmp setup_pagingL6:    jmp L6            # main should never return here, but                # just in case, we know what happens. /* This is the default interrupt "handler" :-) */int_msg:    .asciz "Unknown interruptnr".align 2ignore_int:    pushl %eax    pushl %ecx    pushl %edx    push %ds    push %es    push %fs    movl $0x10,%eax    mov %ax,%ds    mov %ax,%es    mov %ax,%fs    pushl $int_msg    call _printk    popl %eax    pop %fs    pop %es    pop %ds    popl %edx    popl %ecx    popl %eax    iret  /* * Setup_paging * * This routine sets up paging by setting the page bit * in cr0. The page tables are set up, identity-mapping * the first 16MB. The pager assumes that no illegal * addresses are produced (ie >4Mb on a 4Mb machine). * * NOTE! Although all physical memory should be identity * mapped by this routine, only the kernel page functions * use the >1Mb addresses directly. All "normal" functions * use just the lower 1Mb, or the local data space, which * will be mapped to some other place - mm keeps track of * that. * * For those with more memory than 16 Mb - tough luck. I've * not got it, why should you :-) The source is here. Change * it. (Seriously - it shouldn't be too difficult. Mostly * change some constants etc. I left it at 16Mb, as my machine * even cannot be extended past that (ok, but it was cheap :-) * I've tried to show which constants to change by having * some kind of marker at them (search for "16Mb"), but I * won't guarantee that's all :-( ) */.align 2setup_paging:    movl $1024*5,%ecx        /* 5 pages - pg_dir+4 page tables */    xorl %eax,%eax    xorl %edi,%edi            /* pg_dir is at 0x000 */    cld;rep;stosl    movl $pg0+7,_pg_dir        /* set present bit/user r/w */    movl $pg1+7,_pg_dir+4        /*  --------- " " --------- */    movl $pg2+7,_pg_dir+8        /*  --------- " " --------- */    movl $pg3+7,_pg_dir+12        /*  --------- " " --------- */    movl $pg3+4092,%edi    movl $0xfff007,%eax        /*  16Mb - 4096 + 7 (r/w user,p) */    std1:    stosl            /* fill pages backwards - more efficient :-) */    subl $0x1000,%eax    jge 1b    xorl %eax,%eax        /* pg_dir is at 0x0000 */    movl %eax,%cr3        /* cr3 - page directory start */    movl %cr0,%eax    orl $0x80000000,%eax    movl %eax,%cr0        /* set paging (PG) bit */    ret            /* this also flushes prefetch-queue */ .align 2.word 0idt_descr:    .word 256*8-1        # idt contains 256 entries    .long _idt.align 2.word 0gdt_descr:    .word 256*8-1        # so does gdt (not that that's any    .long _gdt        # magic number, but it works for me :^)     .align 3_idt:    .fill 256,8,0        # idt is uninitialized _gdt:    .quad 0x0000000000000000    /* NULL descriptor */    .quad 0x00c09a0000000fff    /* 16Mb */    .quad 0x00c0920000000fff    /* 16Mb */    .quad 0x0000000000000000    /* TEMPORARY - don't use */    .fill 252,8,0


5、main.c(WORKING)

void main(void)        /* This really IS void, no error here. */{            /* The startup routine assumes (well, ...) this *//* * Interrupts are still disabled. Do necessary setups, then * enable them */     ROOT_DEV = ORIG_ROOT_DEV;     drive_info = DRIVE_INFO;    memory_end = (1<<20) + (EXT_MEM_K<<10);    memory_end &= 0xfffff000;    if (memory_end > 16*1024*1024)        memory_end = 16*1024*1024;    if (memory_end > 12*1024*1024)        buffer_memory_end = 4*1024*1024;    else if (memory_end > 6*1024*1024)        buffer_memory_end = 2*1024*1024;    else        buffer_memory_end = 1*1024*1024;    main_memory_start = buffer_memory_end;#ifdef RAMDISK    main_memory_start += rd_init(main_memory_start, RAMDISK*1024);#endif    mem_init(main_memory_start,memory_end);    trap_init();    blk_dev_init();    chr_dev_init();    tty_init();    time_init();    sched_init();    buffer_init(buffer_memory_end);    hd_init();    floppy_init();    sti();    move_to_user_mode();    if (!fork()) {        /* we count on this going ok */        init();            //init函数在三.1有分析    }/* *   NOTE!!   For any other task 'pause()' would mean we have to get a * signal to awaken, but task0 is the sole exception (see 'schedule()') * as task 0 gets activated at every idle moment (when no other tasks * can run). For task0 'pause()' just means we go check if some other * task can run, and if not we return here. */    for(;;) pause();}





信号概述


内核的信号量是很重要的,关于信号的定义在/include/signal.h文件内,比如运行一个elf文件可能会出现段错误(SIGSEGV),玩pwn的同学应该很熟悉。在system_call.s中存在call do_signal,那么do_signal在/kernel/signal.c内定义。

硬件来源:信号由硬件驱动产生

软件来源:系统提供了些API,例如kill命令

当进程收到信号时,会有三种场景;

忽略:忽略信号

执行:执行每个信号所对应的操作

执行自定操作:用户自定义的操作

① 在系统中什么是信号,都有什么信号?

② 在系统接收到信号后,是如何进行处理的?

③ 信号作用。


1、signal.h

#ifndef _SIGNAL_H#define _SIGNAL_H #include <sys/types.h> typedef int sig_atomic_t;typedef unsigned int sigset_t;        /* 32 bits */ #define _NSIG             32#define NSIG        _NSIG #define SIGHUP         1  //挂断控制中端或进程#define SIGINT         2  //键盘中断#define SIGQUIT         3  //键盘退出#define SIGILL         4  //非法指令#define SIGTRAP         5  //跟踪断点#define SIGABRT         6  //异常结束#define SIGIOT         6  //异常结束#define SIGUNUSED     7  //未使用#define SIGFPE         8  //协处理器错误#define SIGKILL         9  //终止进程#define SIGUSR1        10  //用户信号1#define SIGSEGV        11  //段错误#define SIGUSR2        12  //用户信号2#define SIGPIPE        13  //管道写出错,读端全关闭#define SIGALRM        14  //定时器警报#define SIGTERM        15  //进程终止#define SIGSTKFLT    16  //栈出错#define SIGCHLD        17  //子进程状态改变#define SIGCONT        18  //恢复进程继续执行#define SIGSTOP        19  //暂停进程执行#define SIGTSTP        20  //tty发出停止信号#define SIGTTIN        21  //后台进程请求输入#define SIGTTOU        22  //后台进程请求输出 /* Ok, I haven't implemented sigactions, but trying to keep headers POSIX */#define SA_NOCLDSTOP    1       //当子进程处于停止状态,就不对SIGCHLD处理#define SA_NOMASK    0x40000000  //不阻止在指定的信号处理程序中再收到该信号#define SA_ONESHOT    0x80000000  //信号句柄一旦被处理过就恢复到默认处理句柄 #define SIG_BLOCK          0    /* for blocking signals */ #define SIG_UNBLOCK        1    /* for unblocking signals */#define SIG_SETMASK        2    /* for setting the signal mask */ #define SIG_DFL        ((void (*)(int))0)    /* default signal handling */   //默认处理信号句柄#define SIG_IGN        ((void (*)(int))1)    /* ignore signal */             //忽略信号的处理程序 struct sigaction {                 //信号结构体    void (*sa_handler)(int);       //对应某信号指定要采取的行动,可以用上面的SIG_DFL和SIG_IGN    sigset_t sa_mask;              //当前信号处理程序执行期间需要被屏蔽的信号    int sa_flags;                  //    void (*sa_restorer)(void);     //恢复函数指针}; void (*signal(int _sig, void (*_func)(int)))(int);int raise(int sig);int kill(pid_t pid, int sig);int sigaddset(sigset_t *mask, int signo);int sigdelset(sigset_t *mask, int signo);int sigemptyset(sigset_t *mask);int sigfillset(sigset_t *mask);int sigismember(sigset_t *mask, int signo); /* 1 - is, 0 - not, -1 error */int sigpending(sigset_t *set);int sigprocmask(int how, sigset_t *set, sigset_t *oldset);int sigsuspend(sigset_t *sigmask);int sigaction(int sig, struct sigaction *act, struct sigaction *oldact); #endif /* _SIGNAL_H */


2、signal.c

#include <linux/sched.h>#include <linux/kernel.h>#include <asm/segment.h> #include <signal.h> volatile void do_exit(int error_code); int sys_sgetmask(){    return current->blocked;} int sys_ssetmask(int newmask){    int old=current->blocked;     current->blocked = newmask & ~(1<<(SIGKILL-1));    return old;} static inline void save_old(char * from,char * to){    int i;     verify_area(to, sizeof(struct sigaction));    for (i=0 ; i< sizeof(struct sigaction) ; i++) {        put_fs_byte(*from,to);        from++;        to++;    }} static inline void get_new(char * from,char * to){    int i;     for (i=0 ; i< sizeof(struct sigaction) ; i++)        *(to++) = get_fs_byte(from++);} int sys_signal(int signum, long handler, long restorer) //signum是信号标号,handlers是信号处理的函数指针,restorer是恢复函数指针,即执行完signal系统调用后,恢复堆栈及返回值{    struct sigaction tmp;     if (signum<1 || signum>32 || signum==SIGKILL)        return -1;    tmp.sa_handler = (void (*)(int)) handler;          //设置结构体    tmp.sa_mask = 0;    tmp.sa_flags = SA_ONESHOT | SA_NOMASK;    tmp.sa_restorer = (void (*)(void)) restorer;    handler = (long) current->sigaction[signum-1].sa_handler;    current->sigaction[signum-1] = tmp;    return handler;} int sys_sigaction(int signum, const struct sigaction * action,    struct sigaction * oldaction)     //设置新信号处理结构体{    struct sigaction tmp;     if (signum<1 || signum>32 || signum==SIGKILL)   //若不符合信号值大小,直接返回        return -1;    tmp = current->sigaction[signum-1];  //信号值所对应的sigaction结构体    get_new((char *) action,        (char *) (signum-1+current->sigaction)); //设置新信号处理结构体    if (oldaction)        save_old((char *) &tmp,(char *) oldaction);  //将old保存到tmp    if (current->sigaction[signum-1].sa_flags & SA_NOMASK)  //如果允许处理信号过程中再次收到该信号,则屏蔽码置为0        current->sigaction[signum-1].sa_mask = 0;    else                                                    //否则,设置屏蔽本信号        current->sigaction[signum-1].sa_mask |= (1<<(signum-1));    return 0;} void do_signal(long signr,long eax, long ebx, long ecx, long edx,    long fs, long es, long ds,    long eip, long cs, long eflags,    unsigned long * esp, long ss)   //signr是信号值,其余都是当前寄存器为参数{    unsigned long sa_handler;        long old_eip=eip;  //将用户态ip保存至old_eip    struct sigaction * sa = current->sigaction + signr - 1; //取出当前任务signr信号量所对应的sigaction结构体存入sa    int longs;    unsigned long * tmp_esp;      sa_handler = (unsigned long) sa->sa_handler; //取出信号处理函数指针    if (sa_handler==1)   //若sa_handler是SIG_IGN,直接返回        return;    if (!sa_handler) {   ///如果信号处理函数是 SIG_DFL,表示按默认方式处理        if (signr==SIGCHLD)   //不作处理,直接返回            return;        else            do_exit(1<<(signr-1));   //否则终止进程,故默认处理方式一般效果是终止进程    }    if (sa->sa_flags & SA_ONESHOT)  //如果只需调用一次信号处理,则将sa_handler置零        sa->sa_handler = NULL;    *(&eip) = sa_handler;      //将用户返回地址换成信号处理函数    longs = (sa->sa_flags & SA_NOMASK)?7:8;  //如果允许处理信号过程中再次收到该信号,longs 为 7,否则为 8    *(&esp) -= longs;   //将用户栈腾出空间存放寄存器    verify_area(esp,longs*4);    tmp_esp=esp;   //保存腾出空间之后的esp    put_fs_long((long) sa->sa_restorer,tmp_esp++); //存入恢复栈函数地址    put_fs_long(signr,tmp_esp++);   //    if (!(sa->sa_flags & SA_NOMASK))         put_fs_long(current->blocked,tmp_esp++);    put_fs_long(eax,tmp_esp++);       //下面的操作是将各种寄存器压入用户栈    put_fs_long(ecx,tmp_esp++);    put_fs_long(edx,tmp_esp++);    put_fs_long(eflags,tmp_esp++);    put_fs_long(old_eip,tmp_esp++);    current->blocked |= sa->sa_mask; //}


3、sa_restorer

/* 如果没有屏蔽码,使用该函数作为恢复函数 */sig_restore:    addl $4,%esp        /* 丢弃 signr */    popl %eax        /* 系统调用返回值还原到 eax */    popl %ecx        /* 还原 ecx,edx */    popl %edx    popfl            /* 恢复 eflags */    ret /* 如果有屏蔽码,使用该函数 */masksig_restore:    addl $4,%esp    call ssetmask        /* 设置信号屏蔽码 */    addl $4,%esp        /* 丢弃屏蔽码 */    popl %eax    popl %ecx    popl %edx    popfl    ret





文件系统

顾名思义就是文件所组成的一个系统,linux下所谓“一切皆文件”,所以文件系统在内核中占了很大比重。

Linux启动过程:

① PCB上电后先由uboot初始化板子,然后将linux内核迁移到内存中运行;
② 由linux内核进行初始化操作,挂载第一个应用程序即根文件系统(linuxrc);
③ 根文件系统提供磁盘管理服务(glibc,设备节点,配置文件,应用程序 shell命令)。


1、文件系统概述

文件系统主要包括四个部分:高速缓冲区管理,文件底层操作,文件数据访问,文件高层访问控制。


(1)文件系统底层函数


① bitmap.c

程序包括对i节点位图和逻辑块位图进行释放和占用处理函数。操作i节点位图的函数是free_inode()和new_inode(),操作逻辑块位图的函数是free_block()和new_block()。


② truncate.c

程序包括对数据文件长度截断为0的函数truncate(),他将i节点指定的设备上文件长度截为0,并释放文件数据占用的设备逻辑块。

③ inode.c

程序包括分配i节点函数iget()和放回对内存i节点存取函数iput()以及根据i节点信息取文件数据块在设备上对应的逻辑块号函数bmap()。


④ namei.c

程序主要包括函数namei(),该函数使用iget(),iput(),bmap()将给定的文件路径名映射到其i节点。

⑤ super.c

程序专门用于处理文件系统超级块,包括函数get_super(),put_super()和free_super()和free_super()等,还包括几个文件系统加载/卸载处理函数和系统调用,如sys_mount()等。


(2)文件中数据的访问操作


① block_dev.c

程序中的函数block_read()和block_write()是用于读写块设备特殊文件的数据,所使用的参数指定要访问的设备号,起始地址和长度

② file_dev.c

程序中的file_read()和file_write()函数是用于访问一般的文件,所使用的参数指定文件对应的i节点和文件结构。

③ pipe.c

文件中实现了管道读写函数read_pipe()和write_pipe(),另外还实现了创建无名管道的系统调用pipe(),

④ char_dev.c

系统调用使用read()和write()会调用char_dev.c中的rw_char()函数来操作。字符设备包括控制台终端,串口终端和内存字符设备。

(3)文件和目录管理系统调用

① open.c

文件用于实现与文件操作相关的系统调用,主要有文件的创建,打开和关闭,文件宿主和属性修改,文件访问权限和操作时间的修改等。

② exec.c

程序实现对二进制可执行文件和shell脚本文件的加载与执行,其中主要是的do_execve(),他是系统中断调用(int 0x80)的功能号__NR_execve()调用的C处理函数,更是exec()函数簇的主要实现函数。


③ fcntl.c

实现了文件控制操作的系统调用fcntl()和两个文件句柄(描述符)复制系统调用dup()和dup2(),dup2()指定了新句柄的数值,dup()则返回当前最小值的未用句柄。句柄复制操作主要用在文件的标准输入/输出重定向和管道操作方面。

④ ioctl.c

文件实现了输入/输出控制系统调用ioctl(),主要调用tty_ioctl()函数,对终端的I/O进行控制。


⑤ stat.c

文件用于实现取得文件状态信息的系统调用,stat()和fstat()。stat()是利用文件名取信息,而fstat()是利用文件句柄取信息。

2、高速缓冲区管理(buffer.c)

高速缓冲区位于内核代码与主内存区之间,在块设备与内核其他程序之间起着一个桥梁作用,除了块设备驱动程序以外,内核程序如果需要访问块设备中的数据,就需要通过高速缓冲区来进行操作。

Linux内核学习笔记


看雪ID:e*16 a

https://bbs.kanxue.com/user-home-922338.htm

*本文由看雪论坛 e*16 a 原创,转载请注明来自看雪社区

Linux内核学习笔记

# 往期推荐

1.CVE-2022-21882提权漏洞学习笔记

2.wibu证书 - 初探

3.win10 1909逆向之APIC中断和实验

4.EMET下EAF机制分析以及模拟实现

5.sql注入学习分享

6.V8 Array.prototype.concat函数出现过的issues和他们的POC们

Linux内核学习笔记

Linux内核学习笔记

球分享

Linux内核学习笔记

球点赞

Linux内核学习笔记

球在看

Linux内核学习笔记

点击“阅读原文”,了解更多!

  • 左青龙
  • 微信扫一扫
  • weinxin
  • 右白虎
  • 微信扫一扫
  • weinxin
admin
  • 本文由 发表于 2023年1月19日23:11:12
  • 转载请保留本文链接(CN-SEC中文网:感谢原作者辛苦付出):
                   Linux内核学习笔记https://cn-sec.com/archives/1522118.html

发表评论

匿名网友 填写信息