...... # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway pushl %eax # save orig_eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys syscall_call: call *sys_call_table(,%eax,4) //此处执行相应的系统调用 movl %eax,PT_EAX(%esp) # store the return value syscall_exit: LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testl $_TIF_ALLWORK_MASK, %ecx # current->work jne syscall_exit_work ......
/** * sys_getpid - return the thread group id of the current process * * Note, despite the name, this returns the tgid not the pid. The tgid and * the pid are identical unless CLONE_THREAD was specified on clone() in * which case the tgid is the same in all threads of the same group. * * This is SMP safe as current->tgid does not change. */ SYSCALL_DEFINE0(getpid) { return task_tgid_vnr(current); }
staticinlineint __must_check request_irq(unsignedint irq, irq_handler_t handler, unsignedlong flags, constchar *name, void *dev) { return request_threaded_irq(irq, handler, NULL, flags, name, dev); } intrequest_threaded_irq(unsignedint irq, irq_handler_t handler, irq_handler_t thread_fn, unsignedlong irqflags, constchar *devname, void *dev_id) { structirqaction *action; structirq_desc *desc; int retval; /* * handle_IRQ_event() always ignores IRQF_DISABLED except for * the _first_ irqaction (sigh). That can cause oopsing, but * the behavior is classified as "will not fix" so we need to * start nudging drivers away from using that idiom. */ if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) == (IRQF_SHARED|IRQF_DISABLED)) { pr_warning("IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n", irq, devname); } #ifdef CONFIG_LOCKDEP /* * Lockdep wants atomic interrupt handlers: */ irqflags |= IRQF_DISABLED; #endif /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing * logic etc). */ if ((irqflags & IRQF_SHARED) && !dev_id) return -EINVAL; desc = irq_to_desc(irq); if (!desc) return -EINVAL; if (desc->status & IRQ_NOREQUEST) return -EINVAL; if (!handler) { if (!thread_fn) return -EINVAL; handler = irq_default_primary_handler; } //分配一个irqaction action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->thread_fn = thread_fn; action->flags = irqflags; action->name = devname; action->dev_id = dev_id; chip_bus_lock(irq, desc); //将创建并初始化完在的action加入desc retval = __setup_irq(irq, desc, action); chip_bus_sync_unlock(irq, desc); if (retval) kfree(action); #ifdef CONFIG_DEBUG_SHIRQ if (irqflags & IRQF_SHARED) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... * We disable the irq to make sure that a 'real' IRQ doesn't * run in parallel with our fake. */ unsignedlong flags; disable_irq(irq); local_irq_save(flags); handler(irq, dev_id); local_irq_restore(flags); enable_irq(irq); } #endif return retval; }
/* * A very tiny interrupt handler. It runs with IRQF_DISABLED set, * but there is possibility of conflicting with the set_rtc_mmss() * call (the rtc irq and the timer irq can easily run at the same * time in two different CPUs). So we need to serialize * accesses to the chip with the rtc_lock spinlock that each * architecture should implement in the timer code. * (See ./arch/XXXX/kernel/time.c for the set_rtc_mmss() function.) */ staticirqreturn_trtc_interrupt(int irq, void *dev_id) { /* * Can be an alarm interrupt, update complete interrupt, * or a periodic interrupt. We store the status in the * low byte and the number of interrupts received since * the last read in the remainder of rtc_irq_data. */ spin_lock(&rtc_lock); //保证`rtc_irq_data`不被`SMP`机器上其他处理器同时访问 rtc_irq_data += 0x100; rtc_irq_data &= ~0xff; if (is_hpet_enabled()) { /* * In this case it is HPET RTC interrupt handler * calling us, with the interrupt information * passed as arg1, instead of irq. */ rtc_irq_data |= (unsignedlong)irq & 0xF0; } else { rtc_irq_data |= (CMOS_READ(RTC_INTR_FLAGS) & 0xF0); } if (rtc_status & RTC_TIMER_ON) mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100); spin_unlock(&rtc_lock); /* Now do the rest of the actions */ spin_lock(&rtc_task_lock); //避免`rtc_callback`出现系统情况,RTC`驱动允许注册一个回调函数在每个`RTC`中断到来时执行。 if (rtc_callback) rtc_callback->func(rtc_callback->private_data); spin_unlock(&rtc_task_lock); wake_up_interruptible(&rtc_wait); kill_fasync(&rtc_async_queue, SIGIO, POLL_IN); return IRQ_HANDLED; }
/** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number * @action: the interrupt action chain for this irq * * Handles the action chain of an irq event */ irqreturn_thandle_IRQ_event(unsignedint irq, struct irqaction *action) { irqreturn_t ret, retval = IRQ_NONE; unsignedint status = 0; //如果没有设置`IRQF_DISABLED,将CPU中断打开,应该尽量避免中断关闭情况,本地中断关闭情况下会导致中断丢失。 if (!(action->flags & IRQF_DISABLED)) local_irq_enable_in_hardirq(); do { //遍历运行中断处理程序 trace_irq_handler_entry(irq, action); ret = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, ret); switch (ret) { case IRQ_WAKE_THREAD: /* * Set result to handled so the spurious check * does not trigger. */ ret = IRQ_HANDLED; /* * Catch drivers which return WAKE_THREAD but * did not set up a thread function */ if (unlikely(!action->thread_fn)) { warn_no_thread(irq, action); break; } /* * Wake up the handler thread for this * action. In case the thread crashed and was * killed we just pretend that we handled the * interrupt. The hardirq handler above has * disabled the device interrupt, so no irq * storm is lurking. */ if (likely(!test_bit(IRQTF_DIED, &action->thread_flags))) { set_bit(IRQTF_RUNTHREAD, &action->thread_flags); wake_up_process(action->thread); } /* Fall through to add to randomness */ case IRQ_HANDLED: status |= action->flags; break; default: break; } retval |= ret; action = action->next; } while (action); if (status & IRQF_SAMPLE_RANDOM) add_interrupt_randomness(irq); local_irq_disable();//关中断 return retval; }
/* PLEASE, avoid to allocate new softirqs, if you need not _really_ high frequency threaded job scheduling. For almost all the purposes tasklets are more than enough. F.e. all serial device BHs et al. should be converted to tasklets, not to softirqs. */ enum { HI_SOFTIRQ=0, TIMER_SOFTIRQ, NET_TX_SOFTIRQ, NET_RX_SOFTIRQ, BLOCK_SOFTIRQ, BLOCK_IOPOLL_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, HRTIMER_SOFTIRQ, RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS };
第二步,编写tasklet处理程序。tasklet处理函数类型是void tasklet_handler(unsigned long data)。因为是靠软中断实现,所以tasklet不能休眠,也就是说不能在tasklet中使用信号量或者其他什么阻塞式的函数。由于tasklet运行时允许响应中断,所以必须做好预防工作,如果新加入的tasklet和中断处理程序之间共享了某些数据额的话。两个相同的tasklet绝不能同时执行,如果新加入的tasklet和其他的tasklet或者软中断共享了数据,就必须要进行适当地锁保护。
staticirqreturn_ttimer_interrupt(int irq, void *dev_id) { /* Keep nmi watchdog up to date */ inc_irq_stat(irq0_irqs); /* Optimized out for !IO_APIC and x86_64 */ if (timer_ack) { /* * Subtle, when I/O APICs are used we have to ack timer IRQ * manually to deassert NMI lines for the watchdog if run * on an 82489DX-based system. */ spin_lock(&i8259A_lock); outb(0x0c, PIC_MASTER_OCW3); /* Ack the IRQ; AEOI will end it automatically. */ inb(PIC_MASTER_POLL); spin_unlock(&i8259A_lock); } //在此处调用体系无关的时钟处理例程 global_clock_event->event_handler(global_clock_event); /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ if (MCA_bus) outb_p(inb_p(0x61)| 0x80, 0x61); return IRQ_HANDLED; }
voidupdate_process_times(int user_tick) { structtask_struct *p = current; int cpu = smp_processor_id(); /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); rcu_check_callbacks(cpu, user_tick); printk_tick(); scheduler_tick(); run_posix_cpu_timers(p); }
signedlong __sched schedule_timeout(signedlong timeout) { structtimer_listtimer; unsignedlong expire; switch (timeout) { case MAX_SCHEDULE_TIMEOUT: /* * These two special cases are useful to be comfortable * in the caller. Nothing more. We could take * MAX_SCHEDULE_TIMEOUT from one of the negative value * but I' d like to return a valid offset (>=0) to allow * the caller to do everything it want with the retval. */ schedule(); goto out; default: /* * Another bit of PARANOID. Note that the retval will be * 0 since no piece of kernel is supposed to do a check * for a negative retval of schedule_timeout() (since it * should never happens anyway). You just have the printk() * that will tell you if something is gone wrong and where. */ if (timeout < 0) { printk(KERN_ERR "schedule_timeout: wrong timeout " "value %lx\n", timeout); dump_stack(); current->state = TASK_RUNNING; goto out; } } expire = timeout + jiffies; //下一行代码设置了超时执行函数`process_timeout()。 setup_timer_on_stack(&timer, process_timeout, (unsignedlong)current); __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); //激活定时器 schedule(); //调度其他新任务 del_singleshot_timer_sync(&timer); /* Remove the timer from the object tracker */ destroy_timer_on_stack(&timer); timeout = expire - jiffies; out: return timeout < 0 ? 0 : timeout; } 当定时器超时时,process_timeout()函数被调用: staticvoidprocess_timeout(unsignedlong __data) { wake_up_process((struct task_struct *)__data); }
进程其实就是程序的执行时的实例,是处于执行期的程序。在Linux内核中,进程列表被存放在一个双向循环链表中,链表中每一项都是类型为task_struct的结构,该结构称作进程描述符,进程描述符包含一个具体进程的所有信息,这个结构就是我们在操作系统中所说的PCB(Process Control Block)。该结构定义于<include/linux/sched.h>文件中:
structtask_struct { volatilelong state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; atomic_t usage; unsignedint flags; /* per process flags, defined below */ unsignedint ptrace; int lock_depth; /* BKL lock depth */ ...... int prio, static_prio, normal_prio; unsignedint rt_priority; conststructsched_class *sched_class; structsched_entityse; structsched_rt_entityrt; ...... structtask_struct *parent;/* recipient of SIGCHLD, wait4() reports */ structlist_headchildren;/* list of my children */ structlist_headsibling;/* linkage in my parent's children list */ ...... };
intsys_fork(struct pt_regs *regs) { returndo_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); } longdo_fork(unsignedlong clone_flags, unsignedlong stack_start, struct pt_regs *regs, unsignedlong stack_size, int __user *parent_tidptr, int __user *child_tidptr) { structtask_struct *p; int trace = 0; long nr; ...... p = copy_process(clone_flags, stack_start, regs, stack_size, child_tidptr, NULL, trace); ...... return nr; } staticstructtask_struct *copy_process(unsignedlong clone_flags, unsignedlong stack_start, struct pt_regs *regs, unsignedlong stack_size, int __user *child_tidptr, struct pid *pid, int trace) { int retval; structtask_struct *p; int cgroup_callbacks_done = 0; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) returnERR_PTR(-EINVAL); /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) returnERR_PTR(-EINVAL); /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) returnERR_PTR(-EINVAL); /* * Siblings of global init remain as zombies on exit since they are * not reaped by their parent (swapper). To solve this and to avoid * multi-rooted process trees, prevent global and container-inits * from creating siblings. */ if ((clone_flags & CLONE_PARENT) && current->signal->flags & SIGNAL_UNKILLABLE) returnERR_PTR(-EINVAL); retval = security_task_create(clone_flags); if (retval) goto fork_out; retval = -ENOMEM; p = dup_task_struct(current); if (!p) goto fork_out; ftrace_graph_init_task(p); rt_mutex_init_task(p); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->real_cred->user != INIT_USER) goto bad_fork_free; } retval = copy_creds(p, clone_flags); if (retval < 0) goto bad_fork_free; /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ retval = -EAGAIN; if (nr_threads >= max_threads) goto bad_fork_cleanup_count; if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); p->utime = cputime_zero; p->stime = cputime_zero; p->gtime = cputime_zero; p->utimescaled = cputime_zero; p->stimescaled = cputime_zero; p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; p->default_timer_slack_ns = current->timer_slack_ns; task_io_accounting_init(&p->ioac); acct_clear_integrals(p); posix_cpu_timers_init(p); p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; p->audit_context = NULL; cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; goto bad_fork_cleanup_cgroup; } mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW p->hardirqs_enabled = 1; #else p->hardirqs_enabled = 0; #endif p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; p->hardirq_disable_event = 0; p->softirqs_enabled = 1; p->softirq_enable_ip = _THIS_IP_; p->softirq_enable_event = 0; p->softirq_disable_ip = 0; p->softirq_disable_event = 0; p->hardirq_context = 0; p->softirq_context = 0; #endif #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; p->lockdep_recursion = 0; #endif #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif p->bts = NULL; p->stack_start = stack_start; /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) goto bad_fork_cleanup_policy; /* copy all the process information */ if ((retval = copy_semundo(clone_flags, p))) goto bad_fork_cleanup_audit; if ((retval = copy_files(clone_flags, p))) goto bad_fork_cleanup_semundo; if ((retval = copy_fs(clone_flags, p))) goto bad_fork_cleanup_files; if ((retval = copy_sighand(clone_flags, p))) goto bad_fork_cleanup_fs; if ((retval = copy_signal(clone_flags, p))) goto bad_fork_cleanup_sighand; if ((retval = copy_mm(clone_flags, p))) goto bad_fork_cleanup_signal; if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_mm; if ((retval = copy_io(clone_flags, p))) goto bad_fork_cleanup_namespaces; retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_io; if (pid != &init_struct_pid) { retval = -ENOMEM; pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; if (clone_flags & CLONE_NEWPID) { retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); if (retval < 0) goto bad_fork_free_pid; } } p->pid = pid_nr(pid); p->tgid = p->pid; if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; if (current->nsproxy != p->nsproxy) { retval = ns_cgroup_clone(p, pid); if (retval) goto bad_fork_free_pid; } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; #ifdef CONFIG_FUTEX p->robust_list = NULL; #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif /* * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) p->sas_ss_sp = p->sas_ss_size = 0; /* * Syscall tracing should be turned off in the child regardless * of CLONE_PTRACE. */ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif clear_all_latency_tracing(p); /* ok, now we should be set up.. */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; p->exit_state = 0; /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible * on the tasklist. */ cgroup_fork_callbacks(p); cgroup_callbacks_done = 1; /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* * The task hasn't been attached yet, so its cpus_allowed mask will * not be changed, nor will its assigned CPU. * * The cpus_allowed mask of the parent may have changed after it was * copied first time - so re-copy it here, then check the child's CPU * to ensure it is on a valid CPU (and if not, just force it back to * parent's CPU). This avoids alot of nasty races. */ p->cpus_allowed = current->cpus_allowed; p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || !cpu_online(task_cpu(p)))) set_task_cpu(p, smp_processor_id()); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; } spin_lock(¤t->sighand->siglock); /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). */ recalc_sigpending(); if (signal_pending(current)) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_free_pid; } if (clone_flags & CLONE_THREAD) { atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); } if (likely(p->pid)) { list_add_tail(&p->sibling, &p->real_parent->children); tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { if (clone_flags & CLONE_NEWPID) p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; tty_kref_put(p->signal->tty); p->signal->tty = tty_kref_get(current->signal->tty); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; } total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); perf_event_fork(p); return p; bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: put_io_context(p->io_context); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) mmput(p->mm); bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) __cleanup_signal(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: free_task(p); fork_out: returnERR_PTR(retval); }
NORET_TYPE voiddo_exit(long code) { structtask_struct *tsk = current; int group_dead; profile_task_exit(tsk); WARN_ON(atomic_read(&tsk->fs_excl)); //不可在中断上下文中使用该函数 if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); tracehook_report_exit(&code); validate_creds_for_do_exit(tsk); /* * We're taking recursive faults here in do_exit. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { printk(KERN_ALERT "Fixing recursive fault but reboot is needed!\n"); //设置PF_EXITING:表示进程正在退出 tsk->flags |= PF_EXITPIDONE; set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } exit_irq_thread(); exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against * an exiting task cleaning up the robust pi futexes. */ smp_mb(); spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); if (unlikely(tsk->audit_context)) audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); //调用__exit_mm()函数放弃进程占用的mm_struct,如果没有别的进程使用它们即没被共享,就彻底释放它们 exit_mm(tsk); if (group_dead) acct_process(); trace_sched_process_exit(tsk); exit_sem(tsk); //调用sem_exit()函数。如果进程排队等候IPC信号,它则离开队列 //分别递减文件描述符,文件系统数据等的引用计数。当引用计数的值为0时,就代表没有进程在使用这些资源,此时就释放 exit_files(tsk); exit_fs(tsk); check_stack_usage(); exit_thread(); cgroup_exit(tsk, 1); if (group_dead && tsk->signal->leader) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); proc_exit_connector(tsk); /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ perf_event_exit_task(tsk); //调用exit_notify()向父进程发送信号,将子进程的父进程重新设置为线程组中的其他线程或init进程,并把进程状态设为TASK_ZOMBIE. exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held(tsk); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done * or not. In the worst case it loops once more. */ tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(); if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); validate_creds_for_do_exit(tsk); preempt_disable(); exit_rcu(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; schedule(); //调用`schedule()切换到其他进程 BUG(); /* Avoid "noreturn function does return". */ for (;;) cpu_relax(); /* For when BUG is null */ }
voidrelease_task(struct task_struct * p) { structtask_struct *leader; int zap_leader; repeat: tracehook_prepare_release_task(p); /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials */ atomic_dec(&__task_cred(p)->user->processes); proc_flush_task(p); write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); __exit_signal(p); //释放目前僵死进程所使用的所有剩余资源,并进行统计记录 zap_leader = 0; leader = p->group_leader; //如果进程是线程组最后一个进程,并且领头进程已经死掉,那么就通知僵死的领头进程的父进程 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { BUG_ON(task_detached(leader)); do_notify_parent(leader, leader->exit_signal); zap_leader = task_detached(leader); if (zap_leader) leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); p = leader; if (unlikely(zap_leader)) goto repeat; }
asmlinkage void __sched schedule(void) { structtask_struct *prev, *next; unsignedlong *switch_count; structrq *rq; int cpu; need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_sched_qs(cpu); prev = rq->curr; switch_count = &prev->nivcsw; release_kernel_lock(prev); need_resched_nonpreemptible: schedule_debug(prev); if (sched_feat(HRTICK)) hrtick_clear(rq); spin_lock_irq(&rq->lock); update_rq_clock(rq); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; else deactivate_task(rq, prev, 1); switch_count = &prev->nvcsw; } pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); put_prev_task(rq, prev); <strong>next = pick_next_task(rq); //<span><span><span style="font-size:14px;">//挑选最高优先级别的任务</span></span></span></strong> if (likely(prev != next)) { sched_info_switch(prev, next); perf_event_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; ++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */ /* * the context switch might have flipped the stack from under * us, hence refresh the local variables. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); } else spin_unlock_irq(&rq->lock); post_schedule(rq); if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; preempt_enable_no_resched(); if (need_resched()) goto need_resched; }
staticinlinestruct task_struct * pick_next_task(struct rq *rq) { conststructsched_class *class; structtask_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ if (likely(rq->nr_running == rq->cfs.nr_running)) { p = fair_sched_class.pick_next_task(rq); if (likely(p)) return p; } //从最高优先级类开始,遍历每一个调度类。每一个调度类都实现了pick_next_task,他会返回指向下一个可运行进程的指针,没有时返回NULL。 class = sched_class_highest; for ( ; ; ) { p = class->pick_next_task(rq); if (p) return p; /* * Will never be NULL as the idle class always * returns a non-NULL p: */ class =class->next; } }
void __sched mutex_lock(struct mutex *lock) { might_sleep(); /* * The locking fastpath is the 1->0 transition from * 'unlocked' into 'locked' state. */ __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); mutex_set_owner(lock); } void __sched mutex_unlock(struct mutex *lock) { /* * The unlocking fastpath is the 0->1 transition from 'locked' * into 'unlocked' state: */ #ifndef CONFIG_DEBUG_MUTEXES /* * When debugging is enabled we must not clear the owner before time, * the slow path will always be taken, and that clears the owner field * after verifying that it was indeed current. */ mutex_clear_owner(lock); #endif __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); }
#define preempt_enable() \ do { \ preempt_enable_no_resched(); \ barrier(); \ preempt_check_resched(); \ } while (0)
#define preempt_enable_no_resched() \ do { \ barrier(); \ dec_preempt_count(); \ } while (0)
#define dec_preempt_count() sub_preempt_count(1)
#define sub_preempt_count(val) do { preempt_count() -= (val); } while (0) #define preempt_check_resched() \ do { \ if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ preempt_schedule(); \ } while (0)
asmlinkage void __sched preempt_schedule(void) { structthread_info *ti = current_thread_info(); /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ if (likely(ti->preempt_count || irqs_disabled())) return; do { add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity * between schedule and now. */ barrier(); } while (need_resched());
内存管理之页的分配与回收
内存管理单元(MMU)负责将管理内存,在把虚拟地址转换为物理地址的硬件的时候是按页为单位进行处理,从虚拟内存的角度来看,页就是内存管理中的最小单位。页的大小与体系结构有关,在 x86 结构中一般是4KB(32位)或者8KB(64位)。 通过 getconf 命令可以查看系统的page的大小:
structkmem_cache { /* 1) per-cpu data, touched during every alloc/free */ structarray_cache *array[NR_CPUS]; /* 2) Cache tunables. Protected by cache_chain_mutex */ unsignedint batchcount; unsignedint limit; unsignedint shared; unsignedint buffer_size; u32 reciprocal_buffer_size; /* 3) touched by every alloc & free from the backend */ unsignedint flags; /* constant flags */ unsignedint num; /* # of objs per slab */ /* 4) cache_grow/shrink */ /* order of pgs per slab (2^n) */ unsignedint gfporder; /* force GFP flags, e.g. GFP_DMA */ gfp_t gfpflags; size_t colour; /* cache colouring range */ unsignedint colour_off; /* colour offset */ structkmem_cache *slabp_cache; unsignedint slab_size; unsignedint dflags; /* dynamic flags */ /* constructor func */ void (*ctor)(void *obj); /* 5) cache creation/removal */ constchar *name; structlist_headnext; /* 6) statistics */ #ifdef CONFIG_DEBUG_SLAB unsignedlong num_active; unsignedlong num_allocations; unsignedlong high_mark; unsignedlong grown; unsignedlong reaped; unsignedlong errors; unsignedlong max_freeable; unsignedlong node_allocs; unsignedlong node_frees; unsignedlong node_overflow; atomic_t allochit; atomic_t allocmiss; atomic_t freehit; atomic_t freemiss; /* * If debugging is enabled, then the allocator can add additional * fields and/or padding to every object. buffer_size contains the total * object size including these internal fields, the following two * variables contain the offset to the user object and its size. */ int obj_offset; int obj_size; #endif/* CONFIG_DEBUG_SLAB */ /* * We put nodelists[] at the end of kmem_cache, because we want to size * this array to nr_node_ids slots instead of MAX_NUMNODES * (see kmem_cache_init()) * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache * is statically defined, so we reserve the max number of nodes. */ structkmem_list3 *nodelists[MAX_NUMNODES]; /* * Do not add fields after nodelists[] */ };
其中struct kmem_list3结构体链接slab,共享高速缓存,其定义如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* * The slab lists for all objects. */ structkmem_list3 { structlist_headslabs_partial;/* partial list first, better asm code */ structlist_headslabs_full; structlist_headslabs_free; unsignedlong free_objects; unsignedint free_limit; unsignedint colour_next; /* Per-node cache coloring */ spinlock_t list_lock; structarray_cache *shared;/* shared per node */ structarray_cache **alien;/* on other nodes */ unsignedlong next_reap; /* updated without locking */ int free_touched; /* updated without locking */ };
/* * struct slab * * Manages the objs in a slab. Placed either at the beginning of mem allocated * for a slab, or allocated from an general cache. * Slabs are chained into three list: fully used, partial, fully free slabs. */ structslab { structlist_headlist; unsignedlong colouroff; void *s_mem; /* including colour offset */ unsignedint inuse; /* num of objs active in slab */ kmem_bufctl_tfree; unsignedshort nodeid; };
/** * kmem_cache_alloc - Allocate an object * @cachep: The cache to allocate from. * @flags: See kmalloc(). * * Allocate an object from this cache. The flags are only relevant * if the cache has no available objects. */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); trace_kmem_cache_alloc(_RET_IP_, ret, obj_size(cachep), cachep->buffer_size, flags); return ret; }
/** * kfree - free previously allocated memory * @objp: pointer returned by kmalloc. * * If @objp is NULL, no operation is performed. * * Don't free memory not originally allocated by kmalloc() * or you will run into trouble. */ voidkfree(constvoid *objp) { structkmem_cache *c; unsignedlong flags; trace_kfree(_RET_IP_, objp); if (unlikely(ZERO_OR_NULL_PTR(objp))) return; local_irq_save(flags); kfree_debugcheck(objp); c = virt_to_cache(objp); debug_check_no_locks_freed(objp, obj_size(c)); debug_check_no_obj_freed(objp, obj_size(c)); __cache_free(c, (void *)objp); local_irq_restore(flags); }
其中堆栈安排在虚拟地址空间顶部,数据段和代码段分布在虚拟地址空间底部,空洞部分就是进程运行时可以动态分布的空间,包括映射内核地址空间内容、动态申请地址空间、共享库的代码或数据等。在虚拟地址空间中,只有那些映射到物理存储空间的地址才是合法的地址空间。每一片合法的地址空间片段都对应一个独立的虚拟内存区域(VMA,virtual memory areas ),而进程的进程地址空间就是由这些内存区域组成。
Linux 采用了复杂的数据结构来跟踪进程的虚拟地址,进程地址空间使用内存描述符结构体来表示,内存描述符由mm_struct结构体表示,该结构体表示在<include/linux/mm_types.h>文件中:
structmm_struct { structvm_area_struct * mmap;/* list of VMAs */ structrb_rootmm_rb; structvm_area_struct * mmap_cache;/* last find_vma result */ unsignedlong(*get_unmapped_area)(struct file *filp, unsignedlong addr, unsignedlong len, unsignedlong pgoff, unsignedlong flags); void (*unmap_area) (struct mm_struct *mm, unsignedlong addr); unsignedlong mmap_base; /* base of mmap area */ unsignedlong task_size; /* size of task vm space */ unsignedlong cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ unsignedlong free_area_cache; /* first hole of size cached_hole_size or larger */ pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ int map_count; /* number of VMAs */ structrw_semaphoremmap_sem; spinlock_t page_table_lock; /* Protects page tables and some counters */ structlist_headmmlist;/* List of maybe swapped mm's. These are globally strung together off init_mm.mmlist, and are protected by mmlist_lock */ /* Special counters, in some configurations protected by the * page_table_lock, in other configurations by being atomic. */ mm_counter_t _file_rss; mm_counter_t _anon_rss; unsignedlong hiwater_rss; /* High-watermark of RSS usage */ unsignedlong hiwater_vm; /* High-water virtual memory usage */ unsignedlong total_vm, locked_vm, shared_vm, exec_vm; unsignedlong stack_vm, reserved_vm, def_flags, nr_ptes; unsignedlong start_code, end_code, start_data, end_data; unsignedlong start_brk, brk, start_stack; unsignedlong arg_start, arg_end, env_start, env_end; unsignedlong saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ structlinux_binfmt *binfmt; cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ mm_context_t context; /* Swap token stuff */ /* * Last value of global fault stamp as seen by this process. * In other words, this value gives an indication of how long * it has been since this task got the token. * Look at mm/thrash.c */ unsignedint faultstamp; unsignedint token_priority; unsignedint last_interval; unsignedlong flags; /* Must use atomic bitops to access the bits */ structcore_state *core_state;/* coredumping support */ #ifdef CONFIG_AIO spinlock_t ioctx_lock; structhlist_headioctx_list; #endif #ifdef CONFIG_MM_OWNER /* * "owner" points to a task that is regarded as the canonical * user/owner of this mm. All of the following must be true in * order for it to be changed: * * current == mm->owner * current->mm != mm * new_owner->mm == mm * new_owner->alloc_lock is held */ structtask_struct *owner; #endif #ifdef CONFIG_PROC_FS /* store ref to file /proc/<pid>/exe symlink points to */ structfile *exe_file; unsignedlong num_exe_file_vmas; #endif #ifdef CONFIG_MMU_NOTIFIER structmmu_notifier_mm *mmu_notifier_mm; #endif };
/* * This struct defines a memory VMM memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory * space that has a special rule for the page-fault handlers (ie a shared * library, the executable area etc). */ structvm_area_struct { structmm_struct * vm_mm;/* The address space we belong to. */ unsignedlong vm_start; /* Our start address within vm_mm. */ unsignedlong vm_end; /* The first byte after our end address within vm_mm. */ /* linked list of VM areas per task, sorted by address */ structvm_area_struct *vm_next; pgprot_t vm_page_prot; /* Access permissions of this VMA. */ unsignedlong vm_flags; /* Flags, see mm.h. */ structrb_nodevm_rb; /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap prio tree, or * linkage to the list of like vmas hanging off its node, or * linkage of vma in the address_space->i_mmap_nonlinear list. */ union { struct { structlist_headlist; void *parent; /* aligns with prio_tree_node parent */ structvm_area_struct *head; } vm_set; structraw_prio_tree_nodeprio_tree_node; } shared; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack * or brk vma (with NULL file) can only be in an anon_vma list. */ structlist_headanon_vma_node;/* Serialized by anon_vma->lock */ structanon_vma *anon_vma;/* Serialized by page_table_lock */ /* Function pointers to deal with this struct. */ conststructvm_operations_struct *vm_ops; /* Information about our backing store: */ unsignedlong vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE units, *not* PAGE_CACHE_SIZE */ structfile * vm_file;/* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ unsignedlong vm_truncate_count;/* truncate_count or restart_addr */ #ifndef CONFIG_MMU structvm_region *vm_region;/* NOMMU mapping region */ #endif #ifdef CONFIG_NUMA structmempolicy *vm_policy;/* NUMA policy for the VMA */ #endif };
/* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer * to the functions called when a no-page or a wp-page exception occurs. */ structvm_operations_struct { void (*open)(struct vm_area_struct * area); //指定内存区域被加载到一个地址空间时函数被调用 void (*close)(struct vm_area_struct * area); //指定内存区域从地址空间删除时函数被调用 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); //没有出现在物理内存中的页面被访问时,页面故障处理调用该函数 /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware */ int (*access)(struct vm_area_struct *vma, unsignedlong addr, void *buf, int len, int write); #ifdef CONFIG_NUMA ...... #endif };
当可执行映像映射到进程的虚拟地址空间时,将产生一组vm_area_struct结构来描述虚拟内存区间的起始点和终止点,每个vm_area_struct结构代表可执行映像的一部分,可能是可执行代码,也可能是初始化的变量或未初始化的数据,这些都是在函数do_mmap()中来实现的。随着vm_area_struct结构的生成,这些结构所描述的虚拟内存区间上的标准操作函数也由 Linux 初始化。