GCD之数据结构及概念向

GCD系列文章之概念向，本文结合开发文档等相关书记来介绍GCD的概念及数据结构，在后面的文章会写实现原理

背景

源码及版本号。所有源码均在苹果开源官网下可下载
源码 | 版本
-|-
libdispatch | 1008.250.7
libpthread | 330.250.2
xnu | 6153.11.26

概念

进程

进程是过去时代的产物。现代操作系统，包括OSX和iOS，都只面对线程进行调度。苹果比其他操作系统提供了更为丰富的操作多线程的API，从这个角度来看，它比其他操作系统高了几个档次。

最初，UNIX被设计为一个多进程的操作系统。进程是系统执行的基本单元，而且是执行过程中所需要的各种资源的容器，这些资源包括：虚拟内存、文件描述符以及其他各种对象。开发者编写顺序程序，从入口点main开始执行直到main函数返回，执行过程是序列化的，容易理解。

然而，这种方法很快被证明太刻板，对于需要并发执行的任务来说灵活性太低。另一个原因是大部分进程早晚都会在I/O上阻塞，I/O操作意味着进程时间片中大部分都放弃了。这对性能有很大的影响，因为进程上下文切换的开销很大。

线程

线程，作为最大化利用进程时间片的方法，应运而生：通过使用多个线程，程序的执行可以分割为表面看上去并发执行的子任务。如果一个子任务发生了阻寒，那么剩下的时间片可以分配给另一个于任务。

而CPU当时的发展却是有限的，即使是多线程的代码，一次也只能运行一个线程。进程中线程的抢占开销比多任务系统对进程抢占的开销要小。因此，从这个角度看，大部分操作系统开始将调度策略从进程转移到线程是有意义的。线程之间切换的开销比较小一一只需要保存和恢复寄存器即可。而相比起来进程的切换还需要切换虚拟内存空间，其中包含很多底层的开销，例如清空cache和TLB(TransIation Lookaside Buffer).

随着多处理器架构，特别是多核处理器架构的出现，线程焕发了新的生机。突然间，两个线程可以真正地同时运行了。多核处理器史是特别适合线程，因为多个处理器核心共享同样的cache和RAM,这为线程之间的共享虚拟内存提供了基础。相比之下，多处理器架构可能会因为非一致的内存架构和cache一致性方面的原因而损失一些性能。

Mach线程定义

线程定义了Mach中最小的执行单元，线程表示的是底层的机器寄存器c状态以及各种调度统计数据，线程的定义在中，线程从设计上提供了调度所需要的大量信息，同时又尽可能的维持最小开销

thread

struct thread {
#if MACH_ASSERT
#define THREAD_MAGIC 0x1234ABCDDCBA4321ULL
    /* Ensure nothing uses &thread as a queue entry */
    uint64_t                thread_magic;
#endif /* MACH_ASSERT */

  
    union {
        queue_chain_t                   runq_links;             /* run queue links */
        queue_chain_t                   wait_links;             /* wait queue links */
        struct mpsc_queue_chain         mpsc_links;             /* thread daemon mpsc links */
        struct priority_queue_entry     wait_prioq_links;       /* priority ordered waitq links */
    };

    processor_t             runq;           /* run queue assignment */

    event64_t               wait_event;     /* wait queue event */
    struct waitq           *waitq;          /* wait queue this thread is enqueued on */
    struct turnstile       *turnstile;      /* thread's turnstile, protected by primitives interlock */
    void                   *inheritor;      /* inheritor of the primitive the thread will block on */
    struct priority_queue  sched_inheritor_queue; /* Inheritor queue for kernel promotion */
    struct priority_queue  base_inheritor_queue; /* Inheritor queue for user promotion */

#if CONFIG_SCHED_CLUTCH
    /*
     * In the clutch scheduler, the threads are maintained in runqs at the clutch_bucket
     * level (clutch_bucket defines a unique thread group and scheduling bucket pair). In
     * order to determine the priority of the clutch bucket as a whole, it is necessary to
     * find the highest thread in it. The thread could be present in the clutch bucket due
     * to its base_pri or its promoted pri. This link is used to maintain that queue.
     */
    struct priority_queue_entry sched_clutchpri_link;

#endif /* CONFIG_SCHED_CLUTCH */

    /* Data updated during assert_wait/thread_wakeup */
#if __SMP__
    decl_simple_lock_data(, sched_lock);     /* scheduling lock (thread_lock()) */
    decl_simple_lock_data(, wake_lock);      /* for thread stop / wait (wake_lock()) */
#endif
    integer_t               options;                        /* options set by thread itself */
#define TH_OPT_INTMASK          0x0003          /* interrupt / abort level */
#define TH_OPT_VMPRIV           0x0004          /* may allocate reserved memory */
#define TH_OPT_SYSTEM_CRITICAL  0x0010          /* Thread must always be allowed to run - even under heavy load */
#define TH_OPT_PROC_CPULIMIT    0x0020          /* Thread has a task-wide CPU limit applied to it */
#define TH_OPT_PRVT_CPULIMIT    0x0040          /* Thread has a thread-private CPU limit applied to it */
#define TH_OPT_IDLE_THREAD      0x0080          /* Thread is a per-processor idle thread */
#define TH_OPT_GLOBAL_FORCED_IDLE       0x0100  /* Thread performs forced idle for thermal control */
#define TH_OPT_SCHED_VM_GROUP   0x0200          /* Thread belongs to special scheduler VM group */
#define TH_OPT_HONOR_QLIMIT     0x0400          /* Thread will honor qlimit while sending mach_msg, regardless of MACH_SEND_ALWAYS */
#define TH_OPT_SEND_IMPORTANCE  0x0800          /* Thread will allow importance donation from kernel rpc */
#define TH_OPT_ZONE_GC          0x1000          /* zone_gc() called on this thread */

    boolean_t                       wake_active;    /* wake event on stop */
    int                                     at_safe_point;  /* thread_abort_safely allowed */
    ast_t                           reason;                 /* why we blocked */
    uint32_t                        quantum_remaining;
    wait_result_t                   wait_result;    /* outcome of wait -
                                                    * may be examined by this thread
                                                    * WITHOUT locking */
    thread_continue_t       continuation;   /* continue here next dispatch */
    void                            *parameter;             /* continuation parameter */

    /* Data updated/used in thread_invoke */
    vm_offset_t             kernel_stack;           /* current kernel stack */
    vm_offset_t                     reserved_stack;         /* reserved kernel stack */

#if KASAN
    struct kasan_thread_data kasan_data;
#endif

#if CONFIG_KSANCOV
    void *ksancov_data;
#endif

    /* Thread state: */
    int                                     state;
/*
 *    Thread states [bits or'ed]
 */
#define TH_WAIT                 0x01                    /* queued for waiting */
#define TH_SUSP                 0x02                    /* stopped or requested to stop */
#define TH_RUN                  0x04                    /* running or on runq */
#define TH_UNINT                0x08                    /* waiting uninteruptibly */
#define TH_TERMINATE    0x10                    /* halted at termination */
#define TH_TERMINATE2   0x20                    /* added to termination queue */
#define TH_WAIT_REPORT  0x40                    /* the wait is using the sched_call,
                                             *                                  only set if TH_WAIT is also set */
#define TH_IDLE                 0x80                    /* idling processor */

    /* Scheduling information */
    sched_mode_t                    sched_mode;             /* scheduling mode */
    sched_mode_t                    saved_mode;             /* saved mode during forced mode demotion */

    /* This thread's contribution to global sched counters */
    sched_bucket_t                  th_sched_bucket;

    sfi_class_id_t                  sfi_class;              /* SFI class (XXX Updated on CSW/QE/AST) */
    sfi_class_id_t                  sfi_wait_class; /* Currently in SFI wait for this class, protected by sfi_lock */


    uint32_t                        sched_flags;            /* current flag bits */
#define TH_SFLAG_NO_SMT                 0x0001          /* On an SMT CPU, this thread must be scheduled alone */
#define TH_SFLAG_FAILSAFE               0x0002          /* fail-safe has tripped */
#define TH_SFLAG_THROTTLED              0x0004          /* throttled thread forced to timeshare mode (may be applied in addition to failsafe) */
#define TH_SFLAG_DEMOTED_MASK      (TH_SFLAG_THROTTLED | TH_SFLAG_FAILSAFE)     /* saved_mode contains previous sched_mode */

#define TH_SFLAG_PROMOTED               0x0008          /* sched pri has been promoted by kernel mutex priority promotion */
#define TH_SFLAG_ABORT                  0x0010          /* abort interruptible waits */
#define TH_SFLAG_ABORTSAFELY            0x0020          /* ... but only those at safe point */
#define TH_SFLAG_ABORTED_MASK           (TH_SFLAG_ABORT | TH_SFLAG_ABORTSAFELY)
#define TH_SFLAG_DEPRESS                0x0040          /* normal depress yield */
#define TH_SFLAG_POLLDEPRESS            0x0080          /* polled depress yield */
#define TH_SFLAG_DEPRESSED_MASK         (TH_SFLAG_DEPRESS | TH_SFLAG_POLLDEPRESS)
/* unused TH_SFLAG_PRI_UPDATE           0x0100 */
#define TH_SFLAG_EAGERPREEMPT           0x0200          /* Any preemption of this thread should be treated as if AST_URGENT applied */
#define TH_SFLAG_RW_PROMOTED            0x0400          /* promote reason: blocking with RW lock held */
#define TH_SFLAG_BASE_PRI_FROZEN        0x0800          /* (effective) base_pri is frozen */
#define TH_SFLAG_WAITQ_PROMOTED         0x1000          /* promote reason: waitq wakeup (generally for IPC receive) */


#define TH_SFLAG_EXEC_PROMOTED          0x8000          /* promote reason: thread is in an exec */

/* 'promote reasons' that request a priority floor only, not a custom priority */
#define TH_SFLAG_PROMOTE_REASON_MASK    (TH_SFLAG_RW_PROMOTED | TH_SFLAG_WAITQ_PROMOTED | TH_SFLAG_EXEC_PROMOTED)

#define TH_SFLAG_RW_PROMOTED_BIT        (10)    /* 0x400 */

    int16_t                         sched_pri;              /* scheduled (current) priority */
    int16_t                         base_pri;               /* effective base priority (equal to req_base_pri unless TH_SFLAG_BASE_PRI_FROZEN) */
    int16_t                         req_base_pri;           /* requested base priority */
    int16_t                         max_priority;           /* copy of max base priority */
    int16_t                         task_priority;          /* copy of task base priority */
    int16_t                         promotion_priority;     /* priority thread is currently promoted to */

#if defined(CONFIG_SCHED_GRRR)
#if 0
    uint16_t                        grrr_deficit;           /* fixed point (1/1000th quantum) fractional deficit */
#endif
#endif

    int                             iotier_override; /* atomic operations to set, cleared on ret to user */
    os_refcnt_t                     ref_count;              /* number of references to me */

    lck_mtx_t*                      waiting_for_mutex;      /* points to mutex we're waiting for until we acquire it */

    uint32_t                        rwlock_count;   /* Number of lck_rw_t locks held by thread */

    integer_t                       importance;                     /* task-relative importance */

    /* Priority depression expiration */
    integer_t                       depress_timer_active;
    timer_call_data_t       depress_timer;
    /* real-time parameters */
    struct {                                                                /* see mach/thread_policy.h */
        uint32_t                        period;
        uint32_t                        computation;
        uint32_t                        constraint;
        boolean_t                       preemptible;
        uint64_t                        deadline;
    }                                       realtime;

    uint64_t                        last_run_time;          /* time when thread was switched away from */
    uint64_t                        last_made_runnable_time;        /* time when thread was unblocked or preempted */
    uint64_t                        last_basepri_change_time;       /* time when thread was last changed in basepri while runnable */
    uint64_t                        same_pri_latency;
#define THREAD_NOT_RUNNABLE (~0ULL)


#if defined(CONFIG_SCHED_MULTIQ)
    sched_group_t                   sched_group;
#endif /* defined(CONFIG_SCHED_MULTIQ) */

    /* Data used during setrun/dispatch */
    timer_data_t            system_timer;           /* system mode timer */
    processor_t                     bound_processor;        /* bound to a processor? */
    processor_t                     last_processor;         /* processor last dispatched on */
    processor_t                     chosen_processor;       /* Where we want to run this thread */

    /* Fail-safe computation since last unblock or qualifying yield */
    uint64_t                        computation_metered;
    uint64_t                        computation_epoch;
    uint64_t                        safe_release;   /* when to release fail-safe */

    /* Call out from scheduler */
    void                            (*sched_call)(
        int                     type,
        thread_t        thread);
#if defined(CONFIG_SCHED_PROTO)
    uint32_t                        runqueue_generation;    /* last time runqueue was drained */
#endif

    /* Statistics and timesharing calculations */
#if defined(CONFIG_SCHED_TIMESHARE_CORE)
    natural_t                       sched_stamp;    /* last scheduler tick */
    natural_t                       sched_usage;    /* timesharing cpu usage [sched] */
    natural_t                       pri_shift;              /* usage -> priority from pset */
    natural_t                       cpu_usage;              /* instrumented cpu usage [%cpu] */
    natural_t                       cpu_delta;              /* accumulated cpu_usage delta */
#endif /* CONFIG_SCHED_TIMESHARE_CORE */

    uint32_t                        c_switch;               /* total context switches */
    uint32_t                        p_switch;               /* total processor switches */
    uint32_t                        ps_switch;              /* total pset switches */

    integer_t mutex_count;  /* total count of locks held */
    /* Timing data structures */
    int                                     precise_user_kernel_time; /* precise user/kernel enabled for this thread */
    timer_data_t            user_timer;                     /* user mode timer */
    uint64_t                        user_timer_save;        /* saved user timer value */
    uint64_t                        system_timer_save;      /* saved system timer value */
    uint64_t                        vtimer_user_save;       /* saved values for vtimers */
    uint64_t                        vtimer_prof_save;
    uint64_t                        vtimer_rlim_save;
    uint64_t                        vtimer_qos_save;

    timer_data_t            ptime;                  /* time executing in P mode */
    timer_data_t            runnable_timer;         /* time the thread is runnable (including running) */

#if CONFIG_SCHED_SFI
    /* Timing for wait state */
    uint64_t                wait_sfi_begin_time;    /* start time for thread waiting in SFI */
#endif

    /* Timed wait expiration */
    timer_call_data_t       wait_timer;
    integer_t                       wait_timer_active;
    boolean_t                       wait_timer_is_set;


    /*
     * Processor/cache affinity
     * - affinity_threads links task threads with the same affinity set
     */
    affinity_set_t                  affinity_set;
    queue_chain_t                   affinity_threads;

    /* Various bits of state to stash across a continuation, exclusive to the current thread block point */
    union {
        struct {
            mach_msg_return_t       state;          /* receive state */
            mach_port_seqno_t       seqno;          /* seqno of recvd message */
            ipc_object_t            object;         /* object received on */
            mach_vm_address_t       msg_addr;       /* receive buffer pointer */
            mach_msg_size_t         rsize;          /* max size for recvd msg */
            mach_msg_size_t         msize;          /* actual size for recvd msg */
            mach_msg_option_t       option;         /* options for receive */
            mach_port_name_t        receiver_name;  /* the receive port name */
            struct knote            *knote;         /* knote fired for rcv */
            union {
                struct ipc_kmsg   *kmsg;        /* received message */
                struct ipc_mqueue *peekq;       /* mqueue to peek at */
                struct {
                    mach_msg_priority_t qos;        /* received message qos */
                    mach_msg_priority_t oqos;       /* override qos for message */
                } received_qos;
            };
            mach_msg_continue_t     continuation;
        } receive;
        struct {
            struct semaphore        *waitsemaphore;         /* semaphore ref */
            struct semaphore        *signalsemaphore;       /* semaphore ref */
            int                                     options;                        /* semaphore options */
            kern_return_t           result;                         /* primary result */
            mach_msg_continue_t continuation;
        } sema;
        struct {
#define THREAD_SAVE_IOKIT_TLS_COUNT     8
            void                    *tls[THREAD_SAVE_IOKIT_TLS_COUNT];
        } iokit;
    } saved;

    /* Only user threads can cause guard exceptions, only kernel threads can be thread call threads */
    union {
        /* Group and call this thread is working on behalf of */
        struct {
            struct thread_call_group * thc_group;
            struct thread_call *       thc_call;                    /* debug only, may be deallocated */
        } thc_state;

        /* Structure to save information about guard exception */
        struct {
            mach_exception_code_t           code;
            mach_exception_subcode_t        subcode;
        } guard_exc_info;
    };

    /* Kernel holds on this thread  */
    int16_t                                         suspend_count;
    /* User level suspensions */
    int16_t                                         user_stop_count;

    /* IPC data structures */
#if IMPORTANCE_INHERITANCE
    natural_t ith_assertions;                       /* assertions pending drop */
#endif
    struct ipc_kmsg_queue ith_messages;             /* messages to reap */
    mach_port_t ith_rpc_reply;                      /* reply port for kernel RPCs */

    /* Ast/Halt data structures */
    vm_offset_t                                     recover;                /* page fault recover(copyin/out) */

    queue_chain_t                           threads;                /* global list of all threads */

    /* Activation */
    queue_chain_t                   task_threads;

    /* Task membership */
    struct task                             *task;
    vm_map_t                                map;
#if DEVELOPMENT || DEBUG
    boolean_t pmap_footprint_suspended;
#endif /* DEVELOPMENT || DEBUG */

    decl_lck_mtx_data(, mutex);


    /* Pending thread ast(s) */
    ast_t                                   ast;

    /* Miscellaneous bits guarded by mutex */
    uint32_t
        active:1,                                   /* Thread is active and has not been terminated */
        started:1,                                  /* Thread has been started after creation */
        static_param:1,                             /* Disallow policy parameter changes */
        inspection:1,                               /* TRUE when task is being inspected by crash reporter */
        policy_reset:1,                             /* Disallow policy parameter changes on terminating threads */
        suspend_parked:1,                           /* thread parked in thread_suspended */
        corpse_dup:1,                               /* TRUE when thread is an inactive duplicate in a corpse */
    :0;

    /* Ports associated with this thread */
    struct ipc_port                 *ith_self;                      /* not a right, doesn't hold ref */
    struct ipc_port                 *ith_sself;                     /* a send right */
    struct ipc_port                 *ith_special_reply_port;         /* ref to special reply port */
    struct exception_action *exc_actions;

#ifdef  MACH_BSD
    void                                    *uthread;
#endif

#if CONFIG_DTRACE
    uint16_t t_dtrace_flags;                /* DTrace thread states */
#define TH_DTRACE_EXECSUCCESS   0x01
    uint16_t t_dtrace_inprobe;          /* Executing under dtrace_probe */
    uint32_t t_dtrace_predcache;        /* DTrace per thread predicate value hint */
    int64_t t_dtrace_tracing;               /* Thread time under dtrace_probe() */
    int64_t t_dtrace_vtime;
#endif

    clock_sec_t t_page_creation_time;
    uint32_t    t_page_creation_count;
    uint32_t    t_page_creation_throttled;
#if (DEVELOPMENT || DEBUG)
    uint64_t    t_page_creation_throttled_hard;
    uint64_t    t_page_creation_throttled_soft;
#endif /* DEVELOPMENT || DEBUG */
    int         t_pagein_error;            /* for vm_fault(), holds error from vnop_pagein() */

#ifdef KPERF
/* The high 8 bits are the number of frames to sample of a user callstack. */
#define T_KPERF_CALLSTACK_DEPTH_OFFSET     (24)
#define T_KPERF_SET_CALLSTACK_DEPTH(DEPTH) (((uint32_t)(DEPTH)) << T_KPERF_CALLSTACK_DEPTH_OFFSET)
#define T_KPERF_GET_CALLSTACK_DEPTH(FLAGS) ((FLAGS) >> T_KPERF_CALLSTACK_DEPTH_OFFSET)
#endif

#define T_KPERF_AST_CALLSTACK (1U << 0) /* dump a callstack on thread's next AST */
#define T_KPERF_AST_DISPATCH  (1U << 1) /* dump a name on thread's next AST */
#define T_KPC_ALLOC           (1U << 2) /* thread needs a kpc_buf allocated */
/* only go up to T_KPERF_CALLSTACK_DEPTH_OFFSET - 1 */

#ifdef KPERF
    uint32_t kperf_flags;
    uint32_t kperf_pet_gen;  /* last generation of PET that sampled this thread*/
    uint32_t kperf_c_switch; /* last dispatch detection */
    uint32_t kperf_pet_cnt;  /* how many times a thread has been sampled by PET */
#endif

#ifdef KPC
    /* accumulated performance counters for this thread */
    uint64_t *kpc_buf;
#endif

#if HYPERVISOR
    /* hypervisor virtual CPU object associated with this thread */
    void *hv_thread_target;
#endif /* HYPERVISOR */

    uint64_t thread_id;             /*system wide unique thread-id*/

    /* Statistics accumulated per-thread and aggregated per-task */
    uint32_t                syscalls_unix;
    uint32_t                syscalls_mach;
    ledger_t                t_ledger;
    ledger_t                t_threadledger; /* per thread ledger */
    ledger_t                t_bankledger;                /* ledger to charge someone */
    uint64_t                t_deduct_bank_ledger_time;   /* cpu time to be deducted from bank ledger */
    uint64_t                t_deduct_bank_ledger_energy; /* energy to be deducted from bank ledger */

#if MONOTONIC
    struct mt_thread t_monotonic;
#endif /* MONOTONIC */

    /*** Machine-dependent state ***/
    struct machine_thread   machine;

    /* policy is protected by the thread mutex */
    struct thread_requested_policy  requested_policy;
    struct thread_effective_policy  effective_policy;

    /* usynch override is protected by the task lock, eventually will be thread mutex */
    struct thread_qos_override {
        struct thread_qos_override      *override_next;
        uint32_t        override_contended_resource_count;
        int16_t         override_qos;
        int16_t         override_resource_type;
        user_addr_t     override_resource;
    } *overrides;

    uint32_t        kevent_overrides;
    uint16_t        user_promotion_basepri;
    uint16_t         kern_promotion_schedpri;
    _Atomic uint16_t kevent_ast_bits;

    io_stat_info_t                  thread_io_stats; /* per-thread I/O statistics */

#if CONFIG_EMBEDDED
    task_watch_t *  taskwatch;              /* task watch */
#endif /* CONFIG_EMBEDDED */

    uint32_t                        thread_callout_interrupt_wakeups;
    uint32_t                        thread_callout_platform_idle_wakeups;
    uint32_t                        thread_timer_wakeups_bin_1;
    uint32_t                        thread_timer_wakeups_bin_2;
    uint16_t                        thread_tag;
    /*
     * callout_* fields are only set for thread call threads whereas guard_exc_fatal is set
     * by user threads on themselves while taking a guard exception. So it's okay for them to
     * share this bitfield.
     */
    uint16_t                        callout_woken_from_icontext:1,
        callout_woken_from_platform_idle:1,
        callout_woke_thread:1,
        guard_exc_fatal:1,
        thread_bitfield_unused:12;

    mach_port_name_t                ith_voucher_name;
    ipc_voucher_t                   ith_voucher;
#if CONFIG_IOSCHED
    void                            *decmp_upl;
#endif /* CONFIG_IOSCHED */

    /* work interval (if any) associated with the thread. Uses thread mutex */
    struct work_interval            *th_work_interval;

#if     SCHED_TRACE_THREAD_WAKEUPS
    uintptr_t               thread_wakeup_bt[64];
#endif
    turnstile_update_flags_t inheritor_flags; /* inheritor flags for inheritor field */
    block_hint_t    pending_block_hint;
    block_hint_t    block_hint;      /* What type of primitive last caused us to block. */
    integer_t       decompressions;  /* Per-thread decompressions counter to be added to per-task decompressions counter */
};