.. Kenneth Lee 版权所有 2019-2020
:Authors: Kenneth Lee :Version: 1.0
单元测试的强与弱问题
最近看了一个项目做的单元测试,很不满意——单元测试框架做得太好了。
单元测试的框架不能做得太好,单元测试的主角是被测试的那个代码,不是单元测试的框 架,把单元测试框架做得太好,结果就是测试效果大打折扣。
这就好比你盖房子,脚手架搭得比房子还漂亮,房子有可能搭的好吗?
我们看一些细节,比如你测试Linux内核的一个模块,这个模块包含了<include/sched.h> ,用到里面的struct task_struct {}。你应该怎么打桩?有人会找一个Linux版本,把这 个结构整个拷贝进来:::
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
* For reasons of header soup (see current_thread_info()), this
* must be the first element of task_struct.
*/
struct thread_info thread_info;
#endif
/* -1 unrunnable, 0 runnable, >0 stopped: */
volatile long state;
/*
* This begins the randomizable portion of task_struct. Only
* scheduling-critical items should be added above here.
*/
randomized_struct_fields_start
void *stack;
atomic_t usage;
/* Per task flags (PF_*), defined further below: */
unsigned int flags;
unsigned int ptrace;
#ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* Current CPU: */
unsigned int cpu;
#endif
unsigned int wakee_flips;
unsigned long wakee_flip_decay_ts;
struct task_struct *last_wakee;
/*
* recent_used_cpu is initially set as the last CPU used by a task
* that wakes affine another task. Waker/wakee relationships can
* push tasks around a CPU where each wakeup moves to the next one.
* Tracking a recently used CPU allows a quick search for a recently
* used CPU that may be idle.
*/
int recent_used_cpu;
int wake_cpu;
#endif
int on_rq;
int prio;
int static_prio;
int normal_prio;
unsigned int rt_priority;
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
struct sched_dl_entity dl;
#ifdef CONFIG_PREEMPT_NOTIFIERS
/* List of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
unsigned int policy;
int nr_cpus_allowed;
cpumask_t cpus_allowed;
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
union rcu_special rcu_read_unlock_special;
struct list_head rcu_node_entry;
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
unsigned long rcu_tasks_nvcsw;
u8 rcu_tasks_holdout;
u8 rcu_tasks_idx;
int rcu_tasks_idle_cpu;
struct list_head rcu_tasks_holdout_list;
#endif /* #ifdef CONFIG_TASKS_RCU */
struct sched_info sched_info;
struct list_head tasks;
#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
struct rb_node pushable_dl_tasks;
#endif
struct mm_struct *mm;
struct mm_struct *active_mm;
/* Per-thread vma caching: */
struct vmacache vmacache;
#ifdef SPLIT_RSS_COUNTING
struct task_rss_stat rss_stat;
#endif
int exit_state;
int exit_code;
int exit_signal;
/* The signal sent when the parent dies: */
int pdeath_signal;
/* JOBCTL_*, siglock protected: */
unsigned long jobctl;
/* Used for emulating ABI behavior of previous Linux versions: */
unsigned int personality;
/* Scheduler bits, serialized by scheduler locks: */
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
unsigned sched_remote_wakeup:1;
#ifdef CONFIG_PSI
unsigned sched_psi_wake_requeue:1;
#endif
/* Force alignment to the next boundary: */
unsigned :0;
/* Unserialized, strictly 'current' */
/* Bit to tell LSMs we're in execve(): */
unsigned in_execve:1;
unsigned in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
unsigned restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG
unsigned in_user_fault:1;
#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
/* disallow userland-initiated cgroup migration */
unsigned no_cgroup_migration:1;
#endif
#ifdef CONFIG_BLK_CGROUP
/* to be used once the psi infrastructure lands upstream. */
unsigned use_memdelay:1;
#endif
/*
* May usercopy functions fault on kernel addresses?
* This is not just a single bit because this can potentially nest.
*/
unsigned int kernel_uaccess_faults_ok;
unsigned long atomic_flags; /* Flags requiring atomic access. */
struct restart_block restart_block;
pid_t pid;
pid_t tgid;
#ifdef CONFIG_STACKPROTECTOR
/* Canary value for the -fstack-protector GCC feature: */
unsigned long stack_canary;
#endif
/*
* Pointers to the (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->real_parent->pid)
*/
/* Real parent process: */
struct task_struct __rcu *real_parent;
/* Recipient of SIGCHLD, wait4() reports: */
struct task_struct __rcu *parent;
/*
* Children/sibling form the list of natural children:
*/
struct list_head children;
struct list_head sibling;
struct task_struct *group_leader;
/*
* 'ptraced' is the list of tasks this task is using ptrace() on.
*
* This includes both natural children and PTRACE_ATTACH targets.
* 'ptrace_entry' is this task's link on the p->parent->ptraced list.
*/
struct list_head ptraced;
struct list_head ptrace_entry;
/* PID/PID hash table linkage. */
struct pid *thread_pid;
struct hlist_node pid_links[PIDTYPE_MAX];
struct list_head thread_group;
struct list_head thread_node;
struct completion *vfork_done;
/* CLONE_CHILD_SETTID: */
int __user *set_child_tid;
/* CLONE_CHILD_CLEARTID: */
int __user *clear_child_tid;
u64 utime;
u64 stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
u64 utimescaled;
u64 stimescaled;
#endif
u64 gtime;
struct prev_cputime prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
struct vtime vtime;
#endif
#ifdef CONFIG_NO_HZ_FULL
atomic_t tick_dep_mask;
#endif
/* Context switch counts: */
unsigned long nvcsw;
unsigned long nivcsw;
/* Monotonic time in nsecs: */
u64 start_time;
/* Boot based time in nsecs: */
u64 real_start_time;
/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
unsigned long min_flt;
unsigned long maj_flt;
#ifdef CONFIG_POSIX_TIMERS
struct task_cputime cputime_expires;
struct list_head cpu_timers[3];
#endif
/* Process credentials: */
/* Tracer's credentials at attach: */
const struct cred __rcu *ptracer_cred;
/* Objective and real subjective task credentials (COW): */
const struct cred __rcu *real_cred;
/* Effective (overridable) subjective task credentials (COW): */
const struct cred __rcu *cred;
/*
* executable name, excluding path.
*
* - normally initialized setup_new_exec()
* - access it with [gs]et_task_comm()
* - lock it with task_lock()
*/
char comm[TASK_COMM_LEN];
struct nameidata *nameidata;
#ifdef CONFIG_SYSVIPC
struct sysv_sem sysvsem;
struct sysv_shm sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long last_switch_count;
unsigned long last_switch_time;
#endif
/* Filesystem information: */
struct fs_struct *fs;
/* Open file information: */
struct files_struct *files;
/* Namespaces: */
struct nsproxy *nsproxy;
/* Signal handlers: */
struct signal_struct *signal;
struct sighand_struct *sighand;
sigset_t blocked;
sigset_t real_blocked;
/* Restored if set_restore_sigmask() was used: */
sigset_t saved_sigmask;
struct sigpending pending;
unsigned long sas_ss_sp;
size_t sas_ss_size;
unsigned int sas_ss_flags;
struct callback_head *task_works;
struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
kuid_t loginuid;
unsigned int sessionid;
#endif
struct seccomp seccomp;
/* Thread group tracking: */
u32 parent_exec_id;
u32 self_exec_id;
/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
spinlock_t alloc_lock;
/* Protection of the PI data structures: */
raw_spinlock_t pi_lock;
struct wake_q_node wake_q;
#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task: */
struct rb_root_cached pi_waiters;
/* Updated under owner's pi_lock and rq lock */
struct task_struct *pi_top_task;
/* Deadlock detection and priority inheritance handling: */
struct rt_mutex_waiter *pi_blocked_on;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
/* Mutex deadlock detection: */
struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
unsigned long hardirq_enable_ip;
unsigned long hardirq_disable_ip;
unsigned int hardirq_enable_event;
unsigned int hardirq_disable_event;
int hardirqs_enabled;
int hardirq_context;
unsigned long softirq_disable_ip;
unsigned long softirq_enable_ip;
unsigned int softirq_disable_event;
unsigned int softirq_enable_event;
int softirqs_enabled;
int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
u64 curr_chain_key;
int lockdep_depth;
unsigned int lockdep_recursion;
struct held_lock held_locks[MAX_LOCK_DEPTH];
#endif
#ifdef CONFIG_UBSAN
unsigned int in_ubsan;
#endif
/* Journalling filesystem info: */
void *journal_info;
/* Stacked block device info: */
struct bio_list *bio_list;
#ifdef CONFIG_BLOCK
/* Stack plugging: */
struct blk_plug *plug;
#endif
/* VM state: */
struct reclaim_state *reclaim_state;
struct backing_dev_info *backing_dev_info;
struct io_context *io_context;
/* Ptrace state: */
unsigned long ptrace_message;
kernel_siginfo_t *last_siginfo;
struct task_io_accounting ioac;
#ifdef CONFIG_PSI
/* Pressure stall state */
unsigned int psi_flags;
#endif
#ifdef CONFIG_TASK_XACCT
/* Accumulated RSS usage: */
u64 acct_rss_mem1;
/* Accumulated virtual memory usage: */
u64 acct_vm_mem1;
/* stime + utime since last update: */
u64 acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
/* Protected by ->alloc_lock: */
nodemask_t mems_allowed;
/* Seqence number to catch updates: */
seqcount_t mems_allowed_seq;
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock: */
struct css_set __rcu *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock: */
struct list_head cg_list;
#endif
#ifdef CONFIG_INTEL_RDT
u32 closid;
u32 rmid;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
unsigned long preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
/* Protected by alloc_lock: */
struct mempolicy *mempolicy;
short il_prev;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
int numa_scan_seq;
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
int numa_preferred_nid;
unsigned long numa_migrate_retry;
/* Migration stamp: */
u64 node_stamp;
u64 last_task_numa_placement;
u64 last_sum_exec_runtime;
struct callback_head numa_work;
struct numa_group *numa_group;
/*
* numa_faults is an array split into four regions:
* faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
* in this precise order.
*
* faults_memory: Exponential decaying average of faults on a per-node
* basis. Scheduling placement decisions are made based on these
* counts. The values remain static for the duration of a PTE scan.
* faults_cpu: Track the nodes the process was running on when a NUMA
* hinting fault was incurred.
* faults_memory_buffer and faults_cpu_buffer: Record faults per node
* during the current scan window. When the scan completes, the counts
* in faults_memory and faults_cpu decay and these values are copied.
*/
unsigned long *numa_faults;
unsigned long total_numa_faults;
/*
* numa_faults_locality tracks if faults recorded during the last
* scan window were remote/local or failed to migrate. The task scan
* period is adapted based on the locality of the faults with different
* weights depending on whether they were shared or private faults
*/
unsigned long numa_faults_locality[3];
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_RSEQ
struct rseq __user *rseq;
u32 rseq_len;
u32 rseq_sig;
/*
* RmW on rseq_event_mask must be performed atomically
* with respect to preemption.
*/
unsigned long rseq_event_mask;
#endif
struct tlbflush_unmap_batch tlb_ubc;
struct rcu_head rcu;
/* Cache last used pipe for splice(): */
struct pipe_inode_info *splice_pipe;
struct page_frag task_frag;
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
unsigned int fail_nth;
#endif
/*
* When (nr_dirtied >= nr_dirtied_pause), it's time to call
* balance_dirty_pages() for a dirty throttling pause:
*/
int nr_dirtied;
int nr_dirtied_pause;
/* Start of a write-and-pause period: */
unsigned long dirty_paused_when;
#ifdef CONFIG_LATENCYTOP
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
#endif
/*
* Time slack values; these are used to round up poll() and
* select() etc timeout values. These are in nanoseconds.
*/
u64 timer_slack_ns;
u64 default_timer_slack_ns;
#ifdef CONFIG_KASAN
unsigned int kasan_depth;
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack: */
int curr_ret_stack;
/* Stack of return addresses for return function tracing: */
struct ftrace_ret_stack *ret_stack;
/* Timestamp for last schedule: */
unsigned long long ftrace_timestamp;
/*
* Number of functions that haven't been traced
* because of depth overrun:
*/
atomic_t trace_overrun;
/* Pause tracing: */
atomic_t tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
/* State flags for use by tracers: */
unsigned long trace;
/* Bitmask and counter of trace recursion: */
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_KCOV
/* Coverage collection mode enabled for this task (0 if disabled): */
unsigned int kcov_mode;
/* Size of the kcov_area: */
unsigned int kcov_size;
/* Buffer for coverage collection: */
void *kcov_area;
/* KCOV descriptor wired with this task or NULL: */
struct kcov *kcov;
#endif
#ifdef CONFIG_MEMCG
struct mem_cgroup *memcg_in_oom;
gfp_t memcg_oom_gfp_mask;
int memcg_oom_order;
/* Number of pages to reclaim on returning to userland: */
unsigned int memcg_nr_pages_over_high;
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
#endif
#ifdef CONFIG_BLK_CGROUP
struct request_queue *throttle_queue;
#endif
#ifdef CONFIG_UPROBES
struct uprobe_task *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
unsigned int sequential_io;
unsigned int sequential_io_avg;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long task_state_change;
#endif
int pagefault_disabled;
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
#endif
#ifdef CONFIG_VMAP_STACK
struct vm_struct *stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* A live task holds one reference: */
atomic_t stack_refcount;
#endif
#ifdef CONFIG_LIVEPATCH
int patch_state;
#endif
#ifdef CONFIG_SECURITY
/* Used by LSM modules for access restriction: */
void *security;
#endif
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
unsigned long lowest_stack;
unsigned long prev_lowest_stack;
#endif
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
*/
randomized_struct_fields_end
/* CPU-specific state of this task: */
struct thread_struct thread;
/*
* WARNING: on x86, 'thread_struct' contains a variable-sized
* structure. It *MUST* be at the end of 'task_struct'.
*
* Do not put anything below here!
*/
};
够长吧?——不够。
这个结构里面还用了用到了thread_info这个结构呢,然后接着把thread_info也定义进来 ,thread_info又用了另一个结构呢,你又把里面那个解构定义进来?——疯了。
哪里需要这么复杂呢,这样定义就好了:::
struct task_struct {};
然后你用到哪个成员,往里面补一个int member就结了。打桩,打桩。你打的是桩,不是 真东西。你盖的是脚手架,不是房子啊。脚手架的投资比房子大,房子的建设必然受脚手 架的影响的啊。
你桩做得很精巧,很多用例肯定就上不去的呀。(注1)
再来一个例子。比如你的程序调用了remap_pfn_range(),这个桩应该怎么打?有人会这样 打:::
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
return 0;
}
真是吃饱没事干,你都没有放用例进去呢,这么复杂干什么?这样就可以啦:::
#define remap_pfn_range(...) 0
还有kmalloc应该怎么打?显然你不需要这样:::
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
return malloc(size);
}
这样就可以啦:::
char * testcase101_mem[1024];
void *kmalloc(...)
{
if (testcase==101)
return testcase101_mem;
return NULL;
}
这才是打桩。你要测试的是你的程序,你要全部聚焦到你的程序的:
在各种极端的情况下,查找有没有遗漏?资源有没有泄漏?计算结果是否正确?等等等等 。这些逻辑才是核心,核心的不是你的测试桩。比如下面这个程序:::
static inline int my_iommu_map(struct queue *q,
struct qfile_region *qfr)
{
struct device *dev = q->myhw->pdev;
struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
int i, j, ret;
if (!domain)
return -ENODEV;
for (i = 0; i < qfr->nr_pages; i++) {
get_page(qfr->pages[i]);
ret = iommu_map(domain, qfr->iova + i * PAGE_SIZE,
page_to_phys(qfr->pages[i]),
PAGE_SIZE, qfr->prot | q->uacce->prot);
if (ret) {
dev_err(dev, "iommu_map page %i fail %d\n", i, ret);
goto err_with_map_pages;
}
}
return 0;
err_with_map_pages:
for (j = i-1; j >= 0; j--) {
iommu_unmap(domain, qfr->iova + j * PAGE_SIZE, PAGE_SIZE);
put_page(qfr->pages[j]);
}
return ret;
}
你看,单元测试的聚焦点应该是:nr_pages是0, 1, 10的时候,这个流程是不是正确的 。iommu_map()在第一次,中间,最后一次的时候失败的话,是不是所有map过的page都被 释放了,所有page没有被get过。
(实际上,这个流程是有错误的,但肉眼很难看出来,但单元测试可以很轻易找出这个错 误)
而至于page_to_phys()怎么实现,关你鬼事。那是另一个系统实现得对不对,或者你对那 个系统的预期或者理解对不对的问题,那是集成测试的范畴,不是单元测试的范畴。
写程序的时候我很强调每个语句的“语义”,但单元测试的时候,我们要彻底忘掉“语义”, 眼中只有“循环”,“赋值”,“退出”这样的概念。在这个时候,dev_err不是打印, put_page()不是释放,它们都只是流程中的一个“经过点”。你不是要模拟一个真正的错误 日志或者内存释放。你要验证你设计的流程,是不是按你设定的路径在走。
关于单元测试本身的观点,我表达完了,最后说说构架和哲学。很多人觉得,我“严格要求 ”,事事做到“尽善尽美”肯定是没有错的。这是一个很生动的例子:你的观点是不对的。天 之道,高者抑之,下者举之。把事情做成是要动脑子的,是要“准”的,不是要“极致”的。
而不确定目的,无所谓“准”。
所以,我们要警惕两个设计中的常见误区。第一,不定义目的,不考虑目的,只要“努力”。
第二,寻求表面的好看:好比前面我提到的测试框架。开始的时候其实不是这样的,但发 展一段时间以后,或者独立交个一个团队后,这个团队要展现自己的“能力”,“绩效”,就 忍不住要炫技了。对那个团队来说,这个无可厚非。但对架构师来说,让非主角“好看”,“ 强”,就是让主业务“失去关注和重点”,“弱”。强弱的转变,可以发生得很快。
九层之台,起于垒土。在垒土的阶段就应该呈现为垒土,如果这个阶段就呈现为高台,这 个台就垒不起来。
注1:关于是否自己mock所有的头文件的问题
我基于本文和一个实际的例子在部门内部做了一个培训,现场不少人对不使用Linux的全集 头文件来做桩有不同意见。
这个问题首先这样说:你如果能保证聚焦到把各种极端流程都验证过,你怎么打桩,这个 不是关键问题。我们的重点不在这里。我的关注点是你把头文件搭得太好了,你失去了从 结构,定义这些角度注入数据的能力了。如果你的情形是都能解决,我没有意见。
其次是这样,很多人可能有一个误区,觉得这样打桩法,工作量很大。其实你要试一试, 我的经验是,每个模块的对外接口都是有限的,这个工作量其实相当有限。
最后说点玄学。我大学的时候看了很多黄易的武侠小说,里面的超级高手,常常在大战前 做一件事:为自己削一根长矛,或者打一把剑。以保证自己在大战前,对自己的武器有一 个“点点滴滴”的认识。
我个人觉得自己为每个用到的外部接口打桩,作用也在这里。单元测试的目的,说到底是 提升对自己代码的信心。你一两万行的代码,几十个函数,变成一个模块,然后黑盒地扔 到机架上,BIOS中,然后你一跑,有数据出来了。你知道里面经历了什么?该走的流程有 没有走过,各种边界有没有碰过?
UT是一个削长矛的过程,而确切知道,你自己的模块对各种结构的定义是什么。是这个削 长矛过程的一部分,这个必要性,你做一次就发现了。
这是我的经验。
补充2019/5/10:
这个文档上一次编辑的时间是2018年12月,我写完后,对应团队没有接受我的意见,继续 按原来的方案做UT,我给他们留了一句话:你们这样玩,撑不过一年。现在过了不到半年 ,他们的测试例都废了……这个事情证明了我的观点,但我一点都高兴不起来,因为这些资 源一定程度上也是我的。认知是成本,不过如此。