自从 Linux 4.3 开始,在 Linode 上使用 PaX/grsecurity 时,内核会在被 pv-grub 执行后不久立即崩溃。由于崩溃是在启动后极早期立刻发生的,没有任何可以用来调试的日志,同时公司也不是盖子开的,也没有办法得到母机上有意义的调试信息。这导致了盖子的 VPS 内核从去年 12 月开始被锁定在 4.2.7。由于不知什么时候产生了 Linode 东京机房会在 2016 年 6 月从 Xen 迁移到 KVM 的错觉,也没有花精力去尝试调试这个问题。
然而今年 Linode 周年庆时硬件全部翻倍,惟独东京机房除外。而根据官方最新的说法,新机房乐观估计要第四季度上线。解决内核问题就不得不提上了盖子的日程,首先是手工修复了不少 CVE 高危漏洞,随后又祭出 diff 折腾半天,内核始终会在启动后立刻死亡。而由于 grsecurity 并不提供 git 源,所以 git bisect 也是不可能的,唯一可用的工具只有 Linux 4.2.7 / 补丁文件,与 Linux 4.3.3 / 补丁文件。
在阅读代码差异时,一个很大的挑战是如何区分上游内核的修改与下游 PaX/grsecurity 补丁的修改。直接比较补丁文件会导致代码上下文丢失,让代码的意图不可理解。最后盖子打算编写一个名为 metadiff 的工具,自动比较并去除在上游中出现的代码段,以便仅仅对 PaX/grsecurity 的代码进行比较,就连名字都想好了就叫 metadiff ,但一直没有动手。
直到上个月和 Shawn 聊天时,提到了自己装个 Xen 也不是不可行;于是周六终于动手在 VirutalBox 虚拟机里撞了个 Debian + Xen,又在 Xen 里启动了一个虚拟机,果然很快就得到了内核崩溃的 traceback。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
rip: ffffffff8100b2b0 pmu_msr_read+0x10
flags: 00000282 i s nz
rsp: ffffffff81aeff30
rax: 8000000000000000 rcx: 0000000000000001 rdx: ffffffff81aeffcc
rbx: 00000000c0000080 rsi: ffffffff81aeffa0 rdi: 00000000c0000080
rbp: ffffffff81aeffa0 r8: 0000000000000001 r9: 00000000ffffffff
r10: ffffffff81cf9000 r11: 0000000000000000 r12: ffffffff81aeffcc
r13: ffffffff81aeffc4 r14: ffffffff81aeffc0 r15: 6f73b764afec1c9d
cs: e033 ss: e02b ds: 0000 es: 0000
fs: 0000 @ 0000000000000000
gs: 0000 @ 0000000000000000/0000000000000000
Code (instr addr ffffffff8100b2b0)
00 00 00 00 00 41 54 49 89 d4 55 48 89 f5 53 89 fb 48 83 ec 10 <65> 48 8b 04 25 28 00 00 00 48 89
Stack:
0000000000000001 0000000000000000 0000000000000000 ffffffff8100b2b0
000000010000e030 0000000000010082 ffffffff81aeff70 000000000000e02b
0000000000000000 0000000000000000 00000000c0000080 ffffffff81aeffcc
ffffffff81aeffc8 ffffffff810041c8 ffffffff81aeffc8 ffffffff81aeffcc
Call Trace:
[<ffffffff8100b2b0>] pmu_msr_read+0x10 <--
[<ffffffff8100b2b0>] pmu_msr_read+0x10
[<ffffffff810041c8>] xen_read_msr_safe+0x18
[<ffffffff81be93eb>] xen_start_kernel+0x1b9
|
哦?可见内核在 xen_start_kernel 不久就崩溃了,这是 /* First C function to be called on Xen boot */,在如此早期就崩溃,什么错误日志到看不到也就不奇怪了。来看看 xen_read_msr 和 pmu_msr_read 在 4.2 和 4.3 之间有什么改变:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
--- ../../4.2.7/linux-4.2.7/arch/x86/xen/enlighten.c 2016-09-11 00:44:12.010022936 +0800
+++ arch/x86/xen/enlighten.c 2015-12-15 13:41:43.000000000 +0800
@@ -1030,6 +1034,9 @@ static u64 xen_read_msr_safe(unsigned in
{
u64 val;
+ if (pmu_msr_read(msr, &val, err))
+ return val;
+
val = native_read_msr_safe(msr, err);
switch (msr) {
case MSR_IA32_APICBASE:
@@ -1074,9 +1081,11 @@ static int xen_write_msr_safe(unsigned i
/* Fast syscall setup is all done in hypercalls, so
these are all ignored. Stub them out here to stop
Xen console noise. */
+ break;
default:
- ret = native_write_msr_safe(msr, low, high);
+ if (!pmu_msr_write(msr, low, high, &ret))
+ ret = native_write_msr_safe(msr, low, high);
}
return ret;
|
可见 pmu_msr_read 完全是个新东西,使用 git blame 继续追查。
xen/PMU: Initialization code for Xen PMU 65d0cf0be79feebeb19e7626fd3ed41ae73f642d
xen/PMU: Describe vendor-specific PMU registers e27b72df01109c689062caeba1defa013b759e0e
xen/PMU: Intercept PMU-related MSR and APIC accesses 6b08cd6328c58a2ae190c5ee03a2ffcab5ef828e
xen/PMU: PMU emulation code bf6dfb154d935725c9a2005033ca33017b9df439
发现 PMU 是 Xen 在 4.3 进入主线内核的新特性,于是解决方法就很简单了,把 bf6dfb 和 6b08cd 都撤销就好,接下来的事情就让 PaX Team 和 spender 去追查吧。最后的补丁是:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
|
diff -uprN linux-4.7.3-hardened/arch/x86/xen/apic.c linux-4.7.3-hardened.good/arch/x86/xen/apic.c
--- linux-4.7.3-hardened/arch/x86/xen/apic.c 2016-07-24 19:23:50.000000000 +0000
+++ linux-4.7.3-hardened.good/arch/x86/xen/apic.c 2016-09-10 20:05:21.450647009 +0000
@@ -7,7 +7,6 @@
#include <xen/xen.h>
#include <xen/interface/physdev.h>
#include "xen-ops.h"
-#include "pmu.h"
#include "smp.h"
static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
@@ -73,10 +72,8 @@ static u32 xen_apic_read(u32 reg)
static void xen_apic_write(u32 reg, u32 val)
{
- if (reg == APIC_LVTPC) {
- ( void )pmu_apic_update(reg);
+ if (reg == APIC_LVTPC)
return ;
- }
/* Warn to see if there's any stray references */
WARN(1, "register: %x, value: %x\n" , reg, val);
diff -uprN linux-4.7.3-hardened/arch/x86/xen/enlighten.c linux-4.7.3-hardened.good/arch/x86/xen/enlighten.c
--- linux-4.7.3-hardened/arch/x86/xen/enlighten.c 2016-09-10 19:59:29.237313676 +0000
+++ linux-4.7.3-hardened.good/arch/x86/xen/enlighten.c 2016-09-10 20:06:49.683980342 +0000
@@ -1031,9 +1031,6 @@ static u64 xen_read_msr_safe(unsigned in
{
u64 val;
- if (pmu_msr_read(msr, &val, err))
- return val;
-
val = native_read_msr_safe(msr, err);
switch (msr) {
case MSR_IA32_APICBASE:
@@ -1081,13 +1078,17 @@ static int xen_write_msr_safe(unsigned i
break ;
default :
- if (!pmu_msr_write(msr, low, high, &ret))
- ret = native_write_msr_safe(msr, low, high);
+ ret = native_write_msr_safe(msr, low, high);
}
return ret;
}
+unsigned long long xen_read_pmc( int counter)
+{
+ return 0;
+}
+
static u64 xen_read_msr(unsigned int msr)
{
/*
diff -uprN linux-4.7.3-hardened/arch/x86/xen/pmu.c linux-4.7.3-hardened.good/arch/x86/xen/pmu.c
--- linux-4.7.3-hardened/arch/x86/xen/pmu.c 2016-07-24 19:23:50.000000000 +0000
+++ linux-4.7.3-hardened.good/arch/x86/xen/pmu.c 2016-09-10 20:05:21.450647009 +0000
@@ -13,20 +13,11 @@
/* x86_pmu.handle_irq definition */
#include "../events/perf_event.h"
-#define XENPMU_IRQ_PROCESSING 1
- struct xenpmu {
- /* Shared page between hypervisor and domain */
- struct xen_pmu_data *xenpmu_data;
- uint8_t flags;
-};
- static DEFINE_PER_CPU( struct xenpmu, xenpmu_shared);
-#define get_xenpmu_data() (this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
-#define get_xenpmu_flags() (this_cpu_ptr(&xenpmu_shared)->flags)
-
- /* Macro for computing address of a PMU MSR bank */
-#define field_offset(ctxt, field) (( void *)(( uintptr_t )ctxt + \
- ( uintptr_t )ctxt->field))
+ /* Shared page between hypervisor and domain */
+ static DEFINE_PER_CPU( struct xen_pmu_data *, xenpmu_shared);
+#define get_xenpmu_data() per_cpu(xenpmu_shared, smp_processor_id())
+
/* AMD PMU */
#define F15H_NUM_COUNTERS 6
@@ -60,8 +51,6 @@ static __read_mostly int amd_num_counter
/* Alias registers (0x4c1) for full-width writes to PMCs */
#define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0))
-#define INTEL_PMC_TYPE_SHIFT 30
-
static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters;
@@ -178,232 +167,6 @@ static int is_intel_pmu_msr(u32 msr_inde
}
}
- static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
- int index, bool is_read)
-{
- uint64_t *reg = NULL;
- struct xen_pmu_intel_ctxt *ctxt;
- uint64_t *fix_counters;
- struct xen_pmu_cntr_pair *arch_cntr_pair;
- struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
- uint8_t xenpmu_flags = get_xenpmu_flags();
-
-
- if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
- return false ;
-
- ctxt = &xenpmu_data->pmu.c.intel;
-
- switch (msr) {
- case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- reg = &ctxt->global_ovf_ctrl;
- break ;
- case MSR_CORE_PERF_GLOBAL_STATUS:
- reg = &ctxt->global_status;
- break ;
- case MSR_CORE_PERF_GLOBAL_CTRL:
- reg = &ctxt->global_ctrl;
- break ;
- case MSR_CORE_PERF_FIXED_CTR_CTRL:
- reg = &ctxt->fixed_ctrl;
- break ;
- default :
- switch (type) {
- case MSR_TYPE_COUNTER:
- fix_counters = field_offset(ctxt, fixed_counters);
- reg = &fix_counters[index];
- break ;
- case MSR_TYPE_ARCH_COUNTER:
- arch_cntr_pair = field_offset(ctxt, arch_counters);
- reg = &arch_cntr_pair[index].counter;
- break ;
- case MSR_TYPE_ARCH_CTRL:
- arch_cntr_pair = field_offset(ctxt, arch_counters);
- reg = &arch_cntr_pair[index].control;
- break ;
- default :
- return false ;
- }
- }
-
- if (reg) {
- if (is_read)
- *val = *reg;
- else {
- *reg = *val;
-
- if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL)
- ctxt->global_status &= (~(*val));
- }
- return true ;
- }
-
- return false ;
-}
-
- static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
-{
- uint64_t *reg = NULL;
- int i, off = 0;
- struct xen_pmu_amd_ctxt *ctxt;
- uint64_t *counter_regs, *ctrl_regs;
- struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
- uint8_t xenpmu_flags = get_xenpmu_flags();
-
- if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
- return false ;
-
- if (k7_counters_mirrored &&
- ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3)))
- msr = get_fam15h_addr(msr);
-
- ctxt = &xenpmu_data->pmu.c.amd;
- for (i = 0; i < amd_num_counters; i++) {
- if (msr == amd_ctrls_base + off) {
- ctrl_regs = field_offset(ctxt, ctrls);
- reg = &ctrl_regs[i];
- break ;
- } else if (msr == amd_counters_base + off) {
- counter_regs = field_offset(ctxt, counters);
- reg = &counter_regs[i];
- break ;
- }
- off += amd_msr_step;
- }
-
- if (reg) {
- if (is_read)
- *val = *reg;
- else
- *reg = *val;
-
- return true ;
- }
- return false ;
-}
-
- bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
-{
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- if (is_amd_pmu_msr(msr)) {
- if (!xen_amd_pmu_emulate(msr, val, 1))
- *val = native_read_msr_safe(msr, err);
- return true ;
- }
- } else {
- int type, index;
-
- if (is_intel_pmu_msr(msr, &type, &index)) {
- if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
- *val = native_read_msr_safe(msr, err);
- return true ;
- }
- }
-
- return false ;
-}
-
- bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
-{
- uint64_t val = ((uint64_t)high << 32) | low;
-
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- if (is_amd_pmu_msr(msr)) {
- if (!xen_amd_pmu_emulate(msr, &val, 0))
- *err = native_write_msr_safe(msr, low, high);
- return true ;
- }
- } else {
- int type, index;
-
- if (is_intel_pmu_msr(msr, &type, &index)) {
- if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
- *err = native_write_msr_safe(msr, low, high);
- return true ;
- }
- }
-
- return false ;
-}
-
- static unsigned long long xen_amd_read_pmc( int counter)
-{
- struct xen_pmu_amd_ctxt *ctxt;
- uint64_t *counter_regs;
- struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
- uint8_t xenpmu_flags = get_xenpmu_flags();
-
- if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
- uint32_t msr;
- int err;
-
- msr = amd_counters_base + (counter * amd_msr_step);
- return native_read_msr_safe(msr, &err);
- }
-
- ctxt = &xenpmu_data->pmu.c.amd;
- counter_regs = field_offset(ctxt, counters);
- return counter_regs[counter];
-}
-
- static unsigned long long xen_intel_read_pmc( int counter)
-{
- struct xen_pmu_intel_ctxt *ctxt;
- uint64_t *fixed_counters;
- struct xen_pmu_cntr_pair *arch_cntr_pair;
- struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
- uint8_t xenpmu_flags = get_xenpmu_flags();
-
- if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
- uint32_t msr;
- int err;
-
- if (counter & (1 << INTEL_PMC_TYPE_SHIFT))
- msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff);
- else
- msr = MSR_IA32_PERFCTR0 + counter;
-
- return native_read_msr_safe(msr, &err);
- }
-
- ctxt = &xenpmu_data->pmu.c.intel;
- if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) {
- fixed_counters = field_offset(ctxt, fixed_counters);
- return fixed_counters[counter & 0xffff];
- }
-
- arch_cntr_pair = field_offset(ctxt, arch_counters);
- return arch_cntr_pair[counter].counter;
-}
-
-unsigned long long xen_read_pmc( int counter)
-{
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
- return xen_amd_read_pmc(counter);
- else
- return xen_intel_read_pmc(counter);
-}
-
- int pmu_apic_update(uint32_t val)
-{
- int ret;
- struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
-
- if (!xenpmu_data) {
- pr_warn_once( "%s: pmudata not initialized\n" , __func__);
- return -EINVAL;
- }
-
- xenpmu_data->pmu.l.lapic_lvtpc = val;
-
- if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING)
- return 0;
-
- ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL);
-
- return ret;
-}
-
/* perf callbacks */
static int xen_is_in_guest( void )
{
@@ -476,37 +239,26 @@ static void xen_convert_regs( const struc
irqreturn_t xen_pmu_irq_handler( int irq, void *dev_id)
{
- int err, ret = IRQ_NONE;
+ int ret = IRQ_NONE;
struct pt_regs regs;
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
- uint8_t xenpmu_flags = get_xenpmu_flags();
if (!xenpmu_data) {
pr_warn_once( "%s: pmudata not initialized\n" , __func__);
return ret;
}
- this_cpu_ptr(&xenpmu_shared)->flags =
- xenpmu_flags | XENPMU_IRQ_PROCESSING;
xen_convert_regs(&xenpmu_data->pmu.r.regs, ®s,
xenpmu_data->pmu.pmu_flags);
if (x86_pmu.handle_irq(®s))
ret = IRQ_HANDLED;
- /* Write out cached context to HW */
- err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL);
- this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags;
- if (err) {
- pr_warn_once( "%s: failed hypercall, err: %d\n" , __func__, err);
- return IRQ_NONE;
- }
-
return ret;
}
bool is_xen_pmu( int cpu)
{
- return (get_xenpmu_data() != NULL);
+ return (per_cpu(xenpmu_shared, cpu) != NULL);
}
void xen_pmu_init( int cpu)
@@ -536,8 +288,7 @@ void xen_pmu_init( int cpu)
if (err)
goto fail;
- per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
- per_cpu(xenpmu_shared, cpu).flags = 0;
+ per_cpu(xenpmu_shared, cpu) = xenpmu_data;
if (cpu == 0) {
perf_register_guest_info_callbacks(&xen_guest_cbs);
@@ -565,6 +316,6 @@ void xen_pmu_finish( int cpu)
( void )HYPERVISOR_xenpmu_op(XENPMU_finish, &xp);
- free_pages((unsigned long )per_cpu(xenpmu_shared, cpu).xenpmu_data, 0);
- per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL;
+ free_pages((unsigned long )per_cpu(xenpmu_shared, cpu), 0);
+ per_cpu(xenpmu_shared, cpu) = NULL;
}
diff -uprN linux-4.7.3-hardened/arch/x86/xen/pmu.h linux-4.7.3-hardened.good/arch/x86/xen/pmu.h
--- linux-4.7.3-hardened/arch/x86/xen/pmu.h 2016-07-24 19:23:50.000000000 +0000
+++ linux-4.7.3-hardened.good/arch/x86/xen/pmu.h 2016-09-10 20:05:21.453980342 +0000
@@ -7,9 +7,5 @@ irqreturn_t xen_pmu_irq_handler( int irq,
void xen_pmu_init( int cpu);
void xen_pmu_finish( int cpu);
bool is_xen_pmu( int cpu);
- bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
- bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
- int pmu_apic_update(uint32_t reg);
-unsigned long long xen_read_pmc( int counter);
#endif /* __XEN_PMU_H */
|
打好补丁再编译内核,被智子锁定版本的内核果然升级成功了。
1
2
|
$ uname -r
4.7.3-hardened
|
更新:官方已在 grsecurity-3.1-4.7.4-201609152234.patch 中修复问题,不再需要此 workaround。
原文链接:https://tomli.blog/archives/2016/09/2160.html