1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
|
# Commit a5eaac9245f4f382a3cd0e9710e9d1cba7db20e4
# Date 2020-09-07 11:32:34 +0100
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Andrew Cooper <andrew.cooper3@citrix.com>
x86/pv: Fix consistency of 64bit segment bases
The comments in save_segments(), _toggle_guest_pt() and write_cr() are false.
The %fs and %gs bases can be updated at any time by the guest.
As a consequence, Xen's fs_base/etc tracking state is always stale when the
vcpu is in context, and must not be used to complete MSR_{FS,GS}_BASE reads, etc.
In particular, a sequence such as:
wrmsr(MSR_FS_BASE, 0x1ull << 32);
write_fs(__USER_DS);
base = rdmsr(MSR_FS_BASE);
will return the stale base, not the new base. This may cause guest a guest
kernel's context switching of userspace to malfunction.
Therefore:
* Update save_segments(), _toggle_guest_pt() and read_msr() to always read
the segment bases from hardware.
* Update write_cr(), write_msr() and do_set_segment_base() to not not waste
time caching data which is instantly going to become stale again.
* Provide comments explaining when the tracking state is and isn't stale.
This bug has been present for 14 years, but several bugfixes since have built
on and extended the original flawed logic.
Fixes: ba9adb737ba ("Apply stricter checking to RDMSR/WRMSR emulations.")
Fixes: c42494acb2f ("x86: fix FS/GS base handling when using the fsgsbase feature")
Fixed: eccc170053e ("x86/pv: Don't have %cr4.fsgsbase active behind a guest kernels back")
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1546,6 +1546,16 @@ static void load_segments(struct vcpu *n
}
}
+/*
+ * Record all guest segment state. The guest can load segment selectors
+ * without trapping, which will also alter the 64bit FS/GS bases. Arbitrary
+ * changes to bases can also be made with the WR{FS,GS}BASE instructions, when
+ * enabled.
+ *
+ * Guests however cannot use SWAPGS, so there is no mechanism to modify the
+ * inactive GS base behind Xen's back. Therefore, Xen's copy of the inactive
+ * GS base is still accurate, and doesn't need reading back from hardware.
+ */
static void save_segments(struct vcpu *v)
{
struct cpu_user_regs *regs = &v->arch.user_regs;
@@ -1556,14 +1566,15 @@ static void save_segments(struct vcpu *v
regs->fs = read_sreg(fs);
regs->gs = read_sreg(gs);
- /* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */
- if ( (read_cr4() & X86_CR4_FSGSBASE) && !is_pv_32bit_vcpu(v) )
+ if ( !is_pv_32bit_vcpu(v) )
{
- v->arch.pv.fs_base = __rdfsbase();
+ unsigned long gs_base = rdgsbase();
+
+ v->arch.pv.fs_base = rdfsbase();
if ( v->arch.flags & TF_kernel_mode )
- v->arch.pv.gs_base_kernel = __rdgsbase();
+ v->arch.pv.gs_base_kernel = gs_base;
else
- v->arch.pv.gs_base_user = __rdgsbase();
+ v->arch.pv.gs_base_user = gs_base;
}
if ( regs->ds )
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -408,16 +408,19 @@ static void _toggle_guest_pt(struct vcpu
void toggle_guest_mode(struct vcpu *v)
{
+ unsigned long gs_base;
+
ASSERT(!is_pv_32bit_vcpu(v));
- /* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */
- if ( read_cr4() & X86_CR4_FSGSBASE )
- {
- if ( v->arch.flags & TF_kernel_mode )
- v->arch.pv.gs_base_kernel = __rdgsbase();
- else
- v->arch.pv.gs_base_user = __rdgsbase();
- }
+ /*
+ * Update the cached value of the GS base about to become inactive, as a
+ * subsequent context switch won't bother re-reading it.
+ */
+ gs_base = rdgsbase();
+ if ( v->arch.flags & TF_kernel_mode )
+ v->arch.pv.gs_base_kernel = gs_base;
+ else
+ v->arch.pv.gs_base_user = gs_base;
asm volatile ( "swapgs" );
_toggle_guest_pt(v);
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -779,17 +779,6 @@ static int write_cr(unsigned int reg, un
}
case 4: /* Write CR4 */
- /*
- * If this write will disable FSGSBASE, refresh Xen's idea of the
- * guest bases now that they can no longer change.
- */
- if ( (curr->arch.pv.ctrlreg[4] & X86_CR4_FSGSBASE) &&
- !(val & X86_CR4_FSGSBASE) )
- {
- curr->arch.pv.fs_base = __rdfsbase();
- curr->arch.pv.gs_base_kernel = __rdgsbase();
- }
-
curr->arch.pv.ctrlreg[4] = pv_fixup_guest_cr4(curr, val);
write_cr4(pv_make_cr4(curr));
ctxt_switch_levelling(curr);
@@ -838,15 +827,13 @@ static int read_msr(unsigned int reg, ui
case MSR_FS_BASE:
if ( is_pv_32bit_domain(currd) )
break;
- *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdfsbase()
- : curr->arch.pv.fs_base;
+ *val = rdfsbase();
return X86EMUL_OKAY;
case MSR_GS_BASE:
if ( is_pv_32bit_domain(currd) )
break;
- *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdgsbase()
- : curr->arch.pv.gs_base_kernel;
+ *val = rdgsbase();
return X86EMUL_OKAY;
case MSR_SHADOW_GS_BASE:
@@ -975,14 +962,12 @@ static int write_msr(unsigned int reg, u
if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
break;
wrfsbase(val);
- curr->arch.pv.fs_base = val;
return X86EMUL_OKAY;
case MSR_GS_BASE:
if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
break;
wrgsbase(val);
- curr->arch.pv.gs_base_kernel = val;
return X86EMUL_OKAY;
case MSR_SHADOW_GS_BASE:
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -1027,10 +1027,7 @@ long do_set_segment_base(unsigned int wh
{
case SEGBASE_FS:
if ( is_canonical_address(base) )
- {
wrfsbase(base);
- v->arch.pv.fs_base = base;
- }
else
ret = -EINVAL;
break;
@@ -1047,10 +1044,7 @@ long do_set_segment_base(unsigned int wh
case SEGBASE_GS_KERNEL:
if ( is_canonical_address(base) )
- {
wrgsbase(base);
- v->arch.pv.gs_base_kernel = base;
- }
else
ret = -EINVAL;
break;
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -505,7 +505,24 @@ struct pv_vcpu
bool_t syscall32_disables_events;
bool_t sysenter_disables_events;
- /* Segment base addresses. */
+ /*
+ * 64bit segment bases.
+ *
+ * FS and the active GS are always stale when the vCPU is in context, as
+ * the guest can change them behind Xen's back with MOV SREG, or
+ * WR{FS,GS}BASE on capable hardware.
+ *
+ * The inactive GS base is never stale, as guests can't use SWAPGS to
+ * access it - all modification is performed by Xen either directly
+ * (hypercall, #GP emulation), or indirectly (toggle_guest_mode()).
+ *
+ * The vCPU context switch path is optimised based on this fact, so any
+ * path updating or swapping the inactive base must update the cached
+ * value as well.
+ *
+ * Which GS base is active and inactive depends on whether the vCPU is in
+ * user or kernel context.
+ */
unsigned long fs_base;
unsigned long gs_base_kernel;
unsigned long gs_base_user;
|