File 20396-x86-reserve-mem.patch of Package xen
References: bnc#548852
# HG changeset patch
# User Keir Fraser <keir.fraser@citrix.com>
# Date 1257252028 0
# Node ID 7a206c6f216aeb6cb7b7e4deb90f2f84ce1e2ed8
# Parent 450f2ddd2dc6688bb3c5be3636944b33b3b1bb9c
x86: improve reporting through XENMEM_machine_memory_map
Since Dom0 derives machine address ranges usable for assigning PCI
device resources from the output of this sub-hypercall, Xen should
make
sure it properly reports all ranges not suitable for this (as either
reserved or unusable):
- RAM regions excluded via command line option
- memory regions used by Xen itself (LAPIC, IOAPICs)
While the latter should generally already be excluded by the BIOS
provided E820 table, this apparently isn't always the case at least
for IOAPICs, and with Linux having got changed to account for this it
seems to make sense to also do so in Xen.
Generally the HPET range should also be excluded here, but since it
isn't being reflected in Dom0's iomem_caps (and can't be, as it's a
sub-page range) I wasn't sure whether adding explicit code for doing
so would be reasonable.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Parts taken from -unstable c/s 19114.
# HG changeset patch
# User Keir Fraser <keir.fraser@citrix.com>
# Date 1257797143 0
# Node ID 9b393d47b22ae261ac45d503d9370567b8db88b4
# Parent 8a91056bea81b5caa8fbeea59094fbdb8d318d56
e820: fix e820_change_range_type()
In below case, e820_change_range_type() will return success:
[s, e] is in the middle of [rs, re] and e820->nr_map+1 >=
ARRAY_SIZE(e820->map) actually, it's failed, so this patch fix it
Signed-off-by: Xiao Guangrong <ericxiao.gr@gmail.com>
--- a/xen/arch/x86/e820.c
+++ b/xen/arch/x86/e820.c
@@ -324,7 +324,8 @@ static void __init clip_to_limit(uint64_
for ( i = 0; i < e820.nr_map; i++ )
{
- if ( (e820.map[i].addr + e820.map[i].size) <= limit )
+ if ( (e820.map[i].type != E820_RAM) ||
+ ((e820.map[i].addr + e820.map[i].size) <= limit) )
continue;
if ( warnmsg )
{
@@ -333,7 +334,13 @@ static void __init clip_to_limit(uint64_
}
printk("Truncating memory map to %lukB\n",
(unsigned long)(limit >> 10));
- if ( e820.map[i].addr >= limit )
+ if ( e820_change_range_type(&e820, max(e820.map[i].addr, limit),
+ e820.map[i].addr + e820.map[i].size,
+ E820_RAM, E820_UNUSABLE) )
+ {
+ i = 0;
+ }
+ else if ( e820.map[i].addr >= limit )
{
e820.nr_map = i;
}
@@ -391,8 +398,9 @@ static void __init machine_specific_memo
reserve_dmi_region();
}
-/* Reserve RAM area (@s,@e) in the specified e820 map. */
-int __init reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e)
+int __init e820_change_range_type(
+ struct e820map *e820, uint64_t s, uint64_t e,
+ uint32_t orig_type, uint32_t new_type)
{
uint64_t rs = 0, re = 0;
int i;
@@ -406,55 +414,79 @@ int __init reserve_e820_ram(struct e820m
break;
}
- if ( (i == e820->nr_map) || (e820->map[i].type != E820_RAM) )
+ if ( (i == e820->nr_map) || (e820->map[i].type != orig_type) )
return 0;
if ( (s == rs) && (e == re) )
{
- /* Complete excision. */
- memmove(&e820->map[i], &e820->map[i+1],
- (e820->nr_map-i-1) * sizeof(e820->map[0]));
- e820->nr_map--;
- }
- else if ( s == rs )
- {
- /* Truncate start. */
- e820->map[i].addr += e - s;
- e820->map[i].size -= e - s;
- }
- else if ( e == re )
- {
- /* Truncate end. */
- e820->map[i].size -= e - s;
+ e820->map[i].type = new_type;
}
- else if ( e820->nr_map < ARRAY_SIZE(e820->map) )
+ else if ( (s == rs) || (e == re) )
{
- /* Split in two. */
+ if ( (e820->nr_map + 1) > ARRAY_SIZE(e820->map) )
+ goto overflow;
+
memmove(&e820->map[i+1], &e820->map[i],
(e820->nr_map-i) * sizeof(e820->map[0]));
e820->nr_map++;
- e820->map[i].size = s - rs;
- i++;
- e820->map[i].addr = e;
- e820->map[i].size = re - e;
- }
- else
- {
- /* e820map is at maximum size. We have to leak some space. */
- if ( (s - rs) > (re - e) )
+
+ if ( s == rs )
{
- printk("e820 overflow: leaking RAM %"PRIx64"-%"PRIx64"\n", e, re);
- e820->map[i].size = s - rs;
+ e820->map[i].size = e - s;
+ e820->map[i].type = new_type;
+ e820->map[i+1].addr = e;
+ e820->map[i+1].size = re - e;
}
else
{
- printk("e820 overflow: leaking RAM %"PRIx64"-%"PRIx64"\n", rs, s);
- e820->map[i].addr = e;
- e820->map[i].size = re - e;
+ e820->map[i].size = s - rs;
+ e820->map[i+1].addr = s;
+ e820->map[i+1].size = e - s;
+ e820->map[i+1].type = new_type;
}
}
+ else
+ {
+ if ( (e820->nr_map + 2) > ARRAY_SIZE(e820->map) )
+ goto overflow;
+
+ memmove(&e820->map[i+2], &e820->map[i],
+ (e820->nr_map-i) * sizeof(e820->map[0]));
+ e820->nr_map += 2;
+
+ e820->map[i].size = s - rs;
+ e820->map[i+1].addr = s;
+ e820->map[i+1].size = e - s;
+ e820->map[i+1].type = new_type;
+ e820->map[i+2].addr = e;
+ e820->map[i+2].size = re - e;
+ }
+
+ /* Finally, look for any opportunities to merge adjacent e820 entries. */
+ for ( i = 0; i < (e820->nr_map - 1); i++ )
+ {
+ if ( (e820->map[i].type != e820->map[i+1].type) ||
+ ((e820->map[i].addr + e820->map[i].size) != e820->map[i+1].addr) )
+ continue;
+ e820->map[i].size += e820->map[i+1].size;
+ memmove(&e820->map[i+1], &e820->map[i+2],
+ (e820->nr_map-i-2) * sizeof(e820->map[0]));
+ e820->nr_map--;
+ i--;
+ }
return 1;
+
+ overflow:
+ printk("Overflow in e820 while reserving region %"PRIx64"-%"PRIx64"\n",
+ s, e);
+ return 0;
+}
+
+/* Set E820_RAM area (@s,@e) as RESERVED in specified e820 map. */
+int __init reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e)
+{
+ return e820_change_range_type(e820, s, e, E820_RAM, E820_RESERVED);
}
unsigned long __init init_e820(
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -3789,6 +3789,37 @@ long do_update_descriptor(u64 pa, u64 de
typedef struct e820entry e820entry_t;
DEFINE_XEN_GUEST_HANDLE(e820entry_t);
+struct memory_map_context
+{
+ unsigned int n;
+ unsigned long s;
+ struct xen_memory_map map;
+};
+
+static int handle_iomem_range(unsigned long s, unsigned long e, void *p)
+{
+ struct memory_map_context *ctxt = p;
+
+ if ( s > ctxt->s )
+ {
+ e820entry_t ent;
+ XEN_GUEST_HANDLE(e820entry_t) buffer;
+
+ if ( ctxt->n + 1 >= ctxt->map.nr_entries )
+ return -EINVAL;
+ ent.addr = (uint64_t)ctxt->s << PAGE_SHIFT;
+ ent.size = (uint64_t)(s - ctxt->s) << PAGE_SHIFT;
+ ent.type = E820_RESERVED;
+ buffer = guest_handle_cast(ctxt->map.buffer, e820entry_t);
+ if ( __copy_to_guest_offset(buffer, ctxt->n, &ent, 1) < 0 )
+ return -EFAULT;
+ ctxt->n++;
+ }
+ ctxt->s = e + 1;
+
+ return 0;
+}
+
long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
{
int rc;
@@ -3924,9 +3955,9 @@ long arch_memory_op(int op, XEN_GUEST_HA
case XENMEM_machine_memory_map:
{
- struct xen_memory_map memmap;
+ struct memory_map_context ctxt;
XEN_GUEST_HANDLE(e820entry_t) buffer;
- int count;
+ unsigned int i;
int rc;
if ( !IS_PRIV(current->domain) )
@@ -3936,20 +3967,49 @@ long arch_memory_op(int op, XEN_GUEST_HA
if ( rc )
return rc;
- if ( copy_from_guest(&memmap, arg, 1) )
+ if ( copy_from_guest(&ctxt.map, arg, 1) )
return -EFAULT;
- if ( memmap.nr_entries < e820.nr_map + 1 )
+ if ( ctxt.map.nr_entries < e820.nr_map + 1 )
return -EINVAL;
- buffer = guest_handle_cast(memmap.buffer, e820entry_t);
-
- count = min((unsigned int)e820.nr_map, memmap.nr_entries);
- if ( copy_to_guest(buffer, e820.map, count) < 0 )
+ buffer = guest_handle_cast(ctxt.map.buffer, e820entry_t);
+ if ( !guest_handle_okay(buffer, ctxt.map.nr_entries) )
return -EFAULT;
- memmap.nr_entries = count;
+ for ( i = 0, ctxt.n = 0, ctxt.s = 0; i < e820.nr_map; ++i, ++ctxt.n )
+ {
+ unsigned long s = PFN_DOWN(e820.map[i].addr);
+
+ if ( s )
+ {
+ rc = rangeset_report_ranges(current->domain->iomem_caps,
+ ctxt.s, s - 1,
+ handle_iomem_range, &ctxt);
+ if ( !rc )
+ rc = handle_iomem_range(s, s, &ctxt);
+ if ( rc )
+ return rc;
+ }
+ if ( ctxt.map.nr_entries <= ctxt.n + (e820.nr_map - i) )
+ return -EINVAL;
+ if ( __copy_to_guest_offset(buffer, ctxt.n, e820.map + i, 1) < 0 )
+ return -EFAULT;
+ ctxt.s = PFN_UP(e820.map[i].addr + e820.map[i].size);
+ }
+
+ if ( ctxt.s )
+ {
+ rc = rangeset_report_ranges(current->domain->iomem_caps, ctxt.s,
+ ~0UL, handle_iomem_range, &ctxt);
+ if ( !rc && ctxt.s )
+ rc = handle_iomem_range(~0UL, ~0UL, &ctxt);
+ if ( rc )
+ return rc;
+ }
+
+ ctxt.map.nr_entries = ctxt.n;
- if ( copy_to_guest(arg, &memmap, 1) )
+ if ( copy_to_guest(arg, &ctxt.map, 1) )
return -EFAULT;
return 0;
--- a/xen/common/rangeset.c
+++ b/xen/common/rangeset.c
@@ -242,6 +242,24 @@ int rangeset_contains_range(
return contains;
}
+int rangeset_report_ranges(
+ struct rangeset *r, unsigned long s, unsigned long e,
+ int (*cb)(unsigned long s, unsigned long e, void *), void *ctxt)
+{
+ struct range *x;
+ int rc = 0;
+
+ spin_lock(&r->lock);
+
+ for ( x = find_range(r, s); x && (x->s <= e) && !rc; x = next_range(r, x) )
+ if ( x->e >= s )
+ rc = cb(max(x->s, s), min(x->e, e), ctxt);
+
+ spin_unlock(&r->lock);
+
+ return rc;
+}
+
int rangeset_add_singleton(
struct rangeset *r, unsigned long s)
{
--- a/xen/include/asm-x86/e820.h
+++ b/xen/include/asm-x86/e820.h
@@ -24,6 +24,9 @@ struct e820map {
};
extern int reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e);
+extern int e820_change_range_type(
+ struct e820map *e820, uint64_t s, uint64_t e,
+ uint32_t orig_type, uint32_t new_type);
extern unsigned long init_e820(const char *, struct e820entry *, int *);
extern struct e820map e820;
--- a/xen/include/xen/rangeset.h
+++ b/xen/include/xen/rangeset.h
@@ -53,6 +53,9 @@ int __must_check rangeset_remove_range(
struct rangeset *r, unsigned long s, unsigned long e);
int __must_check rangeset_contains_range(
struct rangeset *r, unsigned long s, unsigned long e);
+int rangeset_report_ranges(
+ struct rangeset *r, unsigned long s, unsigned long e,
+ int (*cb)(unsigned long s, unsigned long e, void *), void *ctxt);
/* Add/remove/query a single number. */
int __must_check rangeset_add_singleton(