File linux-2.6-x86_64-memory-hotplug.patch of Package kernel
diff -uNr linux-2.6.17.i386.orig/arch/ia64/mm/numa.c linux-2.6.17.i386/arch/ia64/mm/numa.c
--- linux-2.6.17.i386.orig/arch/ia64/mm/numa.c 2006-09-19 16:55:05.000000000 -0400
+++ linux-2.6.17.i386/arch/ia64/mm/numa.c 2006-09-19 17:05:09.000000000 -0400
@@ -16,6 +16,7 @@
#include <linux/node.h>
#include <linux/init.h>
#include <linux/bootmem.h>
+#include <linux/module.h>
#include <asm/mmzone.h>
#include <asm/numa.h>
@@ -69,4 +70,21 @@
return 0;
}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * SRAT information is stored in node_memblk[], then we can use SRAT
+ * information at memory-hot-add if necessary.
+ */
+
+int memory_add_physaddr_to_nid(u64 addr)
+{
+ int nid = paddr_to_nid(addr);
+ if (nid < 0)
+ return 0;
+ return nid;
+}
+
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
#endif
diff -uNr linux-2.6.17.i386.orig/arch/x86_64/Kconfig linux-2.6.17.i386/arch/x86_64/Kconfig
--- linux-2.6.17.i386.orig/arch/x86_64/Kconfig 2006-09-19 16:55:15.000000000 -0400
+++ linux-2.6.17.i386/arch/x86_64/Kconfig 2006-09-19 16:55:43.000000000 -0400
@@ -379,6 +379,10 @@
source "mm/Kconfig"
+config MEMORY_HOTPLUG_RESERVE
+ def_bool y
+ depends on (MEMORY_HOTPLUG && DISCONTIGMEM)
+
config HAVE_ARCH_EARLY_PFN_TO_NID
def_bool y
depends on NUMA
diff -uNr linux-2.6.17.i386.orig/arch/x86_64/mm/init.c linux-2.6.17.i386/arch/x86_64/mm/init.c
--- linux-2.6.17.i386.orig/arch/x86_64/mm/init.c 2006-09-19 16:55:15.000000000 -0400
+++ linux-2.6.17.i386/arch/x86_64/mm/init.c 2006-09-19 16:55:43.000000000 -0400
@@ -251,12 +251,13 @@
}
static void __meminit
-phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
{
- int i;
+ int i = pmd_index(address);
- for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) {
+ for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
unsigned long entry;
+ pmd_t *pmd = pmd_page + pmd_index(address);
if (address >= end) {
if (!after_bootmem)
@@ -264,6 +265,11 @@
set_pmd(pmd, __pmd(0));
break;
}
+
+ if (pmd_val(*pmd)) {
+ printk (KERN_ERR "%s trying to trample pte entry \
+ %lx@%lx\n",__func__,pmd_val(*pmd),address);
+ }
entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
entry &= __supported_pte_mask;
set_pmd(pmd, __pmd(entry));
@@ -273,45 +279,41 @@
static void __meminit
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
- pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
-
- if (pmd_none(*pmd)) {
- spin_lock(&init_mm.page_table_lock);
- phys_pmd_init(pmd, address, end);
- spin_unlock(&init_mm.page_table_lock);
- __flush_tlb_all();
- }
+ pmd_t *pmd = pmd_offset(pud,0);
+ spin_lock(&init_mm.page_table_lock);
+ phys_pmd_init(pmd, address, end);
+ spin_unlock(&init_mm.page_table_lock);
+ __flush_tlb_all();
}
-static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
{
- long i = pud_index(address);
+ int i = pud_index(addr);
- pud = pud + i;
- if (after_bootmem && pud_val(*pud)) {
- phys_pmd_update(pud, address, end);
- return;
- }
-
- for (; i < PTRS_PER_PUD; pud++, i++) {
+ for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
int map;
- unsigned long paddr, pmd_phys;
+ unsigned long pmd_phys;
+ pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
- paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
- if (paddr >= end)
+ if (addr >= end)
break;
- if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) {
+ if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
set_pud(pud, __pud(0));
continue;
}
+ if (pud_val(*pud)) {
+ phys_pmd_update(pud, addr, end);
+ continue;
+ }
+
pmd = alloc_low_page(&map, &pmd_phys);
spin_lock(&init_mm.page_table_lock);
set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
- phys_pmd_init(pmd, paddr, end);
+ phys_pmd_init(pmd, addr, end);
spin_unlock(&init_mm.page_table_lock);
unmap_low_page(map);
}
@@ -540,19 +542,6 @@
#ifdef CONFIG_MEMORY_HOTPLUG
/*
- * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
- * via probe interface of sysfs. If acpi notifies hot-add event, then it
- * can tell node id by searching dsdt. But, probe interface doesn't have
- * node id. So, return 0 as node id at this time.
- */
-#ifdef CONFIG_NUMA
-int memory_add_physaddr_to_nid(u64 start)
-{
- return 0;
-}
-#endif
-
-/*
* Memory is added always to NORMAL zone. This means you will never get
* additional DMA/DMA32 memory.
*/
@@ -583,6 +572,14 @@
}
EXPORT_SYMBOL_GPL(remove_memory);
+#ifdef CONFIG_NUMA
+#ifndef CONFIG_ACPI_NUMA
+int memory_add_physaddr_to_nid(u64 start)
+{
+ return 0;
+}
+#endif
+#endif
#else /* CONFIG_MEMORY_HOTPLUG */
/*
* Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
diff -uNr linux-2.6.17.i386.orig/arch/x86_64/mm/srat.c linux-2.6.17.i386/arch/x86_64/mm/srat.c
--- linux-2.6.17.i386.orig/arch/x86_64/mm/srat.c 2006-09-19 16:55:15.000000000 -0400
+++ linux-2.6.17.i386/arch/x86_64/mm/srat.c 2006-09-19 16:55:43.000000000 -0400
@@ -21,22 +21,13 @@
#include <asm/numa.h>
#include <asm/e820.h>
-#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
- defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
- && !defined(CONFIG_MEMORY_HOTPLUG)
-#define RESERVE_HOTADD 1
-#endif
-
static struct acpi_table_slit *acpi_slit;
static nodemask_t nodes_parsed __initdata;
static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
+static struct bootnode nodes_add[MAX_NUMNODES];
static int found_add_area __initdata;
int hotadd_percent __initdata = 0;
-#ifndef RESERVE_HOTADD
-#define hotadd_percent 0 /* Ignore all settings */
-#endif
/* Too small nodes confuse the VM badly. Usually they result
from BIOS bugs. */
@@ -157,7 +148,7 @@
pxm, pa->apic_id, node);
}
-#ifdef RESERVE_HOTADD
+#ifdef CONFIG_HOTPLUG_MEMORY_RESERVE
/*
* Protect against too large hotadd areas that would fill up memory.
*/
@@ -200,15 +191,37 @@
return 1;
}
+static int update_end_of_memory(unsigned long end)
+{
+ found_add_area = 1;
+ if ((end >> PAGE_SHIFT) > end_pfn)
+ end_pfn = end >> PAGE_SHIFT;
+ return 1;
+}
+
+static inline int save_add_info(void)
+{
+ return hotadd_percent > 0;
+}
+#else
+int update_end_of_memory(unsigned long end) {return 0;}
+static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+static inline int save_add_info(void) {return 1;}
+#else
+static inline int save_add_info(void) {return 0;}
+#endif
+#endif
/*
- * It is fine to add this area to the nodes data it will be used later
+ * Update nodes_add and decide if to include add are in the zone.
+ * Both SPARSE and RESERVE need nodes_add infomation.
* This code supports one contigious hot add area per node.
*/
static int reserve_hotadd(int node, unsigned long start, unsigned long end)
{
unsigned long s_pfn = start >> PAGE_SHIFT;
unsigned long e_pfn = end >> PAGE_SHIFT;
- int changed = 0;
+ int ret = 0, changed = 0;
struct bootnode *nd = &nodes_add[node];
/* I had some trouble with strange memory hotadd regions breaking
@@ -235,7 +248,6 @@
/* Looks good */
- found_add_area = 1;
if (nd->start == nd->end) {
nd->start = start;
nd->end = end;
@@ -253,14 +265,12 @@
printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
}
- if ((nd->end >> PAGE_SHIFT) > end_pfn)
- end_pfn = nd->end >> PAGE_SHIFT;
+ ret = update_end_of_memory(nd->end);
if (changed)
printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
return 0;
}
-#endif
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
@@ -279,7 +289,7 @@
}
if (ma->flags.enabled == 0)
return;
- if (ma->flags.hot_pluggable && hotadd_percent == 0)
+ if (ma->flags.hot_pluggable && !save_add_info())
return;
start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
@@ -318,15 +328,13 @@
printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
nd->start, nd->end);
-#ifdef RESERVE_HOTADD
- if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
+ if (ma->flags.hot_pluggable && !reserve_hotadd(node, start, end) < 0) {
/* Ignore hotadd region. Undo damage */
printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
*nd = oldnode;
if ((nd->start | nd->end) == 0)
node_clear(node, nodes_parsed);
}
-#endif
}
/* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -342,7 +350,6 @@
unsigned long e = nodes[i].end >> PAGE_SHIFT;
pxmram += e - s;
pxmram -= e820_hole_size(s, e);
- pxmram -= nodes_add[i].end - nodes_add[i].start;
if ((long)pxmram < 0)
pxmram = 0;
}
@@ -450,3 +457,16 @@
}
EXPORT_SYMBOL(__node_distance);
+
+int memory_add_physaddr_to_nid(u64 start)
+{
+ int i, ret = 0;
+
+ for_each_node(i)
+ if (nodes_add[i].start <= start && nodes_add[i].end > start)
+ ret = i;
+
+ return ret;
+}
+
+EXPORT_SYMBOL(memory_add_physaddr_to_nid);
diff -uNr linux-2.6.17.i386.orig/drivers/acpi/acpi_memhotplug.c linux-2.6.17.i386/drivers/acpi/acpi_memhotplug.c
--- linux-2.6.17.i386.orig/drivers/acpi/acpi_memhotplug.c 2006-09-19 16:54:47.000000000 -0400
+++ linux-2.6.17.i386/drivers/acpi/acpi_memhotplug.c 2006-09-19 17:05:22.000000000 -0400
@@ -238,6 +238,8 @@
num_enabled++;
continue;
}
+ if (node < 0)
+ node = memory_add_physaddr_to_nid(info->start_addr);
result = add_memory(node, info->start_addr, info->length);
if (result)
continue;
diff -uNr linux-2.6.17.i386.orig/drivers/acpi/motherboard.c linux-2.6.17.i386/drivers/acpi/motherboard.c
--- linux-2.6.17.i386.orig/drivers/acpi/motherboard.c 2006-09-19 16:54:47.000000000 -0400
+++ linux-2.6.17.i386/drivers/acpi/motherboard.c 2006-09-19 16:55:43.000000000 -0400
@@ -87,6 +87,7 @@
}
} else {
/* Memory mapped IO? */
+ return -EINVAL;
}
if (requested_res)
@@ -96,11 +97,16 @@
static int acpi_motherboard_add(struct acpi_device *device)
{
+ acpi_status status;
if (!device)
return -EINVAL;
- acpi_walk_resources(device->handle, METHOD_NAME__CRS,
+
+ status = acpi_walk_resources(device->handle, METHOD_NAME__CRS,
acpi_reserve_io_ranges, NULL);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
return 0;
}
--- linux-2.6.18.noarch.orig/mm/Kconfig 2006-09-26 10:37:54.000000000 -0400
+++ linux-2.6.18.noarch/mm/Kconfig 2006-09-26 11:02:31.000000000 -0400
@@ -115,12 +115,15 @@
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
- depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on SPARSEMEM && HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
depends on (IA64 || X86 || PPC64)
-comment "Memory hotplug is currently incompatible with Software Suspend"
+comment "Memory hotplug is not guaranteed to work with Software Suspend"
depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
+config MEMORY_HOTPLUG_SPARSE
+ def_bool y
+ depends on SPARSEMEM && MEMORY_HOTPLUG
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
Date: Fri, 29 Sep 2006 22:24:13 -0400
From: Konrad Rzeszutek <konradr@redhat.com>
Subject: [RHEL5 PATCH] RHBZ 208445 - NetLabel hot-add memory confict pre-beta2 kenrel x86_64
RHBZ#:
------
https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=208445
Description:
------------
Extra checking of the pre-Beta2 kernel with hot-add memory has
demonstrated some major bugs in main-line and RHEL5 kernel. Just
two lines and the box crashes after hot-add memory is done.
Not sure how to classify this as bug-after-feature-follow-on.
Please provide ACKs - only two lines are changed, but it is
in common code line patchs.
RHEL Version Found:
------------------
RHEL5 pre Beta2 (2.6.18-1-2702)
Upstream Status:
----------------
This is fresh from the bakery. Being posted on LKML soon.
Test Status:
------------
Tested on IBM xSeries 2-node x460 in Beaverton. Testing of this
will be done in Westford on Monday with various memory
configurations.
Proposed Patch:
---------------
This patch is based on 2.6.18 (RHEL5 pre-Beta2) kernel.
diff -urN linux-2.6.18.x86_64/arch/x86_64/mm/srat.c linux-2.6.18.x86_64-works/arch/x86_64/mm/srat.c
--- linux-2.6.18.x86_64/arch/x86_64/mm/srat.c 2006-09-27 12:48:42.000000000 -0700
+++ linux-2.6.18.x86_64-works/arch/x86_64/mm/srat.c 2006-09-29 16:54:09.000000000 -0700
@@ -204,7 +204,7 @@
return hotadd_percent > 0;
}
#else
-int update_end_of_memory(unsigned long end) {return 0;}
+int update_end_of_memory(unsigned long end) {return -1;}
static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static inline int save_add_info(void) {return 1;}
@@ -269,7 +269,7 @@
if (changed)
printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
- return 0;
+ return ret;
}
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -328,7 +328,7 @@
printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
nd->start, nd->end);
- if (ma->flags.hot_pluggable && !reserve_hotadd(node, start, end) < 0) {
+ if (ma->flags.hot_pluggable && (reserve_hotadd(node, start, end) < 0)) {
/* Ignore hotadd region. Undo damage */
printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
*nd = oldnode;
--
Konrad Rzeszutek 1-(978)-392-3903 or 1-(617)-693-1718
IBM on-site partner.