File mcelog-0.7-newcpus-1.diff of Package mcelog

From: Andi Kleen <ak@linux.intel.com>
Subject: mcelog decoding support for Intel Tigerton

Backport of the changes for Tigerton/Dunnington/Nehalem changes from mcelog git
git://git.kernel.org/pub/scm/utils/cpu/mce/mcelog.git

The Tigerton support required adding Core2 support, they are all
lumped together. I also added "P6OLD" because that was in the mainline
mcelog git changes and would have been difficult to separate.
The differences to core2 are very minimal (just a few different events).
The actual decoder is all table driven.

In the original git this was done as individual changes, but I lumped
it all together in the backport.

While it adds quite a lot of new code there's not many changes to generic 
code.  Most of the new code is only used on the new CPUs.

diff -x '*~' -urpN mcelog-0.7/bitfield.c mcelog-0.7-newcpus//bitfield.c
--- mcelog-0.7/bitfield.c	1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//bitfield.c	2008-09-26 20:28:29.000000000 +0200
@@ -0,0 +1,61 @@
+#include <string.h>
+#include <stdio.h>
+#include "mcelog.h"
+#include "bitfield.h"
+
+char *reserved_3bits[8];
+char *reserved_1bit[2];
+char *reserved_2bits[4];
+
+static u64 bitmask(u64 i)
+{
+	u64 mask = 1;
+	while (mask < i) 
+		mask = (mask << 1) | 1; 
+	return mask;
+}
+
+void decode_bitfield(u64 status, struct field *fields)
+{
+	struct field *f;
+	int linelen = 0;
+	char *delim = "";
+	
+	for (f = fields; f->str; f++) { 
+		u64 v = (status >> f->start_bit) & bitmask(f->stringlen - 1);
+		char *s = NULL;
+		if (v < f->stringlen)
+			s = f->str[v]; 
+		if (!s) { 
+			if (v == 0) 
+				continue;
+			char buf[60];
+			s = buf; 
+			snprintf(buf, sizeof buf, "<%u:%Lx>", f->start_bit, v);
+		}
+		int len = strlen(s);
+		if (linelen + len > 75) {
+			delim = "\n";
+			linelen = 0;
+		}
+		Wprintf("%s%s", delim, s);
+		delim = " ";
+		linelen += len + 1; 
+	}
+	if (linelen > 0) 
+		Wprintf("\n");
+}
+
+void decode_numfield(u64 status, struct numfield *fields)
+{
+	struct numfield *f;
+	for (f = fields; f->name; f++) {
+		u64 mask = (1ULL << (f->end - f->start - 1)) - 1;
+		u64 v = (status >> f->start) & mask;
+		if (v > 0) { 
+			char fmt[30];
+			snprintf(fmt, 30, "%%s: %s\n", f->fmt ? f->fmt : "%Lu");
+			Wprintf(fmt, f->name, v);
+		}
+	}
+}
diff -x '*~' -urpN mcelog-0.7/bitfield.h mcelog-0.7-newcpus//bitfield.h
--- mcelog-0.7/bitfield.h	1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//bitfield.h	2008-09-26 20:28:29.000000000 +0200
@@ -0,0 +1,27 @@
+/* Generic bitfield decoder */
+
+struct field {
+	int start_bit;
+	char **str;
+	int stringlen;
+};
+
+struct numfield { 
+	int start, end;
+	char *name;
+	char *fmt;
+};
+
+#define FIELD(start_bit, name) { start_bit, name, NELE(name) }
+#define SBITFIELD(start_bit, string) { start_bit, ((char * [2]) { NULL, string }), 2 }
+
+#define NUMBER(start, end, name) { start, end, name, "%Lu" }
+#define HEXNUMBER(start, end, name) { start, end, name, "%Lx" }
+
+void decode_bitfield(u64 status, struct field *fields);
+void decode_numfield(u64 status, struct numfield *fields);
+
+extern char *reserved_3bits[8];
+extern char *reserved_1bit[2];
+extern char *reserved_2bits[4];
+
diff -x '*~' -urpN mcelog-0.7/core2.c mcelog-0.7-newcpus//core2.c
--- mcelog-0.7/core2.c	1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//core2.c	2008-09-26 20:21:18.000000000 +0200
@@ -0,0 +1,105 @@
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "mcelog.h"
+#include "core2.h"
+#include "bitfield.h"
+
+/* Decode P6 family (Core2) model specific errors. 
+   The generic errors are decoded in p4.c */
+
+/* [19..24] */
+static char *bus_queue_req_type[] = {
+	[0] = "BQ_DCU_READ_TYPE",
+	[2] = "BQ_IFU_DEMAND_TYPE",
+	[3] = "BQ_IFU_DEMAND_NC_TYPE",
+	[4] = "BQ_DCU_RFO_TYPE",
+	[5] = "BQ_DCU_RFO_LOCK_TYPE",
+	[6] = "BQ_DCU_ITOM_TYPE",
+	[8] = "BQ_DCU_WB_TYPE",
+	[10] = "BC_DCU_WCEVICT_TYPE", 
+	[11] = "BQ_DCU_WCLINE_TYPE",
+	[12] = "BQ_DCU_BTM_TYPE",
+	[13] = "BQ_DCU_INTACK_TYPE",
+	[14] = "BQ_DCU_INVALL2_TYPE",
+	[15] = "BQ_DCU_FLUSHL2_TYPE",
+	[16] = "BQ_DCU_PART_RD_TYPE",
+	[18] = "BQ_DCU_PART_WR_TYPE",
+	[20] = "BQ_DCU_SPEC_CYC_TYPE",
+	[24] = "BQ_DCU_IO_RD_TYPE",
+	[25] = "BQ_DCU_IO_WR_TYPE",
+	[28] = "BQ_DCU_LOCK_RD_TYPE",
+	[30] = "BQ_DCU_SPLOCK_RD_TYPE",
+	[29] = "BQ_DCU_LOCK_WR_TYPE",
+};
+
+/* [25..27] */
+static char *bus_queue_error_type[] = {
+	[0] = "BQ_ERR_HARD_TYPE",
+	[1] = "BQ_ERR_DOUBLE_TYPE",
+	[2] = "BQ_ERR_AERR2_TYPE",
+	[4] = "BQ_ERR_SINGLE_TYPE",
+	[5] = "BQ_ERR_AERR1_TYPE",
+};
+
+static struct field p6_shared_status[] = { 
+	FIELD(16, reserved_3bits),
+	FIELD(19, bus_queue_req_type),
+	FIELD(25, bus_queue_error_type),
+	FIELD(25, bus_queue_error_type),
+	SBITFIELD(30, "internal BINIT"),
+	SBITFIELD(36, "received parity error on response transaction"),
+	SBITFIELD(38, "timeout BINIT (ROB timeout)."
+		  " No micro-instruction retired for some time"),
+	FIELD(39, reserved_3bits),
+	SBITFIELD(42, "bus transaction received hard error response"),
+	SBITFIELD(43, "failure that caused IERR"),
+	/* The following are reserved for Core in the SDM. Let's keep them here anyways*/
+	SBITFIELD(44, "two failing bus transactions with address parity error (AERR)"),
+	SBITFIELD(45, "uncorrectable ECC error"),
+	SBITFIELD(46, "correctable ECC error"),
+	/* [47..54]: ECC syndrome */
+	FIELD(55, reserved_2bits),
+	{},
+};
+
+static struct field p6old_status[] = { 
+	SBITFIELD(28, "FRC error"),
+	SBITFIELD(29, "BERR on this CPU"),
+	FIELD(31, reserved_1bit),
+	FIELD(32, reserved_3bits),
+	SBITFIELD(35, "BINIT received from external bus"),
+	SBITFIELD(37, "Received hard error reponse on split transaction (Bus BINIT)"),
+	{}
+};
+
+static struct field core2_status[] = {
+	SBITFIELD(28, "MCE driven"),
+	SBITFIELD(29, "MCE is observed"),
+	SBITFIELD(31, "BINIT observed"),
+	FIELD(32, reserved_2bits),
+	SBITFIELD(34, "PIC or FSB data parity error"),
+	FIELD(35, reserved_1bit),
+	SBITFIELD(37, "FSB address parity error detected"),
+	{}
+};
+
+static struct numfield p6old_status_numbers[] = { 
+	HEXNUMBER(47, 54, "ECC syndrome"),
+	{}
+};
+
+void core2_decode_model(u64 status)
+{	
+	decode_bitfield(status, p6_shared_status);
+	decode_bitfield(status, core2_status);
+	/* Normally reserved, but let's parse anyways: */
+	decode_numfield(status, p6old_status_numbers);
+}
+
+void p6old_decode_model(u64 status)
+{
+	decode_bitfield(status, p6_shared_status);
+	decode_bitfield(status, p6old_status);	
+	decode_numfield(status, p6old_status_numbers);
+}
diff -x '*~' -urpN mcelog-0.7/core2.h mcelog-0.7-newcpus//core2.h
--- mcelog-0.7/core2.h	1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//core2.h	2008-09-26 20:21:18.000000000 +0200
@@ -0,0 +1,2 @@
+void core2_decode_model(u64 status);
+void p6old_decode_model(u64 status);
diff -x '*~' -urpN mcelog-0.7/dunnington.c mcelog-0.7-newcpus//dunnington.c
--- mcelog-0.7/dunnington.c	1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//dunnington.c	2008-09-26 20:24:24.000000000 +0200
@@ -0,0 +1,123 @@
+/* Copyright (c) 2008 by Intel Corp.
+   Decode Intel Xeon Processor 7400 Model (Dunnington) specific MCEs
+
+   mcelog is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public
+   License as published by the Free Software Foundation; version
+   2.
+
+   mcelog is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should find a copy of v2 of the GNU General Public License somewhere
+   on your Linux system; if not, write to the Free Software Foundation,
+   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+   Author:
+	Andi Kleen
+*/
+
+/* other files 
+
+mcelog.h CPU_DUNNINGTON
+mcelog.c: cputype name
+intel.h CASE_INTEL_CPUS
+intel.c model == 0x1d CPU_DUNNINGTON
+p4.c: if (cpu == CPU_DUNNINGTON) dunnington_decode_model(log->status);
+      add to CORE2 cases
+
+*/
+
+#include <stddef.h>
+#include "mcelog.h"
+#include "bitfield.h"
+#include "dunnington.h"
+
+/* Follows Intel IA32 SDM 3b Appendix E.2.1 ++ */
+
+static struct field dunnington_bus_status[] = {
+	SBITFIELD(16, "Parity error detected during FSB request phase"),
+	FIELD(17, reserved_3bits),
+	SBITFIELD(20, "Hard Failure response received for a local transaction"),
+	SBITFIELD(21, "Parity error on FSB response field detected"),
+	SBITFIELD(22, "Parity data error on inbound data detected"),
+	FIELD(23, reserved_3bits),
+	FIELD(25, reserved_3bits),
+	FIELD(28, reserved_3bits),
+	FIELD(31, reserved_1bit),
+	{}
+};
+
+static char *dnt_front_error[0xf] = {
+	[0x1] = "Inclusion error from core 0",
+	[0x2] = "Inclusion error from core 1",
+	[0x3] = "Write Exclusive error from core 0",
+	[0x4] = "Write Exclusive error from core 1",
+	[0x5] = "Inclusion error from FSB",
+	[0x6] = "SNP stall error from FSB",
+	[0x7] = "Write stall error from FSB",
+	[0x8] = "FSB Arbiter Timeout error",
+	[0xA] = "Inclusion error from core 2",
+	[0xB] = "Write exclusive error from core 2",
+};
+
+static char *dnt_int_error[0xf] = {
+	[0x2] = "Internal timeout error",
+	[0x3] = "Internal timeout error",
+	[0x4] = "Intel Cache Safe Technology Queue full error\n"
+	        "or disabled ways in a set overflow",
+	[0x5] = "Quiet cycle timeout error (correctable)",
+};
+
+struct field dnt_int_status[] = {
+	FIELD(8, dnt_int_error),
+	{}
+};
+
+struct field dnt_front_status[] = {
+	FIELD(0, dnt_front_error),
+	{}
+};
+
+struct field dnt_cecc[] = {
+	SBITFIELD(1, "Correctable ECC event on outgoing core 0 data"),
+	SBITFIELD(2, "Correctable ECC event on outgoing core 1 data"),
+	SBITFIELD(3, "Correctable ECC event on outgoing core 3 data"),
+	{}
+};
+
+struct field dnt_uecc[] = {
+	SBITFIELD(1, "Uncorrectable ECC event on outgoing core 0 data"),
+	SBITFIELD(2, "Uncorrectable ECC event on outgoing core 1 data"),
+	SBITFIELD(3, "Uncorrectable ECC event on outgoing core 3 data"),
+	{}
+};
+
+static void dunnington_decode_bus(u64 status)
+{
+	decode_bitfield(status, dunnington_bus_status);
+}
+
+static void dunnington_decode_internal(u64 status)
+{
+	u32 mca = (status >> 16) & 0xffff;
+	if ((mca & 0xfff0) == 0)
+		decode_bitfield(status, dnt_front_status);
+	else if ((mca & 0xf0ff) == 0)
+		decode_bitfield(status, dnt_int_status);
+	else if ((mca & 0xfff0) == 0xc000)
+		decode_bitfield(status, dnt_cecc);
+	else if ((mca & 0xfff0) == 0xe000)
+		decode_bitfield(status, dnt_uecc);
+}
+
+void dunnington_decode_model(u64 status)
+{
+	if ((status & 0xffff) == 0xe0f)
+		dunnington_decode_bus(status);
+	else if ((status & 0xffff) == (1 << 10))
+		dunnington_decode_internal(status);
+}
+
diff -x '*~' -urpN mcelog-0.7/dunnington.h mcelog-0.7-newcpus//dunnington.h
--- mcelog-0.7/dunnington.h	1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//dunnington.h	2008-09-26 20:24:24.000000000 +0200
@@ -0,0 +1,2 @@
+void dunnington_decode_model(u64 status);
+
diff -x '*~' -urpN mcelog-0.7/intel.c mcelog-0.7-newcpus//intel.c
--- mcelog-0.7/intel.c	1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//intel.c	2008-09-26 20:32:52.000000000 +0200
@@ -0,0 +1,22 @@
+#include "mcelog.h"
+#include "intel.h"
+#include <stdio.h>
+
+enum cputype select_intel_cputype(int family, int model)
+{
+	if (family == 15) { 
+		return CPU_P4;
+	} 
+	if (family == 6) { 
+		if (model < 0xf) 
+			return CPU_P6OLD;
+		else if (model == 0xf || model == 0x17) /* Merom/Penryn */
+			return CPU_CORE2;
+		else if (model == 0x1d)
+			return CPU_DUNNINGTON;
+		else if (model == 0x1a)
+			return CPU_NEHALEM;
+	}
+	fprintf(stderr, "Unknown Intel CPU type family %x model %x\n", family, model);
+	return family == 6 ? CPU_P6OLD : CPU_GENERIC;
+}
diff -x '*~' -urpN mcelog-0.7/intel.h mcelog-0.7-newcpus//intel.h
--- mcelog-0.7/intel.h	1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//intel.h	2008-09-26 20:32:00.000000000 +0200
@@ -0,0 +1,9 @@
+enum cputype select_intel_cputype(int family, int model);
+
+#define CASE_INTEL_CPUS \
+	case CPU_P6OLD: \
+	case CPU_CORE2: \
+	case CPU_NEHALEM: \
+	case CPU_DUNNINGTON: \
+	case CPU_P4
+
diff -x '*~' -urpN mcelog-0.7/Makefile mcelog-0.7-newcpus//Makefile
--- mcelog-0.7/Makefile	2006-05-03 08:55:54.000000000 +0200
+++ mcelog-0.7-newcpus//Makefile	2008-09-26 21:07:21.000000000 +0200
@@ -5,7 +5,8 @@ all: mcelog
 
 .PHONY: install clean
 
-mcelog: p4.o k8.o mcelog.o dmi.o
+mcelog: p4.o k8.o mcelog.o dmi.o core2.o dunnington.o nehalem.o \
+	bitfield.o intel.o
 
 p4.o: p4.c mcelog.h p4.h
 k8.o: k8.c mcelog.h k8.h
@@ -18,7 +19,8 @@ install: mcelog.c
 	echo "call mcelog regularly from your crontab"
 
 clean:
-	rm -f mcelog mcelog.o k8.o p4.o dmi.o dmi
+	rm -f mcelog mcelog.o k8.o p4.o dmi.o dmi core2.o dunnington.o \
+		nehalem.o bitfield.o intel.o
 
 dmi:	dmi.c
 	gcc -o dmi ${CFLAGS} -DSTANDALONE dmi.c ${LDFLAGS}
diff -x '*~' -urpN mcelog-0.7/mcelog.8 mcelog-0.7-newcpus//mcelog.8
--- mcelog-0.7/mcelog.8	2006-05-03 08:55:54.000000000 +0200
+++ mcelog-0.7-newcpus//mcelog.8	2008-09-26 20:42:44.000000000 +0200
@@ -2,9 +2,9 @@
 .SH NAME
 mcelog \- Print machine check log from x86-64 kernel.
 .SH SYNOPSIS
-mcelog [\-\-syslog] [\-\-k8|\-\-p4|\-\-generic] [\-\-ignorenodev] [\-\-dmi] [\-\-filter] [device]
+mcelog [\-\-syslog] [\-\-k8|\-\-p4|\-\-generic|...] [\-\-ignorenodev] [\-\-dmi] [\-\-filter] [device]
 .br
-mcelog [\-\-k8|\-\-p4|\-\-generic] \-\-ascii
+mcelog [\-\-k8|\-\-p4|\-\-generic|...] \-\-ascii
 .SH DESCRIPTION
 Linux x86-64 kernels since 2.6.4 don't print recoverable machine check errors
 to the kernel log anymore. Instead they are saved into a special 
@@ -18,13 +18,21 @@ When the
 .B \-\-syslog
 option is specified redirect output to system log.
 
+
 When
 .B \-\-k8
 is specified assume the events are for a AMD Opteron or Athlon 64 or Athlon
 FX CPU. 
 With
 .B \-\-p4
-is specified assume the events are for a Intel Pentium 4 or Intel Xeon.
+is specified assume the events are for a Intel Pentium 4 or Intel (older) Xeon.
+With 
+.B \-\-core2
+assume the events are for a Intel Core2 CPU or Intel Xeon 3000, 3200, 5100, 5300, 7300 
+series. When
+.B \-\-intel-cpu=family,model
+are specified then the family number and model number of the Intel CPU
+to be decoded should be specified (can be found in /proc/cpuinfo).
 When 
 .B \-\-generic 
 all the fields are dumped without CPU specific decoding.
diff -x '*~' -urpN mcelog-0.7/mcelog.c mcelog-0.7-newcpus//mcelog.c
--- mcelog-0.7/mcelog.c	2006-05-03 08:55:54.000000000 +0200
+++ mcelog-0.7-newcpus//mcelog.c	2008-09-26 20:45:50.000000000 +0200
@@ -31,12 +31,10 @@
 #include "k8.h"
 #include "p4.h"
 #include "dmi.h"
+#include "intel.h"
 
-enum {
-	CPU_GENERIC,
-	CPU_K8,
-	CPU_P4
-} cpu = CPU_GENERIC;	
+
+enum cputype cpu = CPU_GENERIC;	
 
 char *logfn = "/dev/mcelog";
 
@@ -62,8 +60,8 @@ char *bankname(unsigned bank)
 	switch (cpu) { 
 	case CPU_K8:
 		return k8_bank_name(bank);
-	case CPU_P4:
-		return p4_bank_name(bank);
+	CASE_INTEL_CPUS:
+		return intel_bank_name(bank);
 	/* add banks of other cpu types here */
 	default:
 		sprintf(numeric, "BANK %d", bank); 
@@ -98,7 +96,7 @@ int mce_filter(struct mce *m)
 	case CPU_K8:
 		return mce_filter_k8(m);
 		/* add more buggy CPUs here */
-	case CPU_P4:
+	CASE_INTEL_CPUS:
 		/* No bugs known */
 		return 1;
 	default:
@@ -134,8 +132,8 @@ void dump_mce(struct mce *m)
 	case CPU_K8:
 		decode_k8_mc(m); 
 		break;
-	case CPU_P4:
-		decode_p4_mc(m);
+	CASE_INTEL_CPUS:
+		decode_intel_mc(m, cpu);
 		break;
 	/* add handlers for other CPUs here */
 	default:
@@ -153,23 +151,27 @@ void check_cpu(void)
 	if (f != NULL) { 
 		int found = 0; 
 		int family; 
+		int model;
 		char vendor[64];
 		char *line = NULL;
 		size_t linelen = 0; 
-		while (getdelim(&line, &linelen, '\n', f) > 0 && found < 2) { 
+		while (getdelim(&line, &linelen, '\n', f) > 0 && found < 3) { 
 			if (sscanf(line, "vendor_id : %63[^\n]", vendor) == 1) 
 				found++; 
 			if (sscanf(line, "cpu family : %d", &family) == 1)
 				found++;
+			if (sscanf(line, "model : %d", &model) == 1)
+				found++;
 		} 
-		if (found == 2) {
+		if (found == 3) {
 			if (!strcmp(vendor,"AuthenticAMD") && family == 15)
 				cpu = CPU_K8;
-			if (!strcmp(vendor,"GenuineIntel") && family == 15)
-				cpu = CPU_P4;
+			if (!strcmp(vendor,"GenuineIntel"))
+				cpu = select_intel_cputype(family, model);
 			/* Add checks for other CPUs here */	
 		} else {
-			fprintf(stderr, "mcelog: warning: Cannot parse /proc/cpuinfo\n"); 
+			fprintf(stderr, 
+			"mcelog: warning: Cannot parse /proc/cpuinfo\n"); 
 		} 
 		fclose(f);
 		free(line);
@@ -303,9 +305,11 @@ void usage(void)
 {
 	fprintf(stderr, 
 		"Usage:\n"
-		"  mcelog [--k8|--p4|--generic] [--ignorenodev] [--dmi] [--syslog] [--filter] [mcelogdevice]\n"
+		"  mcelog options [--ignorenodev] [--dmi] [--syslog] [--filter] [mcelogdevice]\n"
 		"Decode machine check error records from kernel\n"
-		"  mcelog [--k8|--p4|--generic] [--dmi] --ascii < log\n"
+		"  mcelog options [--dmi] --ascii < log\n"
+		"Options:\n"
+		"--p4|--k8|--core2|--generic|--intel-cpu=family,model Set CPU type to decode\n"
 		"Decode machine check ASCII output from kernel logs\n");
 	exit(1);
 }
@@ -318,6 +322,17 @@ int modifier(char *s)
 		cpu = CPU_P4;
 	} else if (!strcmp(s, "--generic")) { 
 		cpu = CPU_GENERIC;
+	} else if (!strcmp(s, "--core2")) { 
+		cpu = CPU_CORE2;
+	} else if (!strncmp(s, "--intel-cpu=", 12)) { 
+		unsigned fam, mod;
+		if (sscanf(s + 12, "%i,%i", &fam, &mod) != 2)
+			usage();
+		cpu = select_intel_cputype(fam, mod);
+		if (cpu == CPU_GENERIC) {
+			fprintf(stderr, "Unknown Intel CPU\n");
+			usage();
+		}
 	} else if (!strcmp(s, "--ignorenodev")) { 
 		ignore_nodev = 1;
 	} else if (!strcmp(s,"--filter")) { 
diff -x '*~' -urpN mcelog-0.7/mcelog.h mcelog-0.7-newcpus//mcelog.h
--- mcelog-0.7/mcelog.h	2006-05-03 08:55:54.000000000 +0200
+++ mcelog-0.7-newcpus//mcelog.h	2008-09-26 20:28:19.000000000 +0200
@@ -61,3 +61,13 @@ struct mce {
 #endif
 
 void Wprintf(char *fmt, ...) PRINTFLIKE;
+
+enum cputype {
+	CPU_GENERIC,
+	CPU_K8,
+	CPU_P4,
+	CPU_NEHALEM,
+	CPU_DUNNINGTON,
+	CPU_P6OLD,
+	CPU_CORE2,
+};
diff -x '*~' -urpN mcelog-0.7/nehalem.c mcelog-0.7-newcpus//nehalem.c
--- mcelog-0.7/nehalem.c	1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//nehalem.c	2008-09-26 20:24:20.000000000 +0200
@@ -0,0 +1,163 @@
+/* Copyright (C) 2008 Intel Corporation
+   Decode Intel Nehalem specific machine check errors.
+
+   mcelog is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public
+   License as published by the Free Software Foundation; version
+   2.
+
+   mcelog is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should find a copy of v2 of the GNU General Public License somewhere
+   on your Linux system; if not, write to the Free Software Foundation, 
+   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 
+
+   Author: Andi Kleen 
+*/
+
+/* other files 
+
+mcelog.h CPU_NEHALEM
+intel.h CASE_INTEL_CPUS
+intel.c model == 0x1a CPU_NEHALEM
+p4.c: if (cpu == CPU_NEHALEM) nehalem_decode_model(log->status, log->misc);
+      if (test_prefix(status, 7)) decode_memory_controller(log->status);
+mcelog.c/p4.c:  syslog/trigger for memory controller
+      cputype_name
+*/
+
+#include <string.h>
+#include <stdio.h>
+#include "mcelog.h"
+#include "nehalem.h"
+#include "core2.h"
+#include "bitfield.h"
+
+/* See IA32 SDM Vol3B Appendix E.3.2 ff */
+
+/* MC1_STATUS error */
+static struct field qpi_status[] = {
+	SBITFIELD(16, "QPI header had bad parity"),
+	SBITFIELD(17, "QPI Data packet had bad parity"),
+	SBITFIELD(18, "Number of QPI retries exceeded"),
+	SBITFIELD(19, "Received QPI data packet that was poisoned by sender"),
+	SBITFIELD(20, "QPI reserved 20"),
+	SBITFIELD(21, "QPI reserved 21"),
+	SBITFIELD(22, "QPI received unsupported message encoding"),
+	SBITFIELD(23, "QPI credit type is not supported"),
+	SBITFIELD(24, "Sender sent too many QPI flits to the receiver"),
+	SBITFIELD(25, "QPI Sender sent a failed response to receiver"),
+	SBITFIELD(26, "Clock jitter detected in internal QPI clocking"),
+	{}
+}; 
+
+static struct field qpi_misc[] = {
+	SBITFIELD(14, "QPI misc reserved 14"),
+	SBITFIELD(15, "QPI misc reserved 15"),
+	SBITFIELD(24, "QPI Interleave/Head Indication Bit (IIB)"),
+	{}
+};
+
+static struct numfield qpi_numbers[] = {
+	HEXNUMBER(0, 7, "QPI class and opcode of packet with error"),
+	HEXNUMBER(8, 13, "QPI Request Transaction ID"),
+	NUMBER(16, 18, "QPI Requestor/Home Node ID (RHNID)"),
+	HEXNUMBER(19, 23, "QPI miscreserved 19-23"),
+};
+
+static struct field memory_controller_status[] = {
+	SBITFIELD(16, "Memory read ECC error"),
+	SBITFIELD(17, "Memory ECC error occurred during scrub"),
+	SBITFIELD(18, "Memory write parity error"),
+	SBITFIELD(19, "Memory error in half of redundant memory"),
+	SBITFIELD(20, "Memory reserved 20"),
+	SBITFIELD(21, "Memory access out of range"),
+	SBITFIELD(22, "Memory internal RTID invalid"), 
+	SBITFIELD(23, "Memory address parity error"),
+	SBITFIELD(24, "Memory byte enable parity error"),
+	{}
+};
+
+static struct numfield memory_controller_numbers[] = {
+	HEXNUMBER(0, 7, "Memory transaction Tracker ID (RTId)"),
+	HEXNUMBER(8, 15, "Memory MISC reserved 8..15"),
+	NUMBER(16, 17, "Memory DIMM ID of error"),
+	NUMBER(18, 19, "Memory channel ID of error"),
+	HEXNUMBER(32, 63, "Memory ECC syndrome"),
+	HEXNUMBER(25, 37, "Memory MISC reserved 25..37"),
+	NUMBER(38, 52, "Memory corrected error count (CORE_ERR_CNT)"),
+	HEXNUMBER(53, 56, "Memory MISC reserved 53..56"),
+	{}
+};
+
+static char *internal_errors[] = {
+	[0x0]  = "No Error",
+	[0x3]  = "Reset firmware did not complete",
+	[0x8]  = "Received an invalid CMPD",
+	[0xa]  = "Invalid Power Management Request",
+	[0xd]  = "Invalid S-state transition",
+	[0x11] = "VID controller does not match POC controller selected",
+	[0x1a] = "MSID from POC does not match CPU MSID",
+};
+
+static struct field internal_error_status[] = {
+	FIELD(24, internal_errors),
+	{}
+};
+
+static struct numfield internal_error_numbers[] = { 
+	HEXNUMBER(16, 23, "Internal machine check status reserved 16..23"),
+	HEXNUMBER(32, 56, "Internal machine check status reserved 32..56"),
+	{},
+};
+
+/* Generic architectural memory controller encoding */
+
+static char *mmm_mnemonic[] = { 
+	"GEN", "RD", "WR", "AC", "MS", "RES5", "RES6", "RES7" 
+};
+static char *mmm_desc[] = { 
+	"Generic undefined request",
+	"Memory read error",
+	"Memory write error",
+	"Address/Command error",
+	"Memory scrubbing error",
+	"Reserved 5",
+	"Reserved 6",
+	"Reserved 7"
+};
+
+void decode_memory_controller(u32 status)
+{
+	char channel[30];
+	if ((status & 0xf) == 0xf) 
+		strcpy(channel, "unspecified"); 
+	else
+		sprintf(channel, "%u", status & 0xf);
+	Wprintf("MEMORY CONTROLLER %s_CHANNEL%s_ERR\n", 
+		mmm_mnemonic[(status >> 4) & 7],
+		channel);
+	Wprintf("Transaction: %s\n", mmm_desc[(status >> 4) & 7]);
+	Wprintf("Channel: %s\n", channel);
+}
+
+void nehalem_decode_model(u64 status, u64 misc)
+{
+	u32 mca = status & 0xffff;
+	core2_decode_model(status);
+	if ((mca >> 11) == 1) { 	/* bus and interconnect QPI */
+		decode_bitfield(status, qpi_status);
+		decode_numfield(status, qpi_numbers);
+		decode_bitfield(misc, qpi_misc);
+	} else if (mca == 0x0001) { /* internal unspecified */
+		decode_bitfield(status, internal_error_status);
+		decode_numfield(status, internal_error_numbers);
+	} else if ((mca >> 8) == 1) { /* memory controller */
+		decode_bitfield(status, memory_controller_status);
+		decode_numfield(status, memory_controller_numbers);
+	}
+}
+
diff -x '*~' -urpN mcelog-0.7/nehalem.h mcelog-0.7-newcpus//nehalem.h
--- mcelog-0.7/nehalem.h	1970-01-01 01:00:00.000000000 +0100
+++ mcelog-0.7-newcpus//nehalem.h	2008-09-26 20:24:20.000000000 +0200
@@ -0,0 +1,2 @@
+void nehalem_decode_model(u64 status, u64 misc);
+void decode_memory_controller(u32 status);
diff -x '*~' -urpN mcelog-0.7/p4.c mcelog-0.7-newcpus//p4.c
--- mcelog-0.7/p4.c	2006-05-03 08:55:54.000000000 +0200
+++ mcelog-0.7-newcpus//p4.c	2008-09-26 20:34:41.000000000 +0200
@@ -1,7 +1,6 @@
 /* Copyright (c) 2005 by Intel Corp.
 
-   Decode IA32/x86-64 machine check for Pentium 4, Intel Xeon
-   or EM64T.
+   Decode Intel machine check (generic and P4 specific)
 
    mcelog is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public
@@ -19,12 +18,17 @@
    
    Authors:
         Racing Guo <racing.guo@intel.com>
+	Andi Kleen
 */
-
+  
 #include <stdio.h>
 #include "mcelog.h"
+#include "p4.h"
+#include "core2.h"
+#include "nehalem.h"
+#include "dunnington.h"
 
-/* decode mce for P4/Xeon family */
+/* decode mce for P4/Xeon and Core2 family */
 
 static inline int test_prefix(int nr, __u32 value)
 {
@@ -73,13 +77,12 @@ static char* get_RRRR_str(__u8 rrrr)
 	}
 
 	return "UNKNOWN";
-	
 }
 
 static char* get_PP_str(__u8 pp)
 {
 	static char* PP[] = {
-		"Originated-request",
+		"Local-CPU-originated-request",
 		"Responed-to-request",
 		"Observed-error-as-third-party",
 		"Generic"
@@ -112,7 +115,7 @@ static char* get_II_str(__u8 i)
 	return II[i];
 }
 
-static int decode_mca(__u32 mca, char *buf, int len)
+static void decode_mca(__u32 mca)
 {
 #define TLB_LL_MASK      0x3  /*bit 0, bit 1*/
 #define TLB_LL_SHIFT     0x0
@@ -137,64 +140,59 @@ static int decode_mca(__u32 mca, char *b
 #define BUS_PP_MASK      0x600 /*bit 9, bit 10*/
 #define BUS_PP_SHIFT     0x9
 
-	mca = mca & 0xFFFF;
+	static char *msg[] = {
+		[0] = "No Error",
+		[1] = "Unclassified",
+		[2] = "Microcode ROM parity error",
+		[3] = "External error",
+		[4] = "FRC error",
+	};
+
+	if (mca & (1UL << 12)) {
+		Wprintf("corrected filtering (some unreported errors in same region)\n");
+		mca &= ~(1UL << 12);
+	}
 
-	switch(mca) {
-	case 0x0:
-		return snprintf(buf, len, "%s", "No Error");
-		break;
-	case 0x1:
-		return snprintf(buf, len, "%s", "Unclassified");
-		break;
-	case 0x2:
-		return snprintf(buf, len, "%s", "Microcode ROM Parity Error");
-		break;
-	case 0x3:
-		return snprintf(buf, len, "%s", "External Error");
-		break;
-	case 0x4:
-		return snprintf(buf, len, "%s", "FRC Error");
-		break;
-	default:
-		break;
+	if (mca < NELE(msg)) {
+		Wprintf("%s\n", msg[mca]); 
+		return;
 	}
 
-	if (test_prefix(4, mca)) {
-		return snprintf(buf, len, "%s TLB %s Error",
+	if ((mca >> 2) == 3) { 
+		Wprintf("%s Generic memory hierarchy error\n", get_LL_str(mca & 3));
+	} else if (test_prefix(4, mca)) {
+		Wprintf("%s TLB %s Error\n",
 				get_TT_str((mca & TLB_TT_MASK) >> TLB_TT_SHIFT),
 				get_LL_str((mca & TLB_LL_MASK) >> 
 					    TLB_LL_SHIFT));
-	}
-	if (test_prefix(8, mca)) {
-		return snprintf(buf, len, "%s CACHE %s %s Error", 
+	} else if (test_prefix(8, mca)) {
+		Wprintf("%s CACHE %s %s Error\n", 
 				get_TT_str((mca & CACHE_TT_MASK) >> 
 					    CACHE_TT_SHIFT),
 				get_LL_str((mca & CACHE_LL_MASK) >> 
 					    CACHE_LL_SHIFT),
 				get_RRRR_str((mca & CACHE_RRRR_MASK) >> 
 					      CACHE_RRRR_SHIFT));
-	}
-	if (test_prefix(10, mca)) {
+	} else if (test_prefix(10, mca)) {
 		if (mca == 0x400)
-			return snprintf(buf, len, "Internal Timer error");
+			Wprintf("Internal Timer error\n");
 		else
-			return snprintf(buf, len, 
-					"Internal unclassified errors");
-	}
-	if (test_prefix(11, mca)) {
-
-		return snprintf(buf, len, "BUS %s %s %s %s %s Error",
+			Wprintf("Internal unclassified error: %x\n", mca & 0xffff);
+	} else if (test_prefix(11, mca)) {
+		Wprintf("BUS %s %s %s %s %s Error\n",
 				get_LL_str((mca & BUS_LL_MASK) >> BUS_LL_SHIFT),
 				get_PP_str((mca & BUS_PP_MASK) >> BUS_PP_SHIFT),
 				get_RRRR_str((mca & BUS_RRRR_MASK) >> 
 					      BUS_RRRR_SHIFT),
 				get_II_str((mca & BUS_II_MASK) >> BUS_II_SHIFT),
 				get_T_str((mca & BUS_T_MASK) >> BUS_T_SHIFT));
-	}
-	return snprintf(buf, len, "Unknown Error");
+	} else if (test_prefix(7, mca)) {
+		decode_memory_controller(mca);
+	} else 
+		Wprintf("Unknown Error %x\n", mca);
 }
 
-static void decode_model(__u32 model)
+static void p4_decode_model(__u32 model)
 {
 	static struct {
 		int value;
@@ -219,17 +217,27 @@ static void decode_model(__u32 model)
 	Wprintf("\n");
 }
 
-static void decode_mci(__u64 status)
+static void decode_tracking(u64 track, int cpu)
 {
-#define BUF_LEN 200
-	char buf[BUF_LEN];
-	__u32 mca;
+	static char *msg[] = { 
+		[1] = "green", 
+		[2] = "yellow\n"
+"Large number of corrected errors. System operating, but you should\n"
+"schedule it for service within a few weeks",
+		[3] ="res3" };
+	if (track) {
+		Wprintf("Threshold based error status: %s\n", msg[track]);
+		if (track == 2)
+			Wprintf(
+    "CPU %d has large number of corrected errors. Consider replacement", cpu);
+	}
+}
 
+static void decode_mci(__u64 status, int cpu)
+{
 	Wprintf("MCi status:\n");
-	if (!(status & MCI_STATUS_VAL)) {
-		Wprintf("Invalid log\n");
-		return;
-	}
+	if (!(status & MCI_STATUS_VAL))
+		Wprintf("Machine check not valid\n");
 
 	if (status & MCI_STATUS_OVER)
 		Wprintf("Error overflow\n");
@@ -249,15 +257,9 @@ static void decode_mci(__u64 status)
 	if (status & MCI_STATUS_PCC)
 		Wprintf("Processor context corrupt\n");
 
-	mca = status & 0xFFFFL;
-	decode_mca(mca, buf, BUF_LEN);
-	Wprintf("MCA:%s\n", buf);	
-	
-	if (test_prefix(11, mca)) {
-		__u32 model;
-		model = (status & 0xFFFF0000L);
-		decode_model(model);
-	}
+	decode_tracking((status >> 54) & 3, cpu);
+	Wprintf("MCA: ");
+	decode_mca(status & 0xffffL);
 }
 
 static void decode_mcg(__u64 mcgstatus)
@@ -272,13 +274,36 @@ static void decode_mcg(__u64 mcgstatus)
 	Wprintf("\n");
 }
 
-void decode_p4_mc(struct mce *log)
+void decode_intel_mc(struct mce *log, int cputype)
 {
+	int cpu = log->cpu;
+
 	decode_mcg(log->mcgstatus);
-	decode_mci(log->status);
+	decode_mci(log->status, cpu);
+
+	if (test_prefix(11, (log->status & 0xffffL))) {
+		switch (cputype) {
+		case CPU_P6OLD:
+			p6old_decode_model(log->status);
+			break;
+		case CPU_DUNNINGTON:
+		case CPU_CORE2:
+			core2_decode_model(log->status);
+			break;
+		case CPU_P4:
+			p4_decode_model(log->status & 0xffff0000L);
+			break;
+		case CPU_NEHALEM:
+			nehalem_decode_model(log->status, log->misc);
+			break;
+		}
+	}
+
+	if (cputype == CPU_DUNNINGTON)
+		dunnington_decode_model(log->status);
 }
 
-char *p4_bank_name(int num)
+char *intel_bank_name(int num)
 {
 	static char bname[64];
 	sprintf(bname, "BANK %d", num);
diff -x '*~' -urpN mcelog-0.7/p4.h mcelog-0.7-newcpus//p4.h
--- mcelog-0.7/p4.h	2006-05-03 08:55:54.000000000 +0200
+++ mcelog-0.7-newcpus//p4.h	2008-09-26 20:35:46.000000000 +0200
@@ -1,2 +1,2 @@
-char *p4_bank_name(int num);
-void decode_p4_mc(struct mce* mce);
+char *intel_bank_name(int num);
+void decode_intel_mc(struct mce *log, int cpu);
openSUSE Build Service is sponsored by