Discussion:
Locate your faulty DIMM with Pike
Peter Bortas
2015-01-16 10:50:23 UTC
Permalink
There is too much devel-near code on this list and not enough
user-level hacks and questions. So I shall make an effort to post
trivial hacks here when I remember.

This hack identifies what DIMM reported needs to be replaced due to
reported ECC errors. It does so by crudely parsing the mcelog and the
DMI table, and it does it because mcelog can not parse it itself for
these CPUs/motherboards yet.

Example output:

# ./locate_dimm.pike n173
Defaulting to finding DIMM for last logged error.
Last error on address 0x514a30400
handle: 0x53
DIMM_C1

# ./locate_dimm.pike n173 0x856fa4570
Locating DIMM for address 0x856fa4570
handle: 0x5b
DIMM_E1


#!/usr/bin/env pike

void usage(string command)
{
string help = "Usage: %s [node name] <memory address>\n\n"
"If no address is specified the last entry inte the mcelog
will be used.\n"
"address is interpreted as a hex value if prefixed with \"0x\"\n";

exit(1, help, command);
}

string ssh_run(string host, string rcmd)
{
mapping res = Process.run( ({ "ssh", host, rcmd }) );
if(res->exitcode)
exit(res->exitcode, "Failed to run %O on %O\nstdout:
%O\nstderr: %O\n", res->stdout, res->stderr);
return res->stdout;
}

int get_last_error(string node)
{
string lasterror = ssh_run(node, "grep ADDR /var/log/mcelog | grep
-v 'register valid' | awk '{print $4}' | tail -1") - "\n";
sscanf(lasterror, "%x", int error_addr);
return error_addr;
}

int get_physmem_handle(string dmidump, int addr)
{
int dmi_type, dummy_handle, handle;
int start_addr = -1, end_addr = -1;

foreach(dmidump/"\n", string line) {
if(sscanf(line, "Handle 0x%x, DMI type %d,%*s", dummy_handle,
dmi_type) == 3) {
// Reset state at each header regardless of type
start_addr = end_addr = -1;
handle = 0;
}
sscanf(line, "\tStarting Address: 0x%x", start_addr);
sscanf(line, "\tEnding Address: 0x%x", end_addr);
sscanf(line, "\tPhysical Device Handle: 0x%x", handle);
if(start_addr > -1 && end_addr > -1 && handle)
if(addr >= start_addr && addr <= end_addr)
return handle;
}

exit(1, "Unable to locate physmem handle for address 0x%x\n", addr);
}

string get_dimm_name(string dmidump, int physmem_handle)
{
int handle, dmi_type;
string dimm_name;

foreach(dmidump/"\n", string line) {
sscanf(line, "Handle 0x%x, DMI type %d,%*s", handle, dmi_type);
if(dmi_type == 17 && sscanf(line, "\tLocator: %s", dimm_name) == 1) {
if(handle == physmem_handle)
return dimm_name;
}
}
exit(1, "Unable to locate DIMM for handle 0x%x (%d)\n",
physmem_handle, physmem_handle);
}

string get_dimm_for_addr(string node, int addr)
{
string dmidump = ssh_run(node, "dmidecode");
int physmem_handle = get_physmem_handle(dmidump, addr);

werror("handle: 0x%x\n", physmem_handle);

return get_dimm_name(dmidump, physmem_handle);
}

void main(int argc, array argv)
{
if( argc < 2 || argv[1] == "-h" || argv[1] == "--help" )
usage(argv[0]);

string node = argv[1];

int address;
if(argc == 3) {
string tmp = argv[2];
if(sscanf(tmp, "0x%x", address) != 1)
address = (int)tmp;
werror("Locating DIMM for address 0x%x\n", address);
} else {
werror("Defaulting to finding DIMM for last logged error.\n");
address = get_last_error(node);
werror("Last error on address 0x%x\n", address);
}

write("%s\n", get_dimm_for_addr(node, address));
}
--
Peter Bortas
Loading...