Commit 32a7757a authored by Neil Horman's avatar Neil Horman

Complete rework of how we detect and classify irqs

irqbalance has been broken for a long time.  Its ability to properly detect msi
irqs and to correctly identify interrupt types (net vs. storage vs. other, etc),
has been based on some tenuous string comparison logic that was easily broken by
administrative name changes for interfaces.  I've recently submitted this patch:
https://lkml.org/lkml/2011/9/19/176
Which lets us use sysfs exclusively for finding device interrupts, which in
turns lets us definitavely identify irq types (legacy pci vs. msi), as well as
properly classifying them using the pci device class value.

Additionally, this patch rips out the code that attemtps to bias interrupt
count volumes using network statistics, since theres no sane way to be certain a
single network interrupt is responsible for the number of packets received on a
given interface.  Workload computation is now done on soley on irq count.  This
may change in the future, adding /proc/stat irq and softirq time to the biasing
mechanism.

Note that without the above kernel change, this doesn't work right.  Irqbalance
contains a self check in which it identifies MSI interrupts in /proc/interrupts
still.  If it sees MSI irqs in /proc/interrupts, but none in sysfs, then it will
issue a loud warning about irqs being missclassified until the kernel is
updated.
Signed-off-by: 's avatarNeil Horman <nhorman@tuxdriver.com>
parent 565db19e
......@@ -30,7 +30,7 @@ noinst_HEADERS = bitmap.h constants.h cpumask.h irqbalance.h non-atomic.h \
types.h
sbin_PROGRAMS = irqbalance
irqbalance_SOURCES = activate.c bitmap.c classify.c cputree.c irqbalance.c \
irqlist.c network.c numa.c placement.c powermode.c procinterrupts.c
irqlist.c numa.c placement.c powermode.c procinterrupts.c
dist_man_MANS = irqbalance.1
CONFIG_CLEAN_FILES = debug*.list config/*
......
......@@ -2,6 +2,9 @@
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <dirent.h>
#include <assert.h>
#include "irqbalance.h"
#include "types.h"
......@@ -24,112 +27,333 @@ int map_class_to_level[7] =
int class_counts[7];
#define MAX_CLASS 0x12
/*
* Class codes lifted from pci spec, appendix D.
* and mapped to irqbalance types here
*/
static short class_codes[MAX_CLASS] = {
IRQ_OTHER,
IRQ_SCSI,
IRQ_ETH,
IRQ_OTHER,
IRQ_OTHER,
IRQ_OTHER,
IRQ_LEGACY,
IRQ_OTHER,
IRQ_OTHER,
IRQ_LEGACY,
IRQ_OTHER,
IRQ_OTHER,
IRQ_LEGACY,
IRQ_ETH,
IRQ_SCSI,
IRQ_OTHER,
IRQ_OTHER,
IRQ_OTHER,
};
NOTE NOTE although that this file has a hard-coded list of modules, something missing is not
a big deal; the types are also set based on PCI class information when available.
*/
static GList *interrupts_db;
/*
Based on the original irqbalance code which is:
Copyright (C) 2003 Red Hat, Inc. All rights reserved.
Usage and distribution of this file are subject to the Gnu General Public License Version 2
that can be found at http://www.gnu.org/licenses/gpl.txt and the COPYING file as
distributed together with this file is included herein by reference.
Author: Arjan van de Ven <arjanv@redhat.com>
*/
static char *legacy_modules[] = {
"PS/2",
"serial",
"i8042",
"acpi",
"floppy",
"parport",
"keyboard",
"usb-ohci",
"usb-uhci",
"uhci_hcd",
"ohci_hcd",
"ehci_hcd",
"EMU10K1",
0
#define SYSDEV_DIR "/sys/bus/pci/devices"
union property {
int int_val;
cpumask_t mask_val;
};
static char *timer_modules[] = {
"rtc",
"timer",
0
enum irq_type {
INT_TYPE = 0,
CPUMASK_TYPE,
};
static char *storage_modules[] = {
"aic7xxx",
"aic79xx",
"ide",
"cciss",
"cpqarray",
"qla2",
"megaraid",
"fusion",
"libata",
"ohci1394",
"sym53c8xx",
0
struct irq_property {
enum irq_type itype;
union property iproperty;
};
#define iint_val iproperty.int_val
#define imask_val iproperty.mask_val
static char *ethernet_modules[] = {
"eth",
"e100",
"eepro100",
"orinoco_cs",
"wvlan_cs",
"3c5",
"HiSax",
"skge",
"sky2",
0
struct irq_info {
int irq;
struct irq_property property[IRQ_MAX_PROPERTY];
};
static void init_new_irq(struct irq_info *new)
{
new->property[IRQ_CLASS].itype = INT_TYPE;
new->property[IRQ_TYPE].itype = INT_TYPE;
new->property[IRQ_NUMA].itype = INT_TYPE;
new->property[IRQ_LCPU_MASK].itype = CPUMASK_TYPE;
}
static gint compare_ints(gconstpointer a, gconstpointer b)
{
const struct irq_info *ai = a;
const struct irq_info *bi = b;
return ai->irq - bi->irq;
}
static void free_int(gpointer data)
{
free(data);
}
/*
* Inserts an irq_info struct into the intterupts_db list
* devpath points to the device directory in sysfs for the
* related device
*/
static struct irq_info *add_one_irq_to_db(const char *devpath, int irq)
{
int class = 0;
int rc;
struct irq_info *new, find;
int numa_node;
char *path = alloca(strlen(devpath) + 64);
FILE *fd;
char *lcpu_mask;
GList *entry;
/*
* First check to make sure this isn't a duplicate entry
*/
find.irq = irq;
entry = g_list_find_custom(interrupts_db, &find, compare_ints);
if (entry) {
if (debug_mode)
printf("DROPPING DUPLICATE ENTRY FOR IRQ %d on path %s\n", irq, devpath);
return NULL;
}
new = malloc(sizeof(struct irq_info));
if (!new)
return NULL;
init_new_irq(new);
new->irq = irq;
new->property[IRQ_CLASS].iint_val = IRQ_OTHER;
interrupts_db = g_list_append(interrupts_db, new);
sprintf(path, "%s/class", devpath);
fd = fopen(path, "r");
if (!fd) {
perror("Can't open class file: ");
goto get_numa_node;
}
rc = fscanf(fd, "%x", &class);
fclose(fd);
if (!rc)
goto get_numa_node;
int find_class(struct interrupt *irq, char *moduletext)
/*
* Restrict search to major class code
*/
class >>= 16;
if (class >= MAX_CLASS)
goto get_numa_node;
new->property[IRQ_CLASS].iint_val = class_codes[class];
get_numa_node:
numa_node = -1;
sprintf(path, "%s/numa_node", devpath);
fd = fopen(path, "r");
if (!fd)
goto assign_node;
rc = fscanf(fd, "%d", &numa_node);
fclose(fd);
assign_node:
new->property[IRQ_NUMA].iint_val = numa_node;
sprintf(path, "%s/local_cpus", devpath);
fd = fopen(path, "r");
if (!fd) {
cpus_setall(new->property[IRQ_LCPU_MASK].imask_val);
goto out;
}
lcpu_mask = NULL;
rc = fscanf(fd, "%as", &lcpu_mask);
fclose(fd);
if (!lcpu_mask) {
cpus_setall(new->property[IRQ_LCPU_MASK].imask_val);
} else {
cpumask_parse_user(lcpu_mask, strlen(lcpu_mask),
new->property[IRQ_LCPU_MASK].imask_val);
}
out:
if (debug_mode)
printf("Adding IRQ %d to database\n", irq);
return new;
}
/*
* Figures out which interrupt(s) relate to the device we're looking at in dirname
*/
static void build_one_dev_entry(const char *dirname)
{
int guess = IRQ_OTHER;
int i;
size_t pathlen = strlen(SYSDEV_DIR) + strlen(dirname) + 128;
struct dirent *entry;
DIR *msidir;
FILE *fd;
int irqnum;
struct irq_info *new;
char *path = alloca(pathlen);
if (!path)
return;
sprintf(path, "%s/%s/msi_irqs", SYSDEV_DIR, dirname);
if (moduletext == NULL)
return guess;
msidir = opendir(path);
if (msidir) {
do {
entry = readdir(msidir);
if (!entry)
break;
irqnum = strtol(entry->d_name, NULL, 10);
if (irqnum) {
sprintf(path, "%s/%s", SYSDEV_DIR, dirname);
new = add_one_irq_to_db(path, irqnum);
if (!new)
continue;
new->property[IRQ_TYPE].iint_val = IRQ_TYPE_MSIX;
}
} while (entry != NULL);
closedir(msidir);
return;
}
sprintf(path, "%s/%s/irq", SYSDEV_DIR, dirname);
fd = fopen(path, "r");
if (!fd)
return;
if (fscanf(fd, "%d", &irqnum) < 0)
goto done;
/*
* no pci device has irq 0
*/
if (irqnum) {
sprintf(path, "%s/%s", SYSDEV_DIR, dirname);
new = add_one_irq_to_db(path, irqnum);
if (!new)
goto done;
new->property[IRQ_TYPE].iint_val = IRQ_TYPE_LEGACY;
}
done:
fclose(fd);
return;
}
void rebuild_irq_db(void)
{
DIR *devdir = opendir(SYSDEV_DIR);
struct dirent *entry;
g_list_free_full(interrupts_db, free_int);
for (i=0; legacy_modules[i]; i++)
if (strstr(moduletext, legacy_modules[i]))
guess = IRQ_LEGACY;
for (i=0; storage_modules[i]; i++)
if (strstr(moduletext, storage_modules[i]))
guess = IRQ_SCSI;
for (i=0; timer_modules[i]; i++)
if (strstr(moduletext, timer_modules[i]))
guess = IRQ_TIMER;
for (i=0; ethernet_modules[i]; i++)
if (strstr(moduletext, ethernet_modules[i])) {
guess = IRQ_ETH;
if (strstr(moduletext, "-rx"))
guess = IRQ_GETH;
if (strstr(moduletext, "-tx"))
guess = IRQ_TGETH;
}
if (guess == IRQ_OTHER && irq->number==0)
guess = IRQ_TIMER;
if (!devdir)
return;
do {
entry = readdir(devdir);
if (!entry)
break;
build_one_dev_entry(entry->d_name);
} while (entry != NULL);
closedir(devdir);
}
static GList *add_misc_irq(int irq)
{
struct irq_info *new, find;
new = malloc(sizeof(struct irq_info));
if (!new)
return NULL;
init_new_irq(new);
new->irq = irq;
new->property[IRQ_TYPE].iint_val = IRQ_TYPE_LEGACY;
new->property[IRQ_CLASS].iint_val = IRQ_OTHER;
new->property[IRQ_NUMA].iint_val = -1;
interrupts_db = g_list_append(interrupts_db, new);
find.irq = irq;
return g_list_find_custom(interrupts_db, &find, compare_ints);
}
int find_irq_integer_prop(int irq, enum irq_prop prop)
{
GList *entry;
struct irq_info find, *result;
find.irq = irq;
entry = g_list_find_custom(interrupts_db, &find, compare_ints);
if (!entry) {
if (debug_mode)
printf("No entry for irq %d in the irq database, adding default entry\n", irq);
entry = add_misc_irq(irq);
}
result = entry->data;
assert(result->property[prop].itype == INT_TYPE);
return result->property[prop].iint_val;
}
cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop)
{
GList *entry;
struct irq_info find, *result;
if (guess > irq->class)
return guess;
return irq->class;
find.irq = irq;
entry = g_list_find_custom(interrupts_db, &find, compare_ints);
if (!entry) {
if (debug_mode)
printf("No entry for irq %d in the irq database, adding default entry\n", irq);
entry = add_misc_irq(irq);
}
result = entry->data;
assert(result->property[prop].itype == CPUMASK_TYPE);
return result->property[prop].imask_val;
}
int get_next_irq(int irq)
{
GList *entry;
struct irq_info *irqp, find;
if (irq == -1) {
entry = g_list_first(interrupts_db);
irqp = entry->data;
return irqp->irq;
}
find.irq = irq;
entry = g_list_find_custom(interrupts_db, &find, compare_ints);
if (!entry)
return -1;
entry = g_list_next(entry);
if (!entry)
return -1;
irqp= entry->data;
return irqp->irq;
}
......@@ -24,6 +24,8 @@
#include <stdlib.h>
#include <malloc.h>
#include <sys/time.h>
#include <syslog.h>
#ifdef HAVE_LIBCAP_NG
#include <cap-ng.h>
#endif
......@@ -79,6 +81,8 @@ int main(int argc, char** argv)
}
rebuild_irq_db();
parse_cpu_tree();
......@@ -93,6 +97,8 @@ int main(int argc, char** argv)
if (daemon(0,0))
exit(EXIT_FAILURE);
openlog(argv[0], 0, LOG_DAEMON);
#ifdef HAVE_LIBCAP_NG
// Drop capabilities
capng_clear(CAPNG_SELECT_BOTH);
......@@ -109,7 +115,7 @@ int main(int argc, char** argv)
sort_irq_list();
if (debug_mode)
dump_workloads();
while (1) {
sleep_approx(SLEEP_INTERVAL);
if (debug_mode)
......@@ -133,8 +139,6 @@ int main(int argc, char** argv)
parse_cpu_tree();
}
/* deal with NAPI */
account_for_nic_stats();
calculate_workload();
/* to cope with dynamic configurations we scan for new numa information
......
......@@ -29,12 +29,14 @@ extern GList *interrupts;
extern void parse_cpu_tree(void);
extern void clear_work_stats(void);
extern void parse_proc_interrupts(void);
extern void rebuild_irq_db(void);
extern void set_interrupt_count(int number, uint64_t count);
extern void set_msi_interrupt_numa(int number, char *devname);
extern void add_interrupt_count(int number, uint64_t count, int type);
extern int find_class(struct interrupt *irq, char *string);
extern int get_next_irq(int irq);
extern int find_irq_integer_prop(int irq, enum irq_prop prop);
extern cpumask_t find_irq_cpumask_prop(int irq, enum irq_prop prop);
extern void add_interrupt_numa(int number, cpumask_t mask, int node_num, int type);
int dev_to_node(char *devname);
void calculate_workload(void);
void reset_counts(void);
......
......@@ -117,7 +117,7 @@ static void investigate(struct interrupt *irq, int number)
} else if (strcmp(entry->d_name,"affinity_hint")==0) {
get_affinity_hint(irq, number);
} else {
irq->class = find_class(irq, entry->d_name);
irq->class = find_irq_integer_prop(irq->number, IRQ_CLASS);
}
} while (entry);
......@@ -155,7 +155,7 @@ void set_msi_interrupt_numa(int number, char *devname)
struct interrupt *irq;
int node;
node = dev_to_node(devname);
node = find_irq_integer_prop(number, IRQ_NUMA);
if (node < 0)
return;
......@@ -210,34 +210,6 @@ void set_interrupt_count(int number, uint64_t count)
interrupts = g_list_append(interrupts, irq);
}
/*
* Add extra irqs to a specific irq metadata structure;
* if no such metadata exists, do nothing at all
*/
void add_interrupt_count(int number, uint64_t count, int type)
{
GList *item;
struct interrupt *irq;
if (!count)
return;
item = g_list_first(interrupts);
while (item) {
irq = item->data;
item = g_list_next(item);
if (irq->number == number) {
irq->extra += count;
if (irq->class < type && irq->balance_level != BALANCE_NONE) {
irq->class = type;
irq->balance_level = map_class_to_level[irq->class];
}
return;
}
}
}
/*
* Set the numa affinity mask for a specific interrupt if there
* is metadata for the interrupt; do nothing if no such data
......
/*
* Copyright (C) 2006, Intel Corporation
*
* This file is part of irqbalance
*
* This program file is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; version 2 of the License.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program in a file named COPYING; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301 USA
*/
/*
* Due to NAPI, the actual number of interrupts for a network NIC is usually low
* even though the amount of work is high; this file is there to compensate for this
* by adding actual package counts to the calculated amount of work of interrupts
*/
#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <linux/types.h>
/* some distros (Debian / SLES) ship a totally broken ethtool.h */
/* work around the breakage some */
#define u32 __u32
#define u16 __u16
#define u8 __u8
#define u64 __u64
#include <linux/ethtool.h>
#undef u8
#undef u16
#undef u32
#undef u64
#include <glib.h>
#include <net/if.h>
#include <linux/sockios.h>
#include <string.h>
#include <sys/ioctl.h>
#include <stdint.h>
#include "irqbalance.h"
struct nic {
char ethname[64];
int irq;
uint64_t prev_pkt;
int counter;
};
static GList *nics;
static int dev_to_bus(char *devname, char *busname)
{
int sock, ret;
struct ifreq ifr;
struct ethtool_value ethtool;
struct ethtool_drvinfo driver;
memset(&ifr, 0, sizeof(struct ifreq));
memset(&ethtool, 0, sizeof(struct ethtool_value));
sock = socket(AF_INET, SOCK_DGRAM, 0);
if (sock<0)
return -1;
strcpy(ifr.ifr_name, devname);
driver.cmd = ETHTOOL_GDRVINFO;
ifr.ifr_data = (void*) &driver;
ret = ioctl(sock, SIOCETHTOOL, &ifr);
close(sock);
if (ret<0)
return -1;
strncpy(busname,driver.bus_info,63);
return 0;
}
static int dev_to_irq(char *devname)
{
FILE *file;
char *line = NULL;
size_t size;
int val;
char busname[64];
char buffer[PATH_MAX];
if (dev_to_bus(devname, busname))
return 0;
sprintf(buffer,"/sys/bus/pci/devices/%s/irq", busname);
file = fopen(buffer, "r");
if (!file)
return 0;
if (getline(&line, &size, file)==0) {
free(line);
fclose(file);
return 0;
}
fclose(file);
val = 0;
if (line)
val = strtoul(line, NULL, 10);
free(line);
return val;
}
int dev_to_node(char *devname)
{
int node, ret;
char *line = NULL;
FILE *file;
size_t size;
char busname[64];
char buffer[PATH_MAX];
ret = dev_to_bus(devname, busname);
if (ret)
return -1;
sprintf(buffer,"/sys/bus/pci/devices/%s/numa_node", busname);
file = fopen(buffer, "r");
if (!file)
return -1;
if (getline(&line, &size, file)==0) {
free(line);
fclose(file);
return -1;
}
fclose(file);
node = 0;
if (line)
node = strtoul(line, NULL, 10);
free(line);
return node;
}
static struct nic *new_nic(char *name)
{
struct nic *nic;
nic = malloc(sizeof(struct nic));
if (!nic)
return NULL;
memset(nic, 0, sizeof(struct nic));
strcpy(nic->ethname, name);
nic->irq = dev_to_irq(name);
nics = g_list_append(nics, nic);
return nic;
}
static struct nic *find_nic(char *name)
{
GList *item;
struct nic *nic;
item = g_list_first(nics);
while (item) {
nic = item->data;
item = g_list_next(item);
if (strcmp(nic->ethname, name)==0) {
nic->counter++;
/* refresh irq information once in a while; ifup/down
* can make this info go stale over time
*/
if ((nic->counter % NIC_REFRESH_INTERVAL) == 0)
nic->irq = dev_to_irq(nic->ethname);
return nic;
}
}
nic = new_nic(name);
return nic;
}
void account_for_nic_stats(void)
{
struct nic *nic;
FILE *file;
char *line = NULL;
size_t size = 0;
file = fopen("/proc/net/dev", "r");
if (!file)
return;
/* first two lines are headers */
if (getline(&line, &size, file)==0) {
free(line);
return;