Subject: Provide CPU load measurement based on idle time From: Carsten Emde <C.Emde@osadl.org> Date: Sun, 22 Apr 2012 15:19:46 +0100 The standard method to measure CPU load has a number of problems (for details refer to Documentation/cpu-load.txt). This patch adds an additional CPU load measuring method that is based on idle time processing. The data are available for every CPU in /proc/cpuload/cpuN. The counters can be reset by writing anything to /proc/cpuload/resetall for all CPUs and to /proc/cpuload/cpuN/reset for a particular CPU, respectively. The load value represents the average load since the most recent reset. It may take up to a second after reset until the load data reach their final precision. Signed-off-by: Carsten Emde <C.Emde@osadl.org> --- include/linux/sched.h | 8 + init/Kconfig | 14 ++ kernel/Makefile | 1 kernel/sched.c | 24 ++++ kernel/sched_cpuload.c | 254 +++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 301 insertions(+) Index: linux-3.2.36-rt54/include/linux/sched.h =================================================================== @ linux-3.2.36-rt54/include/linux/sched.h:2864 @ static inline unsigned long rlimit_max(u #endif /* __KERNEL__ */ #endif + +#ifdef CONFIG_IDLETIME_CPULOAD +extern DEFINE_PER_CPU(unsigned long long, idlestart); +extern DEFINE_PER_CPU(unsigned long long, idlestop); +extern DEFINE_PER_CPU(unsigned long long, idletime); +extern DEFINE_PER_CPU(unsigned long long, runtime); +extern DEFINE_PER_CPU(raw_spinlock_t, cpuload_lock); +#endif Index: linux-3.2.36-rt54/init/Kconfig =================================================================== --- linux-3.2.36-rt54.orig/init/Kconfig +++ linux-3.2.36-rt54/init/Kconfig @ linux-3.2.36-rt54/include/linux/sched.h:305 @ config FHANDLE get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) syscalls. +config IDLETIME_CPULOAD + bool "Provide CPU load measurement based on idle time" + help + The standard method to measure CPU load has a number of problems (for + details refer to Documentation/cpu-load.txt). If you say Y here, + additional CPU load data will be provided that are based on idle + time. The data are available for every CPU in /proc/cpuload/cpuN. The + counters can be reset by writing anything to /proc/cpuload/resetall + for all CPUs and to /proc/cpuload/cpuN/reset for a particular CPU, + respectively. The load value represents the average load since the + most recent reset. Please note that a certain, though small, + performance penalty cannot be avoided when this additional CPU load + calculation is enabled. + config TASKSTATS bool "Export task/process statistics through netlink (EXPERIMENTAL)" depends on NET Index: linux-3.2.36-rt54/kernel/Makefile =================================================================== --- linux-3.2.36-rt54.orig/kernel/Makefile +++ linux-3.2.36-rt54/kernel/Makefile @ linux-3.2.36-rt54/include/linux/sched.h:110 @ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o +obj-$(CONFIG_IDLETIME_CPULOAD) += sched_cpuload.o obj-$(CONFIG_PERF_EVENTS) += events/ Index: linux-3.2.36-rt54/kernel/sched.c =================================================================== --- linux-3.2.36-rt54.orig/kernel/sched.c +++ linux-3.2.36-rt54/kernel/sched.c @ linux-3.2.36-rt54/include/linux/sched.h:3164 @ prepare_task_switch(struct rq *rq, struc prepare_lock_switch(rq, next); prepare_arch_switch(next); trace_sched_switch(prev, next); + +#ifdef CONFIG_IDLETIME_CPULOAD + if (next->pid == 0) { + int cpu = raw_smp_processor_id(); + + raw_spin_lock(&per_cpu(cpuload_lock, cpu)); + per_cpu(idlestart, cpu) = cpu_clock(cpu); + if (per_cpu(idlestop, cpu)) { + per_cpu(runtime, cpu) += + per_cpu(idlestart, cpu) - per_cpu(idlestop, cpu); + } + raw_spin_unlock(&per_cpu(cpuload_lock, cpu)); + } else if (prev->pid == 0) { + int cpu = raw_smp_processor_id(); + + raw_spin_lock(&per_cpu(cpuload_lock, cpu)); + per_cpu(idlestop, cpu) = cpu_clock(cpu); + if (per_cpu(idlestart, cpu)) { + per_cpu(idletime, cpu) += + per_cpu(idlestop, cpu) - per_cpu(idlestart, cpu); + } + raw_spin_unlock(&per_cpu(cpuload_lock, cpu)); + } +#endif } /** Index: linux-3.2.36-rt54/kernel/sched_cpuload.c =================================================================== --- /dev/null +++ linux-3.2.36-rt54/kernel/sched_cpuload.c @ linux-3.2.36-rt54/include/linux/sched.h:4 @ +/* + cpuload.c: calculate CPU load data that are derived from the + idle time + + Copyright (C) 2012 Carsten Emde <C.Emde@osadl.org> + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +#include <linux/sched.h> +#include <linux/proc_fs.h> +#include <linux/stringify.h> + +/* + * Total precision of percentage output + * If PRECISION is 5, for example, output will be "NNN.NN%" + */ + +#define PRECISION 5 +#if PRECISION < 3 +#error PRECISION must not be smaller than 3 +#endif + +#if PRECISION == 3 +#define PRECFACTOR ((int)1E2) +#elif PRECISION == 4 +#define PRECFACTOR ((int)1E3) +#elif PRECISION == 5 +#define PRECFACTOR ((int)1E4) +#elif PRECISION == 6 +#define PRECFACTOR ((int)1E5) +#elif PRECISION == 7 +#define PRECFACTOR ((int)1E6) +#else +#error PRECSICION must not be larger than 7 +#endif + +#define PRECFORMAT "%0"__stringify(PRECISION)"llu" +#define MAXINTPRECISION 3 +#define DECPRECISION (PRECISION-MAXINTPRECISION) + +enum cpuload_action_index { + CPULOAD_IDLETIME, + CPULOAD_RUNTIME, + CPULOAD_LOAD, + CPULOAD_RESET, + CPULOAD_RESETALL, +}; + +DEFINE_PER_CPU(unsigned long long, idlestart); +DEFINE_PER_CPU(unsigned long long, idlestop); +DEFINE_PER_CPU(unsigned long long, idletime); +DEFINE_PER_CPU(unsigned long long, runtime); +DEFINE_PER_CPU(raw_spinlock_t, cpuload_lock); + +struct cpuload_data { + int cpu; + int action; +}; + +static DEFINE_PER_CPU(struct cpuload_data, cpuload_data_idletime); +static DEFINE_PER_CPU(struct cpuload_data, cpuload_data_runtime); +static DEFINE_PER_CPU(struct cpuload_data, cpuload_data_load); +static DEFINE_PER_CPU(struct cpuload_data, cpuload_data_reset); +static struct cpuload_data cpuload_data_resetall = { + .cpu = 0, + .action = CPULOAD_RESETALL, +}; + +static int show_cpuload(char *buf, char **start, off_t off, + int count, int *eof, void *data) +{ + int r; + struct cpuload_data *cpuload_data = (struct cpuload_data *) data; + int cpu = cpuload_data->cpu; + unsigned long long now = cpu_clock(cpu); + + raw_spin_lock(&per_cpu(cpuload_lock, cpu)); + + /* Update counters */ + if (per_cpu(idlestart, cpu) > per_cpu(idlestop, cpu)) { + /* CPU is idle */ + per_cpu(idletime, cpu) += now - per_cpu(idlestart, cpu); + per_cpu(idlestart, cpu) = now; + } else { + /* CPU is running */ + per_cpu(runtime, cpu) += now - per_cpu(idlestop, cpu); + per_cpu(idlestop, cpu) = now; + } + + switch (cpuload_data->action) { + case CPULOAD_IDLETIME: + r = snprintf(buf, count, "%llu\n", + per_cpu(idletime, cpu)); + raw_spin_unlock(&per_cpu(cpuload_lock, cpu)); + break; + + case CPULOAD_RUNTIME: + r = snprintf(buf, count, "%llu\n", + per_cpu(runtime, cpu)); + raw_spin_unlock(&per_cpu(cpuload_lock, cpu)); + break; + + case CPULOAD_LOAD: { + char str[8], *firstdigit; + int intdigits; + unsigned long long idletime1, runtime1, alltime; + + idletime1 = per_cpu(idletime, cpu); + runtime1 = per_cpu(runtime, cpu); + raw_spin_unlock(&per_cpu(cpuload_lock, cpu)); + + alltime = idletime1 + runtime1; + + if (alltime) { + /* Fake precision, if measurement was just started */ + if (alltime < PRECFACTOR) { + runtime1 *= PRECFACTOR; + alltime *= PRECFACTOR; + } + + /* Format examples: 100.00%, 1.66%, 0.12%, 0.00% */ + snprintf(str, sizeof(str), PRECFORMAT, + div64_u64(runtime1, div_u64(alltime, PRECFACTOR))); + + firstdigit = str; + while (*firstdigit == '0' && *firstdigit != '\0') + firstdigit++; + + if (strlen(firstdigit) < DECPRECISION+1) + firstdigit = str + MAXINTPRECISION-1; + + intdigits = strlen(firstdigit) - DECPRECISION; + strncpy(buf, firstdigit, intdigits); + buf[intdigits] = '.'; + strcpy(buf + intdigits + 1, firstdigit + intdigits); + strcat(buf, "%\n"); + } else + strcpy(buf, "n.a.\n"); + + r = strlen(buf); + break; + + default: + raw_spin_unlock(&per_cpu(cpuload_lock, cpu)); + r = 0; + break; + } + } + + *eof = 1; + return r; +} + +static inline void reset_cpuload1(int cpu) +{ + per_cpu(idletime, cpu) = per_cpu(runtime, cpu) = 0; +} + +static int reset_cpuload(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + struct cpuload_data *cpuload_data = (struct cpuload_data *) data; + int cpu = cpuload_data->cpu; + + switch (cpuload_data->action) { + case CPULOAD_RESET: + reset_cpuload1(cpu); + break; + + case CPULOAD_RESETALL: + for_each_online_cpu(cpu) + reset_cpuload1(cpu); + break; + } + return count; +} + +static int __init proc_cpuload_init(void) +{ + int cpu; + struct proc_dir_entry *root_cpuload_dir, *entry; + + root_cpuload_dir = proc_mkdir("cpuload", NULL); + if (!root_cpuload_dir) + return 0; + + entry = create_proc_entry("resetall", S_IWUGO, root_cpuload_dir); + if (entry) { + entry->write_proc = reset_cpuload; + entry->data = (void *) &cpuload_data_resetall; + } + + for_each_possible_cpu(cpu) { + char name[32]; + struct proc_dir_entry *cpuload_cpudir; + + snprintf(name, sizeof(name), "cpu%d", cpu); + cpuload_cpudir = proc_mkdir(name, root_cpuload_dir); + if (!cpuload_cpudir) + return 0; + + per_cpu(cpuload_data_idletime, cpu).cpu = cpu; + per_cpu(cpuload_data_idletime, cpu).action = CPULOAD_IDLETIME; + entry = create_proc_entry("idletime", S_IRUGO, cpuload_cpudir); + if (entry) { + entry->read_proc = show_cpuload; + entry->data = (void *) + &per_cpu(cpuload_data_idletime, cpu); + } + + per_cpu(cpuload_data_runtime, cpu).cpu = cpu; + per_cpu(cpuload_data_runtime, cpu).action = CPULOAD_RUNTIME; + entry = create_proc_entry("runtime", S_IRUGO, cpuload_cpudir); + if (entry) { + entry->read_proc = show_cpuload; + entry->data = (void *) + &per_cpu(cpuload_data_runtime, cpu); + } + + per_cpu(cpuload_data_load, cpu).cpu = cpu; + per_cpu(cpuload_data_load, cpu).action = CPULOAD_LOAD; + entry = create_proc_entry("load", S_IRUGO, cpuload_cpudir); + if (entry) { + entry->read_proc = show_cpuload; + entry->data = (void *) + &per_cpu(cpuload_data_load, cpu); + } + + per_cpu(cpuload_data_reset, cpu).cpu = cpu; + per_cpu(cpuload_data_reset, cpu).action = CPULOAD_RESET; + entry = create_proc_entry("reset", S_IWUGO, cpuload_cpudir); + if (entry) { + entry->write_proc = reset_cpuload; + entry->data = (void *) + &per_cpu(cpuload_data_reset, cpu); + } + } + return 0; +} + +module_init(proc_cpuload_init);