Subject: Provide CPU load measurement based on idle time
From: Carsten Emde <C.Emde@osadl.org>
Date: Sun, 22 Apr 2012 15:19:46 +0100

The standard method to measure CPU load has a number of problems (for
details refer to Documentation/cpu-load.txt). This patch adds an
additional CPU load measuring method that is based on idle time
processing. The data are available for every CPU in /proc/cpuload/cpuN.
The counters can be reset by writing anything to /proc/cpuload/resetall
for all CPUs and to /proc/cpuload/cpuN/reset for a particular CPU,
respectively. The load value represents the average load since the most
recent reset. It may take up to a second after reset until the load data
reach their final precision.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>

---
 include/linux/sched.h  |    8 +
 init/Kconfig           |   14 ++
 kernel/Makefile        |    1 
 kernel/sched.c         |   24 ++++
 kernel/sched_cpuload.c |  254 +++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 301 insertions(+)

Index: linux-3.2.36-rt54/include/linux/sched.h
===================================================================
@ linux-3.2.36-rt54/include/linux/sched.h:2864 @ static inline unsigned long rlimit_max(u
 #endif /* __KERNEL__ */
 
 #endif
+
+#ifdef CONFIG_IDLETIME_CPULOAD
+extern DEFINE_PER_CPU(unsigned long long, idlestart);
+extern DEFINE_PER_CPU(unsigned long long, idlestop);
+extern DEFINE_PER_CPU(unsigned long long, idletime);
+extern DEFINE_PER_CPU(unsigned long long, runtime);
+extern DEFINE_PER_CPU(raw_spinlock_t, cpuload_lock);
+#endif
Index: linux-3.2.36-rt54/init/Kconfig
===================================================================
--- linux-3.2.36-rt54.orig/init/Kconfig
+++ linux-3.2.36-rt54/init/Kconfig
@ linux-3.2.36-rt54/include/linux/sched.h:305 @ config FHANDLE
 	  get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
 	  syscalls.
 
+config IDLETIME_CPULOAD
+	bool "Provide CPU load measurement based on idle time"
+	help
+	  The standard method to measure CPU load has a number of problems (for
+	  details refer to Documentation/cpu-load.txt). If you say Y here,
+	  additional CPU load data will be provided that are based on idle
+	  time. The data are available for every CPU in /proc/cpuload/cpuN. The
+	  counters can be reset by writing anything to /proc/cpuload/resetall
+	  for all CPUs and to /proc/cpuload/cpuN/reset for a particular CPU,
+	  respectively. The load value represents the average load since the
+	  most recent reset. Please note that a certain, though small,
+	  performance penalty cannot be avoided when this additional CPU load
+	  calculation is enabled.
+
 config TASKSTATS
 	bool "Export task/process statistics through netlink (EXPERIMENTAL)"
 	depends on NET
Index: linux-3.2.36-rt54/kernel/Makefile
===================================================================
--- linux-3.2.36-rt54.orig/kernel/Makefile
+++ linux-3.2.36-rt54/kernel/Makefile
@ linux-3.2.36-rt54/include/linux/sched.h:110 @ obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_CPU_PM) += cpu_pm.o
+obj-$(CONFIG_IDLETIME_CPULOAD) += sched_cpuload.o
 
 obj-$(CONFIG_PERF_EVENTS) += events/
 
Index: linux-3.2.36-rt54/kernel/sched.c
===================================================================
--- linux-3.2.36-rt54.orig/kernel/sched.c
+++ linux-3.2.36-rt54/kernel/sched.c
@ linux-3.2.36-rt54/include/linux/sched.h:3164 @ prepare_task_switch(struct rq *rq, struc
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 	trace_sched_switch(prev, next);
+
+#ifdef CONFIG_IDLETIME_CPULOAD
+	if (next->pid == 0) {
+		int cpu = raw_smp_processor_id();
+
+		raw_spin_lock(&per_cpu(cpuload_lock, cpu));
+		per_cpu(idlestart, cpu) = cpu_clock(cpu);
+		if (per_cpu(idlestop, cpu)) {
+			per_cpu(runtime, cpu) +=
+			    per_cpu(idlestart, cpu) - per_cpu(idlestop, cpu);
+		}
+		raw_spin_unlock(&per_cpu(cpuload_lock, cpu));
+	} else if (prev->pid == 0) {
+		int cpu = raw_smp_processor_id();
+
+		raw_spin_lock(&per_cpu(cpuload_lock, cpu));
+		per_cpu(idlestop, cpu) = cpu_clock(cpu);
+		if (per_cpu(idlestart, cpu)) {
+			per_cpu(idletime, cpu) +=
+			    per_cpu(idlestop, cpu) - per_cpu(idlestart, cpu);
+		}
+		raw_spin_unlock(&per_cpu(cpuload_lock, cpu));
+	}
+#endif
 }
 
 /**
Index: linux-3.2.36-rt54/kernel/sched_cpuload.c
===================================================================
--- /dev/null
+++ linux-3.2.36-rt54/kernel/sched_cpuload.c
@ linux-3.2.36-rt54/include/linux/sched.h:4 @
+/*
+   cpuload.c: calculate CPU load data that are derived from the
+	      idle time
+
+   Copyright (C) 2012 Carsten Emde <C.Emde@osadl.org>
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; either version 2
+   of the License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
+*/
+
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+#include <linux/stringify.h>
+
+/*
+ * Total precision of percentage output
+ * If PRECISION is 5, for example, output will be "NNN.NN%"
+ */
+
+#define PRECISION 5
+#if PRECISION < 3
+#error PRECISION must not be smaller than 3
+#endif
+
+#if PRECISION == 3
+#define PRECFACTOR ((int)1E2)
+#elif PRECISION == 4
+#define PRECFACTOR ((int)1E3)
+#elif PRECISION == 5
+#define PRECFACTOR ((int)1E4)
+#elif PRECISION == 6
+#define PRECFACTOR ((int)1E5)
+#elif PRECISION == 7
+#define PRECFACTOR ((int)1E6)
+#else
+#error PRECSICION must not be larger than 7
+#endif
+
+#define PRECFORMAT "%0"__stringify(PRECISION)"llu"
+#define MAXINTPRECISION 3
+#define DECPRECISION (PRECISION-MAXINTPRECISION)
+
+enum cpuload_action_index {
+	CPULOAD_IDLETIME,
+	CPULOAD_RUNTIME,
+	CPULOAD_LOAD,
+	CPULOAD_RESET,
+	CPULOAD_RESETALL,
+};
+
+DEFINE_PER_CPU(unsigned long long, idlestart);
+DEFINE_PER_CPU(unsigned long long, idlestop);
+DEFINE_PER_CPU(unsigned long long, idletime);
+DEFINE_PER_CPU(unsigned long long, runtime);
+DEFINE_PER_CPU(raw_spinlock_t, cpuload_lock);
+
+struct cpuload_data {
+	int cpu;
+	int action;
+};
+
+static DEFINE_PER_CPU(struct cpuload_data, cpuload_data_idletime);
+static DEFINE_PER_CPU(struct cpuload_data, cpuload_data_runtime);
+static DEFINE_PER_CPU(struct cpuload_data, cpuload_data_load);
+static DEFINE_PER_CPU(struct cpuload_data, cpuload_data_reset);
+static struct cpuload_data cpuload_data_resetall = {
+	.cpu = 0,
+	.action = CPULOAD_RESETALL,
+};
+
+static int show_cpuload(char *buf, char **start, off_t off,
+			int count, int *eof, void *data)
+{
+	int r;
+	struct cpuload_data *cpuload_data = (struct cpuload_data *) data;
+	int cpu = cpuload_data->cpu;
+	unsigned long long now = cpu_clock(cpu);
+
+	raw_spin_lock(&per_cpu(cpuload_lock, cpu));
+
+	/* Update counters */
+	if (per_cpu(idlestart, cpu) > per_cpu(idlestop, cpu)) {
+		/* CPU is idle */
+		per_cpu(idletime, cpu) += now - per_cpu(idlestart, cpu);
+		per_cpu(idlestart, cpu) = now;
+	} else {
+		/* CPU is running */
+		per_cpu(runtime, cpu) += now - per_cpu(idlestop, cpu);
+		per_cpu(idlestop, cpu) = now;
+	}
+
+	switch (cpuload_data->action) {
+	case CPULOAD_IDLETIME:
+		r = snprintf(buf, count, "%llu\n",
+		    per_cpu(idletime, cpu));
+		raw_spin_unlock(&per_cpu(cpuload_lock, cpu));
+		break;
+
+	case CPULOAD_RUNTIME:
+		r = snprintf(buf, count, "%llu\n",
+		    per_cpu(runtime, cpu));
+		raw_spin_unlock(&per_cpu(cpuload_lock, cpu));
+		break;
+
+	case CPULOAD_LOAD: {
+		char str[8], *firstdigit;
+		int intdigits;
+		unsigned long long idletime1, runtime1, alltime;
+
+		idletime1 = per_cpu(idletime, cpu);
+		runtime1 = per_cpu(runtime, cpu);
+		raw_spin_unlock(&per_cpu(cpuload_lock, cpu));
+
+		alltime = idletime1 + runtime1;
+
+		if (alltime) {
+			/* Fake precision, if measurement was just started */
+			if (alltime < PRECFACTOR) {
+				runtime1 *= PRECFACTOR;
+				alltime *= PRECFACTOR;
+			}
+
+			/* Format examples: 100.00%, 1.66%, 0.12%, 0.00% */
+			snprintf(str, sizeof(str), PRECFORMAT,
+			    div64_u64(runtime1, div_u64(alltime, PRECFACTOR)));
+
+			firstdigit = str;
+			while (*firstdigit == '0' && *firstdigit != '\0')
+				firstdigit++;
+
+			if (strlen(firstdigit) < DECPRECISION+1)
+				firstdigit = str + MAXINTPRECISION-1;
+
+			intdigits = strlen(firstdigit) - DECPRECISION;
+			strncpy(buf, firstdigit, intdigits);
+			buf[intdigits] = '.';
+			strcpy(buf + intdigits + 1, firstdigit + intdigits);
+			strcat(buf, "%\n");
+		} else
+			strcpy(buf, "n.a.\n");
+
+		r = strlen(buf);
+		break;
+
+		default:
+		raw_spin_unlock(&per_cpu(cpuload_lock, cpu));
+		r = 0;
+		break;
+		}
+	}
+
+	*eof = 1;
+	return r;
+}
+
+static inline void reset_cpuload1(int cpu)
+{
+	per_cpu(idletime, cpu) = per_cpu(runtime, cpu) = 0;
+}
+
+static int reset_cpuload(struct file *file, const char __user *buffer,
+			   unsigned long count, void *data)
+{
+	struct cpuload_data *cpuload_data = (struct cpuload_data *) data;
+	int cpu = cpuload_data->cpu;
+
+	switch (cpuload_data->action) {
+	case CPULOAD_RESET:
+		reset_cpuload1(cpu);
+		break;
+
+	case CPULOAD_RESETALL:
+		for_each_online_cpu(cpu)
+			reset_cpuload1(cpu);
+		break;
+	}
+	return count;
+}
+
+static int __init proc_cpuload_init(void)
+{
+	int cpu;
+	struct proc_dir_entry *root_cpuload_dir, *entry;
+
+	root_cpuload_dir = proc_mkdir("cpuload", NULL);
+	if (!root_cpuload_dir)
+		return 0;
+
+	entry = create_proc_entry("resetall", S_IWUGO, root_cpuload_dir);
+	if (entry) {
+		entry->write_proc = reset_cpuload;
+		entry->data = (void *) &cpuload_data_resetall;
+	}
+
+	for_each_possible_cpu(cpu) {
+		char name[32];
+		struct proc_dir_entry *cpuload_cpudir;
+
+		snprintf(name, sizeof(name), "cpu%d", cpu);
+		cpuload_cpudir = proc_mkdir(name, root_cpuload_dir);
+		if (!cpuload_cpudir)
+			return 0;
+
+		per_cpu(cpuload_data_idletime, cpu).cpu = cpu;
+		per_cpu(cpuload_data_idletime, cpu).action = CPULOAD_IDLETIME;
+		entry = create_proc_entry("idletime", S_IRUGO, cpuload_cpudir);
+		if (entry) {
+			entry->read_proc = show_cpuload;
+			entry->data = (void *)
+			    &per_cpu(cpuload_data_idletime, cpu);
+		}
+
+		per_cpu(cpuload_data_runtime, cpu).cpu = cpu;
+		per_cpu(cpuload_data_runtime, cpu).action = CPULOAD_RUNTIME;
+		entry = create_proc_entry("runtime", S_IRUGO, cpuload_cpudir);
+		if (entry) {
+			entry->read_proc = show_cpuload;
+			entry->data = (void *)
+			    &per_cpu(cpuload_data_runtime, cpu);
+		}
+
+		per_cpu(cpuload_data_load, cpu).cpu = cpu;
+		per_cpu(cpuload_data_load, cpu).action = CPULOAD_LOAD;
+		entry = create_proc_entry("load", S_IRUGO, cpuload_cpudir);
+		if (entry) {
+			entry->read_proc = show_cpuload;
+			entry->data = (void *)
+			    &per_cpu(cpuload_data_load, cpu);
+		}
+
+		per_cpu(cpuload_data_reset, cpu).cpu = cpu;
+		per_cpu(cpuload_data_reset, cpu).action = CPULOAD_RESET;
+		entry = create_proc_entry("reset", S_IWUGO, cpuload_cpudir);
+		if (entry) {
+			entry->write_proc = reset_cpuload;
+			entry->data = (void *)
+			    &per_cpu(cpuload_data_reset, cpu);
+		}
+	}
+	return 0;
+}
+
+module_init(proc_cpuload_init);