[PATCH] schedule_on_each_cpu(): reduce kmalloc() size

schedule_on_each_cpu() presently does a large kmalloc - 96 kbytes on 1024 CPU
64-bit.

Rework it so that we do one 8192-byte allocation and then a pile of tiny ones,
via alloc_percpu().  This has a much higher chance of success (100% in the
current VM).

This also has the effect of reducing the memory requirements from NR_CPUS*n to
num_possible_cpus()*n.

Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 740c5ab..f869aff 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -428,22 +428,34 @@
 	return ret;
 }
 
-int schedule_on_each_cpu(void (*func) (void *info), void *info)
+/**
+ * schedule_on_each_cpu - call a function on each online CPU from keventd
+ * @func: the function to call
+ * @info: a pointer to pass to func()
+ *
+ * Returns zero on success.
+ * Returns -ve errno on failure.
+ *
+ * Appears to be racy against CPU hotplug.
+ *
+ * schedule_on_each_cpu() is very slow.
+ */
+int schedule_on_each_cpu(void (*func)(void *info), void *info)
 {
 	int cpu;
-	struct work_struct *work;
+	struct work_struct *works;
 
-	work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL);
-
-	if (!work)
+	works = alloc_percpu(struct work_struct);
+	if (!works)
 		return -ENOMEM;
+
 	for_each_online_cpu(cpu) {
-		INIT_WORK(work + cpu, func, info);
+		INIT_WORK(per_cpu_ptr(works, cpu), func, info);
 		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
-				work + cpu);
+				per_cpu_ptr(works, cpu));
 	}
 	flush_workqueue(keventd_wq);
-	kfree(work);
+	free_percpu(works);
 	return 0;
 }