s390/fpu: allocate 'struct fpu' with the task_struct

Analog to git commit 0c8c0f03e3a292e031596484275c14cf39c0ab7a
"x86/fpu, sched: Dynamically allocate 'struct fpu'"
move the struct fpu to the end of the struct thread_struct,
set CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT and add the
setup_task_size() function to calculate the correct size
fo the task struct.

For the performance_defconfig this increases the size of
struct task_struct from 7424 bytes to 7936 bytes (MACHINE_HAS_VX==1)
or 7552 bytes (MACHINE_HAS_VX==0). The dynamic allocation of the
struct fpu is removed. The slab cache uses an 8KB block for the
task struct in all cases, there is enough room for the struct fpu.
For MACHINE_HAS_VX==1 each task now needs 512 bytes less memory.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index bf24ab1..212f34b 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -107,6 +107,7 @@
 	select ARCH_SUPPORTS_NUMA_BALANCING
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
+	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANTS_PROT_NUMA_PROT_NONE
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select BUILDTIME_EXTABLE_SORT
diff --git a/arch/s390/include/asm/fpu/types.h b/arch/s390/include/asm/fpu/types.h
index 14a8b0c..fe937c9 100644
--- a/arch/s390/include/asm/fpu/types.h
+++ b/arch/s390/include/asm/fpu/types.h
@@ -11,11 +11,13 @@
 #include <asm/sigcontext.h>
 
 struct fpu {
-	__u32 fpc;			/* Floating-point control */
+	__u32 fpc;		/* Floating-point control */
+	void *regs;		/* Pointer to the current save area */
 	union {
-		void *regs;
-		freg_t *fprs;		/* Floating-point register save area */
-		__vector128 *vxrs;	/* Vector register save area */
+		/* Floating-point register save area */
+		freg_t fprs[__NUM_FPRS];
+		/* Vector register save area */
+		__vector128 vxrs[__NUM_VXRS];
 	};
 };
 
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index d6fd22e..332f4f7 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -105,7 +105,6 @@
  * Thread structure
  */
 struct thread_struct {
-	struct fpu fpu;			/* FP and VX register save area */
 	unsigned int  acrs[NUM_ACRS];
         unsigned long ksp;              /* kernel stack pointer             */
 	mm_segment_t mm_segment;
@@ -120,6 +119,11 @@
 	/* cpu runtime instrumentation */
 	struct runtime_instr_cb *ri_cb;
 	unsigned char trap_tdb[256];	/* Transaction abort diagnose block */
+	/*
+	 * Warning: 'fpu' is dynamically-sized. It *MUST* be at
+	 * the end.
+	 */
+	struct fpu fpu;			/* FP and VX register save area */
 };
 
 /* Flag to disable transactions. */
@@ -155,10 +159,9 @@
 
 #define ARCH_MIN_TASKALIGN	8
 
-extern __vector128 init_task_fpu_regs[__NUM_VXRS];
 #define INIT_THREAD {							\
 	.ksp = sizeof(init_stack) + (unsigned long) &init_stack,	\
-	.fpu.regs = (void *)&init_task_fpu_regs,			\
+	.fpu.regs = (void *) init_task.thread.fpu.fprs,			\
 }
 
 /*
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 2bba7df..adb346b 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -37,9 +37,6 @@
 
 asmlinkage void ret_from_fork(void) asm ("ret_from_fork");
 
-/* FPU save area for the init task */
-__vector128 init_task_fpu_regs[__NUM_VXRS] __init_task_data;
-
 /*
  * Return saved PC of a blocked thread. used in kernel/sched.
  * resume in entry.S does not create a new stack frame, it
@@ -85,35 +82,19 @@
 
 void arch_release_task_struct(struct task_struct *tsk)
 {
-	/* Free either the floating-point or the vector register save area */
-	kfree(tsk->thread.fpu.regs);
 }
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	size_t fpu_regs_size;
-
-	*dst = *src;
-
-	/*
-	 * If the vector extension is available, it is enabled for all tasks,
-	 * and, thus, the FPU register save area must be allocated accordingly.
-	 */
-	fpu_regs_size = MACHINE_HAS_VX ? sizeof(__vector128) * __NUM_VXRS
-				       : sizeof(freg_t) * __NUM_FPRS;
-	dst->thread.fpu.regs = kzalloc(fpu_regs_size, GFP_KERNEL|__GFP_REPEAT);
-	if (!dst->thread.fpu.regs)
-		return -ENOMEM;
-
 	/*
 	 * Save the floating-point or vector register state of the current
 	 * task and set the CIF_FPU flag to lazy restore the FPU register
 	 * state when returning to user space.
 	 */
 	save_fpu_regs();
-	dst->thread.fpu.fpc = current->thread.fpu.fpc;
-	memcpy(dst->thread.fpu.regs, current->thread.fpu.regs, fpu_regs_size);
 
+	memcpy(dst, src, arch_task_struct_size);
+	dst->thread.fpu.regs = dst->thread.fpu.fprs;
 	return 0;
 }
 
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index d3f9688..f319391 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -809,6 +809,22 @@
 }
 
 /*
+ * Find the correct size for the task_struct. This depends on
+ * the size of the struct fpu at the end of the thread_struct
+ * which is embedded in the task_struct.
+ */
+static void __init setup_task_size(void)
+{
+	int task_size = sizeof(struct task_struct);
+
+	if (!MACHINE_HAS_VX) {
+		task_size -= sizeof(__vector128) * __NUM_VXRS;
+		task_size += sizeof(freg_t) * __NUM_FPRS;
+	}
+	arch_task_struct_size = task_size;
+}
+
+/*
  * Setup function called from init/main.c just after the banner
  * was printed.
  */
@@ -846,6 +862,7 @@
 
 	os_info_init();
 	setup_ipl();
+	setup_task_size();
 
 	/* Do some memory reservations *before* memory is added to memblock */
 	reserve_memory_end();