[CRYPTO] Use standard byte order macros wherever possible

A lot of crypto code needs to read/write a 32-bit/64-bit words in a
specific gender.  Many of them open code them by reading/writing one
byte at a time.  This patch converts all the applicable usages over
to use the standard byte order macros.

This is based on a previous patch by Denis Vlasenko.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/arch/i386/crypto/aes.c b/arch/i386/crypto/aes.c
index 88ee85c3..1deb9ff 100644
--- a/arch/i386/crypto/aes.c
+++ b/arch/i386/crypto/aes.c
@@ -36,6 +36,8 @@
  * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
  *
  */
+
+#include <asm/byteorder.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -59,7 +61,6 @@
 };
 
 #define WPOLY 0x011b
-#define u32_in(x) le32_to_cpup((const __le32 *)(x))
 #define bytes2word(b0, b1, b2, b3)  \
 	(((u32)(b3) << 24) | ((u32)(b2) << 16) | ((u32)(b1) << 8) | (b0))
 
@@ -393,13 +394,14 @@
 	int i;
 	u32 ss[8];
 	struct aes_ctx *ctx = ctx_arg;
+	const __le32 *key = (const __le32 *)in_key;
 
 	/* encryption schedule */
 	
-	ctx->ekey[0] = ss[0] = u32_in(in_key);
-	ctx->ekey[1] = ss[1] = u32_in(in_key + 4);
-	ctx->ekey[2] = ss[2] = u32_in(in_key + 8);
-	ctx->ekey[3] = ss[3] = u32_in(in_key + 12);
+	ctx->ekey[0] = ss[0] = le32_to_cpu(key[0]);
+	ctx->ekey[1] = ss[1] = le32_to_cpu(key[1]);
+	ctx->ekey[2] = ss[2] = le32_to_cpu(key[2]);
+	ctx->ekey[3] = ss[3] = le32_to_cpu(key[3]);
 
 	switch(key_len) {
 	case 16:
@@ -410,8 +412,8 @@
 		break;
 		
 	case 24:
-		ctx->ekey[4] = ss[4] = u32_in(in_key + 16);
-		ctx->ekey[5] = ss[5] = u32_in(in_key + 20);
+		ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]);
+		ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]);
 		for (i = 0; i < 7; i++)
 			ke6(ctx->ekey, i);
 		kel6(ctx->ekey, 7); 
@@ -419,10 +421,10 @@
 		break;
 
 	case 32:
-		ctx->ekey[4] = ss[4] = u32_in(in_key + 16);
-		ctx->ekey[5] = ss[5] = u32_in(in_key + 20);
-		ctx->ekey[6] = ss[6] = u32_in(in_key + 24);
-		ctx->ekey[7] = ss[7] = u32_in(in_key + 28);
+		ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]);
+		ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]);
+		ctx->ekey[6] = ss[6] = le32_to_cpu(key[6]);
+		ctx->ekey[7] = ss[7] = le32_to_cpu(key[7]);
 		for (i = 0; i < 6; i++)
 			ke8(ctx->ekey, i);
 		kel8(ctx->ekey, 6);
@@ -436,10 +438,10 @@
 	
 	/* decryption schedule */
 	
-	ctx->dkey[0] = ss[0] = u32_in(in_key);
-	ctx->dkey[1] = ss[1] = u32_in(in_key + 4);
-	ctx->dkey[2] = ss[2] = u32_in(in_key + 8);
-	ctx->dkey[3] = ss[3] = u32_in(in_key + 12);
+	ctx->dkey[0] = ss[0] = le32_to_cpu(key[0]);
+	ctx->dkey[1] = ss[1] = le32_to_cpu(key[1]);
+	ctx->dkey[2] = ss[2] = le32_to_cpu(key[2]);
+	ctx->dkey[3] = ss[3] = le32_to_cpu(key[3]);
 
 	switch (key_len) {
 	case 16:
@@ -450,8 +452,8 @@
 		break;
 		
 	case 24:
-		ctx->dkey[4] = ff(ss[4] = u32_in(in_key + 16));
-		ctx->dkey[5] = ff(ss[5] = u32_in(in_key + 20));
+		ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4]));
+		ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5]));
 		kdf6(ctx->dkey, 0);
 		for (i = 1; i < 7; i++)
 			kd6(ctx->dkey, i);
@@ -459,10 +461,10 @@
 		break;
 
 	case 32:
-		ctx->dkey[4] = ff(ss[4] = u32_in(in_key + 16));
-		ctx->dkey[5] = ff(ss[5] = u32_in(in_key + 20));
-		ctx->dkey[6] = ff(ss[6] = u32_in(in_key + 24));
-		ctx->dkey[7] = ff(ss[7] = u32_in(in_key + 28));
+		ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4]));
+		ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5]));
+		ctx->dkey[6] = ff(ss[6] = le32_to_cpu(key[6]));
+		ctx->dkey[7] = ff(ss[7] = le32_to_cpu(key[7]));
 		kdf8(ctx->dkey, 0);
 		for (i = 1; i < 6; i++)
 			kd8(ctx->dkey, i);
diff --git a/arch/x86_64/crypto/aes.c b/arch/x86_64/crypto/aes.c
index acfdaa2..19996854 100644
--- a/arch/x86_64/crypto/aes.c
+++ b/arch/x86_64/crypto/aes.c
@@ -74,8 +74,6 @@
 	return x >> (n << 3);
 }
 
-#define u32_in(x) le32_to_cpu(*(const __le32 *)(x))
-
 struct aes_ctx
 {
 	u32 key_length;
@@ -234,6 +232,7 @@
 		       u32 *flags)
 {
 	struct aes_ctx *ctx = ctx_arg;
+	const __le32 *key = (const __le32 *)in_key;
 	u32 i, j, t, u, v, w;
 
 	if (key_len != 16 && key_len != 24 && key_len != 32) {
@@ -243,10 +242,10 @@
 
 	ctx->key_length = key_len;
 
-	D_KEY[key_len + 24] = E_KEY[0] = u32_in(in_key);
-	D_KEY[key_len + 25] = E_KEY[1] = u32_in(in_key + 4);
-	D_KEY[key_len + 26] = E_KEY[2] = u32_in(in_key + 8);
-	D_KEY[key_len + 27] = E_KEY[3] = u32_in(in_key + 12);
+	D_KEY[key_len + 24] = E_KEY[0] = le32_to_cpu(key[0]);
+	D_KEY[key_len + 25] = E_KEY[1] = le32_to_cpu(key[1]);
+	D_KEY[key_len + 26] = E_KEY[2] = le32_to_cpu(key[2]);
+	D_KEY[key_len + 27] = E_KEY[3] = le32_to_cpu(key[3]);
 
 	switch (key_len) {
 	case 16:
@@ -256,17 +255,17 @@
 		break;
 
 	case 24:
-		E_KEY[4] = u32_in(in_key + 16);
-		t = E_KEY[5] = u32_in(in_key + 20);
+		E_KEY[4] = le32_to_cpu(key[4]);
+		t = E_KEY[5] = le32_to_cpu(key[5]);
 		for (i = 0; i < 8; ++i)
 			loop6 (i);
 		break;
 
 	case 32:
-		E_KEY[4] = u32_in(in_key + 16);
-		E_KEY[5] = u32_in(in_key + 20);
-		E_KEY[6] = u32_in(in_key + 24);
-		t = E_KEY[7] = u32_in(in_key + 28);
+		E_KEY[4] = le32_to_cpu(key[4]);
+		E_KEY[5] = le32_to_cpu(key[5]);
+		E_KEY[6] = le32_to_cpu(key[6]);
+		t = E_KEY[7] = le32_to_cpu(key[7]);
 		for (i = 0; i < 7; ++i)
 			loop8(i);
 		break;
diff --git a/crypto/aes.c b/crypto/aes.c
index 5df9288..35a11de 100644
--- a/crypto/aes.c
+++ b/crypto/aes.c
@@ -73,9 +73,6 @@
 	return x >> (n << 3);
 }
 
-#define u32_in(x) le32_to_cpu(*(const u32 *)(x))
-#define u32_out(to, from) (*(u32 *)(to) = cpu_to_le32(from))
-
 struct aes_ctx {
 	int key_length;
 	u32 E[60];
@@ -256,6 +253,7 @@
 aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, u32 *flags)
 {
 	struct aes_ctx *ctx = ctx_arg;
+	const __le32 *key = (const __le32 *)in_key;
 	u32 i, t, u, v, w;
 
 	if (key_len != 16 && key_len != 24 && key_len != 32) {
@@ -265,10 +263,10 @@
 
 	ctx->key_length = key_len;
 
-	E_KEY[0] = u32_in (in_key);
-	E_KEY[1] = u32_in (in_key + 4);
-	E_KEY[2] = u32_in (in_key + 8);
-	E_KEY[3] = u32_in (in_key + 12);
+	E_KEY[0] = le32_to_cpu(key[0]);
+	E_KEY[1] = le32_to_cpu(key[1]);
+	E_KEY[2] = le32_to_cpu(key[2]);
+	E_KEY[3] = le32_to_cpu(key[3]);
 
 	switch (key_len) {
 	case 16:
@@ -278,17 +276,17 @@
 		break;
 
 	case 24:
-		E_KEY[4] = u32_in (in_key + 16);
-		t = E_KEY[5] = u32_in (in_key + 20);
+		E_KEY[4] = le32_to_cpu(key[4]);
+		t = E_KEY[5] = le32_to_cpu(key[5]);
 		for (i = 0; i < 8; ++i)
 			loop6 (i);
 		break;
 
 	case 32:
-		E_KEY[4] = u32_in (in_key + 16);
-		E_KEY[5] = u32_in (in_key + 20);
-		E_KEY[6] = u32_in (in_key + 24);
-		t = E_KEY[7] = u32_in (in_key + 28);
+		E_KEY[4] = le32_to_cpu(key[4]);
+		E_KEY[5] = le32_to_cpu(key[5]);
+		E_KEY[6] = le32_to_cpu(key[6]);
+		t = E_KEY[7] = le32_to_cpu(key[7]);
 		for (i = 0; i < 7; ++i)
 			loop8 (i);
 		break;
@@ -324,13 +322,15 @@
 static void aes_encrypt(void *ctx_arg, u8 *out, const u8 *in)
 {
 	const struct aes_ctx *ctx = ctx_arg;
+	const __le32 *src = (const __le32 *)in;
+	__le32 *dst = (__le32 *)out;
 	u32 b0[4], b1[4];
 	const u32 *kp = E_KEY + 4;
 
-	b0[0] = u32_in (in) ^ E_KEY[0];
-	b0[1] = u32_in (in + 4) ^ E_KEY[1];
-	b0[2] = u32_in (in + 8) ^ E_KEY[2];
-	b0[3] = u32_in (in + 12) ^ E_KEY[3];
+	b0[0] = le32_to_cpu(src[0]) ^ E_KEY[0];
+	b0[1] = le32_to_cpu(src[1]) ^ E_KEY[1];
+	b0[2] = le32_to_cpu(src[2]) ^ E_KEY[2];
+	b0[3] = le32_to_cpu(src[3]) ^ E_KEY[3];
 
 	if (ctx->key_length > 24) {
 		f_nround (b1, b0, kp);
@@ -353,10 +353,10 @@
 	f_nround (b1, b0, kp);
 	f_lround (b0, b1, kp);
 
-	u32_out (out, b0[0]);
-	u32_out (out + 4, b0[1]);
-	u32_out (out + 8, b0[2]);
-	u32_out (out + 12, b0[3]);
+	dst[0] = cpu_to_le32(b0[0]);
+	dst[1] = cpu_to_le32(b0[1]);
+	dst[2] = cpu_to_le32(b0[2]);
+	dst[3] = cpu_to_le32(b0[3]);
 }
 
 /* decrypt a block of text */
@@ -377,14 +377,16 @@
 static void aes_decrypt(void *ctx_arg, u8 *out, const u8 *in)
 {
 	const struct aes_ctx *ctx = ctx_arg;
+	const __le32 *src = (const __le32 *)in;
+	__le32 *dst = (__le32 *)out;
 	u32 b0[4], b1[4];
 	const int key_len = ctx->key_length;
 	const u32 *kp = D_KEY + key_len + 20;
 
-	b0[0] = u32_in (in) ^ E_KEY[key_len + 24];
-	b0[1] = u32_in (in + 4) ^ E_KEY[key_len + 25];
-	b0[2] = u32_in (in + 8) ^ E_KEY[key_len + 26];
-	b0[3] = u32_in (in + 12) ^ E_KEY[key_len + 27];
+	b0[0] = le32_to_cpu(src[0]) ^ E_KEY[key_len + 24];
+	b0[1] = le32_to_cpu(src[1]) ^ E_KEY[key_len + 25];
+	b0[2] = le32_to_cpu(src[2]) ^ E_KEY[key_len + 26];
+	b0[3] = le32_to_cpu(src[3]) ^ E_KEY[key_len + 27];
 
 	if (key_len > 24) {
 		i_nround (b1, b0, kp);
@@ -407,10 +409,10 @@
 	i_nround (b1, b0, kp);
 	i_lround (b0, b1, kp);
 
-	u32_out (out, b0[0]);
-	u32_out (out + 4, b0[1]);
-	u32_out (out + 8, b0[2]);
-	u32_out (out + 12, b0[3]);
+	dst[0] = cpu_to_le32(b0[0]);
+	dst[1] = cpu_to_le32(b0[1]);
+	dst[2] = cpu_to_le32(b0[2]);
+	dst[3] = cpu_to_le32(b0[3]);
 }
 
 
diff --git a/crypto/anubis.c b/crypto/anubis.c
index 3925eb0..94c4b1f 100644
--- a/crypto/anubis.c
+++ b/crypto/anubis.c
@@ -32,8 +32,10 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <asm/byteorder.h>
 #include <asm/scatterlist.h>
 #include <linux/crypto.h>
+#include <linux/types.h>
 
 #define ANUBIS_MIN_KEY_SIZE	16
 #define ANUBIS_MAX_KEY_SIZE	40
@@ -461,8 +463,8 @@
 static int anubis_setkey(void *ctx_arg, const u8 *in_key,
 			 unsigned int key_len, u32 *flags)
 {
-
-	int N, R, i, pos, r;
+	const __be32 *key = (const __be32 *)in_key;
+	int N, R, i, r;
 	u32 kappa[ANUBIS_MAX_N];
 	u32 inter[ANUBIS_MAX_N];
 
@@ -483,13 +485,8 @@
 	ctx->R = R = 8 + N;
 
 	/* * map cipher key to initial key state (mu): */
-		for (i = 0, pos = 0; i < N; i++, pos += 4) {
-		kappa[i] =
-			(in_key[pos    ] << 24) ^
-			(in_key[pos + 1] << 16) ^
-			(in_key[pos + 2] <<  8) ^
-			(in_key[pos + 3]      );
-	}
+	for (i = 0; i < N; i++)
+		kappa[i] = be32_to_cpu(key[i]);
 
 	/*
 	 * generate R + 1 round keys:
@@ -578,7 +575,9 @@
 static void anubis_crypt(u32 roundKey[ANUBIS_MAX_ROUNDS + 1][4],
 		u8 *ciphertext, const u8 *plaintext, const int R)
 {
-	int i, pos, r;
+	const __be32 *src = (const __be32 *)plaintext;
+	__be32 *dst = (__be32 *)ciphertext;
+	int i, r;
 	u32 state[4];
 	u32 inter[4];
 
@@ -586,14 +585,8 @@
 	 * map plaintext block to cipher state (mu)
 	 * and add initial round key (sigma[K^0]):
 	 */
-	for (i = 0, pos = 0; i < 4; i++, pos += 4) {
-		state[i] =
-			(plaintext[pos    ] << 24) ^
-			(plaintext[pos + 1] << 16) ^
-			(plaintext[pos + 2] <<  8) ^
-			(plaintext[pos + 3]      ) ^
-			roundKey[0][i];
-	}
+	for (i = 0; i < 4; i++)
+		state[i] = be32_to_cpu(src[i]) ^ roundKey[0][i];
 
 	/*
 	 * R - 1 full rounds:
@@ -663,13 +656,8 @@
 	 * map cipher state to ciphertext block (mu^{-1}):
 	 */
 
-	for (i = 0, pos = 0; i < 4; i++, pos += 4) {
-		u32 w = inter[i];
-		ciphertext[pos    ] = (u8)(w >> 24);
-		ciphertext[pos + 1] = (u8)(w >> 16);
-		ciphertext[pos + 2] = (u8)(w >>  8);
-		ciphertext[pos + 3] = (u8)(w      );
-	}
+	for (i = 0; i < 4; i++)
+		dst[i] = cpu_to_be32(inter[i]);
 }
 
 static void anubis_encrypt(void *ctx_arg, u8 *dst, const u8 *src)
diff --git a/crypto/blowfish.c b/crypto/blowfish.c
index a8b29d5..99fc459 100644
--- a/crypto/blowfish.c
+++ b/crypto/blowfish.c
@@ -19,8 +19,10 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <asm/byteorder.h>
 #include <asm/scatterlist.h>
 #include <linux/crypto.h>
+#include <linux/types.h>
 
 #define BF_BLOCK_SIZE 8
 #define BF_MIN_KEY_SIZE 4
diff --git a/crypto/cast5.c b/crypto/cast5.c
index bc42f42..282641c 100644
--- a/crypto/cast5.c
+++ b/crypto/cast5.c
@@ -21,11 +21,13 @@
 */
 
 
+#include <asm/byteorder.h>
 #include <linux/init.h>
 #include <linux/crypto.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/types.h>
 
 #define CAST5_BLOCK_SIZE 8
 #define CAST5_MIN_KEY_SIZE 5
@@ -578,6 +580,8 @@
 static void cast5_encrypt(void *ctx, u8 * outbuf, const u8 * inbuf)
 {
 	struct cast5_ctx *c = (struct cast5_ctx *) ctx;
+	const __be32 *src = (const __be32 *)inbuf;
+	__be32 *dst = (__be32 *)outbuf;
 	u32 l, r, t;
 	u32 I;			/* used by the Fx macros */
 	u32 *Km;
@@ -589,8 +593,8 @@
 	/* (L0,R0) <-- (m1...m64).  (Split the plaintext into left and
 	 * right 32-bit halves L0 = m1...m32 and R0 = m33...m64.)
 	 */
-	l = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3];
-	r = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7];
+	l = be32_to_cpu(src[0]);
+	r = be32_to_cpu(src[1]);
 
 	/* (16 rounds) for i from 1 to 16, compute Li and Ri as follows:
 	 *  Li = Ri-1;
@@ -634,19 +638,15 @@
 
 	/* c1...c64 <-- (R16,L16).  (Exchange final blocks L16, R16 and
 	 *  concatenate to form the ciphertext.) */
-	outbuf[0] = (r >> 24) & 0xff;
-	outbuf[1] = (r >> 16) & 0xff;
-	outbuf[2] = (r >> 8) & 0xff;
-	outbuf[3] = r & 0xff;
-	outbuf[4] = (l >> 24) & 0xff;
-	outbuf[5] = (l >> 16) & 0xff;
-	outbuf[6] = (l >> 8) & 0xff;
-	outbuf[7] = l & 0xff;
+	dst[0] = cpu_to_be32(r);
+	dst[1] = cpu_to_be32(l);
 }
 
 static void cast5_decrypt(void *ctx, u8 * outbuf, const u8 * inbuf)
 {
 	struct cast5_ctx *c = (struct cast5_ctx *) ctx;
+	const __be32 *src = (const __be32 *)inbuf;
+	__be32 *dst = (__be32 *)outbuf;
 	u32 l, r, t;
 	u32 I;
 	u32 *Km;
@@ -655,8 +655,8 @@
 	Km = c->Km;
 	Kr = c->Kr;
 
-	l = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3];
-	r = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7];
+	l = be32_to_cpu(src[0]);
+	r = be32_to_cpu(src[1]);
 
 	if (!(c->rr)) {
 		t = l; l = r; r = t ^ F1(r, Km[15], Kr[15]);
@@ -690,14 +690,8 @@
 		t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]);
 	}
 
-	outbuf[0] = (r >> 24) & 0xff;
-	outbuf[1] = (r >> 16) & 0xff;
-	outbuf[2] = (r >> 8) & 0xff;
-	outbuf[3] = r & 0xff;
-	outbuf[4] = (l >> 24) & 0xff;
-	outbuf[5] = (l >> 16) & 0xff;
-	outbuf[6] = (l >> 8) & 0xff;
-	outbuf[7] = l & 0xff;
+	dst[0] = cpu_to_be32(r);
+	dst[1] = cpu_to_be32(l);
 }
 
 static void key_schedule(u32 * x, u32 * z, u32 * k)
@@ -782,7 +776,7 @@
 	u32 x[4];
 	u32 z[4];
 	u32 k[16];
-	u8 p_key[16];
+	__be32 p_key[4];
 	struct cast5_ctx *c = (struct cast5_ctx *) ctx;
 	
 	if (key_len < 5 || key_len > 16) {
@@ -796,12 +790,10 @@
 	memcpy(p_key, key, key_len);
 
 
-	x[0] = p_key[0] << 24 | p_key[1] << 16 | p_key[2] << 8 | p_key[3];
-	x[1] = p_key[4] << 24 | p_key[5] << 16 | p_key[6] << 8 | p_key[7];
-	x[2] =
-	    p_key[8] << 24 | p_key[9] << 16 | p_key[10] << 8 | p_key[11];
-	x[3] =
-	    p_key[12] << 24 | p_key[13] << 16 | p_key[14] << 8 | p_key[15];
+	x[0] = be32_to_cpu(p_key[0]);
+	x[1] = be32_to_cpu(p_key[1]);
+	x[2] = be32_to_cpu(p_key[2]);
+	x[3] = be32_to_cpu(p_key[3]);
 
 	key_schedule(x, z, k);
 	for (i = 0; i < 16; i++)
diff --git a/crypto/cast6.c b/crypto/cast6.c
index 3eb0810..d317fff 100644
--- a/crypto/cast6.c
+++ b/crypto/cast6.c
@@ -18,11 +18,13 @@
  */
 
 
+#include <asm/byteorder.h>
 #include <linux/init.h>
 #include <linux/crypto.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/types.h>
 
 #define CAST6_BLOCK_SIZE 16
 #define CAST6_MIN_KEY_SIZE 16
@@ -384,7 +386,7 @@
 {
 	int i;
 	u32 key[8];
-	u8 p_key[32]; /* padded key */
+	__be32 p_key[8]; /* padded key */
 	struct cast6_ctx *c = (struct cast6_ctx *) ctx;
 
 	if (key_len < 16 || key_len > 32 || key_len % 4 != 0) {
@@ -395,14 +397,14 @@
 	memset (p_key, 0, 32);
 	memcpy (p_key, in_key, key_len);
 	
-	key[0] = p_key[0] << 24 | p_key[1] << 16 | p_key[2] << 8 | p_key[3];		/* A */
-	key[1] = p_key[4] << 24 | p_key[5] << 16 | p_key[6] << 8 | p_key[7];		/* B */
-	key[2] = p_key[8] << 24 | p_key[9] << 16 | p_key[10] << 8 | p_key[11];		/* C */
-	key[3] = p_key[12] << 24 | p_key[13] << 16 | p_key[14] << 8 | p_key[15];	/* D */
-	key[4] = p_key[16] << 24 | p_key[17] << 16 | p_key[18] << 8 | p_key[19];	/* E */
-	key[5] = p_key[20] << 24 | p_key[21] << 16 | p_key[22] << 8 | p_key[23];	/* F */
-	key[6] = p_key[24] << 24 | p_key[25] << 16 | p_key[26] << 8 | p_key[27];	/* G */
-	key[7] = p_key[28] << 24 | p_key[29] << 16 | p_key[30] << 8 | p_key[31];	/* H */
+	key[0] = be32_to_cpu(p_key[0]);		/* A */
+	key[1] = be32_to_cpu(p_key[1]);		/* B */
+	key[2] = be32_to_cpu(p_key[2]);		/* C */
+	key[3] = be32_to_cpu(p_key[3]);		/* D */
+	key[4] = be32_to_cpu(p_key[4]);		/* E */
+	key[5] = be32_to_cpu(p_key[5]);		/* F */
+	key[6] = be32_to_cpu(p_key[6]);		/* G */
+	key[7] = be32_to_cpu(p_key[7]);		/* H */
 	
 
 
@@ -444,14 +446,16 @@
 
 static void cast6_encrypt (void * ctx, u8 * outbuf, const u8 * inbuf) {
 	struct cast6_ctx * c = (struct cast6_ctx *)ctx;
+	const __be32 *src = (const __be32 *)inbuf;
+	__be32 *dst = (__be32 *)outbuf;
 	u32 block[4];
 	u32 * Km; 
 	u8 * Kr;
 
-	block[0] = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3];
-	block[1] = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7];
-	block[2] = inbuf[8] << 24 | inbuf[9] << 16 | inbuf[10] << 8 | inbuf[11];
-	block[3] = inbuf[12] << 24 | inbuf[13] << 16 | inbuf[14] << 8 | inbuf[15];
+	block[0] = be32_to_cpu(src[0]);
+	block[1] = be32_to_cpu(src[1]);
+	block[2] = be32_to_cpu(src[2]);
+	block[3] = be32_to_cpu(src[3]);
 
 	Km = c->Km[0]; Kr = c->Kr[0]; Q (block, Kr, Km);
 	Km = c->Km[1]; Kr = c->Kr[1]; Q (block, Kr, Km);
@@ -465,35 +469,25 @@
 	Km = c->Km[9]; Kr = c->Kr[9]; QBAR (block, Kr, Km);
 	Km = c->Km[10]; Kr = c->Kr[10]; QBAR (block, Kr, Km);
 	Km = c->Km[11]; Kr = c->Kr[11]; QBAR (block, Kr, Km);
-	
-	outbuf[0] = (block[0] >> 24) & 0xff;
-	outbuf[1] = (block[0] >> 16) & 0xff;
-	outbuf[2] = (block[0] >> 8) & 0xff;
-	outbuf[3] = block[0] & 0xff;
-	outbuf[4] = (block[1] >> 24) & 0xff;
-	outbuf[5] = (block[1] >> 16) & 0xff;
-	outbuf[6] = (block[1] >> 8) & 0xff;
-	outbuf[7] = block[1] & 0xff;
-	outbuf[8] = (block[2] >> 24) & 0xff;
-	outbuf[9] = (block[2] >> 16) & 0xff;
-	outbuf[10] = (block[2] >> 8) & 0xff;
-	outbuf[11] = block[2] & 0xff;
-	outbuf[12] = (block[3] >> 24) & 0xff;
-	outbuf[13] = (block[3] >> 16) & 0xff;
-	outbuf[14] = (block[3] >> 8) & 0xff;
-	outbuf[15] = block[3] & 0xff;	
+
+	dst[0] = cpu_to_be32(block[0]);
+	dst[1] = cpu_to_be32(block[1]);
+	dst[2] = cpu_to_be32(block[2]);
+	dst[3] = cpu_to_be32(block[3]);
 }	
 
 static void cast6_decrypt (void * ctx, u8 * outbuf, const u8 * inbuf) {
 	struct cast6_ctx * c = (struct cast6_ctx *)ctx;
+	const __be32 *src = (const __be32 *)inbuf;
+	__be32 *dst = (__be32 *)outbuf;
 	u32 block[4];
 	u32 * Km; 
 	u8 * Kr;
 
-	block[0] = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3];
-	block[1] = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7];
-	block[2] = inbuf[8] << 24 | inbuf[9] << 16 | inbuf[10] << 8 | inbuf[11];
-	block[3] = inbuf[12] << 24 | inbuf[13] << 16 | inbuf[14] << 8 | inbuf[15];
+	block[0] = be32_to_cpu(src[0]);
+	block[1] = be32_to_cpu(src[1]);
+	block[2] = be32_to_cpu(src[2]);
+	block[3] = be32_to_cpu(src[3]);
 
 	Km = c->Km[11]; Kr = c->Kr[11]; Q (block, Kr, Km);
 	Km = c->Km[10]; Kr = c->Kr[10]; Q (block, Kr, Km);
@@ -508,22 +502,10 @@
 	Km = c->Km[1]; Kr = c->Kr[1]; QBAR (block, Kr, Km);
 	Km = c->Km[0]; Kr = c->Kr[0]; QBAR (block, Kr, Km);
 	
-	outbuf[0] = (block[0] >> 24) & 0xff;
-	outbuf[1] = (block[0] >> 16) & 0xff;
-	outbuf[2] = (block[0] >> 8) & 0xff;
-	outbuf[3] = block[0] & 0xff;
-	outbuf[4] = (block[1] >> 24) & 0xff;
-	outbuf[5] = (block[1] >> 16) & 0xff;
-	outbuf[6] = (block[1] >> 8) & 0xff;
-	outbuf[7] = block[1] & 0xff;
-	outbuf[8] = (block[2] >> 24) & 0xff;
-	outbuf[9] = (block[2] >> 16) & 0xff;
-	outbuf[10] = (block[2] >> 8) & 0xff;
-	outbuf[11] = block[2] & 0xff;
-	outbuf[12] = (block[3] >> 24) & 0xff;
-	outbuf[13] = (block[3] >> 16) & 0xff;
-	outbuf[14] = (block[3] >> 8) & 0xff;
-	outbuf[15] = block[3] & 0xff;	
+	dst[0] = cpu_to_be32(block[0]);
+	dst[1] = cpu_to_be32(block[1]);
+	dst[2] = cpu_to_be32(block[2]);
+	dst[3] = cpu_to_be32(block[3]);
 }	
 
 static struct crypto_alg alg = {
diff --git a/crypto/crc32c.c b/crypto/crc32c.c
index 256956c..9533624 100644
--- a/crypto/crc32c.c
+++ b/crypto/crc32c.c
@@ -16,6 +16,7 @@
 #include <linux/string.h>
 #include <linux/crypto.h>
 #include <linux/crc32c.h>
+#include <linux/types.h>
 #include <asm/byteorder.h>
 
 #define CHKSUM_BLOCK_SIZE	32
diff --git a/crypto/des.c b/crypto/des.c
index a3c863d..dae4298 100644
--- a/crypto/des.c
+++ b/crypto/des.c
@@ -12,11 +12,13 @@
  *
  */
 
+#include <asm/byteorder.h>
 #include <linux/bitops.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/crypto.h>
+#include <linux/types.h>
 
 #define DES_KEY_SIZE		8
 #define DES_EXPKEY_WORDS	32
diff --git a/crypto/khazad.c b/crypto/khazad.c
index 738cb0d..6809210 100644
--- a/crypto/khazad.c
+++ b/crypto/khazad.c
@@ -22,8 +22,10 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <asm/byteorder.h>
 #include <asm/scatterlist.h>
 #include <linux/crypto.h>
+#include <linux/types.h>
 
 #define KHAZAD_KEY_SIZE		16
 #define KHAZAD_BLOCK_SIZE	8
@@ -755,8 +757,8 @@
 static int khazad_setkey(void *ctx_arg, const u8 *in_key,
                        unsigned int key_len, u32 *flags)
 {
-
 	struct khazad_ctx *ctx = ctx_arg;
+	const __be64 *key = (const __be64 *)in_key;
 	int r;
 	const u64 *S = T7;
 	u64 K2, K1;
@@ -767,22 +769,8 @@
 		return -EINVAL;
 	}
 
-	K2 = ((u64)in_key[ 0] << 56) ^
-	     ((u64)in_key[ 1] << 48) ^
-	     ((u64)in_key[ 2] << 40) ^
-	     ((u64)in_key[ 3] << 32) ^
-	     ((u64)in_key[ 4] << 24) ^
-	     ((u64)in_key[ 5] << 16) ^
-	     ((u64)in_key[ 6] <<  8) ^
-	     ((u64)in_key[ 7]      );
-	K1 = ((u64)in_key[ 8] << 56) ^
-	     ((u64)in_key[ 9] << 48) ^
-	     ((u64)in_key[10] << 40) ^
-	     ((u64)in_key[11] << 32) ^
-	     ((u64)in_key[12] << 24) ^
-	     ((u64)in_key[13] << 16) ^
-	     ((u64)in_key[14] <<  8) ^
-	     ((u64)in_key[15]      );
+	K2 = be64_to_cpu(key[0]);
+	K1 = be64_to_cpu(key[1]);
 
 	/* setup the encrypt key */
 	for (r = 0; r <= KHAZAD_ROUNDS; r++) {
@@ -820,19 +808,12 @@
 static void khazad_crypt(const u64 roundKey[KHAZAD_ROUNDS + 1],
 		u8 *ciphertext, const u8 *plaintext)
 {
-
+	const __be64 *src = (const __be64 *)plaintext;
+	__be64 *dst = (__be64 *)ciphertext;
 	int r;
 	u64 state;
 
-	state = ((u64)plaintext[0] << 56) ^
-		((u64)plaintext[1] << 48) ^
-		((u64)plaintext[2] << 40) ^
-		((u64)plaintext[3] << 32) ^
-		((u64)plaintext[4] << 24) ^
-		((u64)plaintext[5] << 16) ^
-		((u64)plaintext[6] <<  8) ^
-		((u64)plaintext[7]      ) ^
-		roundKey[0];
+	state = be64_to_cpu(*src) ^ roundKey[0];
 
 	for (r = 1; r < KHAZAD_ROUNDS; r++) {
 		state = T0[(int)(state >> 56)       ] ^
@@ -856,15 +837,7 @@
 		(T7[(int)(state      ) & 0xff] & 0x00000000000000ffULL) ^
 		roundKey[KHAZAD_ROUNDS];
 
-	ciphertext[0] = (u8)(state >> 56);
-	ciphertext[1] = (u8)(state >> 48);
-	ciphertext[2] = (u8)(state >> 40);
-	ciphertext[3] = (u8)(state >> 32);
-	ciphertext[4] = (u8)(state >> 24);
-	ciphertext[5] = (u8)(state >> 16);
-	ciphertext[6] = (u8)(state >>  8);
-	ciphertext[7] = (u8)(state      );
-
+	*dst = cpu_to_be64(state);
 }
 
 static void khazad_encrypt(void *ctx_arg, u8 *dst, const u8 *src)
diff --git a/crypto/md4.c b/crypto/md4.c
index bef6a9e..a2d6df5 100644
--- a/crypto/md4.c
+++ b/crypto/md4.c
@@ -24,6 +24,7 @@
 #include <linux/crypto.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/types.h>
 #include <asm/byteorder.h>
 
 #define MD4_DIGEST_SIZE		16
diff --git a/crypto/md5.c b/crypto/md5.c
index 1ed45f9..7f041ae 100644
--- a/crypto/md5.c
+++ b/crypto/md5.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/crypto.h>
+#include <linux/types.h>
 #include <asm/byteorder.h>
 
 #define MD5_DIGEST_SIZE		16
diff --git a/crypto/michael_mic.c b/crypto/michael_mic.c
index a470bcb..4f6ab23 100644
--- a/crypto/michael_mic.c
+++ b/crypto/michael_mic.c
@@ -10,10 +10,12 @@
  * published by the Free Software Foundation.
  */
 
+#include <asm/byteorder.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/crypto.h>
+#include <linux/types.h>
 
 
 struct michael_mic_ctx {
@@ -43,21 +45,6 @@
 } while (0)
 
 
-static inline u32 get_le32(const u8 *p)
-{
-	return p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
-}
-
-
-static inline void put_le32(u8 *p, u32 v)
-{
-	p[0] = v;
-	p[1] = v >> 8;
-	p[2] = v >> 16;
-	p[3] = v >> 24;
-}
-
-
 static void michael_init(void *ctx)
 {
 	struct michael_mic_ctx *mctx = ctx;
@@ -68,6 +55,7 @@
 static void michael_update(void *ctx, const u8 *data, unsigned int len)
 {
 	struct michael_mic_ctx *mctx = ctx;
+	const __le32 *src;
 
 	if (mctx->pending_len) {
 		int flen = 4 - mctx->pending_len;
@@ -81,21 +69,23 @@
 		if (mctx->pending_len < 4)
 			return;
 
-		mctx->l ^= get_le32(mctx->pending);
+		src = (const __le32 *)mctx->pending;
+		mctx->l ^= le32_to_cpup(src);
 		michael_block(mctx->l, mctx->r);
 		mctx->pending_len = 0;
 	}
 
+	src = (const __le32 *)data;
+
 	while (len >= 4) {
-		mctx->l ^= get_le32(data);
+		mctx->l ^= le32_to_cpup(src++);
 		michael_block(mctx->l, mctx->r);
-		data += 4;
 		len -= 4;
 	}
 
 	if (len > 0) {
 		mctx->pending_len = len;
-		memcpy(mctx->pending, data, len);
+		memcpy(mctx->pending, src, len);
 	}
 }
 
@@ -104,6 +94,7 @@
 {
 	struct michael_mic_ctx *mctx = ctx;
 	u8 *data = mctx->pending;
+	__le32 *dst = (__le32 *)out;
 
 	/* Last block and padding (0x5a, 4..7 x 0) */
 	switch (mctx->pending_len) {
@@ -125,8 +116,8 @@
 	/* l ^= 0; */
 	michael_block(mctx->l, mctx->r);
 
-	put_le32(out, mctx->l);
-	put_le32(out + 4, mctx->r);
+	dst[0] = cpu_to_le32(mctx->l);
+	dst[1] = cpu_to_le32(mctx->r);
 }
 
 
@@ -134,13 +125,16 @@
 			  u32 *flags)
 {
 	struct michael_mic_ctx *mctx = ctx;
+	const __le32 *data = (const __le32 *)key;
+
 	if (keylen != 8) {
 		if (flags)
 			*flags = CRYPTO_TFM_RES_BAD_KEY_LEN;
 		return -EINVAL;
 	}
-	mctx->l = get_le32(key);
-	mctx->r = get_le32(key + 4);
+
+	mctx->l = le32_to_cpu(data[0]);
+	mctx->r = le32_to_cpu(data[1]);
 	return 0;
 }
 
diff --git a/crypto/serpent.c b/crypto/serpent.c
index 3cf2c50..a950ff8 100644
--- a/crypto/serpent.c
+++ b/crypto/serpent.c
@@ -20,6 +20,7 @@
 #include <linux/errno.h>
 #include <asm/byteorder.h>
 #include <linux/crypto.h>
+#include <linux/types.h>
 
 /* Key is padded to the maximum of 256 bits before round key generation.
  * Any key length <= 256 bits (32 bytes) is allowed by the algorithm.
diff --git a/crypto/sha1.c b/crypto/sha1.c
index 4016f3b..c686e78 100644
--- a/crypto/sha1.c
+++ b/crypto/sha1.c
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/crypto.h>
 #include <linux/cryptohash.h>
+#include <linux/types.h>
 #include <asm/scatterlist.h>
 #include <asm/byteorder.h>
 
@@ -72,20 +73,12 @@
 static void sha1_final(void* ctx, u8 *out)
 {
 	struct sha1_ctx *sctx = ctx;
-	u32 i, j, index, padlen;
-	u64 t;
-	u8 bits[8] = { 0, };
+	__be32 *dst = (__be32 *)out;
+	u32 i, index, padlen;
+	__be64 bits;
 	static const u8 padding[64] = { 0x80, };
 
-	t = sctx->count;
-	bits[7] = 0xff & t; t>>=8;
-	bits[6] = 0xff & t; t>>=8;
-	bits[5] = 0xff & t; t>>=8;
-	bits[4] = 0xff & t; t>>=8;
-	bits[3] = 0xff & t; t>>=8;
-	bits[2] = 0xff & t; t>>=8;
-	bits[1] = 0xff & t; t>>=8;
-	bits[0] = 0xff & t;
+	bits = cpu_to_be64(sctx->count);
 
 	/* Pad out to 56 mod 64 */
 	index = (sctx->count >> 3) & 0x3f;
@@ -93,16 +86,11 @@
 	sha1_update(sctx, padding, padlen);
 
 	/* Append length */
-	sha1_update(sctx, bits, sizeof bits); 
+	sha1_update(sctx, (const u8 *)&bits, sizeof(bits));
 
 	/* Store state in digest */
-	for (i = j = 0; i < 5; i++, j += 4) {
-		u32 t2 = sctx->state[i];
-		out[j+3] = t2 & 0xff; t2>>=8;
-		out[j+2] = t2 & 0xff; t2>>=8;
-		out[j+1] = t2 & 0xff; t2>>=8;
-		out[j  ] = t2 & 0xff;
-	}
+	for (i = 0; i < 5; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
 
 	/* Wipe context */
 	memset(sctx, 0, sizeof *sctx);
diff --git a/crypto/sha256.c b/crypto/sha256.c
index c78da50..9d5ef67 100644
--- a/crypto/sha256.c
+++ b/crypto/sha256.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/crypto.h>
+#include <linux/types.h>
 #include <asm/scatterlist.h>
 #include <asm/byteorder.h>
 
@@ -279,22 +280,15 @@
 static void sha256_final(void* ctx, u8 *out)
 {
 	struct sha256_ctx *sctx = ctx;
-	u8 bits[8];
-	unsigned int index, pad_len, t;
-	int i, j;
+	__be32 *dst = (__be32 *)out;
+	__be32 bits[2];
+	unsigned int index, pad_len;
+	int i;
 	static const u8 padding[64] = { 0x80, };
 
 	/* Save number of bits */
-	t = sctx->count[0];
-	bits[7] = t; t >>= 8;
-	bits[6] = t; t >>= 8;
-	bits[5] = t; t >>= 8;
-	bits[4] = t;
-	t = sctx->count[1];
-	bits[3] = t; t >>= 8;
-	bits[2] = t; t >>= 8;
-	bits[1] = t; t >>= 8;
-	bits[0] = t;
+	bits[1] = cpu_to_be32(sctx->count[0]);
+	bits[0] = cpu_to_be32(sctx->count[1]);
 
 	/* Pad out to 56 mod 64. */
 	index = (sctx->count[0] >> 3) & 0x3f;
@@ -302,16 +296,11 @@
 	sha256_update(sctx, padding, pad_len);
 
 	/* Append length (before padding) */
-	sha256_update(sctx, bits, 8);
+	sha256_update(sctx, (const u8 *)bits, sizeof(bits));
 
 	/* Store state in digest */
-	for (i = j = 0; i < 8; i++, j += 4) {
-		t = sctx->state[i];
-		out[j+3] = t; t >>= 8;
-		out[j+2] = t; t >>= 8;
-		out[j+1] = t; t >>= 8;
-		out[j  ] = t;
-	}
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
 
 	/* Zeroize sensitive information. */
 	memset(sctx, 0, sizeof(*sctx));
diff --git a/crypto/sha512.c b/crypto/sha512.c
index c663438..3e6e939 100644
--- a/crypto/sha512.c
+++ b/crypto/sha512.c
@@ -17,6 +17,7 @@
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/crypto.h>
+#include <linux/types.h>
 
 #include <asm/scatterlist.h>
 #include <asm/byteorder.h>
@@ -235,39 +236,17 @@
 sha512_final(void *ctx, u8 *hash)
 {
         struct sha512_ctx *sctx = ctx;
-	
         static u8 padding[128] = { 0x80, };
-
-        u32 t;
-	u64 t2;
-        u8 bits[128];
+	__be64 *dst = (__be64 *)hash;
+	__be32 bits[4];
 	unsigned int index, pad_len;
-	int i, j;
-
-        index = pad_len = t = i = j = 0;
-        t2 = 0;
+	int i;
 
 	/* Save number of bits */
-	t = sctx->count[0];
-	bits[15] = t; t>>=8;
-	bits[14] = t; t>>=8;
-	bits[13] = t; t>>=8;
-	bits[12] = t; 
-	t = sctx->count[1];
-	bits[11] = t; t>>=8;
-	bits[10] = t; t>>=8;
-	bits[9 ] = t; t>>=8;
-	bits[8 ] = t; 
-	t = sctx->count[2];
-	bits[7 ] = t; t>>=8;
-	bits[6 ] = t; t>>=8;
-	bits[5 ] = t; t>>=8;
-	bits[4 ] = t; 
-	t = sctx->count[3];
-	bits[3 ] = t; t>>=8;
-	bits[2 ] = t; t>>=8;
-	bits[1 ] = t; t>>=8;
-	bits[0 ] = t; 
+	bits[3] = cpu_to_be32(sctx->count[0]);
+	bits[2] = cpu_to_be32(sctx->count[1]);
+	bits[1] = cpu_to_be32(sctx->count[2]);
+	bits[0] = cpu_to_be32(sctx->count[3]);
 
 	/* Pad out to 112 mod 128. */
 	index = (sctx->count[0] >> 3) & 0x7f;
@@ -275,21 +254,12 @@
 	sha512_update(sctx, padding, pad_len);
 
 	/* Append length (before padding) */
-	sha512_update(sctx, bits, 16);
+	sha512_update(sctx, (const u8 *)bits, sizeof(bits));
 
 	/* Store state in digest */
-	for (i = j = 0; i < 8; i++, j += 8) {
-		t2 = sctx->state[i];
-		hash[j+7] = (char)t2 & 0xff; t2>>=8;
-		hash[j+6] = (char)t2 & 0xff; t2>>=8;
-		hash[j+5] = (char)t2 & 0xff; t2>>=8;
-		hash[j+4] = (char)t2 & 0xff; t2>>=8;
-		hash[j+3] = (char)t2 & 0xff; t2>>=8;
-		hash[j+2] = (char)t2 & 0xff; t2>>=8;
-		hash[j+1] = (char)t2 & 0xff; t2>>=8;
-		hash[j  ] = (char)t2 & 0xff;
-	}
-	
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be64(sctx->state[i]);
+
 	/* Zeroize sensitive information. */
 	memset(sctx, 0, sizeof(struct sha512_ctx));
 }
diff --git a/crypto/tea.c b/crypto/tea.c
index 5924efd..e0077c7 100644
--- a/crypto/tea.c
+++ b/crypto/tea.c
@@ -22,8 +22,10 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <asm/byteorder.h>
 #include <asm/scatterlist.h>
 #include <linux/crypto.h>
+#include <linux/types.h>
 
 #define TEA_KEY_SIZE		16
 #define TEA_BLOCK_SIZE		8
@@ -35,9 +37,6 @@
 #define XTEA_ROUNDS		32
 #define XTEA_DELTA		0x9e3779b9
 
-#define u32_in(x) le32_to_cpu(*(const __le32 *)(x))
-#define u32_out(to, from) (*(__le32 *)(to) = cpu_to_le32(from))
-
 struct tea_ctx {
 	u32 KEY[4];
 };
@@ -49,8 +48,8 @@
 static int tea_setkey(void *ctx_arg, const u8 *in_key,
                        unsigned int key_len, u32 *flags)
 { 
-
 	struct tea_ctx *ctx = ctx_arg;
+	const __le32 *key = (const __le32 *)in_key;
 	
 	if (key_len != 16)
 	{
@@ -58,10 +57,10 @@
 		return -EINVAL;
 	}
 
-	ctx->KEY[0] = u32_in (in_key);
-	ctx->KEY[1] = u32_in (in_key + 4);
-	ctx->KEY[2] = u32_in (in_key + 8);
-	ctx->KEY[3] = u32_in (in_key + 12);
+	ctx->KEY[0] = le32_to_cpu(key[0]);
+	ctx->KEY[1] = le32_to_cpu(key[1]);
+	ctx->KEY[2] = le32_to_cpu(key[2]);
+	ctx->KEY[3] = le32_to_cpu(key[3]);
 
 	return 0; 
 
@@ -73,9 +72,11 @@
 	u32 k0, k1, k2, k3;
 
 	struct tea_ctx *ctx = ctx_arg;
+	const __le32 *in = (const __le32 *)src;
+	__le32 *out = (__le32 *)dst;
 
-	y = u32_in (src);
-	z = u32_in (src + 4);
+	y = le32_to_cpu(in[0]);
+	z = le32_to_cpu(in[1]);
 
 	k0 = ctx->KEY[0];
 	k1 = ctx->KEY[1];
@@ -90,19 +91,20 @@
 		z += ((y << 4) + k2) ^ (y + sum) ^ ((y >> 5) + k3);
 	}
 	
-	u32_out (dst, y);
-	u32_out (dst + 4, z);
+	out[0] = cpu_to_le32(y);
+	out[1] = cpu_to_le32(z);
 }
 
 static void tea_decrypt(void *ctx_arg, u8 *dst, const u8 *src)
 { 
 	u32 y, z, n, sum;
 	u32 k0, k1, k2, k3;
-
 	struct tea_ctx *ctx = ctx_arg;
+	const __le32 *in = (const __le32 *)src;
+	__le32 *out = (__le32 *)dst;
 
-	y = u32_in (src);
-	z = u32_in (src + 4);
+	y = le32_to_cpu(in[0]);
+	z = le32_to_cpu(in[1]);
 
 	k0 = ctx->KEY[0];
 	k1 = ctx->KEY[1];
@@ -119,16 +121,15 @@
 		sum -= TEA_DELTA;
 	}
 	
-	u32_out (dst, y);
-	u32_out (dst + 4, z);
-
+	out[0] = cpu_to_le32(y);
+	out[1] = cpu_to_le32(z);
 }
 
 static int xtea_setkey(void *ctx_arg, const u8 *in_key,
                        unsigned int key_len, u32 *flags)
 { 
-
 	struct xtea_ctx *ctx = ctx_arg;
+	const __le32 *key = (const __le32 *)in_key;
 	
 	if (key_len != 16)
 	{
@@ -136,10 +137,10 @@
 		return -EINVAL;
 	}
 
-	ctx->KEY[0] = u32_in (in_key);
-	ctx->KEY[1] = u32_in (in_key + 4);
-	ctx->KEY[2] = u32_in (in_key + 8);
-	ctx->KEY[3] = u32_in (in_key + 12);
+	ctx->KEY[0] = le32_to_cpu(key[0]);
+	ctx->KEY[1] = le32_to_cpu(key[1]);
+	ctx->KEY[2] = le32_to_cpu(key[2]);
+	ctx->KEY[3] = le32_to_cpu(key[3]);
 
 	return 0; 
 
@@ -147,14 +148,15 @@
 
 static void xtea_encrypt(void *ctx_arg, u8 *dst, const u8 *src)
 { 
-
 	u32 y, z, sum = 0;
 	u32 limit = XTEA_DELTA * XTEA_ROUNDS;
 
 	struct xtea_ctx *ctx = ctx_arg;
+	const __le32 *in = (const __le32 *)src;
+	__le32 *out = (__le32 *)dst;
 
-	y = u32_in (src);
-	z = u32_in (src + 4);
+	y = le32_to_cpu(in[0]);
+	z = le32_to_cpu(in[1]);
 
 	while (sum != limit) {
 		y += ((z << 4 ^ z >> 5) + z) ^ (sum + ctx->KEY[sum&3]); 
@@ -162,19 +164,19 @@
 		z += ((y << 4 ^ y >> 5) + y) ^ (sum + ctx->KEY[sum>>11 &3]); 
 	}
 	
-	u32_out (dst, y);
-	u32_out (dst + 4, z);
-
+	out[0] = cpu_to_le32(y);
+	out[1] = cpu_to_le32(z);
 }
 
 static void xtea_decrypt(void *ctx_arg, u8 *dst, const u8 *src)
 { 
-
 	u32 y, z, sum;
 	struct tea_ctx *ctx = ctx_arg;
+	const __le32 *in = (const __le32 *)src;
+	__le32 *out = (__le32 *)dst;
 
-	y = u32_in (src);
-	z = u32_in (src + 4);
+	y = le32_to_cpu(in[0]);
+	z = le32_to_cpu(in[1]);
 
 	sum = XTEA_DELTA * XTEA_ROUNDS;
 
@@ -184,22 +186,22 @@
 		y -= ((z << 4 ^ z >> 5) + z) ^ (sum + ctx->KEY[sum & 3]);
 	}
 	
-	u32_out (dst, y);
-	u32_out (dst + 4, z);
-
+	out[0] = cpu_to_le32(y);
+	out[1] = cpu_to_le32(z);
 }
 
 
 static void xeta_encrypt(void *ctx_arg, u8 *dst, const u8 *src)
 { 
-
 	u32 y, z, sum = 0;
 	u32 limit = XTEA_DELTA * XTEA_ROUNDS;
 
 	struct xtea_ctx *ctx = ctx_arg;
+	const __le32 *in = (const __le32 *)src;
+	__le32 *out = (__le32 *)dst;
 
-	y = u32_in (src);
-	z = u32_in (src + 4);
+	y = le32_to_cpu(in[0]);
+	z = le32_to_cpu(in[1]);
 
 	while (sum != limit) {
 		y += (z << 4 ^ z >> 5) + (z ^ sum) + ctx->KEY[sum&3];
@@ -207,19 +209,19 @@
 		z += (y << 4 ^ y >> 5) + (y ^ sum) + ctx->KEY[sum>>11 &3];
 	}
 	
-	u32_out (dst, y);
-	u32_out (dst + 4, z);
-
+	out[0] = cpu_to_le32(y);
+	out[1] = cpu_to_le32(z);
 }
 
 static void xeta_decrypt(void *ctx_arg, u8 *dst, const u8 *src)
 { 
-
 	u32 y, z, sum;
 	struct tea_ctx *ctx = ctx_arg;
+	const __le32 *in = (const __le32 *)src;
+	__le32 *out = (__le32 *)dst;
 
-	y = u32_in (src);
-	z = u32_in (src + 4);
+	y = le32_to_cpu(in[0]);
+	z = le32_to_cpu(in[1]);
 
 	sum = XTEA_DELTA * XTEA_ROUNDS;
 
@@ -229,9 +231,8 @@
 		y -= (z << 4 ^ z >> 5) + (z ^ sum) + ctx->KEY[sum & 3];
 	}
 	
-	u32_out (dst, y);
-	u32_out (dst + 4, z);
-
+	out[0] = cpu_to_le32(y);
+	out[1] = cpu_to_le32(z);
 }
 
 static struct crypto_alg tea_alg = {
diff --git a/crypto/tgr192.c b/crypto/tgr192.c
index f0a45cf..2d8e44f 100644
--- a/crypto/tgr192.c
+++ b/crypto/tgr192.c
@@ -24,8 +24,10 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <asm/byteorder.h>
 #include <asm/scatterlist.h>
 #include <linux/crypto.h>
+#include <linux/types.h>
 
 #define TGR192_DIGEST_SIZE 24
 #define TGR160_DIGEST_SIZE 20
@@ -467,18 +469,10 @@
 	u64 a, b, c, aa, bb, cc;
 	u64 x[8];
 	int i;
-	const u8 *ptr = data;
+	const __le64 *ptr = (const __le64 *)data;
 
-	for (i = 0; i < 8; i++, ptr += 8) {
-		x[i] = (((u64)ptr[7] ) << 56) ^
-		(((u64)ptr[6] & 0xffL) << 48) ^
-		(((u64)ptr[5] & 0xffL) << 40) ^
-		(((u64)ptr[4] & 0xffL) << 32) ^
-		(((u64)ptr[3] & 0xffL) << 24) ^
-		(((u64)ptr[2] & 0xffL) << 16) ^
-		(((u64)ptr[1] & 0xffL) <<  8) ^
-		(((u64)ptr[0] & 0xffL)      );
-	}
+	for (i = 0; i < 8; i++)
+		x[i] = le64_to_cpu(ptr[i]);
 
 	/* save */
 	a = aa = tctx->a;
@@ -558,9 +552,10 @@
 static void tgr192_final(void *ctx, u8 * out)
 {
 	struct tgr192_ctx *tctx = ctx;
+	__be64 *dst = (__be64 *)out;
+	__be64 *be64p;
+	__le32 *le32p;
 	u32 t, msb, lsb;
-	u8 *p;
-	int i, j;
 
 	tgr192_update(tctx, NULL, 0); /* flush */ ;
 
@@ -594,41 +589,16 @@
 		memset(tctx->hash, 0, 56);    /* fill next block with zeroes */
 	}
 	/* append the 64 bit count */
-	tctx->hash[56] = lsb;
-	tctx->hash[57] = lsb >> 8;
-	tctx->hash[58] = lsb >> 16;
-	tctx->hash[59] = lsb >> 24;
-	tctx->hash[60] = msb;
-	tctx->hash[61] = msb >> 8;
-	tctx->hash[62] = msb >> 16;
-	tctx->hash[63] = msb >> 24;
+	le32p = (__le32 *)&tctx->hash[56];
+	le32p[0] = cpu_to_le32(lsb);
+	le32p[1] = cpu_to_le32(msb);
+
 	tgr192_transform(tctx, tctx->hash);
 
-	p = tctx->hash;
-	*p++ = tctx->a >> 56; *p++ = tctx->a >> 48; *p++ = tctx->a >> 40;
-	*p++ = tctx->a >> 32; *p++ = tctx->a >> 24; *p++ = tctx->a >> 16;
-	*p++ = tctx->a >>  8; *p++ = tctx->a;\
-	*p++ = tctx->b >> 56; *p++ = tctx->b >> 48; *p++ = tctx->b >> 40;
-	*p++ = tctx->b >> 32; *p++ = tctx->b >> 24; *p++ = tctx->b >> 16;
-	*p++ = tctx->b >>  8; *p++ = tctx->b;
-	*p++ = tctx->c >> 56; *p++ = tctx->c >> 48; *p++ = tctx->c >> 40;
-	*p++ = tctx->c >> 32; *p++ = tctx->c >> 24; *p++ = tctx->c >> 16;
-	*p++ = tctx->c >>  8; *p++ = tctx->c;
-
-
-	/* unpack the hash */
-	j = 7;
-	for (i = 0; i < 8; i++) {
-		out[j--] = (tctx->a >> 8 * i) & 0xff;
-	}
-	j = 15;
-	for (i = 0; i < 8; i++) {
-		out[j--] = (tctx->b >> 8 * i) & 0xff;
-	}
-	j = 23;
-	for (i = 0; i < 8; i++) {
-		out[j--] = (tctx->c >> 8 * i) & 0xff;
-	}
+	be64p = (__be64 *)tctx->hash;
+	dst[0] = be64p[0] = cpu_to_be64(tctx->a);
+	dst[1] = be64p[1] = cpu_to_be64(tctx->b);
+	dst[2] = be64p[2] = cpu_to_be64(tctx->c);
 }
 
 static void tgr160_final(void *ctx, u8 * out)
diff --git a/crypto/twofish.c b/crypto/twofish.c
index 4efff8c..b501d5a 100644
--- a/crypto/twofish.c
+++ b/crypto/twofish.c
@@ -37,6 +37,8 @@
  * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
  * Third Edition.
  */
+
+#include <asm/byteorder.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/types.h>
@@ -621,13 +623,11 @@
  * whitening subkey number m. */
 
 #define INPACK(n, x, m) \
-   x = in[4 * (n)] ^ (in[4 * (n) + 1] << 8) \
-     ^ (in[4 * (n) + 2] << 16) ^ (in[4 * (n) + 3] << 24) ^ ctx->w[m]
+   x = le32_to_cpu(src[n]) ^ ctx->w[m]
 
 #define OUTUNPACK(n, x, m) \
    x ^= ctx->w[m]; \
-   out[4 * (n)] = x; out[4 * (n) + 1] = x >> 8; \
-   out[4 * (n) + 2] = x >> 16; out[4 * (n) + 3] = x >> 24
+   dst[n] = cpu_to_le32(x)
 
 #define TF_MIN_KEY_SIZE 16
 #define TF_MAX_KEY_SIZE 32
@@ -804,6 +804,8 @@
 static void twofish_encrypt(void *cx, u8 *out, const u8 *in)
 {
 	struct twofish_ctx *ctx = cx;
+	const __le32 *src = (const __le32 *)in;
+	__le32 *dst = (__le32 *)out;
 
 	/* The four 32-bit chunks of the text. */
 	u32 a, b, c, d;
@@ -839,6 +841,8 @@
 static void twofish_decrypt(void *cx, u8 *out, const u8 *in)
 {
 	struct twofish_ctx *ctx = cx;
+	const __le32 *src = (const __le32 *)in;
+	__le32 *dst = (__le32 *)out;
   
 	/* The four 32-bit chunks of the text. */
 	u32 a, b, c, d;
diff --git a/crypto/wp512.c b/crypto/wp512.c
index fd6e20e..b226a12 100644
--- a/crypto/wp512.c
+++ b/crypto/wp512.c
@@ -22,8 +22,10 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <asm/byteorder.h>
 #include <asm/scatterlist.h>
 #include <linux/crypto.h>
+#include <linux/types.h>
 
 #define WP512_DIGEST_SIZE 64
 #define WP384_DIGEST_SIZE 48
@@ -778,19 +780,10 @@
 	u64 block[8];    /* mu(buffer) */
 	u64 state[8];    /* the cipher state */
 	u64 L[8];
-	u8 *buffer = wctx->buffer;
+	const __be64 *buffer = (const __be64 *)wctx->buffer;
 
-	for (i = 0; i < 8; i++, buffer += 8) {
-		block[i] =
-		(((u64)buffer[0]        ) << 56) ^
-		(((u64)buffer[1] & 0xffL) << 48) ^
-		(((u64)buffer[2] & 0xffL) << 40) ^
-		(((u64)buffer[3] & 0xffL) << 32) ^
-		(((u64)buffer[4] & 0xffL) << 24) ^
-		(((u64)buffer[5] & 0xffL) << 16) ^
-		(((u64)buffer[6] & 0xffL) <<  8) ^
-		(((u64)buffer[7] & 0xffL)      );
-	}
+	for (i = 0; i < 8; i++)
+		block[i] = be64_to_cpu(buffer[i]);
 
 	state[0] = block[0] ^ (K[0] = wctx->hash[0]);
 	state[1] = block[1] ^ (K[1] = wctx->hash[1]);
@@ -1069,7 +1062,7 @@
    	u8 *bitLength   = wctx->bitLength;
    	int bufferBits  = wctx->bufferBits;
    	int bufferPos   = wctx->bufferPos;
-   	u8 *digest      = out;
+	__be64 *digest  = (__be64 *)out;
 
    	buffer[bufferPos] |= 0x80U >> (bufferBits & 7);
    	bufferPos++;
@@ -1088,17 +1081,8 @@
    	memcpy(&buffer[WP512_BLOCK_SIZE - WP512_LENGTHBYTES],
 		   bitLength, WP512_LENGTHBYTES);
    	wp512_process_buffer(wctx);
-   	for (i = 0; i < WP512_DIGEST_SIZE/8; i++) {
-		digest[0] = (u8)(wctx->hash[i] >> 56);
-		digest[1] = (u8)(wctx->hash[i] >> 48);
-		digest[2] = (u8)(wctx->hash[i] >> 40);
-		digest[3] = (u8)(wctx->hash[i] >> 32);
-		digest[4] = (u8)(wctx->hash[i] >> 24);
-		digest[5] = (u8)(wctx->hash[i] >> 16);
-		digest[6] = (u8)(wctx->hash[i] >>  8);
-		digest[7] = (u8)(wctx->hash[i]      );
-		digest += 8;
-   	}
+	for (i = 0; i < WP512_DIGEST_SIZE/8; i++)
+		digest[i] = cpu_to_be64(wctx->hash[i]);
    	wctx->bufferBits   = bufferBits;
    	wctx->bufferPos    = bufferPos;
 }
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index 71407c5..963e03d 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -99,9 +99,6 @@
 	return x >> (n << 3);
 }
 
-#define uint32_t_in(x) le32_to_cpu(*(const uint32_t *)(x))
-#define uint32_t_out(to, from) (*(uint32_t *)(to) = cpu_to_le32(from))
-
 #define E_KEY ctx->E
 #define D_KEY ctx->D
 
@@ -294,6 +291,7 @@
 aes_set_key(void *ctx_arg, const uint8_t *in_key, unsigned int key_len, uint32_t *flags)
 {
 	struct aes_ctx *ctx = aes_ctx(ctx_arg);
+	const __le32 *key = (const __le32 *)in_key;
 	uint32_t i, t, u, v, w;
 	uint32_t P[AES_EXTENDED_KEY_SIZE];
 	uint32_t rounds;
@@ -313,10 +311,10 @@
 	ctx->E = ctx->e_data;
 	ctx->D = ctx->e_data;
 
-	E_KEY[0] = uint32_t_in (in_key);
-	E_KEY[1] = uint32_t_in (in_key + 4);
-	E_KEY[2] = uint32_t_in (in_key + 8);
-	E_KEY[3] = uint32_t_in (in_key + 12);
+	E_KEY[0] = le32_to_cpu(key[0]);
+	E_KEY[1] = le32_to_cpu(key[1]);
+	E_KEY[2] = le32_to_cpu(key[2]);
+	E_KEY[3] = le32_to_cpu(key[3]);
 
 	/* Prepare control words. */
 	memset(&ctx->cword, 0, sizeof(ctx->cword));
@@ -343,17 +341,17 @@
 		break;
 
 	case 24:
-		E_KEY[4] = uint32_t_in (in_key + 16);
-		t = E_KEY[5] = uint32_t_in (in_key + 20);
+		E_KEY[4] = le32_to_cpu(key[4]);
+		t = E_KEY[5] = le32_to_cpu(key[5]);
 		for (i = 0; i < 8; ++i)
 			loop6 (i);
 		break;
 
 	case 32:
-		E_KEY[4] = uint32_t_in (in_key + 16);
-		E_KEY[5] = uint32_t_in (in_key + 20);
-		E_KEY[6] = uint32_t_in (in_key + 24);
-		t = E_KEY[7] = uint32_t_in (in_key + 28);
+		E_KEY[4] = le32_to_cpu(in_key[4]);
+		E_KEY[5] = le32_to_cpu(in_key[5]);
+		E_KEY[6] = le32_to_cpu(in_key[6]);
+		t = E_KEY[7] = le32_to_cpu(in_key[7]);
 		for (i = 0; i < 7; ++i)
 			loop8 (i);
 		break;