Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto updates from Herbert Xu:
 "Here is the crypto update for 4.9:

  API:
   - The crypto engine code now supports hashes.

  Algorithms:
   - Allow keys >= 2048 bits in FIPS mode for RSA.

  Drivers:
   - Memory overwrite fix for vmx ghash.
   - Add support for building ARM sha1-neon in Thumb2 mode.
   - Reenable ARM ghash-ce code by adding import/export.
   - Reenable img-hash by adding import/export.
   - Add support for multiple cores in omap-aes.
   - Add little-endian support for sha1-powerpc.
   - Add Cavium HWRNG driver for ThunderX SoC"

* 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (137 commits)
  crypto: caam - treat SGT address pointer as u64
  crypto: ccp - Make syslog errors human-readable
  crypto: ccp - clean up data structure
  crypto: vmx - Ensure ghash-generic is enabled
  crypto: testmgr - add guard to dst buffer for ahash_export
  crypto: caam - Unmap region obtained by of_iomap
  crypto: sha1-powerpc - little-endian support
  crypto: gcm - Fix IV buffer size in crypto_gcm_setkey
  crypto: vmx - Fix memory corruption caused by p8_ghash
  crypto: ghash-generic - move common definitions to a new header file
  crypto: caam - fix sg dump
  hwrng: omap - Only fail if pm_runtime_get_sync returns < 0
  crypto: omap-sham - shrink the internal buffer size
  crypto: omap-sham - add support for export/import
  crypto: omap-sham - convert driver logic to use sgs for data xmit
  crypto: omap-sham - change the DMA threshold value to a define
  crypto: omap-sham - add support functions for sg based data handling
  crypto: omap-sham - rename sgl to sgl_tmp for deprecation
  crypto: omap-sham - align algorithms on word offset
  crypto: omap-sham - add context export/import stubs
  ...
diff --git a/Documentation/DocBook/crypto-API.tmpl b/Documentation/DocBook/crypto-API.tmpl
index fb2a152..088b79c 100644
--- a/Documentation/DocBook/crypto-API.tmpl
+++ b/Documentation/DocBook/crypto-API.tmpl
@@ -797,7 +797,8 @@
      include/linux/crypto.h and their definition can be seen below.
      The former function registers a single transformation, while
      the latter works on an array of transformation descriptions.
-     The latter is useful when registering transformations in bulk.
+     The latter is useful when registering transformations in bulk,
+     for example when a driver implements multiple transformations.
     </para>
 
     <programlisting>
@@ -822,18 +823,31 @@
     </para>
 
     <para>
-     The bulk registration / unregistration functions require
-     that struct crypto_alg is an array of count size. These
-     functions simply loop over that array and register /
-     unregister each individual algorithm. If an error occurs,
-     the loop is terminated at the offending algorithm definition.
-     That means, the algorithms prior to the offending algorithm
-     are successfully registered. Note, the caller has no way of
-     knowing which cipher implementations have successfully
-     registered. If this is important to know, the caller should
-     loop through the different implementations using the single
-     instance *_alg functions for each individual implementation.
+     The bulk registration/unregistration functions
+     register/unregister each transformation in the given array of
+     length count.  They handle errors as follows:
     </para>
+    <itemizedlist>
+     <listitem>
+      <para>
+       crypto_register_algs() succeeds if and only if it
+       successfully registers all the given transformations. If an
+       error occurs partway through, then it rolls back successful
+       registrations before returning the error code. Note that if
+       a driver needs to handle registration errors for individual
+       transformations, then it will need to use the non-bulk
+       function crypto_register_alg() instead.
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       crypto_unregister_algs() tries to unregister all the given
+       transformations, continuing on error. It logs errors and
+       always returns zero.
+      </para>
+     </listitem>
+    </itemizedlist>
+
    </sect1>
 
    <sect1><title>Single-Block Symmetric Ciphers [CIPHER]</title>
diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c
index 1568cb5..7546b3c 100644
--- a/arch/arm/crypto/ghash-ce-glue.c
+++ b/arch/arm/crypto/ghash-ce-glue.c
@@ -138,7 +138,7 @@
 	.setkey			= ghash_setkey,
 	.descsize		= sizeof(struct ghash_desc_ctx),
 	.base			= {
-		.cra_name	= "ghash",
+		.cra_name	= "__ghash",
 		.cra_driver_name = "__driver-ghash-ce",
 		.cra_priority	= 0,
 		.cra_flags	= CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_INTERNAL,
@@ -220,6 +220,27 @@
 	}
 }
 
+static int ghash_async_import(struct ahash_request *req, const void *in)
+{
+	struct ahash_request *cryptd_req = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+
+	desc->tfm = cryptd_ahash_child(ctx->cryptd_tfm);
+	desc->flags = req->base.flags;
+
+	return crypto_shash_import(desc, in);
+}
+
+static int ghash_async_export(struct ahash_request *req, void *out)
+{
+	struct ahash_request *cryptd_req = ahash_request_ctx(req);
+	struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+
+	return crypto_shash_export(desc, out);
+}
+
 static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
 			      unsigned int keylen)
 {
@@ -268,7 +289,10 @@
 	.final			= ghash_async_final,
 	.setkey			= ghash_async_setkey,
 	.digest			= ghash_async_digest,
+	.import			= ghash_async_import,
+	.export			= ghash_async_export,
 	.halg.digestsize	= GHASH_DIGEST_SIZE,
+	.halg.statesize		= sizeof(struct ghash_desc_ctx),
 	.halg.base		= {
 		.cra_name	= "ghash",
 		.cra_driver_name = "ghash-ce",
diff --git a/arch/arm/crypto/sha1-armv7-neon.S b/arch/arm/crypto/sha1-armv7-neon.S
index dcd01f3..2468fad 100644
--- a/arch/arm/crypto/sha1-armv7-neon.S
+++ b/arch/arm/crypto/sha1-armv7-neon.S
@@ -12,7 +12,6 @@
 #include <asm/assembler.h>
 
 .syntax unified
-.code   32
 .fpu neon
 
 .text
diff --git a/arch/powerpc/crypto/sha1-powerpc-asm.S b/arch/powerpc/crypto/sha1-powerpc-asm.S
index 125e165..82ddc9b 100644
--- a/arch/powerpc/crypto/sha1-powerpc-asm.S
+++ b/arch/powerpc/crypto/sha1-powerpc-asm.S
@@ -7,6 +7,15 @@
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 
+#ifdef __BIG_ENDIAN__
+#define LWZ(rt, d, ra)	\
+	lwz	rt,d(ra)
+#else
+#define LWZ(rt, d, ra)	\
+	li	rt,d;	\
+	lwbrx	rt,rt,ra
+#endif
+
 /*
  * We roll the registers for T, A, B, C, D, E around on each
  * iteration; T on iteration t is A on iteration t+1, and so on.
@@ -23,7 +32,7 @@
 #define W(t)	(((t)%16)+16)
 
 #define LOADW(t)				\
-	lwz	W(t),(t)*4(r4)
+	LWZ(W(t),(t)*4,r4)
 
 #define STEPD0_LOAD(t)				\
 	andc	r0,RD(t),RB(t);		\
@@ -33,7 +42,7 @@
 	add	r0,RE(t),r15;			\
 	add	RT(t),RT(t),r6;		\
 	add	r14,r0,W(t);			\
-	lwz	W((t)+4),((t)+4)*4(r4);	\
+	LWZ(W((t)+4),((t)+4)*4,r4);	\
 	rotlwi	RB(t),RB(t),30;			\
 	add	RT(t),RT(t),r14
 
diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index 68a5cea..2d8466f 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -39,6 +39,37 @@
 	bool has_key;
 };
 
+static int hash_alloc_result(struct sock *sk, struct hash_ctx *ctx)
+{
+	unsigned ds;
+
+	if (ctx->result)
+		return 0;
+
+	ds = crypto_ahash_digestsize(crypto_ahash_reqtfm(&ctx->req));
+
+	ctx->result = sock_kmalloc(sk, ds, GFP_KERNEL);
+	if (!ctx->result)
+		return -ENOMEM;
+
+	memset(ctx->result, 0, ds);
+
+	return 0;
+}
+
+static void hash_free_result(struct sock *sk, struct hash_ctx *ctx)
+{
+	unsigned ds;
+
+	if (!ctx->result)
+		return;
+
+	ds = crypto_ahash_digestsize(crypto_ahash_reqtfm(&ctx->req));
+
+	sock_kzfree_s(sk, ctx->result, ds);
+	ctx->result = NULL;
+}
+
 static int hash_sendmsg(struct socket *sock, struct msghdr *msg,
 			size_t ignored)
 {
@@ -54,6 +85,9 @@
 
 	lock_sock(sk);
 	if (!ctx->more) {
+		if ((msg->msg_flags & MSG_MORE))
+			hash_free_result(sk, ctx);
+
 		err = af_alg_wait_for_completion(crypto_ahash_init(&ctx->req),
 						&ctx->completion);
 		if (err)
@@ -90,6 +124,10 @@
 
 	ctx->more = msg->msg_flags & MSG_MORE;
 	if (!ctx->more) {
+		err = hash_alloc_result(sk, ctx);
+		if (err)
+			goto unlock;
+
 		ahash_request_set_crypt(&ctx->req, NULL, ctx->result, 0);
 		err = af_alg_wait_for_completion(crypto_ahash_final(&ctx->req),
 						 &ctx->completion);
@@ -116,6 +154,13 @@
 	sg_init_table(ctx->sgl.sg, 1);
 	sg_set_page(ctx->sgl.sg, page, size, offset);
 
+	if (!(flags & MSG_MORE)) {
+		err = hash_alloc_result(sk, ctx);
+		if (err)
+			goto unlock;
+	} else if (!ctx->more)
+		hash_free_result(sk, ctx);
+
 	ahash_request_set_crypt(&ctx->req, ctx->sgl.sg, ctx->result, size);
 
 	if (!(flags & MSG_MORE)) {
@@ -153,6 +198,7 @@
 	struct alg_sock *ask = alg_sk(sk);
 	struct hash_ctx *ctx = ask->private;
 	unsigned ds = crypto_ahash_digestsize(crypto_ahash_reqtfm(&ctx->req));
+	bool result;
 	int err;
 
 	if (len > ds)
@@ -161,17 +207,29 @@
 		msg->msg_flags |= MSG_TRUNC;
 
 	lock_sock(sk);
+	result = ctx->result;
+	err = hash_alloc_result(sk, ctx);
+	if (err)
+		goto unlock;
+
+	ahash_request_set_crypt(&ctx->req, NULL, ctx->result, 0);
+
 	if (ctx->more) {
 		ctx->more = 0;
-		ahash_request_set_crypt(&ctx->req, NULL, ctx->result, 0);
 		err = af_alg_wait_for_completion(crypto_ahash_final(&ctx->req),
 						 &ctx->completion);
 		if (err)
 			goto unlock;
+	} else if (!result) {
+		err = af_alg_wait_for_completion(
+				crypto_ahash_digest(&ctx->req),
+				&ctx->completion);
 	}
 
 	err = memcpy_to_msg(msg, ctx->result, len);
 
+	hash_free_result(sk, ctx);
+
 unlock:
 	release_sock(sk);
 
@@ -394,8 +452,7 @@
 	struct alg_sock *ask = alg_sk(sk);
 	struct hash_ctx *ctx = ask->private;
 
-	sock_kzfree_s(sk, ctx->result,
-		      crypto_ahash_digestsize(crypto_ahash_reqtfm(&ctx->req)));
+	hash_free_result(sk, ctx);
 	sock_kfree_s(sk, ctx, ctx->len);
 	af_alg_release_parent(sk);
 }
@@ -407,20 +464,12 @@
 	struct algif_hash_tfm *tfm = private;
 	struct crypto_ahash *hash = tfm->hash;
 	unsigned len = sizeof(*ctx) + crypto_ahash_reqsize(hash);
-	unsigned ds = crypto_ahash_digestsize(hash);
 
 	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
 
-	ctx->result = sock_kmalloc(sk, ds, GFP_KERNEL);
-	if (!ctx->result) {
-		sock_kfree_s(sk, ctx, len);
-		return -ENOMEM;
-	}
-
-	memset(ctx->result, 0, ds);
-
+	ctx->result = NULL;
 	ctx->len = len;
 	ctx->more = 0;
 	af_alg_init_completion(&ctx->completion);
diff --git a/crypto/crct10dif_generic.c b/crypto/crct10dif_generic.c
index c1229614..8e94e29 100644
--- a/crypto/crct10dif_generic.c
+++ b/crypto/crct10dif_generic.c
@@ -107,10 +107,7 @@
 
 static int __init crct10dif_mod_init(void)
 {
-	int ret;
-
-	ret = crypto_register_shash(&alg);
-	return ret;
+	return crypto_register_shash(&alg);
 }
 
 static void __exit crct10dif_mod_fini(void)
diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c
index a55c82d..bfb92ac 100644
--- a/crypto/crypto_engine.c
+++ b/crypto/crypto_engine.c
@@ -14,13 +14,12 @@
 
 #include <linux/err.h>
 #include <linux/delay.h>
+#include <crypto/engine.h>
+#include <crypto/internal/hash.h>
 #include "internal.h"
 
 #define CRYPTO_ENGINE_MAX_QLEN 10
 
-void crypto_finalize_request(struct crypto_engine *engine,
-			     struct ablkcipher_request *req, int err);
-
 /**
  * crypto_pump_requests - dequeue one request from engine queue to process
  * @engine: the hardware engine
@@ -34,10 +33,11 @@
 				 bool in_kthread)
 {
 	struct crypto_async_request *async_req, *backlog;
-	struct ablkcipher_request *req;
+	struct ahash_request *hreq;
+	struct ablkcipher_request *breq;
 	unsigned long flags;
 	bool was_busy = false;
-	int ret;
+	int ret, rtype;
 
 	spin_lock_irqsave(&engine->queue_lock, flags);
 
@@ -82,9 +82,7 @@
 	if (!async_req)
 		goto out;
 
-	req = ablkcipher_request_cast(async_req);
-
-	engine->cur_req = req;
+	engine->cur_req = async_req;
 	if (backlog)
 		backlog->complete(backlog, -EINPROGRESS);
 
@@ -95,6 +93,7 @@
 
 	spin_unlock_irqrestore(&engine->queue_lock, flags);
 
+	rtype = crypto_tfm_alg_type(engine->cur_req->tfm);
 	/* Until here we get the request need to be encrypted successfully */
 	if (!was_busy && engine->prepare_crypt_hardware) {
 		ret = engine->prepare_crypt_hardware(engine);
@@ -104,24 +103,55 @@
 		}
 	}
 
-	if (engine->prepare_request) {
-		ret = engine->prepare_request(engine, engine->cur_req);
+	switch (rtype) {
+	case CRYPTO_ALG_TYPE_AHASH:
+		hreq = ahash_request_cast(engine->cur_req);
+		if (engine->prepare_hash_request) {
+			ret = engine->prepare_hash_request(engine, hreq);
+			if (ret) {
+				pr_err("failed to prepare request: %d\n", ret);
+				goto req_err;
+			}
+			engine->cur_req_prepared = true;
+		}
+		ret = engine->hash_one_request(engine, hreq);
 		if (ret) {
-			pr_err("failed to prepare request: %d\n", ret);
+			pr_err("failed to hash one request from queue\n");
 			goto req_err;
 		}
-		engine->cur_req_prepared = true;
+		return;
+	case CRYPTO_ALG_TYPE_ABLKCIPHER:
+		breq = ablkcipher_request_cast(engine->cur_req);
+		if (engine->prepare_cipher_request) {
+			ret = engine->prepare_cipher_request(engine, breq);
+			if (ret) {
+				pr_err("failed to prepare request: %d\n", ret);
+				goto req_err;
+			}
+			engine->cur_req_prepared = true;
+		}
+		ret = engine->cipher_one_request(engine, breq);
+		if (ret) {
+			pr_err("failed to cipher one request from queue\n");
+			goto req_err;
+		}
+		return;
+	default:
+		pr_err("failed to prepare request of unknown type\n");
+		return;
 	}
 
-	ret = engine->crypt_one_request(engine, engine->cur_req);
-	if (ret) {
-		pr_err("failed to crypt one request from queue\n");
-		goto req_err;
-	}
-	return;
-
 req_err:
-	crypto_finalize_request(engine, engine->cur_req, ret);
+	switch (rtype) {
+	case CRYPTO_ALG_TYPE_AHASH:
+		hreq = ahash_request_cast(engine->cur_req);
+		crypto_finalize_hash_request(engine, hreq, ret);
+		break;
+	case CRYPTO_ALG_TYPE_ABLKCIPHER:
+		breq = ablkcipher_request_cast(engine->cur_req);
+		crypto_finalize_cipher_request(engine, breq, ret);
+		break;
+	}
 	return;
 
 out:
@@ -137,12 +167,14 @@
 }
 
 /**
- * crypto_transfer_request - transfer the new request into the engine queue
+ * crypto_transfer_cipher_request - transfer the new request into the
+ * enginequeue
  * @engine: the hardware engine
  * @req: the request need to be listed into the engine queue
  */
-int crypto_transfer_request(struct crypto_engine *engine,
-			    struct ablkcipher_request *req, bool need_pump)
+int crypto_transfer_cipher_request(struct crypto_engine *engine,
+				   struct ablkcipher_request *req,
+				   bool need_pump)
 {
 	unsigned long flags;
 	int ret;
@@ -162,46 +194,88 @@
 	spin_unlock_irqrestore(&engine->queue_lock, flags);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(crypto_transfer_request);
+EXPORT_SYMBOL_GPL(crypto_transfer_cipher_request);
 
 /**
- * crypto_transfer_request_to_engine - transfer one request to list into the
- * engine queue
+ * crypto_transfer_cipher_request_to_engine - transfer one request to list
+ * into the engine queue
  * @engine: the hardware engine
  * @req: the request need to be listed into the engine queue
  */
-int crypto_transfer_request_to_engine(struct crypto_engine *engine,
-				      struct ablkcipher_request *req)
+int crypto_transfer_cipher_request_to_engine(struct crypto_engine *engine,
+					     struct ablkcipher_request *req)
 {
-	return crypto_transfer_request(engine, req, true);
+	return crypto_transfer_cipher_request(engine, req, true);
 }
-EXPORT_SYMBOL_GPL(crypto_transfer_request_to_engine);
+EXPORT_SYMBOL_GPL(crypto_transfer_cipher_request_to_engine);
 
 /**
- * crypto_finalize_request - finalize one request if the request is done
+ * crypto_transfer_hash_request - transfer the new request into the
+ * enginequeue
+ * @engine: the hardware engine
+ * @req: the request need to be listed into the engine queue
+ */
+int crypto_transfer_hash_request(struct crypto_engine *engine,
+				 struct ahash_request *req, bool need_pump)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&engine->queue_lock, flags);
+
+	if (!engine->running) {
+		spin_unlock_irqrestore(&engine->queue_lock, flags);
+		return -ESHUTDOWN;
+	}
+
+	ret = ahash_enqueue_request(&engine->queue, req);
+
+	if (!engine->busy && need_pump)
+		queue_kthread_work(&engine->kworker, &engine->pump_requests);
+
+	spin_unlock_irqrestore(&engine->queue_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(crypto_transfer_hash_request);
+
+/**
+ * crypto_transfer_hash_request_to_engine - transfer one request to list
+ * into the engine queue
+ * @engine: the hardware engine
+ * @req: the request need to be listed into the engine queue
+ */
+int crypto_transfer_hash_request_to_engine(struct crypto_engine *engine,
+					   struct ahash_request *req)
+{
+	return crypto_transfer_hash_request(engine, req, true);
+}
+EXPORT_SYMBOL_GPL(crypto_transfer_hash_request_to_engine);
+
+/**
+ * crypto_finalize_cipher_request - finalize one request if the request is done
  * @engine: the hardware engine
  * @req: the request need to be finalized
  * @err: error number
  */
-void crypto_finalize_request(struct crypto_engine *engine,
-			     struct ablkcipher_request *req, int err)
+void crypto_finalize_cipher_request(struct crypto_engine *engine,
+				    struct ablkcipher_request *req, int err)
 {
 	unsigned long flags;
 	bool finalize_cur_req = false;
 	int ret;
 
 	spin_lock_irqsave(&engine->queue_lock, flags);
-	if (engine->cur_req == req)
+	if (engine->cur_req == &req->base)
 		finalize_cur_req = true;
 	spin_unlock_irqrestore(&engine->queue_lock, flags);
 
 	if (finalize_cur_req) {
-		if (engine->cur_req_prepared && engine->unprepare_request) {
-			ret = engine->unprepare_request(engine, req);
+		if (engine->cur_req_prepared &&
+		    engine->unprepare_cipher_request) {
+			ret = engine->unprepare_cipher_request(engine, req);
 			if (ret)
 				pr_err("failed to unprepare request\n");
 		}
-
 		spin_lock_irqsave(&engine->queue_lock, flags);
 		engine->cur_req = NULL;
 		engine->cur_req_prepared = false;
@@ -212,7 +286,44 @@
 
 	queue_kthread_work(&engine->kworker, &engine->pump_requests);
 }
-EXPORT_SYMBOL_GPL(crypto_finalize_request);
+EXPORT_SYMBOL_GPL(crypto_finalize_cipher_request);
+
+/**
+ * crypto_finalize_hash_request - finalize one request if the request is done
+ * @engine: the hardware engine
+ * @req: the request need to be finalized
+ * @err: error number
+ */
+void crypto_finalize_hash_request(struct crypto_engine *engine,
+				  struct ahash_request *req, int err)
+{
+	unsigned long flags;
+	bool finalize_cur_req = false;
+	int ret;
+
+	spin_lock_irqsave(&engine->queue_lock, flags);
+	if (engine->cur_req == &req->base)
+		finalize_cur_req = true;
+	spin_unlock_irqrestore(&engine->queue_lock, flags);
+
+	if (finalize_cur_req) {
+		if (engine->cur_req_prepared &&
+		    engine->unprepare_hash_request) {
+			ret = engine->unprepare_hash_request(engine, req);
+			if (ret)
+				pr_err("failed to unprepare request\n");
+		}
+		spin_lock_irqsave(&engine->queue_lock, flags);
+		engine->cur_req = NULL;
+		engine->cur_req_prepared = false;
+		spin_unlock_irqrestore(&engine->queue_lock, flags);
+	}
+
+	req->base.complete(&req->base, err);
+
+	queue_kthread_work(&engine->kworker, &engine->pump_requests);
+}
+EXPORT_SYMBOL_GPL(crypto_finalize_hash_request);
 
 /**
  * crypto_engine_start - start the hardware engine
@@ -249,7 +360,7 @@
 int crypto_engine_stop(struct crypto_engine *engine)
 {
 	unsigned long flags;
-	unsigned limit = 500;
+	unsigned int limit = 500;
 	int ret = 0;
 
 	spin_lock_irqsave(&engine->queue_lock, flags);
diff --git a/crypto/drbg.c b/crypto/drbg.c
index f752da3..fb33f7d 100644
--- a/crypto/drbg.c
+++ b/crypto/drbg.c
@@ -1178,12 +1178,16 @@
 		goto err;
 
 	drbg->Vbuf = kmalloc(drbg_statelen(drbg) + ret, GFP_KERNEL);
-	if (!drbg->Vbuf)
+	if (!drbg->Vbuf) {
+		ret = -ENOMEM;
 		goto fini;
+	}
 	drbg->V = PTR_ALIGN(drbg->Vbuf, ret + 1);
 	drbg->Cbuf = kmalloc(drbg_statelen(drbg) + ret, GFP_KERNEL);
-	if (!drbg->Cbuf)
+	if (!drbg->Cbuf) {
+		ret = -ENOMEM;
 		goto fini;
+	}
 	drbg->C = PTR_ALIGN(drbg->Cbuf, ret + 1);
 	/* scratchpad is only generated for CTR and Hash */
 	if (drbg->core->flags & DRBG_HMAC)
@@ -1199,8 +1203,10 @@
 
 	if (0 < sb_size) {
 		drbg->scratchpadbuf = kzalloc(sb_size + ret, GFP_KERNEL);
-		if (!drbg->scratchpadbuf)
+		if (!drbg->scratchpadbuf) {
+			ret = -ENOMEM;
 			goto fini;
+		}
 		drbg->scratchpad = PTR_ALIGN(drbg->scratchpadbuf, ret + 1);
 	}
 
@@ -1917,6 +1923,8 @@
 		return -ENOMEM;
 
 	mutex_init(&drbg->drbg_mutex);
+	drbg->core = &drbg_cores[coreref];
+	drbg->reseed_threshold = drbg_max_requests(drbg);
 
 	/*
 	 * if the following tests fail, it is likely that there is a buffer
@@ -1926,12 +1934,6 @@
 	 * grave bug.
 	 */
 
-	/* get a valid instance of DRBG for following tests */
-	ret = drbg_instantiate(drbg, NULL, coreref, pr);
-	if (ret) {
-		rc = ret;
-		goto outbuf;
-	}
 	max_addtllen = drbg_max_addtl(drbg);
 	max_request_bytes = drbg_max_request_bytes(drbg);
 	drbg_string_fill(&addtl, buf, max_addtllen + 1);
@@ -1941,10 +1943,9 @@
 	/* overflow max_bits */
 	len = drbg_generate(drbg, buf, (max_request_bytes + 1), NULL);
 	BUG_ON(0 < len);
-	drbg_uninstantiate(drbg);
 
 	/* overflow max addtllen with personalization string */
-	ret = drbg_instantiate(drbg, &addtl, coreref, pr);
+	ret = drbg_seed(drbg, &addtl, false);
 	BUG_ON(0 == ret);
 	/* all tests passed */
 	rc = 0;
@@ -1952,9 +1953,7 @@
 	pr_devel("DRBG: Sanity tests for failure code paths successfully "
 		 "completed\n");
 
-	drbg_uninstantiate(drbg);
-outbuf:
-	kzfree(drbg);
+	kfree(drbg);
 	return rc;
 }
 
@@ -2006,7 +2005,7 @@
 {
 	unsigned int i = 0; /* pointer to drbg_algs */
 	unsigned int j = 0; /* pointer to drbg_cores */
-	int ret = -EFAULT;
+	int ret;
 
 	ret = drbg_healthcheck_sanity();
 	if (ret)
@@ -2016,7 +2015,7 @@
 		pr_info("DRBG: Cannot register all DRBG types"
 			"(slots needed: %zu, slots available: %zu)\n",
 			ARRAY_SIZE(drbg_cores) * 2, ARRAY_SIZE(drbg_algs));
-		return ret;
+		return -EFAULT;
 	}
 
 	/*
diff --git a/crypto/gcm.c b/crypto/gcm.c
index 70a892e8..f624ac9 100644
--- a/crypto/gcm.c
+++ b/crypto/gcm.c
@@ -117,7 +117,7 @@
 	struct crypto_skcipher *ctr = ctx->ctr;
 	struct {
 		be128 hash;
-		u8 iv[8];
+		u8 iv[16];
 
 		struct crypto_gcm_setkey_result result;
 
diff --git a/crypto/ghash-generic.c b/crypto/ghash-generic.c
index bac7099..12ad3e3 100644
--- a/crypto/ghash-generic.c
+++ b/crypto/ghash-generic.c
@@ -14,24 +14,13 @@
 
 #include <crypto/algapi.h>
 #include <crypto/gf128mul.h>
+#include <crypto/ghash.h>
 #include <crypto/internal/hash.h>
 #include <linux/crypto.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 
-#define GHASH_BLOCK_SIZE	16
-#define GHASH_DIGEST_SIZE	16
-
-struct ghash_ctx {
-	struct gf128mul_4k *gf128;
-};
-
-struct ghash_desc_ctx {
-	u8 buffer[GHASH_BLOCK_SIZE];
-	u32 bytes;
-};
-
 static int ghash_init(struct shash_desc *desc)
 {
 	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
diff --git a/crypto/mcryptd.c b/crypto/mcryptd.c
index 86fb59b..94ee44a 100644
--- a/crypto/mcryptd.c
+++ b/crypto/mcryptd.c
@@ -612,12 +612,7 @@
 
 int ahash_mcryptd_digest(struct ahash_request *desc)
 {
-	int err;
-
-	err = crypto_ahash_init(desc) ?:
-	      ahash_mcryptd_finup(desc);
-
-	return err;
+	return crypto_ahash_init(desc) ?: ahash_mcryptd_finup(desc);
 }
 
 int ahash_mcryptd_update(struct ahash_request *desc)
diff --git a/crypto/rsa_helper.c b/crypto/rsa_helper.c
index 4df6451..0b66dc8 100644
--- a/crypto/rsa_helper.c
+++ b/crypto/rsa_helper.c
@@ -35,8 +35,8 @@
 			n_sz--;
 		}
 
-		/* In FIPS mode only allow key size 2K & 3K */
-		if (n_sz != 256 && n_sz != 384) {
+		/* In FIPS mode only allow key size 2K and higher */
+		if (n_sz < 256) {
 			pr_err("RSA: key size not allowed in FIPS mode\n");
 			return -EINVAL;
 		}
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 5c9d5a5..62dffa0 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -209,16 +209,19 @@
 	char *state;
 	struct ahash_request *req;
 	int statesize, ret = -EINVAL;
+	const char guard[] = { 0x00, 0xba, 0xad, 0x00 };
 
 	req = *preq;
 	statesize = crypto_ahash_statesize(
 			crypto_ahash_reqtfm(req));
-	state = kmalloc(statesize, GFP_KERNEL);
+	state = kmalloc(statesize + sizeof(guard), GFP_KERNEL);
 	if (!state) {
 		pr_err("alt: hash: Failed to alloc state for %s\n", algo);
 		goto out_nostate;
 	}
+	memcpy(state + statesize, guard, sizeof(guard));
 	ret = crypto_ahash_export(req, state);
+	WARN_ON(memcmp(state + statesize, guard, sizeof(guard)));
 	if (ret) {
 		pr_err("alt: hash: Failed to export() for %s\n", algo);
 		goto out;
@@ -665,7 +668,7 @@
 		memcpy(key, template[i].key, template[i].klen);
 
 		ret = crypto_aead_setkey(tfm, key, template[i].klen);
-		if (!ret == template[i].fail) {
+		if (template[i].fail == !ret) {
 			pr_err("alg: aead%s: setkey failed on test %d for %s: flags=%x\n",
 			       d, j, algo, crypto_aead_get_flags(tfm));
 			goto out;
@@ -770,7 +773,7 @@
 		memcpy(key, template[i].key, template[i].klen);
 
 		ret = crypto_aead_setkey(tfm, key, template[i].klen);
-		if (!ret == template[i].fail) {
+		if (template[i].fail == !ret) {
 			pr_err("alg: aead%s: setkey failed on chunk test %d for %s: flags=%x\n",
 			       d, j, algo, crypto_aead_get_flags(tfm));
 			goto out;
@@ -1008,6 +1011,9 @@
 		if (template[i].np)
 			continue;
 
+		if (fips_enabled && template[i].fips_skip)
+			continue;
+
 		j++;
 
 		ret = -EINVAL;
@@ -1023,7 +1029,7 @@
 
 		ret = crypto_cipher_setkey(tfm, template[i].key,
 					   template[i].klen);
-		if (!ret == template[i].fail) {
+		if (template[i].fail == !ret) {
 			printk(KERN_ERR "alg: cipher: setkey failed "
 			       "on test %d for %s: flags=%x\n", j,
 			       algo, crypto_cipher_get_flags(tfm));
@@ -1112,6 +1118,9 @@
 		if (template[i].np && !template[i].also_non_np)
 			continue;
 
+		if (fips_enabled && template[i].fips_skip)
+			continue;
+
 		if (template[i].iv)
 			memcpy(iv, template[i].iv, ivsize);
 		else
@@ -1133,7 +1142,7 @@
 
 		ret = crypto_skcipher_setkey(tfm, template[i].key,
 					     template[i].klen);
-		if (!ret == template[i].fail) {
+		if (template[i].fail == !ret) {
 			pr_err("alg: skcipher%s: setkey failed on test %d for %s: flags=%x\n",
 			       d, j, algo, crypto_skcipher_get_flags(tfm));
 			goto out;
@@ -1198,6 +1207,9 @@
 		if (!template[i].np)
 			continue;
 
+		if (fips_enabled && template[i].fips_skip)
+			continue;
+
 		if (template[i].iv)
 			memcpy(iv, template[i].iv, ivsize);
 		else
@@ -1211,7 +1223,7 @@
 
 		ret = crypto_skcipher_setkey(tfm, template[i].key,
 					     template[i].klen);
-		if (!ret == template[i].fail) {
+		if (template[i].fail == !ret) {
 			pr_err("alg: skcipher%s: setkey failed on chunk test %d for %s: flags=%x\n",
 			       d, j, algo, crypto_skcipher_get_flags(tfm));
 			goto out;
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index acb6bbf..e64a4ef 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -59,6 +59,7 @@
  * @tap:	How to distribute data in @np SGs
  * @also_non_np: 	if set to 1, the test will be also done without
  * 			splitting data in @np SGs
+ * @fips_skip:	Skip the test vector in FIPS mode
  */
 
 struct cipher_testvec {
@@ -75,6 +76,7 @@
 	unsigned char klen;
 	unsigned short ilen;
 	unsigned short rlen;
+	bool fips_skip;
 };
 
 struct aead_testvec {
@@ -18224,6 +18226,7 @@
 			  "\x00\x00\x00\x00\x00\x00\x00\x00"
 			  "\x00\x00\x00\x00\x00\x00\x00\x00",
 		.klen   = 32,
+		.fips_skip = 1,
 		.iv     = "\x00\x00\x00\x00\x00\x00\x00\x00"
 			  "\x00\x00\x00\x00\x00\x00\x00\x00",
 		.input  = "\x00\x00\x00\x00\x00\x00\x00\x00"
@@ -18566,6 +18569,7 @@
 			  "\x00\x00\x00\x00\x00\x00\x00\x00"
 			  "\x00\x00\x00\x00\x00\x00\x00\x00",
 		.klen   = 32,
+		.fips_skip = 1,
 		.iv     = "\x00\x00\x00\x00\x00\x00\x00\x00"
 			  "\x00\x00\x00\x00\x00\x00\x00\x00",
 		.input = "\x91\x7c\xf6\x9e\xbd\x68\xb2\xec"
diff --git a/crypto/xor.c b/crypto/xor.c
index 35d6b3a..263af9f 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -24,6 +24,10 @@
 #include <linux/preempt.h>
 #include <asm/xor.h>
 
+#ifndef XOR_SELECT_TEMPLATE
+#define XOR_SELECT_TEMPLATE(x) (x)
+#endif
+
 /* The xor routines to use.  */
 static struct xor_block_template *active_template;
 
@@ -109,6 +113,15 @@
 	void *b1, *b2;
 	struct xor_block_template *f, *fastest;
 
+	fastest = XOR_SELECT_TEMPLATE(NULL);
+
+	if (fastest) {
+		printk(KERN_INFO "xor: automatically using best "
+				 "checksumming function   %-10s\n",
+		       fastest->name);
+		goto out;
+	}
+
 	/*
 	 * Note: Since the memory is not actually used for _anything_ but to
 	 * test the XOR speed, we don't really want kmemcheck to warn about
@@ -126,36 +139,22 @@
 	 * all the possible functions, just test the best one
 	 */
 
-	fastest = NULL;
-
-#ifdef XOR_SELECT_TEMPLATE
-		fastest = XOR_SELECT_TEMPLATE(fastest);
-#endif
-
 #define xor_speed(templ)	do_xor_speed((templ), b1, b2)
 
-	if (fastest) {
-		printk(KERN_INFO "xor: automatically using best "
-				 "checksumming function:\n");
-		xor_speed(fastest);
-		goto out;
-	} else {
-		printk(KERN_INFO "xor: measuring software checksum speed\n");
-		XOR_TRY_TEMPLATES;
-		fastest = template_list;
-		for (f = fastest; f; f = f->next)
-			if (f->speed > fastest->speed)
-				fastest = f;
-	}
+	printk(KERN_INFO "xor: measuring software checksum speed\n");
+	XOR_TRY_TEMPLATES;
+	fastest = template_list;
+	for (f = fastest; f; f = f->next)
+		if (f->speed > fastest->speed)
+			fastest = f;
 
 	printk(KERN_INFO "xor: using function: %s (%d.%03d MB/sec)\n",
 	       fastest->name, fastest->speed / 1000, fastest->speed % 1000);
 
 #undef xor_speed
 
- out:
 	free_pages((unsigned long)b1, 2);
-
+out:
 	active_template = fastest;
 	return 0;
 }
diff --git a/crypto/xts.c b/crypto/xts.c
index 26ba583..305343f 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -5,7 +5,7 @@
  *
  * Copyright (c) 2007 Rik Snel <rsnel@cube.dyndns.org>
  *
- * Based om ecb.c
+ * Based on ecb.c
  * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
  *
  * This program is free software; you can redistribute it and/or modify it
diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
index 8c0770b..200dab5 100644
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -410,6 +410,19 @@
 
 	  If unsure, say Y.
 
+config HW_RANDOM_CAVIUM
+       tristate "Cavium ThunderX Random Number Generator support"
+       depends on HW_RANDOM && PCI && (ARM64 || (COMPILE_TEST && 64BIT))
+       default HW_RANDOM
+       ---help---
+         This driver provides kernel-side support for the Random Number
+         Generator hardware found on Cavium SoCs.
+
+         To compile this driver as a module, choose M here: the
+         module will be called cavium_rng.
+
+         If unsure, say Y.
+
 endif # HW_RANDOM
 
 config UML_RANDOM
diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile
index 04bb0b0..5f52b1e 100644
--- a/drivers/char/hw_random/Makefile
+++ b/drivers/char/hw_random/Makefile
@@ -35,3 +35,4 @@
 obj-$(CONFIG_HW_RANDOM_STM32) += stm32-rng.o
 obj-$(CONFIG_HW_RANDOM_PIC32) += pic32-rng.o
 obj-$(CONFIG_HW_RANDOM_MESON) += meson-rng.o
+obj-$(CONFIG_HW_RANDOM_CAVIUM) += cavium-rng.o cavium-rng-vf.o
diff --git a/drivers/char/hw_random/amd-rng.c b/drivers/char/hw_random/amd-rng.c
index 48f6a83..4a99ac7 100644
--- a/drivers/char/hw_random/amd-rng.c
+++ b/drivers/char/hw_random/amd-rng.c
@@ -24,16 +24,18 @@
  * warranty of any kind, whether express or implied.
  */
 
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/hw_random.h>
 #include <linux/delay.h>
-#include <asm/io.h>
+#include <linux/hw_random.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
 
+#define DRV_NAME "AMD768-HWRNG"
 
-#define PFX	KBUILD_MODNAME ": "
-
+#define RNGDATA		0x00
+#define RNGDONE		0x04
+#define PMBASE_OFFSET	0xF0
+#define PMBASE_SIZE	8
 
 /*
  * Data for PCI driver interface
@@ -50,72 +52,84 @@
 };
 MODULE_DEVICE_TABLE(pci, pci_tbl);
 
-static struct pci_dev *amd_pdev;
+struct amd768_priv {
+	void __iomem *iobase;
+	struct pci_dev *pcidev;
+};
 
-
-static int amd_rng_data_present(struct hwrng *rng, int wait)
+static int amd_rng_read(struct hwrng *rng, void *buf, size_t max, bool wait)
 {
-	u32 pmbase = (u32)rng->priv;
-	int data, i;
+	u32 *data = buf;
+	struct amd768_priv *priv = (struct amd768_priv *)rng->priv;
+	size_t read = 0;
+	/* We will wait at maximum one time per read */
+	int timeout = max / 4 + 1;
 
-	for (i = 0; i < 20; i++) {
-		data = !!(inl(pmbase + 0xF4) & 1);
-		if (data || !wait)
-			break;
-		udelay(10);
+	/*
+	 * RNG data is available when RNGDONE is set to 1
+	 * New random numbers are generated approximately 128 microseconds
+	 * after RNGDATA is read
+	 */
+	while (read < max) {
+		if (ioread32(priv->iobase + RNGDONE) == 0) {
+			if (wait) {
+				/* Delay given by datasheet */
+				usleep_range(128, 196);
+				if (timeout-- == 0)
+					return read;
+			} else {
+				return 0;
+			}
+		} else {
+			*data = ioread32(priv->iobase + RNGDATA);
+			data++;
+			read += 4;
+		}
 	}
-	return data;
-}
 
-static int amd_rng_data_read(struct hwrng *rng, u32 *data)
-{
-	u32 pmbase = (u32)rng->priv;
-
-	*data = inl(pmbase + 0xF0);
-
-	return 4;
+	return read;
 }
 
 static int amd_rng_init(struct hwrng *rng)
 {
+	struct amd768_priv *priv = (struct amd768_priv *)rng->priv;
 	u8 rnen;
 
-	pci_read_config_byte(amd_pdev, 0x40, &rnen);
-	rnen |= (1 << 7);	/* RNG on */
-	pci_write_config_byte(amd_pdev, 0x40, rnen);
+	pci_read_config_byte(priv->pcidev, 0x40, &rnen);
+	rnen |= BIT(7);	/* RNG on */
+	pci_write_config_byte(priv->pcidev, 0x40, rnen);
 
-	pci_read_config_byte(amd_pdev, 0x41, &rnen);
-	rnen |= (1 << 7);	/* PMIO enable */
-	pci_write_config_byte(amd_pdev, 0x41, rnen);
+	pci_read_config_byte(priv->pcidev, 0x41, &rnen);
+	rnen |= BIT(7);	/* PMIO enable */
+	pci_write_config_byte(priv->pcidev, 0x41, rnen);
 
 	return 0;
 }
 
 static void amd_rng_cleanup(struct hwrng *rng)
 {
+	struct amd768_priv *priv = (struct amd768_priv *)rng->priv;
 	u8 rnen;
 
-	pci_read_config_byte(amd_pdev, 0x40, &rnen);
-	rnen &= ~(1 << 7);	/* RNG off */
-	pci_write_config_byte(amd_pdev, 0x40, rnen);
+	pci_read_config_byte(priv->pcidev, 0x40, &rnen);
+	rnen &= ~BIT(7);	/* RNG off */
+	pci_write_config_byte(priv->pcidev, 0x40, rnen);
 }
 
-
 static struct hwrng amd_rng = {
 	.name		= "amd",
 	.init		= amd_rng_init,
 	.cleanup	= amd_rng_cleanup,
-	.data_present	= amd_rng_data_present,
-	.data_read	= amd_rng_data_read,
+	.read		= amd_rng_read,
 };
 
-
 static int __init mod_init(void)
 {
 	int err = -ENODEV;
 	struct pci_dev *pdev = NULL;
 	const struct pci_device_id *ent;
 	u32 pmbase;
+	struct amd768_priv *priv;
 
 	for_each_pci_dev(pdev) {
 		ent = pci_match_id(pci_tbl, pdev);
@@ -123,42 +137,44 @@
 			goto found;
 	}
 	/* Device not found. */
-	goto out;
+	return -ENODEV;
 
 found:
 	err = pci_read_config_dword(pdev, 0x58, &pmbase);
 	if (err)
-		goto out;
-	err = -EIO;
+		return err;
+
 	pmbase &= 0x0000FF00;
 	if (pmbase == 0)
-		goto out;
-	if (!request_region(pmbase + 0xF0, 8, "AMD HWRNG")) {
-		dev_err(&pdev->dev, "AMD HWRNG region 0x%x already in use!\n",
-			pmbase + 0xF0);
-		err = -EBUSY;
-		goto out;
-	}
-	amd_rng.priv = (unsigned long)pmbase;
-	amd_pdev = pdev;
+		return -EIO;
 
-	pr_info("AMD768 RNG detected\n");
-	err = hwrng_register(&amd_rng);
-	if (err) {
-		pr_err(PFX "RNG registering failed (%d)\n",
-		       err);
-		release_region(pmbase + 0xF0, 8);
-		goto out;
+	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	if (!devm_request_region(&pdev->dev, pmbase + PMBASE_OFFSET,
+				PMBASE_SIZE, DRV_NAME)) {
+		dev_err(&pdev->dev, DRV_NAME " region 0x%x already in use!\n",
+			pmbase + 0xF0);
+		return -EBUSY;
 	}
-out:
-	return err;
+
+	priv->iobase = devm_ioport_map(&pdev->dev, pmbase + PMBASE_OFFSET,
+			PMBASE_SIZE);
+	if (!priv->iobase) {
+		pr_err(DRV_NAME "Cannot map ioport\n");
+		return -ENOMEM;
+	}
+
+	amd_rng.priv = (unsigned long)priv;
+	priv->pcidev = pdev;
+
+	pr_info(DRV_NAME " detected\n");
+	return devm_hwrng_register(&pdev->dev, &amd_rng);
 }
 
 static void __exit mod_exit(void)
 {
-	u32 pmbase = (unsigned long)amd_rng.priv;
-	release_region(pmbase + 0xF0, 8);
-	hwrng_unregister(&amd_rng);
 }
 
 module_init(mod_init);
diff --git a/drivers/char/hw_random/bcm2835-rng.c b/drivers/char/hw_random/bcm2835-rng.c
index af21492..574211a 100644
--- a/drivers/char/hw_random/bcm2835-rng.c
+++ b/drivers/char/hw_random/bcm2835-rng.c
@@ -92,9 +92,10 @@
 	bcm2835_rng_ops.priv = (unsigned long)rng_base;
 
 	rng_id = of_match_node(bcm2835_rng_of_match, np);
-	if (!rng_id)
+	if (!rng_id) {
+		iounmap(rng_base);
 		return -EINVAL;
-
+	}
 	/* Check for rng init function, execute it */
 	rng_setup = rng_id->data;
 	if (rng_setup)
diff --git a/drivers/char/hw_random/cavium-rng-vf.c b/drivers/char/hw_random/cavium-rng-vf.c
new file mode 100644
index 0000000..066ae0e
--- /dev/null
+++ b/drivers/char/hw_random/cavium-rng-vf.c
@@ -0,0 +1,99 @@
+/*
+ * Hardware Random Number Generator support for Cavium, Inc.
+ * Thunder processor family.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2016 Cavium, Inc.
+ */
+
+#include <linux/hw_random.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+
+struct cavium_rng {
+	struct hwrng ops;
+	void __iomem *result;
+};
+
+/* Read data from the RNG unit */
+static int cavium_rng_read(struct hwrng *rng, void *dat, size_t max, bool wait)
+{
+	struct cavium_rng *p = container_of(rng, struct cavium_rng, ops);
+	unsigned int size = max;
+
+	while (size >= 8) {
+		*((u64 *)dat) = readq(p->result);
+		size -= 8;
+		dat += 8;
+	}
+	while (size > 0) {
+		*((u8 *)dat) = readb(p->result);
+		size--;
+		dat++;
+	}
+	return max;
+}
+
+/* Map Cavium RNG to an HWRNG object */
+static int cavium_rng_probe_vf(struct	pci_dev		*pdev,
+			 const struct	pci_device_id	*id)
+{
+	struct	cavium_rng *rng;
+	int	ret;
+
+	rng = devm_kzalloc(&pdev->dev, sizeof(*rng), GFP_KERNEL);
+	if (!rng)
+		return -ENOMEM;
+
+	/* Map the RNG result */
+	rng->result = pcim_iomap(pdev, 0, 0);
+	if (!rng->result) {
+		dev_err(&pdev->dev, "Error iomap failed retrieving result.\n");
+		return -ENOMEM;
+	}
+
+	rng->ops.name    = "cavium rng";
+	rng->ops.read    = cavium_rng_read;
+	rng->ops.quality = 1000;
+
+	pci_set_drvdata(pdev, rng);
+
+	ret = hwrng_register(&rng->ops);
+	if (ret) {
+		dev_err(&pdev->dev, "Error registering device as HWRNG.\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+/* Remove the VF */
+void  cavium_rng_remove_vf(struct pci_dev *pdev)
+{
+	struct cavium_rng *rng;
+
+	rng = pci_get_drvdata(pdev);
+	hwrng_unregister(&rng->ops);
+}
+
+static const struct pci_device_id cavium_rng_vf_id_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, 0xa033), 0, 0, 0},
+	{0,},
+};
+MODULE_DEVICE_TABLE(pci, cavium_rng_vf_id_table);
+
+static struct pci_driver cavium_rng_vf_driver = {
+	.name		= "cavium_rng_vf",
+	.id_table	= cavium_rng_vf_id_table,
+	.probe		= cavium_rng_probe_vf,
+	.remove		= cavium_rng_remove_vf,
+};
+module_pci_driver(cavium_rng_vf_driver);
+
+MODULE_AUTHOR("Omer Khaliq <okhaliq@caviumnetworks.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/char/hw_random/cavium-rng.c b/drivers/char/hw_random/cavium-rng.c
new file mode 100644
index 0000000..a944e0a
--- /dev/null
+++ b/drivers/char/hw_random/cavium-rng.c
@@ -0,0 +1,94 @@
+/*
+ * Hardware Random Number Generator support for Cavium Inc.
+ * Thunder processor family.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2016 Cavium, Inc.
+ */
+
+#include <linux/hw_random.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+
+#define THUNDERX_RNM_ENT_EN     0x1
+#define THUNDERX_RNM_RNG_EN     0x2
+
+struct cavium_rng_pf {
+	void __iomem *control_status;
+};
+
+/* Enable the RNG hardware and activate the VF */
+static int cavium_rng_probe(struct pci_dev *pdev,
+			const struct pci_device_id *id)
+{
+	struct	cavium_rng_pf *rng;
+	int	iov_err;
+
+	rng = devm_kzalloc(&pdev->dev, sizeof(*rng), GFP_KERNEL);
+	if (!rng)
+		return -ENOMEM;
+
+	/*Map the RNG control */
+	rng->control_status = pcim_iomap(pdev, 0, 0);
+	if (!rng->control_status) {
+		dev_err(&pdev->dev,
+			"Error iomap failed retrieving control_status.\n");
+		return -ENOMEM;
+	}
+
+	/* Enable the RNG hardware and entropy source */
+	writeq(THUNDERX_RNM_RNG_EN | THUNDERX_RNM_ENT_EN,
+		rng->control_status);
+
+	pci_set_drvdata(pdev, rng);
+
+	/* Enable the Cavium RNG as a VF */
+	iov_err = pci_enable_sriov(pdev, 1);
+	if (iov_err != 0) {
+		/* Disable the RNG hardware and entropy source */
+		writeq(0, rng->control_status);
+		dev_err(&pdev->dev,
+			"Error initializing RNG virtual function,(%i).\n",
+			iov_err);
+		return iov_err;
+	}
+
+	return 0;
+}
+
+/* Disable VF and RNG Hardware */
+void  cavium_rng_remove(struct pci_dev *pdev)
+{
+	struct cavium_rng_pf *rng;
+
+	rng = pci_get_drvdata(pdev);
+
+	/* Remove the VF */
+	pci_disable_sriov(pdev);
+
+	/* Disable the RNG hardware and entropy source */
+	writeq(0, rng->control_status);
+}
+
+static const struct pci_device_id cavium_rng_pf_id_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, 0xa018), 0, 0, 0}, /* Thunder RNM */
+	{0,},
+};
+
+MODULE_DEVICE_TABLE(pci, cavium_rng_pf_id_table);
+
+static struct pci_driver cavium_rng_pf_driver = {
+	.name		= "cavium_rng_pf",
+	.id_table	= cavium_rng_pf_id_table,
+	.probe		= cavium_rng_probe,
+	.remove		= cavium_rng_remove,
+};
+
+module_pci_driver(cavium_rng_pf_driver);
+MODULE_AUTHOR("Omer Khaliq <okhaliq@caviumnetworks.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 9203f2d..4827945 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -449,22 +449,6 @@
 		goto out;
 
 	mutex_lock(&rng_mutex);
-
-	/* kmalloc makes this safe for virt_to_page() in virtio_rng.c */
-	err = -ENOMEM;
-	if (!rng_buffer) {
-		rng_buffer = kmalloc(rng_buffer_size(), GFP_KERNEL);
-		if (!rng_buffer)
-			goto out_unlock;
-	}
-	if (!rng_fillbuf) {
-		rng_fillbuf = kmalloc(rng_buffer_size(), GFP_KERNEL);
-		if (!rng_fillbuf) {
-			kfree(rng_buffer);
-			goto out_unlock;
-		}
-	}
-
 	/* Must not register two RNGs with the same name. */
 	err = -EEXIST;
 	list_for_each_entry(tmp, &rng_list, list) {
@@ -573,7 +557,26 @@
 
 static int __init hwrng_modinit(void)
 {
-	return register_miscdev();
+	int ret = -ENOMEM;
+
+	/* kmalloc makes this safe for virt_to_page() in virtio_rng.c */
+	rng_buffer = kmalloc(rng_buffer_size(), GFP_KERNEL);
+	if (!rng_buffer)
+		return -ENOMEM;
+
+	rng_fillbuf = kmalloc(rng_buffer_size(), GFP_KERNEL);
+	if (!rng_fillbuf) {
+		kfree(rng_buffer);
+		return -ENOMEM;
+	}
+
+	ret = register_miscdev();
+	if (ret) {
+		kfree(rng_fillbuf);
+		kfree(rng_buffer);
+	}
+
+	return ret;
 }
 
 static void __exit hwrng_modexit(void)
diff --git a/drivers/char/hw_random/geode-rng.c b/drivers/char/hw_random/geode-rng.c
index 0d0579f..e7a2459 100644
--- a/drivers/char/hw_random/geode-rng.c
+++ b/drivers/char/hw_random/geode-rng.c
@@ -24,15 +24,12 @@
  * warranty of any kind, whether express or implied.
  */
 
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/hw_random.h>
 #include <linux/delay.h>
-#include <asm/io.h>
-
-
-#define PFX	KBUILD_MODNAME ": "
+#include <linux/hw_random.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
 
 #define GEODE_RNG_DATA_REG   0x50
 #define GEODE_RNG_STATUS_REG 0x54
@@ -85,7 +82,6 @@
 
 static int __init mod_init(void)
 {
-	int err = -ENODEV;
 	struct pci_dev *pdev = NULL;
 	const struct pci_device_id *ent;
 	void __iomem *mem;
@@ -93,43 +89,27 @@
 
 	for_each_pci_dev(pdev) {
 		ent = pci_match_id(pci_tbl, pdev);
-		if (ent)
-			goto found;
+		if (ent) {
+			rng_base = pci_resource_start(pdev, 0);
+			if (rng_base == 0)
+				return -ENODEV;
+
+			mem = devm_ioremap(&pdev->dev, rng_base, 0x58);
+			if (!mem)
+				return -ENOMEM;
+			geode_rng.priv = (unsigned long)mem;
+
+			pr_info("AMD Geode RNG detected\n");
+			return devm_hwrng_register(&pdev->dev, &geode_rng);
+		}
 	}
+
 	/* Device not found. */
-	goto out;
-
-found:
-	rng_base = pci_resource_start(pdev, 0);
-	if (rng_base == 0)
-		goto out;
-	err = -ENOMEM;
-	mem = ioremap(rng_base, 0x58);
-	if (!mem)
-		goto out;
-	geode_rng.priv = (unsigned long)mem;
-
-	pr_info("AMD Geode RNG detected\n");
-	err = hwrng_register(&geode_rng);
-	if (err) {
-		pr_err(PFX "RNG registering failed (%d)\n",
-		       err);
-		goto err_unmap;
-	}
-out:
-	return err;
-
-err_unmap:
-	iounmap(mem);
-	goto out;
+	return -ENODEV;
 }
 
 static void __exit mod_exit(void)
 {
-	void __iomem *mem = (void __iomem *)geode_rng.priv;
-
-	hwrng_unregister(&geode_rng);
-	iounmap(mem);
 }
 
 module_init(mod_init);
diff --git a/drivers/char/hw_random/meson-rng.c b/drivers/char/hw_random/meson-rng.c
index 0cfd81b..58bef39 100644
--- a/drivers/char/hw_random/meson-rng.c
+++ b/drivers/char/hw_random/meson-rng.c
@@ -76,9 +76,6 @@
 	struct meson_rng_data *data =
 			container_of(rng, struct meson_rng_data, rng);
 
-	if (max < sizeof(u32))
-		return 0;
-
 	*(u32 *)buf = readl_relaxed(data->base + RNG_DATA);
 
 	return sizeof(u32);
diff --git a/drivers/char/hw_random/omap-rng.c b/drivers/char/hw_random/omap-rng.c
index 01d4be2..f5c26a5 100644
--- a/drivers/char/hw_random/omap-rng.c
+++ b/drivers/char/hw_random/omap-rng.c
@@ -385,7 +385,7 @@
 
 	pm_runtime_enable(&pdev->dev);
 	ret = pm_runtime_get_sync(&pdev->dev);
-	if (ret) {
+	if (ret < 0) {
 		dev_err(&pdev->dev, "Failed to runtime_get device: %d\n", ret);
 		pm_runtime_put_noidle(&pdev->dev);
 		goto err_ioremap;
@@ -443,7 +443,7 @@
 	int ret;
 
 	ret = pm_runtime_get_sync(dev);
-	if (ret) {
+	if (ret < 0) {
 		dev_err(dev, "Failed to runtime_get device: %d\n", ret);
 		pm_runtime_put_noidle(dev);
 		return ret;
diff --git a/drivers/char/hw_random/omap3-rom-rng.c b/drivers/char/hw_random/omap3-rom-rng.c
index 8da14f1..37a58d7 100644
--- a/drivers/char/hw_random/omap3-rom-rng.c
+++ b/drivers/char/hw_random/omap3-rom-rng.c
@@ -71,12 +71,7 @@
 	return 0;
 }
 
-static int omap3_rom_rng_data_present(struct hwrng *rng, int wait)
-{
-	return 1;
-}
-
-static int omap3_rom_rng_data_read(struct hwrng *rng, u32 *data)
+static int omap3_rom_rng_read(struct hwrng *rng, void *data, size_t max, bool w)
 {
 	int r;
 
@@ -88,8 +83,7 @@
 
 static struct hwrng omap3_rom_rng_ops = {
 	.name		= "omap3-rom",
-	.data_present	= omap3_rom_rng_data_present,
-	.data_read	= omap3_rom_rng_data_read,
+	.read		= omap3_rom_rng_read,
 };
 
 static int omap3_rom_rng_probe(struct platform_device *pdev)
diff --git a/drivers/char/hw_random/pasemi-rng.c b/drivers/char/hw_random/pasemi-rng.c
index c19e23d..545df48 100644
--- a/drivers/char/hw_random/pasemi-rng.c
+++ b/drivers/char/hw_random/pasemi-rng.c
@@ -95,42 +95,20 @@
 	.data_read	= pasemi_rng_data_read,
 };
 
-static int rng_probe(struct platform_device *ofdev)
+static int rng_probe(struct platform_device *pdev)
 {
 	void __iomem *rng_regs;
-	struct device_node *rng_np = ofdev->dev.of_node;
-	struct resource res;
-	int err = 0;
+	struct resource *res;
 
-	err = of_address_to_resource(rng_np, 0, &res);
-	if (err)
-		return -ENODEV;
-
-	rng_regs = ioremap(res.start, 0x100);
-
-	if (!rng_regs)
-		return -ENOMEM;
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	rng_regs = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(rng_regs))
+		return PTR_ERR(rng_regs);
 
 	pasemi_rng.priv = (unsigned long)rng_regs;
 
 	pr_info("Registering PA Semi RNG\n");
-
-	err = hwrng_register(&pasemi_rng);
-
-	if (err)
-		iounmap(rng_regs);
-
-	return err;
-}
-
-static int rng_remove(struct platform_device *dev)
-{
-	void __iomem *rng_regs = (void __iomem *)pasemi_rng.priv;
-
-	hwrng_unregister(&pasemi_rng);
-	iounmap(rng_regs);
-
-	return 0;
+	return devm_hwrng_register(&pdev->dev, &pasemi_rng);
 }
 
 static const struct of_device_id rng_match[] = {
@@ -146,7 +124,6 @@
 		.of_match_table = rng_match,
 	},
 	.probe		= rng_probe,
-	.remove		= rng_remove,
 };
 
 module_platform_driver(rng_driver);
diff --git a/drivers/char/hw_random/pic32-rng.c b/drivers/char/hw_random/pic32-rng.c
index 108897b..11dc9b7c 100644
--- a/drivers/char/hw_random/pic32-rng.c
+++ b/drivers/char/hw_random/pic32-rng.c
@@ -143,7 +143,6 @@
 	.remove		= pic32_rng_remove,
 	.driver		= {
 		.name	= "pic32-rng",
-		.owner	= THIS_MODULE,
 		.of_match_table = of_match_ptr(pic32_rng_of_match),
 	},
 };
diff --git a/drivers/char/hw_random/st-rng.c b/drivers/char/hw_random/st-rng.c
index 1d35363..938ec10 100644
--- a/drivers/char/hw_random/st-rng.c
+++ b/drivers/char/hw_random/st-rng.c
@@ -54,9 +54,6 @@
 	u32 status;
 	int i;
 
-	if (max < sizeof(u16))
-		return -EINVAL;
-
 	/* Wait until FIFO is full - max 4uS*/
 	for (i = 0; i < ST_RNG_FILL_FIFO_TIMEOUT; i++) {
 		status = readl_relaxed(ddata->base + ST_RNG_STATUS_REG);
@@ -111,6 +108,7 @@
 	ret = hwrng_register(&ddata->ops);
 	if (ret) {
 		dev_err(&pdev->dev, "Failed to register HW RNG\n");
+		clk_disable_unprepare(clk);
 		return ret;
 	}
 
diff --git a/drivers/char/hw_random/tx4939-rng.c b/drivers/char/hw_random/tx4939-rng.c
index a7b6949..1093583 100644
--- a/drivers/char/hw_random/tx4939-rng.c
+++ b/drivers/char/hw_random/tx4939-rng.c
@@ -144,22 +144,13 @@
 	}
 
 	platform_set_drvdata(dev, rngdev);
-	return hwrng_register(&rngdev->rng);
-}
-
-static int __exit tx4939_rng_remove(struct platform_device *dev)
-{
-	struct tx4939_rng *rngdev = platform_get_drvdata(dev);
-
-	hwrng_unregister(&rngdev->rng);
-	return 0;
+	return devm_hwrng_register(&dev->dev, &rngdev->rng);
 }
 
 static struct platform_driver tx4939_rng_driver = {
 	.driver		= {
 		.name	= "tx4939-rng",
 	},
-	.remove = tx4939_rng_remove,
 };
 
 module_platform_driver_probe(tx4939_rng_driver, tx4939_rng_probe);
diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 9b035b7..4d2b81f 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -318,6 +318,9 @@
 	select CRYPTO_AES
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_ENGINE
+	select CRYPTO_CBC
+	select CRYPTO_ECB
+	select CRYPTO_CTR
 	help
 	  OMAP processors have AES module accelerator. Select this if you
 	  want to use the OMAP module for AES algorithms.
diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index b304421..156aad1 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -111,6 +111,42 @@
 #else
 #define debug(format, arg...)
 #endif
+
+#ifdef DEBUG
+#include <linux/highmem.h>
+
+static void dbg_dump_sg(const char *level, const char *prefix_str,
+			int prefix_type, int rowsize, int groupsize,
+			struct scatterlist *sg, size_t tlen, bool ascii,
+			bool may_sleep)
+{
+	struct scatterlist *it;
+	void *it_page;
+	size_t len;
+	void *buf;
+
+	for (it = sg; it != NULL && tlen > 0 ; it = sg_next(sg)) {
+		/*
+		 * make sure the scatterlist's page
+		 * has a valid virtual memory mapping
+		 */
+		it_page = kmap_atomic(sg_page(it));
+		if (unlikely(!it_page)) {
+			printk(KERN_ERR "dbg_dump_sg: kmap failed\n");
+			return;
+		}
+
+		buf = it_page + it->offset;
+		len = min(tlen, it->length);
+		print_hex_dump(level, prefix_str, prefix_type, rowsize,
+			       groupsize, buf, len, ascii);
+		tlen -= len;
+
+		kunmap_atomic(it_page);
+	}
+}
+#endif
+
 static struct list_head alg_list;
 
 struct caam_alg_entry {
@@ -227,8 +263,9 @@
 	if (is_rfc3686) {
 		nonce = (u32 *)((void *)ctx->key + ctx->split_key_pad_len +
 			       enckeylen);
-		append_load_imm_u32(desc, *nonce, LDST_CLASS_IND_CCB |
-				    LDST_SRCDST_BYTE_OUTFIFO | LDST_IMM);
+		append_load_as_imm(desc, nonce, CTR_RFC3686_NONCE_SIZE,
+				   LDST_CLASS_IND_CCB |
+				   LDST_SRCDST_BYTE_OUTFIFO | LDST_IMM);
 		append_move(desc,
 			    MOVE_SRC_OUTFIFO |
 			    MOVE_DEST_CLASS1CTX |
@@ -500,11 +537,10 @@
 
 	/* Load Counter into CONTEXT1 reg */
 	if (is_rfc3686)
-		append_load_imm_u32(desc, be32_to_cpu(1), LDST_IMM |
-				    LDST_CLASS_1_CCB |
-				    LDST_SRCDST_BYTE_CONTEXT |
-				    ((ctx1_iv_off + CTR_RFC3686_IV_SIZE) <<
-				     LDST_OFFSET_SHIFT));
+		append_load_imm_be32(desc, 1, LDST_IMM | LDST_CLASS_1_CCB |
+				     LDST_SRCDST_BYTE_CONTEXT |
+				     ((ctx1_iv_off + CTR_RFC3686_IV_SIZE) <<
+				      LDST_OFFSET_SHIFT));
 
 	/* Class 1 operation */
 	append_operation(desc, ctx->class1_alg_type |
@@ -578,11 +614,10 @@
 
 	/* Load Counter into CONTEXT1 reg */
 	if (is_rfc3686)
-		append_load_imm_u32(desc, be32_to_cpu(1), LDST_IMM |
-				    LDST_CLASS_1_CCB |
-				    LDST_SRCDST_BYTE_CONTEXT |
-				    ((ctx1_iv_off + CTR_RFC3686_IV_SIZE) <<
-				     LDST_OFFSET_SHIFT));
+		append_load_imm_be32(desc, 1, LDST_IMM | LDST_CLASS_1_CCB |
+				     LDST_SRCDST_BYTE_CONTEXT |
+				     ((ctx1_iv_off + CTR_RFC3686_IV_SIZE) <<
+				      LDST_OFFSET_SHIFT));
 
 	/* Choose operation */
 	if (ctr_mode)
@@ -683,11 +718,10 @@
 
 	/* Load Counter into CONTEXT1 reg */
 	if (is_rfc3686)
-		append_load_imm_u32(desc, be32_to_cpu(1), LDST_IMM |
-				    LDST_CLASS_1_CCB |
-				    LDST_SRCDST_BYTE_CONTEXT |
-				    ((ctx1_iv_off + CTR_RFC3686_IV_SIZE) <<
-				     LDST_OFFSET_SHIFT));
+		append_load_imm_be32(desc, 1, LDST_IMM | LDST_CLASS_1_CCB |
+				     LDST_SRCDST_BYTE_CONTEXT |
+				     ((ctx1_iv_off + CTR_RFC3686_IV_SIZE) <<
+				      LDST_OFFSET_SHIFT));
 
 	/* Class 1 operation */
 	append_operation(desc, ctx->class1_alg_type |
@@ -1478,7 +1512,7 @@
 	int ret = 0;
 	u32 *key_jump_cmd;
 	u32 *desc;
-	u32 *nonce;
+	u8 *nonce;
 	u32 geniv;
 	u32 ctx1_iv_off = 0;
 	const bool ctr_mode = ((ctx->class1_alg_type & OP_ALG_AAI_MASK) ==
@@ -1531,9 +1565,10 @@
 
 	/* Load nonce into CONTEXT1 reg */
 	if (is_rfc3686) {
-		nonce = (u32 *)(key + keylen);
-		append_load_imm_u32(desc, *nonce, LDST_CLASS_IND_CCB |
-				    LDST_SRCDST_BYTE_OUTFIFO | LDST_IMM);
+		nonce = (u8 *)key + keylen;
+		append_load_as_imm(desc, nonce, CTR_RFC3686_NONCE_SIZE,
+				   LDST_CLASS_IND_CCB |
+				   LDST_SRCDST_BYTE_OUTFIFO | LDST_IMM);
 		append_move(desc, MOVE_WAITCOMP |
 			    MOVE_SRC_OUTFIFO |
 			    MOVE_DEST_CLASS1CTX |
@@ -1549,11 +1584,10 @@
 
 	/* Load counter into CONTEXT1 reg */
 	if (is_rfc3686)
-		append_load_imm_u32(desc, be32_to_cpu(1), LDST_IMM |
-				    LDST_CLASS_1_CCB |
-				    LDST_SRCDST_BYTE_CONTEXT |
-				    ((ctx1_iv_off + CTR_RFC3686_IV_SIZE) <<
-				     LDST_OFFSET_SHIFT));
+		append_load_imm_be32(desc, 1, LDST_IMM | LDST_CLASS_1_CCB |
+				     LDST_SRCDST_BYTE_CONTEXT |
+				     ((ctx1_iv_off + CTR_RFC3686_IV_SIZE) <<
+				      LDST_OFFSET_SHIFT));
 
 	/* Load operation */
 	append_operation(desc, ctx->class1_alg_type |
@@ -1590,9 +1624,10 @@
 
 	/* Load nonce into CONTEXT1 reg */
 	if (is_rfc3686) {
-		nonce = (u32 *)(key + keylen);
-		append_load_imm_u32(desc, *nonce, LDST_CLASS_IND_CCB |
-				    LDST_SRCDST_BYTE_OUTFIFO | LDST_IMM);
+		nonce = (u8 *)key + keylen;
+		append_load_as_imm(desc, nonce, CTR_RFC3686_NONCE_SIZE,
+				   LDST_CLASS_IND_CCB |
+				   LDST_SRCDST_BYTE_OUTFIFO | LDST_IMM);
 		append_move(desc, MOVE_WAITCOMP |
 			    MOVE_SRC_OUTFIFO |
 			    MOVE_DEST_CLASS1CTX |
@@ -1608,11 +1643,10 @@
 
 	/* Load counter into CONTEXT1 reg */
 	if (is_rfc3686)
-		append_load_imm_u32(desc, be32_to_cpu(1), LDST_IMM |
-				    LDST_CLASS_1_CCB |
-				    LDST_SRCDST_BYTE_CONTEXT |
-				    ((ctx1_iv_off + CTR_RFC3686_IV_SIZE) <<
-				     LDST_OFFSET_SHIFT));
+		append_load_imm_be32(desc, 1, LDST_IMM | LDST_CLASS_1_CCB |
+				     LDST_SRCDST_BYTE_CONTEXT |
+				     ((ctx1_iv_off + CTR_RFC3686_IV_SIZE) <<
+				      LDST_OFFSET_SHIFT));
 
 	/* Choose operation */
 	if (ctr_mode)
@@ -1653,9 +1687,10 @@
 
 	/* Load Nonce into CONTEXT1 reg */
 	if (is_rfc3686) {
-		nonce = (u32 *)(key + keylen);
-		append_load_imm_u32(desc, *nonce, LDST_CLASS_IND_CCB |
-				    LDST_SRCDST_BYTE_OUTFIFO | LDST_IMM);
+		nonce = (u8 *)key + keylen;
+		append_load_as_imm(desc, nonce, CTR_RFC3686_NONCE_SIZE,
+				   LDST_CLASS_IND_CCB |
+				   LDST_SRCDST_BYTE_OUTFIFO | LDST_IMM);
 		append_move(desc, MOVE_WAITCOMP |
 			    MOVE_SRC_OUTFIFO |
 			    MOVE_DEST_CLASS1CTX |
@@ -1685,11 +1720,10 @@
 
 	/* Load Counter into CONTEXT1 reg */
 	if (is_rfc3686)
-		append_load_imm_u32(desc, (u32)1, LDST_IMM |
-				    LDST_CLASS_1_CCB |
-				    LDST_SRCDST_BYTE_CONTEXT |
-				    ((ctx1_iv_off + CTR_RFC3686_IV_SIZE) <<
-				     LDST_OFFSET_SHIFT));
+		append_load_imm_be32(desc, 1, LDST_IMM | LDST_CLASS_1_CCB |
+				     LDST_SRCDST_BYTE_CONTEXT |
+				     ((ctx1_iv_off + CTR_RFC3686_IV_SIZE) <<
+				      LDST_OFFSET_SHIFT));
 
 	if (ctx1_iv_off)
 		append_jump(desc, JUMP_JSL | JUMP_TEST_ALL | JUMP_COND_NCP |
@@ -1995,9 +2029,9 @@
 	print_hex_dump(KERN_ERR, "dstiv  @"__stringify(__LINE__)": ",
 		       DUMP_PREFIX_ADDRESS, 16, 4, req->info,
 		       edesc->src_nents > 1 ? 100 : ivsize, 1);
-	print_hex_dump(KERN_ERR, "dst    @"__stringify(__LINE__)": ",
-		       DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->src),
-		       edesc->dst_nents > 1 ? 100 : req->nbytes, 1);
+	dbg_dump_sg(KERN_ERR, "dst    @"__stringify(__LINE__)": ",
+		    DUMP_PREFIX_ADDRESS, 16, 4, req->dst,
+		    edesc->dst_nents > 1 ? 100 : req->nbytes, 1, true);
 #endif
 
 	ablkcipher_unmap(jrdev, edesc, req);
@@ -2027,9 +2061,9 @@
 	print_hex_dump(KERN_ERR, "dstiv  @"__stringify(__LINE__)": ",
 		       DUMP_PREFIX_ADDRESS, 16, 4, req->info,
 		       ivsize, 1);
-	print_hex_dump(KERN_ERR, "dst    @"__stringify(__LINE__)": ",
-		       DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->src),
-		       edesc->dst_nents > 1 ? 100 : req->nbytes, 1);
+	dbg_dump_sg(KERN_ERR, "dst    @"__stringify(__LINE__)": ",
+		    DUMP_PREFIX_ADDRESS, 16, 4, req->dst,
+		    edesc->dst_nents > 1 ? 100 : req->nbytes, 1, true);
 #endif
 
 	ablkcipher_unmap(jrdev, edesc, req);
@@ -2184,12 +2218,15 @@
 	int len, sec4_sg_index = 0;
 
 #ifdef DEBUG
+	bool may_sleep = ((req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
+					      CRYPTO_TFM_REQ_MAY_SLEEP)) != 0);
 	print_hex_dump(KERN_ERR, "presciv@"__stringify(__LINE__)": ",
 		       DUMP_PREFIX_ADDRESS, 16, 4, req->info,
 		       ivsize, 1);
-	print_hex_dump(KERN_ERR, "src    @"__stringify(__LINE__)": ",
-		       DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->src),
-		       edesc->src_nents ? 100 : req->nbytes, 1);
+	printk(KERN_ERR "asked=%d, nbytes%d\n", (int)edesc->src_nents ? 100 : req->nbytes, req->nbytes);
+	dbg_dump_sg(KERN_ERR, "src    @"__stringify(__LINE__)": ",
+		    DUMP_PREFIX_ADDRESS, 16, 4, req->src,
+		    edesc->src_nents ? 100 : req->nbytes, 1, may_sleep);
 #endif
 
 	len = desc_len(sh_desc);
@@ -2241,12 +2278,14 @@
 	int len, sec4_sg_index = 0;
 
 #ifdef DEBUG
+	bool may_sleep = ((req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
+					      CRYPTO_TFM_REQ_MAY_SLEEP)) != 0);
 	print_hex_dump(KERN_ERR, "presciv@" __stringify(__LINE__) ": ",
 		       DUMP_PREFIX_ADDRESS, 16, 4, req->info,
 		       ivsize, 1);
-	print_hex_dump(KERN_ERR, "src    @" __stringify(__LINE__) ": ",
-		       DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->src),
-		       edesc->src_nents ? 100 : req->nbytes, 1);
+	dbg_dump_sg(KERN_ERR, "src    @" __stringify(__LINE__) ": ",
+		    DUMP_PREFIX_ADDRESS, 16, 4, req->src,
+		    edesc->src_nents ? 100 : req->nbytes, 1, may_sleep);
 #endif
 
 	len = desc_len(sh_desc);
@@ -2516,18 +2555,20 @@
 	u32 *desc;
 	int ret = 0;
 
+#ifdef DEBUG
+	bool may_sleep = ((req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
+					      CRYPTO_TFM_REQ_MAY_SLEEP)) != 0);
+	dbg_dump_sg(KERN_ERR, "dec src@"__stringify(__LINE__)": ",
+		    DUMP_PREFIX_ADDRESS, 16, 4, req->src,
+		    req->assoclen + req->cryptlen, 1, may_sleep);
+#endif
+
 	/* allocate extended descriptor */
 	edesc = aead_edesc_alloc(req, AUTHENC_DESC_JOB_IO_LEN,
 				 &all_contig, false);
 	if (IS_ERR(edesc))
 		return PTR_ERR(edesc);
 
-#ifdef DEBUG
-	print_hex_dump(KERN_ERR, "dec src@"__stringify(__LINE__)": ",
-		       DUMP_PREFIX_ADDRESS, 16, 4, sg_virt(req->src),
-		       req->assoclen + req->cryptlen, 1);
-#endif
-
 	/* Create and submit job descriptor*/
 	init_authenc_job(req, edesc, all_contig, false);
 #ifdef DEBUG
diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c
index 36365b3..660dc20 100644
--- a/drivers/crypto/caam/caamhash.c
+++ b/drivers/crypto/caam/caamhash.c
@@ -99,17 +99,17 @@
 
 /* ahash per-session context */
 struct caam_hash_ctx {
-	struct device *jrdev;
-	u32 sh_desc_update[DESC_HASH_MAX_USED_LEN];
-	u32 sh_desc_update_first[DESC_HASH_MAX_USED_LEN];
-	u32 sh_desc_fin[DESC_HASH_MAX_USED_LEN];
-	u32 sh_desc_digest[DESC_HASH_MAX_USED_LEN];
-	u32 sh_desc_finup[DESC_HASH_MAX_USED_LEN];
-	dma_addr_t sh_desc_update_dma;
+	u32 sh_desc_update[DESC_HASH_MAX_USED_LEN] ____cacheline_aligned;
+	u32 sh_desc_update_first[DESC_HASH_MAX_USED_LEN] ____cacheline_aligned;
+	u32 sh_desc_fin[DESC_HASH_MAX_USED_LEN] ____cacheline_aligned;
+	u32 sh_desc_digest[DESC_HASH_MAX_USED_LEN] ____cacheline_aligned;
+	u32 sh_desc_finup[DESC_HASH_MAX_USED_LEN] ____cacheline_aligned;
+	dma_addr_t sh_desc_update_dma ____cacheline_aligned;
 	dma_addr_t sh_desc_update_first_dma;
 	dma_addr_t sh_desc_fin_dma;
 	dma_addr_t sh_desc_digest_dma;
 	dma_addr_t sh_desc_finup_dma;
+	struct device *jrdev;
 	u32 alg_type;
 	u32 alg_op;
 	u8 key[CAAM_MAX_HASH_KEY_SIZE];
@@ -187,15 +187,6 @@
 	return buf_dma;
 }
 
-/* Map req->src and put it in link table */
-static inline void src_map_to_sec4_sg(struct device *jrdev,
-				      struct scatterlist *src, int src_nents,
-				      struct sec4_sg_entry *sec4_sg)
-{
-	dma_map_sg(jrdev, src, src_nents, DMA_TO_DEVICE);
-	sg_to_sec4_sg_last(src, src_nents, sec4_sg, 0);
-}
-
 /*
  * Only put buffer in link table if it contains data, which is possible,
  * since a buffer has previously been used, and needs to be unmapped,
@@ -449,7 +440,7 @@
 	u32 *desc;
 	struct split_key_result result;
 	dma_addr_t src_dma, dst_dma;
-	int ret = 0;
+	int ret;
 
 	desc = kmalloc(CAAM_CMD_SZ * 8 + CAAM_PTR_SZ * 2, GFP_KERNEL | GFP_DMA);
 	if (!desc) {
@@ -526,7 +517,7 @@
 	struct device *jrdev = ctx->jrdev;
 	int blocksize = crypto_tfm_alg_blocksize(&ahash->base);
 	int digestsize = crypto_ahash_digestsize(ahash);
-	int ret = 0;
+	int ret;
 	u8 *hashed_key = NULL;
 
 #ifdef DEBUG
@@ -534,14 +525,15 @@
 #endif
 
 	if (keylen > blocksize) {
-		hashed_key = kmalloc(sizeof(u8) * digestsize, GFP_KERNEL |
-				     GFP_DMA);
+		hashed_key = kmalloc_array(digestsize,
+					   sizeof(*hashed_key),
+					   GFP_KERNEL | GFP_DMA);
 		if (!hashed_key)
 			return -ENOMEM;
 		ret = hash_digest_key(ctx, key, &keylen, hashed_key,
 				      digestsize);
 		if (ret)
-			goto badkey;
+			goto bad_free_key;
 		key = hashed_key;
 	}
 
@@ -559,14 +551,14 @@
 
 	ret = gen_split_hash_key(ctx, key, keylen);
 	if (ret)
-		goto badkey;
+		goto bad_free_key;
 
 	ctx->key_dma = dma_map_single(jrdev, ctx->key, ctx->split_key_pad_len,
 				      DMA_TO_DEVICE);
 	if (dma_mapping_error(jrdev, ctx->key_dma)) {
 		dev_err(jrdev, "unable to map key i/o memory\n");
 		ret = -ENOMEM;
-		goto map_err;
+		goto error_free_key;
 	}
 #ifdef DEBUG
 	print_hex_dump(KERN_ERR, "ctx.key@"__stringify(__LINE__)": ",
@@ -579,11 +571,10 @@
 		dma_unmap_single(jrdev, ctx->key_dma, ctx->split_key_pad_len,
 				 DMA_TO_DEVICE);
 	}
-
-map_err:
+ error_free_key:
 	kfree(hashed_key);
 	return ret;
-badkey:
+ bad_free_key:
 	kfree(hashed_key);
 	crypto_ahash_set_flags(ahash, CRYPTO_TFM_RES_BAD_KEY_LEN);
 	return -EINVAL;
@@ -595,16 +586,16 @@
  * @sec4_sg_dma: physical mapped address of h/w link table
  * @src_nents: number of segments in input scatterlist
  * @sec4_sg_bytes: length of dma mapped sec4_sg space
- * @sec4_sg: pointer to h/w link table
  * @hw_desc: the h/w job descriptor followed by any referenced link tables
+ * @sec4_sg: h/w link table
  */
 struct ahash_edesc {
 	dma_addr_t dst_dma;
 	dma_addr_t sec4_sg_dma;
 	int src_nents;
 	int sec4_sg_bytes;
-	struct sec4_sg_entry *sec4_sg;
-	u32 hw_desc[0];
+	u32 hw_desc[DESC_JOB_IO_LEN / sizeof(u32)] ____cacheline_aligned;
+	struct sec4_sg_entry sec4_sg[0];
 };
 
 static inline void ahash_unmap(struct device *dev,
@@ -774,6 +765,65 @@
 	req->base.complete(&req->base, err);
 }
 
+/*
+ * Allocate an enhanced descriptor, which contains the hardware descriptor
+ * and space for hardware scatter table containing sg_num entries.
+ */
+static struct ahash_edesc *ahash_edesc_alloc(struct caam_hash_ctx *ctx,
+					     int sg_num, u32 *sh_desc,
+					     dma_addr_t sh_desc_dma,
+					     gfp_t flags)
+{
+	struct ahash_edesc *edesc;
+	unsigned int sg_size = sg_num * sizeof(struct sec4_sg_entry);
+
+	edesc = kzalloc(sizeof(*edesc) + sg_size, GFP_DMA | flags);
+	if (!edesc) {
+		dev_err(ctx->jrdev, "could not allocate extended descriptor\n");
+		return NULL;
+	}
+
+	init_job_desc_shared(edesc->hw_desc, sh_desc_dma, desc_len(sh_desc),
+			     HDR_SHARE_DEFER | HDR_REVERSE);
+
+	return edesc;
+}
+
+static int ahash_edesc_add_src(struct caam_hash_ctx *ctx,
+			       struct ahash_edesc *edesc,
+			       struct ahash_request *req, int nents,
+			       unsigned int first_sg,
+			       unsigned int first_bytes, size_t to_hash)
+{
+	dma_addr_t src_dma;
+	u32 options;
+
+	if (nents > 1 || first_sg) {
+		struct sec4_sg_entry *sg = edesc->sec4_sg;
+		unsigned int sgsize = sizeof(*sg) * (first_sg + nents);
+
+		sg_to_sec4_sg_last(req->src, nents, sg + first_sg, 0);
+
+		src_dma = dma_map_single(ctx->jrdev, sg, sgsize, DMA_TO_DEVICE);
+		if (dma_mapping_error(ctx->jrdev, src_dma)) {
+			dev_err(ctx->jrdev, "unable to map S/G table\n");
+			return -ENOMEM;
+		}
+
+		edesc->sec4_sg_bytes = sgsize;
+		edesc->sec4_sg_dma = src_dma;
+		options = LDST_SGF;
+	} else {
+		src_dma = sg_dma_address(req->src);
+		options = 0;
+	}
+
+	append_seq_in_ptr(edesc->hw_desc, src_dma, first_bytes + to_hash,
+			  options);
+
+	return 0;
+}
+
 /* submit update job descriptor */
 static int ahash_update_ctx(struct ahash_request *req)
 {
@@ -789,12 +839,10 @@
 	int *next_buflen = state->current_buf ? &state->buflen_0 :
 			   &state->buflen_1, last_buflen;
 	int in_len = *buflen + req->nbytes, to_hash;
-	u32 *sh_desc = ctx->sh_desc_update, *desc;
-	dma_addr_t ptr = ctx->sh_desc_update_dma;
-	int src_nents, sec4_sg_bytes, sec4_sg_src_index;
+	u32 *desc;
+	int src_nents, mapped_nents, sec4_sg_bytes, sec4_sg_src_index;
 	struct ahash_edesc *edesc;
 	int ret = 0;
-	int sh_len;
 
 	last_buflen = *next_buflen;
 	*next_buflen = in_len & (crypto_tfm_alg_blocksize(&ahash->base) - 1);
@@ -807,40 +855,51 @@
 			dev_err(jrdev, "Invalid number of src SG.\n");
 			return src_nents;
 		}
+
+		if (src_nents) {
+			mapped_nents = dma_map_sg(jrdev, req->src, src_nents,
+						  DMA_TO_DEVICE);
+			if (!mapped_nents) {
+				dev_err(jrdev, "unable to DMA map source\n");
+				return -ENOMEM;
+			}
+		} else {
+			mapped_nents = 0;
+		}
+
 		sec4_sg_src_index = 1 + (*buflen ? 1 : 0);
-		sec4_sg_bytes = (sec4_sg_src_index + src_nents) *
+		sec4_sg_bytes = (sec4_sg_src_index + mapped_nents) *
 				 sizeof(struct sec4_sg_entry);
 
 		/*
 		 * allocate space for base edesc and hw desc commands,
 		 * link tables
 		 */
-		edesc = kzalloc(sizeof(*edesc) + DESC_JOB_IO_LEN +
-				sec4_sg_bytes, GFP_DMA | flags);
+		edesc = ahash_edesc_alloc(ctx, sec4_sg_src_index + mapped_nents,
+					  ctx->sh_desc_update,
+					  ctx->sh_desc_update_dma, flags);
 		if (!edesc) {
-			dev_err(jrdev,
-				"could not allocate extended descriptor\n");
+			dma_unmap_sg(jrdev, req->src, src_nents, DMA_TO_DEVICE);
 			return -ENOMEM;
 		}
 
 		edesc->src_nents = src_nents;
 		edesc->sec4_sg_bytes = sec4_sg_bytes;
-		edesc->sec4_sg = (void *)edesc + sizeof(struct ahash_edesc) +
-				 DESC_JOB_IO_LEN;
 
 		ret = ctx_map_to_sec4_sg(desc, jrdev, state, ctx->ctx_len,
 					 edesc->sec4_sg, DMA_BIDIRECTIONAL);
 		if (ret)
-			return ret;
+			goto unmap_ctx;
 
 		state->buf_dma = try_buf_map_to_sec4_sg(jrdev,
 							edesc->sec4_sg + 1,
 							buf, state->buf_dma,
 							*buflen, last_buflen);
 
-		if (src_nents) {
-			src_map_to_sec4_sg(jrdev, req->src, src_nents,
-					   edesc->sec4_sg + sec4_sg_src_index);
+		if (mapped_nents) {
+			sg_to_sec4_sg_last(req->src, mapped_nents,
+					   edesc->sec4_sg + sec4_sg_src_index,
+					   0);
 			if (*next_buflen)
 				scatterwalk_map_and_copy(next_buf, req->src,
 							 to_hash - *buflen,
@@ -852,17 +911,15 @@
 
 		state->current_buf = !state->current_buf;
 
-		sh_len = desc_len(sh_desc);
 		desc = edesc->hw_desc;
-		init_job_desc_shared(desc, ptr, sh_len, HDR_SHARE_DEFER |
-				     HDR_REVERSE);
 
 		edesc->sec4_sg_dma = dma_map_single(jrdev, edesc->sec4_sg,
 						     sec4_sg_bytes,
 						     DMA_TO_DEVICE);
 		if (dma_mapping_error(jrdev, edesc->sec4_sg_dma)) {
 			dev_err(jrdev, "unable to map S/G table\n");
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto unmap_ctx;
 		}
 
 		append_seq_in_ptr(desc, edesc->sec4_sg_dma, ctx->ctx_len +
@@ -877,13 +934,10 @@
 #endif
 
 		ret = caam_jr_enqueue(jrdev, desc, ahash_done_bi, req);
-		if (!ret) {
-			ret = -EINPROGRESS;
-		} else {
-			ahash_unmap_ctx(jrdev, edesc, req, ctx->ctx_len,
-					   DMA_BIDIRECTIONAL);
-			kfree(edesc);
-		}
+		if (ret)
+			goto unmap_ctx;
+
+		ret = -EINPROGRESS;
 	} else if (*next_buflen) {
 		scatterwalk_map_and_copy(buf + *buflen, req->src, 0,
 					 req->nbytes, 0);
@@ -899,6 +953,10 @@
 #endif
 
 	return ret;
+ unmap_ctx:
+	ahash_unmap_ctx(jrdev, edesc, req, ctx->ctx_len, DMA_BIDIRECTIONAL);
+	kfree(edesc);
+	return ret;
 }
 
 static int ahash_final_ctx(struct ahash_request *req)
@@ -913,38 +971,31 @@
 	int buflen = state->current_buf ? state->buflen_1 : state->buflen_0;
 	int last_buflen = state->current_buf ? state->buflen_0 :
 			  state->buflen_1;
-	u32 *sh_desc = ctx->sh_desc_fin, *desc;
-	dma_addr_t ptr = ctx->sh_desc_fin_dma;
+	u32 *desc;
 	int sec4_sg_bytes, sec4_sg_src_index;
 	int digestsize = crypto_ahash_digestsize(ahash);
 	struct ahash_edesc *edesc;
-	int ret = 0;
-	int sh_len;
+	int ret;
 
 	sec4_sg_src_index = 1 + (buflen ? 1 : 0);
 	sec4_sg_bytes = sec4_sg_src_index * sizeof(struct sec4_sg_entry);
 
 	/* allocate space for base edesc and hw desc commands, link tables */
-	edesc = kzalloc(sizeof(*edesc) + DESC_JOB_IO_LEN + sec4_sg_bytes,
-			GFP_DMA | flags);
-	if (!edesc) {
-		dev_err(jrdev, "could not allocate extended descriptor\n");
+	edesc = ahash_edesc_alloc(ctx, sec4_sg_src_index,
+				  ctx->sh_desc_fin, ctx->sh_desc_fin_dma,
+				  flags);
+	if (!edesc)
 		return -ENOMEM;
-	}
 
-	sh_len = desc_len(sh_desc);
 	desc = edesc->hw_desc;
-	init_job_desc_shared(desc, ptr, sh_len, HDR_SHARE_DEFER | HDR_REVERSE);
 
 	edesc->sec4_sg_bytes = sec4_sg_bytes;
-	edesc->sec4_sg = (void *)edesc + sizeof(struct ahash_edesc) +
-			 DESC_JOB_IO_LEN;
 	edesc->src_nents = 0;
 
 	ret = ctx_map_to_sec4_sg(desc, jrdev, state, ctx->ctx_len,
 				 edesc->sec4_sg, DMA_TO_DEVICE);
 	if (ret)
-		return ret;
+		goto unmap_ctx;
 
 	state->buf_dma = try_buf_map_to_sec4_sg(jrdev, edesc->sec4_sg + 1,
 						buf, state->buf_dma, buflen,
@@ -956,7 +1007,8 @@
 					    sec4_sg_bytes, DMA_TO_DEVICE);
 	if (dma_mapping_error(jrdev, edesc->sec4_sg_dma)) {
 		dev_err(jrdev, "unable to map S/G table\n");
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto unmap_ctx;
 	}
 
 	append_seq_in_ptr(desc, edesc->sec4_sg_dma, ctx->ctx_len + buflen,
@@ -966,7 +1018,8 @@
 						digestsize);
 	if (dma_mapping_error(jrdev, edesc->dst_dma)) {
 		dev_err(jrdev, "unable to map dst\n");
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto unmap_ctx;
 	}
 
 #ifdef DEBUG
@@ -975,13 +1028,13 @@
 #endif
 
 	ret = caam_jr_enqueue(jrdev, desc, ahash_done_ctx_src, req);
-	if (!ret) {
-		ret = -EINPROGRESS;
-	} else {
-		ahash_unmap_ctx(jrdev, edesc, req, digestsize, DMA_FROM_DEVICE);
-		kfree(edesc);
-	}
+	if (ret)
+		goto unmap_ctx;
 
+	return -EINPROGRESS;
+ unmap_ctx:
+	ahash_unmap_ctx(jrdev, edesc, req, digestsize, DMA_FROM_DEVICE);
+	kfree(edesc);
 	return ret;
 }
 
@@ -997,68 +1050,66 @@
 	int buflen = state->current_buf ? state->buflen_1 : state->buflen_0;
 	int last_buflen = state->current_buf ? state->buflen_0 :
 			  state->buflen_1;
-	u32 *sh_desc = ctx->sh_desc_finup, *desc;
-	dma_addr_t ptr = ctx->sh_desc_finup_dma;
-	int sec4_sg_bytes, sec4_sg_src_index;
-	int src_nents;
+	u32 *desc;
+	int sec4_sg_src_index;
+	int src_nents, mapped_nents;
 	int digestsize = crypto_ahash_digestsize(ahash);
 	struct ahash_edesc *edesc;
-	int ret = 0;
-	int sh_len;
+	int ret;
 
 	src_nents = sg_nents_for_len(req->src, req->nbytes);
 	if (src_nents < 0) {
 		dev_err(jrdev, "Invalid number of src SG.\n");
 		return src_nents;
 	}
+
+	if (src_nents) {
+		mapped_nents = dma_map_sg(jrdev, req->src, src_nents,
+					  DMA_TO_DEVICE);
+		if (!mapped_nents) {
+			dev_err(jrdev, "unable to DMA map source\n");
+			return -ENOMEM;
+		}
+	} else {
+		mapped_nents = 0;
+	}
+
 	sec4_sg_src_index = 1 + (buflen ? 1 : 0);
-	sec4_sg_bytes = (sec4_sg_src_index + src_nents) *
-			 sizeof(struct sec4_sg_entry);
 
 	/* allocate space for base edesc and hw desc commands, link tables */
-	edesc = kzalloc(sizeof(*edesc) + DESC_JOB_IO_LEN + sec4_sg_bytes,
-			GFP_DMA | flags);
+	edesc = ahash_edesc_alloc(ctx, sec4_sg_src_index + mapped_nents,
+				  ctx->sh_desc_finup, ctx->sh_desc_finup_dma,
+				  flags);
 	if (!edesc) {
-		dev_err(jrdev, "could not allocate extended descriptor\n");
+		dma_unmap_sg(jrdev, req->src, src_nents, DMA_TO_DEVICE);
 		return -ENOMEM;
 	}
 
-	sh_len = desc_len(sh_desc);
 	desc = edesc->hw_desc;
-	init_job_desc_shared(desc, ptr, sh_len, HDR_SHARE_DEFER | HDR_REVERSE);
 
 	edesc->src_nents = src_nents;
-	edesc->sec4_sg_bytes = sec4_sg_bytes;
-	edesc->sec4_sg = (void *)edesc + sizeof(struct ahash_edesc) +
-			 DESC_JOB_IO_LEN;
 
 	ret = ctx_map_to_sec4_sg(desc, jrdev, state, ctx->ctx_len,
 				 edesc->sec4_sg, DMA_TO_DEVICE);
 	if (ret)
-		return ret;
+		goto unmap_ctx;
 
 	state->buf_dma = try_buf_map_to_sec4_sg(jrdev, edesc->sec4_sg + 1,
 						buf, state->buf_dma, buflen,
 						last_buflen);
 
-	src_map_to_sec4_sg(jrdev, req->src, src_nents, edesc->sec4_sg +
-			   sec4_sg_src_index);
-
-	edesc->sec4_sg_dma = dma_map_single(jrdev, edesc->sec4_sg,
-					    sec4_sg_bytes, DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, edesc->sec4_sg_dma)) {
-		dev_err(jrdev, "unable to map S/G table\n");
-		return -ENOMEM;
-	}
-
-	append_seq_in_ptr(desc, edesc->sec4_sg_dma, ctx->ctx_len +
-			       buflen + req->nbytes, LDST_SGF);
+	ret = ahash_edesc_add_src(ctx, edesc, req, mapped_nents,
+				  sec4_sg_src_index, ctx->ctx_len + buflen,
+				  req->nbytes);
+	if (ret)
+		goto unmap_ctx;
 
 	edesc->dst_dma = map_seq_out_ptr_result(desc, jrdev, req->result,
 						digestsize);
 	if (dma_mapping_error(jrdev, edesc->dst_dma)) {
 		dev_err(jrdev, "unable to map dst\n");
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto unmap_ctx;
 	}
 
 #ifdef DEBUG
@@ -1067,13 +1118,13 @@
 #endif
 
 	ret = caam_jr_enqueue(jrdev, desc, ahash_done_ctx_src, req);
-	if (!ret) {
-		ret = -EINPROGRESS;
-	} else {
-		ahash_unmap_ctx(jrdev, edesc, req, digestsize, DMA_FROM_DEVICE);
-		kfree(edesc);
-	}
+	if (ret)
+		goto unmap_ctx;
 
+	return -EINPROGRESS;
+ unmap_ctx:
+	ahash_unmap_ctx(jrdev, edesc, req, digestsize, DMA_FROM_DEVICE);
+	kfree(edesc);
 	return ret;
 }
 
@@ -1084,60 +1135,56 @@
 	struct device *jrdev = ctx->jrdev;
 	gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
 		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
-	u32 *sh_desc = ctx->sh_desc_digest, *desc;
-	dma_addr_t ptr = ctx->sh_desc_digest_dma;
+	u32 *desc;
 	int digestsize = crypto_ahash_digestsize(ahash);
-	int src_nents, sec4_sg_bytes;
-	dma_addr_t src_dma;
+	int src_nents, mapped_nents;
 	struct ahash_edesc *edesc;
-	int ret = 0;
-	u32 options;
-	int sh_len;
+	int ret;
 
-	src_nents = sg_count(req->src, req->nbytes);
+	src_nents = sg_nents_for_len(req->src, req->nbytes);
 	if (src_nents < 0) {
 		dev_err(jrdev, "Invalid number of src SG.\n");
 		return src_nents;
 	}
-	dma_map_sg(jrdev, req->src, src_nents ? : 1, DMA_TO_DEVICE);
-	sec4_sg_bytes = src_nents * sizeof(struct sec4_sg_entry);
-
-	/* allocate space for base edesc and hw desc commands, link tables */
-	edesc = kzalloc(sizeof(*edesc) + sec4_sg_bytes + DESC_JOB_IO_LEN,
-			GFP_DMA | flags);
-	if (!edesc) {
-		dev_err(jrdev, "could not allocate extended descriptor\n");
-		return -ENOMEM;
-	}
-	edesc->sec4_sg = (void *)edesc + sizeof(struct ahash_edesc) +
-			  DESC_JOB_IO_LEN;
-	edesc->sec4_sg_bytes = sec4_sg_bytes;
-	edesc->src_nents = src_nents;
-
-	sh_len = desc_len(sh_desc);
-	desc = edesc->hw_desc;
-	init_job_desc_shared(desc, ptr, sh_len, HDR_SHARE_DEFER | HDR_REVERSE);
 
 	if (src_nents) {
-		sg_to_sec4_sg_last(req->src, src_nents, edesc->sec4_sg, 0);
-		edesc->sec4_sg_dma = dma_map_single(jrdev, edesc->sec4_sg,
-					    sec4_sg_bytes, DMA_TO_DEVICE);
-		if (dma_mapping_error(jrdev, edesc->sec4_sg_dma)) {
-			dev_err(jrdev, "unable to map S/G table\n");
+		mapped_nents = dma_map_sg(jrdev, req->src, src_nents,
+					  DMA_TO_DEVICE);
+		if (!mapped_nents) {
+			dev_err(jrdev, "unable to map source for DMA\n");
 			return -ENOMEM;
 		}
-		src_dma = edesc->sec4_sg_dma;
-		options = LDST_SGF;
 	} else {
-		src_dma = sg_dma_address(req->src);
-		options = 0;
+		mapped_nents = 0;
 	}
-	append_seq_in_ptr(desc, src_dma, req->nbytes, options);
+
+	/* allocate space for base edesc and hw desc commands, link tables */
+	edesc = ahash_edesc_alloc(ctx, mapped_nents > 1 ? mapped_nents : 0,
+				  ctx->sh_desc_digest, ctx->sh_desc_digest_dma,
+				  flags);
+	if (!edesc) {
+		dma_unmap_sg(jrdev, req->src, src_nents, DMA_TO_DEVICE);
+		return -ENOMEM;
+	}
+
+	edesc->src_nents = src_nents;
+
+	ret = ahash_edesc_add_src(ctx, edesc, req, mapped_nents, 0, 0,
+				  req->nbytes);
+	if (ret) {
+		ahash_unmap(jrdev, edesc, req, digestsize);
+		kfree(edesc);
+		return ret;
+	}
+
+	desc = edesc->hw_desc;
 
 	edesc->dst_dma = map_seq_out_ptr_result(desc, jrdev, req->result,
 						digestsize);
 	if (dma_mapping_error(jrdev, edesc->dst_dma)) {
 		dev_err(jrdev, "unable to map dst\n");
+		ahash_unmap(jrdev, edesc, req, digestsize);
+		kfree(edesc);
 		return -ENOMEM;
 	}
 
@@ -1168,29 +1215,23 @@
 		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
 	u8 *buf = state->current_buf ? state->buf_1 : state->buf_0;
 	int buflen = state->current_buf ? state->buflen_1 : state->buflen_0;
-	u32 *sh_desc = ctx->sh_desc_digest, *desc;
-	dma_addr_t ptr = ctx->sh_desc_digest_dma;
+	u32 *desc;
 	int digestsize = crypto_ahash_digestsize(ahash);
 	struct ahash_edesc *edesc;
-	int ret = 0;
-	int sh_len;
+	int ret;
 
 	/* allocate space for base edesc and hw desc commands, link tables */
-	edesc = kzalloc(sizeof(*edesc) + DESC_JOB_IO_LEN, GFP_DMA | flags);
-	if (!edesc) {
-		dev_err(jrdev, "could not allocate extended descriptor\n");
+	edesc = ahash_edesc_alloc(ctx, 0, ctx->sh_desc_digest,
+				  ctx->sh_desc_digest_dma, flags);
+	if (!edesc)
 		return -ENOMEM;
-	}
 
-	edesc->sec4_sg_bytes = 0;
-	sh_len = desc_len(sh_desc);
 	desc = edesc->hw_desc;
-	init_job_desc_shared(desc, ptr, sh_len, HDR_SHARE_DEFER | HDR_REVERSE);
 
 	state->buf_dma = dma_map_single(jrdev, buf, buflen, DMA_TO_DEVICE);
 	if (dma_mapping_error(jrdev, state->buf_dma)) {
 		dev_err(jrdev, "unable to map src\n");
-		return -ENOMEM;
+		goto unmap;
 	}
 
 	append_seq_in_ptr(desc, state->buf_dma, buflen, 0);
@@ -1199,7 +1240,7 @@
 						digestsize);
 	if (dma_mapping_error(jrdev, edesc->dst_dma)) {
 		dev_err(jrdev, "unable to map dst\n");
-		return -ENOMEM;
+		goto unmap;
 	}
 	edesc->src_nents = 0;
 
@@ -1217,6 +1258,11 @@
 	}
 
 	return ret;
+ unmap:
+	ahash_unmap(jrdev, edesc, req, digestsize);
+	kfree(edesc);
+	return -ENOMEM;
+
 }
 
 /* submit ahash update if it the first job descriptor after update */
@@ -1234,48 +1280,58 @@
 	int *next_buflen = state->current_buf ? &state->buflen_0 :
 			   &state->buflen_1;
 	int in_len = *buflen + req->nbytes, to_hash;
-	int sec4_sg_bytes, src_nents;
+	int sec4_sg_bytes, src_nents, mapped_nents;
 	struct ahash_edesc *edesc;
-	u32 *desc, *sh_desc = ctx->sh_desc_update_first;
-	dma_addr_t ptr = ctx->sh_desc_update_first_dma;
+	u32 *desc;
 	int ret = 0;
-	int sh_len;
 
 	*next_buflen = in_len & (crypto_tfm_alg_blocksize(&ahash->base) - 1);
 	to_hash = in_len - *next_buflen;
 
 	if (to_hash) {
 		src_nents = sg_nents_for_len(req->src,
-					     req->nbytes - (*next_buflen));
+					     req->nbytes - *next_buflen);
 		if (src_nents < 0) {
 			dev_err(jrdev, "Invalid number of src SG.\n");
 			return src_nents;
 		}
-		sec4_sg_bytes = (1 + src_nents) *
+
+		if (src_nents) {
+			mapped_nents = dma_map_sg(jrdev, req->src, src_nents,
+						  DMA_TO_DEVICE);
+			if (!mapped_nents) {
+				dev_err(jrdev, "unable to DMA map source\n");
+				return -ENOMEM;
+			}
+		} else {
+			mapped_nents = 0;
+		}
+
+		sec4_sg_bytes = (1 + mapped_nents) *
 				sizeof(struct sec4_sg_entry);
 
 		/*
 		 * allocate space for base edesc and hw desc commands,
 		 * link tables
 		 */
-		edesc = kzalloc(sizeof(*edesc) + DESC_JOB_IO_LEN +
-				sec4_sg_bytes, GFP_DMA | flags);
+		edesc = ahash_edesc_alloc(ctx, 1 + mapped_nents,
+					  ctx->sh_desc_update_first,
+					  ctx->sh_desc_update_first_dma,
+					  flags);
 		if (!edesc) {
-			dev_err(jrdev,
-				"could not allocate extended descriptor\n");
+			dma_unmap_sg(jrdev, req->src, src_nents, DMA_TO_DEVICE);
 			return -ENOMEM;
 		}
 
 		edesc->src_nents = src_nents;
 		edesc->sec4_sg_bytes = sec4_sg_bytes;
-		edesc->sec4_sg = (void *)edesc + sizeof(struct ahash_edesc) +
-				 DESC_JOB_IO_LEN;
 		edesc->dst_dma = 0;
 
 		state->buf_dma = buf_map_to_sec4_sg(jrdev, edesc->sec4_sg,
 						    buf, *buflen);
-		src_map_to_sec4_sg(jrdev, req->src, src_nents,
-				   edesc->sec4_sg + 1);
+		sg_to_sec4_sg_last(req->src, mapped_nents,
+				   edesc->sec4_sg + 1, 0);
+
 		if (*next_buflen) {
 			scatterwalk_map_and_copy(next_buf, req->src,
 						 to_hash - *buflen,
@@ -1284,24 +1340,22 @@
 
 		state->current_buf = !state->current_buf;
 
-		sh_len = desc_len(sh_desc);
 		desc = edesc->hw_desc;
-		init_job_desc_shared(desc, ptr, sh_len, HDR_SHARE_DEFER |
-				     HDR_REVERSE);
 
 		edesc->sec4_sg_dma = dma_map_single(jrdev, edesc->sec4_sg,
 						    sec4_sg_bytes,
 						    DMA_TO_DEVICE);
 		if (dma_mapping_error(jrdev, edesc->sec4_sg_dma)) {
 			dev_err(jrdev, "unable to map S/G table\n");
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto unmap_ctx;
 		}
 
 		append_seq_in_ptr(desc, edesc->sec4_sg_dma, to_hash, LDST_SGF);
 
 		ret = map_seq_out_ptr_ctx(desc, jrdev, state, ctx->ctx_len);
 		if (ret)
-			return ret;
+			goto unmap_ctx;
 
 #ifdef DEBUG
 		print_hex_dump(KERN_ERR, "jobdesc@"__stringify(__LINE__)": ",
@@ -1310,16 +1364,13 @@
 #endif
 
 		ret = caam_jr_enqueue(jrdev, desc, ahash_done_ctx_dst, req);
-		if (!ret) {
-			ret = -EINPROGRESS;
-			state->update = ahash_update_ctx;
-			state->finup = ahash_finup_ctx;
-			state->final = ahash_final_ctx;
-		} else {
-			ahash_unmap_ctx(jrdev, edesc, req, ctx->ctx_len,
-					DMA_TO_DEVICE);
-			kfree(edesc);
-		}
+		if (ret)
+			goto unmap_ctx;
+
+		ret = -EINPROGRESS;
+		state->update = ahash_update_ctx;
+		state->finup = ahash_finup_ctx;
+		state->final = ahash_final_ctx;
 	} else if (*next_buflen) {
 		scatterwalk_map_and_copy(buf + *buflen, req->src, 0,
 					 req->nbytes, 0);
@@ -1335,6 +1386,10 @@
 #endif
 
 	return ret;
+ unmap_ctx:
+	ahash_unmap_ctx(jrdev, edesc, req, ctx->ctx_len, DMA_TO_DEVICE);
+	kfree(edesc);
+	return ret;
 }
 
 /* submit ahash finup if it the first job descriptor after update */
@@ -1350,61 +1405,63 @@
 	int buflen = state->current_buf ? state->buflen_1 : state->buflen_0;
 	int last_buflen = state->current_buf ? state->buflen_0 :
 			  state->buflen_1;
-	u32 *sh_desc = ctx->sh_desc_digest, *desc;
-	dma_addr_t ptr = ctx->sh_desc_digest_dma;
-	int sec4_sg_bytes, sec4_sg_src_index, src_nents;
+	u32 *desc;
+	int sec4_sg_bytes, sec4_sg_src_index, src_nents, mapped_nents;
 	int digestsize = crypto_ahash_digestsize(ahash);
 	struct ahash_edesc *edesc;
-	int sh_len;
-	int ret = 0;
+	int ret;
 
 	src_nents = sg_nents_for_len(req->src, req->nbytes);
 	if (src_nents < 0) {
 		dev_err(jrdev, "Invalid number of src SG.\n");
 		return src_nents;
 	}
+
+	if (src_nents) {
+		mapped_nents = dma_map_sg(jrdev, req->src, src_nents,
+					  DMA_TO_DEVICE);
+		if (!mapped_nents) {
+			dev_err(jrdev, "unable to DMA map source\n");
+			return -ENOMEM;
+		}
+	} else {
+		mapped_nents = 0;
+	}
+
 	sec4_sg_src_index = 2;
-	sec4_sg_bytes = (sec4_sg_src_index + src_nents) *
+	sec4_sg_bytes = (sec4_sg_src_index + mapped_nents) *
 			 sizeof(struct sec4_sg_entry);
 
 	/* allocate space for base edesc and hw desc commands, link tables */
-	edesc = kzalloc(sizeof(*edesc) + DESC_JOB_IO_LEN + sec4_sg_bytes,
-			GFP_DMA | flags);
+	edesc = ahash_edesc_alloc(ctx, sec4_sg_src_index + mapped_nents,
+				  ctx->sh_desc_digest, ctx->sh_desc_digest_dma,
+				  flags);
 	if (!edesc) {
-		dev_err(jrdev, "could not allocate extended descriptor\n");
+		dma_unmap_sg(jrdev, req->src, src_nents, DMA_TO_DEVICE);
 		return -ENOMEM;
 	}
 
-	sh_len = desc_len(sh_desc);
 	desc = edesc->hw_desc;
-	init_job_desc_shared(desc, ptr, sh_len, HDR_SHARE_DEFER | HDR_REVERSE);
 
 	edesc->src_nents = src_nents;
 	edesc->sec4_sg_bytes = sec4_sg_bytes;
-	edesc->sec4_sg = (void *)edesc + sizeof(struct ahash_edesc) +
-			 DESC_JOB_IO_LEN;
 
 	state->buf_dma = try_buf_map_to_sec4_sg(jrdev, edesc->sec4_sg, buf,
 						state->buf_dma, buflen,
 						last_buflen);
 
-	src_map_to_sec4_sg(jrdev, req->src, src_nents, edesc->sec4_sg + 1);
-
-	edesc->sec4_sg_dma = dma_map_single(jrdev, edesc->sec4_sg,
-					    sec4_sg_bytes, DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, edesc->sec4_sg_dma)) {
+	ret = ahash_edesc_add_src(ctx, edesc, req, mapped_nents, 1, buflen,
+				  req->nbytes);
+	if (ret) {
 		dev_err(jrdev, "unable to map S/G table\n");
-		return -ENOMEM;
+		goto unmap;
 	}
 
-	append_seq_in_ptr(desc, edesc->sec4_sg_dma, buflen +
-			       req->nbytes, LDST_SGF);
-
 	edesc->dst_dma = map_seq_out_ptr_result(desc, jrdev, req->result,
 						digestsize);
 	if (dma_mapping_error(jrdev, edesc->dst_dma)) {
 		dev_err(jrdev, "unable to map dst\n");
-		return -ENOMEM;
+		goto unmap;
 	}
 
 #ifdef DEBUG
@@ -1421,6 +1478,11 @@
 	}
 
 	return ret;
+ unmap:
+	ahash_unmap(jrdev, edesc, req, digestsize);
+	kfree(edesc);
+	return -ENOMEM;
+
 }
 
 /* submit first update job descriptor after init */
@@ -1436,78 +1498,65 @@
 	int *next_buflen = state->current_buf ?
 		&state->buflen_1 : &state->buflen_0;
 	int to_hash;
-	u32 *sh_desc = ctx->sh_desc_update_first, *desc;
-	dma_addr_t ptr = ctx->sh_desc_update_first_dma;
-	int sec4_sg_bytes, src_nents;
-	dma_addr_t src_dma;
-	u32 options;
+	u32 *desc;
+	int src_nents, mapped_nents;
 	struct ahash_edesc *edesc;
 	int ret = 0;
-	int sh_len;
 
 	*next_buflen = req->nbytes & (crypto_tfm_alg_blocksize(&ahash->base) -
 				      1);
 	to_hash = req->nbytes - *next_buflen;
 
 	if (to_hash) {
-		src_nents = sg_count(req->src, req->nbytes - (*next_buflen));
+		src_nents = sg_nents_for_len(req->src,
+					     req->nbytes - *next_buflen);
 		if (src_nents < 0) {
 			dev_err(jrdev, "Invalid number of src SG.\n");
 			return src_nents;
 		}
-		dma_map_sg(jrdev, req->src, src_nents ? : 1, DMA_TO_DEVICE);
-		sec4_sg_bytes = src_nents * sizeof(struct sec4_sg_entry);
+
+		if (src_nents) {
+			mapped_nents = dma_map_sg(jrdev, req->src, src_nents,
+						  DMA_TO_DEVICE);
+			if (!mapped_nents) {
+				dev_err(jrdev, "unable to map source for DMA\n");
+				return -ENOMEM;
+			}
+		} else {
+			mapped_nents = 0;
+		}
 
 		/*
 		 * allocate space for base edesc and hw desc commands,
 		 * link tables
 		 */
-		edesc = kzalloc(sizeof(*edesc) + DESC_JOB_IO_LEN +
-				sec4_sg_bytes, GFP_DMA | flags);
+		edesc = ahash_edesc_alloc(ctx, mapped_nents > 1 ?
+					  mapped_nents : 0,
+					  ctx->sh_desc_update_first,
+					  ctx->sh_desc_update_first_dma,
+					  flags);
 		if (!edesc) {
-			dev_err(jrdev,
-				"could not allocate extended descriptor\n");
+			dma_unmap_sg(jrdev, req->src, src_nents, DMA_TO_DEVICE);
 			return -ENOMEM;
 		}
 
 		edesc->src_nents = src_nents;
-		edesc->sec4_sg_bytes = sec4_sg_bytes;
-		edesc->sec4_sg = (void *)edesc + sizeof(struct ahash_edesc) +
-				 DESC_JOB_IO_LEN;
 		edesc->dst_dma = 0;
 
-		if (src_nents) {
-			sg_to_sec4_sg_last(req->src, src_nents,
-					   edesc->sec4_sg, 0);
-			edesc->sec4_sg_dma = dma_map_single(jrdev,
-							    edesc->sec4_sg,
-							    sec4_sg_bytes,
-							    DMA_TO_DEVICE);
-			if (dma_mapping_error(jrdev, edesc->sec4_sg_dma)) {
-				dev_err(jrdev, "unable to map S/G table\n");
-				return -ENOMEM;
-			}
-			src_dma = edesc->sec4_sg_dma;
-			options = LDST_SGF;
-		} else {
-			src_dma = sg_dma_address(req->src);
-			options = 0;
-		}
+		ret = ahash_edesc_add_src(ctx, edesc, req, mapped_nents, 0, 0,
+					  to_hash);
+		if (ret)
+			goto unmap_ctx;
 
 		if (*next_buflen)
 			scatterwalk_map_and_copy(next_buf, req->src, to_hash,
 						 *next_buflen, 0);
 
-		sh_len = desc_len(sh_desc);
 		desc = edesc->hw_desc;
-		init_job_desc_shared(desc, ptr, sh_len, HDR_SHARE_DEFER |
-				     HDR_REVERSE);
-
-		append_seq_in_ptr(desc, src_dma, to_hash, options);
 
 		ret = map_seq_out_ptr_ctx(desc, jrdev, state, ctx->ctx_len);
 		if (ret)
-			return ret;
+			goto unmap_ctx;
 
 #ifdef DEBUG
 		print_hex_dump(KERN_ERR, "jobdesc@"__stringify(__LINE__)": ",
@@ -1515,18 +1564,14 @@
 			       desc_bytes(desc), 1);
 #endif
 
-		ret = caam_jr_enqueue(jrdev, desc, ahash_done_ctx_dst,
-				      req);
-		if (!ret) {
-			ret = -EINPROGRESS;
-			state->update = ahash_update_ctx;
-			state->finup = ahash_finup_ctx;
-			state->final = ahash_final_ctx;
-		} else {
-			ahash_unmap_ctx(jrdev, edesc, req, ctx->ctx_len,
-					DMA_TO_DEVICE);
-			kfree(edesc);
-		}
+		ret = caam_jr_enqueue(jrdev, desc, ahash_done_ctx_dst, req);
+		if (ret)
+			goto unmap_ctx;
+
+		ret = -EINPROGRESS;
+		state->update = ahash_update_ctx;
+		state->finup = ahash_finup_ctx;
+		state->final = ahash_final_ctx;
 	} else if (*next_buflen) {
 		state->update = ahash_update_no_ctx;
 		state->finup = ahash_finup_no_ctx;
@@ -1541,6 +1586,10 @@
 #endif
 
 	return ret;
+ unmap_ctx:
+	ahash_unmap_ctx(jrdev, edesc, req, ctx->ctx_len, DMA_TO_DEVICE);
+	kfree(edesc);
+	return ret;
 }
 
 static int ahash_finup_first(struct ahash_request *req)
@@ -1799,7 +1848,6 @@
 					 HASH_MSG_LEN + SHA256_DIGEST_SIZE,
 					 HASH_MSG_LEN + 64,
 					 HASH_MSG_LEN + SHA512_DIGEST_SIZE };
-	int ret = 0;
 
 	/*
 	 * Get a Job ring from Job Ring driver to ensure in-order
@@ -1819,10 +1867,7 @@
 
 	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
 				 sizeof(struct caam_hash_state));
-
-	ret = ahash_set_sh_desc(ahash);
-
-	return ret;
+	return ahash_set_sh_desc(ahash);
 }
 
 static void caam_hash_cra_exit(struct crypto_tfm *tfm)
diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c
index 0ec112e..72ff196 100644
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@@ -14,6 +14,7 @@
 #include "jr.h"
 #include "desc_constr.h"
 #include "error.h"
+#include "ctrl.h"
 
 bool caam_little_end;
 EXPORT_SYMBOL(caam_little_end);
@@ -826,6 +827,8 @@
 
 caam_remove:
 	caam_remove(pdev);
+	return ret;
+
 iounmap_ctrl:
 	iounmap(ctrl);
 disable_caam_emi_slow:
diff --git a/drivers/crypto/caam/desc.h b/drivers/crypto/caam/desc.h
index 26427c1..513b664 100644
--- a/drivers/crypto/caam/desc.h
+++ b/drivers/crypto/caam/desc.h
@@ -23,13 +23,7 @@
 #define SEC4_SG_OFFSET_MASK	0x00001fff
 
 struct sec4_sg_entry {
-#if !defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) && \
-	defined(CONFIG_CRYPTO_DEV_FSL_CAAM_IMX)
-	u32 rsvd1;
-	dma_addr_t ptr;
-#else
 	u64 ptr;
-#endif /* CONFIG_CRYPTO_DEV_FSL_CAAM_IMX */
 	u32 len;
 	u32 bpid_offset;
 };
diff --git a/drivers/crypto/caam/desc_constr.h b/drivers/crypto/caam/desc_constr.h
index d3869b9..a8cd8a7 100644
--- a/drivers/crypto/caam/desc_constr.h
+++ b/drivers/crypto/caam/desc_constr.h
@@ -325,6 +325,23 @@
 APPEND_CMD_RAW_IMM(load, LOAD, u32);
 
 /*
+ * ee - endianness
+ * size - size of immediate type in bytes
+ */
+#define APPEND_CMD_RAW_IMM2(cmd, op, ee, size) \
+static inline void append_##cmd##_imm_##ee##size(u32 *desc, \
+						   u##size immediate, \
+						   u32 options) \
+{ \
+	__##ee##size data = cpu_to_##ee##size(immediate); \
+	PRINT_POS; \
+	append_cmd(desc, CMD_##op | IMMEDIATE | options | sizeof(data)); \
+	append_data(desc, &data, sizeof(data)); \
+}
+
+APPEND_CMD_RAW_IMM2(load, LOAD, be, 32);
+
+/*
  * Append math command. Only the last part of destination and source need to
  * be specified
  */
diff --git a/drivers/crypto/caam/intern.h b/drivers/crypto/caam/intern.h
index e2bcacc..5d4c050 100644
--- a/drivers/crypto/caam/intern.h
+++ b/drivers/crypto/caam/intern.h
@@ -41,7 +41,6 @@
 	struct device		*dev;
 	int ridx;
 	struct caam_job_ring __iomem *rregs;	/* JobR's register space */
-	struct tasklet_struct irqtask;
 	int irq;			/* One per queue */
 
 	/* Number of scatterlist crypt transforms active on the JobR */
diff --git a/drivers/crypto/caam/jr.c b/drivers/crypto/caam/jr.c
index a81f551..757c27f 100644
--- a/drivers/crypto/caam/jr.c
+++ b/drivers/crypto/caam/jr.c
@@ -73,8 +73,6 @@
 
 	ret = caam_reset_hw_jr(dev);
 
-	tasklet_kill(&jrp->irqtask);
-
 	/* Release interrupt */
 	free_irq(jrp->irq, dev);
 
@@ -130,7 +128,7 @@
 
 	/*
 	 * Check the output ring for ready responses, kick
-	 * tasklet if jobs done.
+	 * the threaded irq if jobs done.
 	 */
 	irqstate = rd_reg32(&jrp->rregs->jrintstatus);
 	if (!irqstate)
@@ -152,18 +150,13 @@
 	/* Have valid interrupt at this point, just ACK and trigger */
 	wr_reg32(&jrp->rregs->jrintstatus, irqstate);
 
-	preempt_disable();
-	tasklet_schedule(&jrp->irqtask);
-	preempt_enable();
-
-	return IRQ_HANDLED;
+	return IRQ_WAKE_THREAD;
 }
 
-/* Deferred service handler, run as interrupt-fired tasklet */
-static void caam_jr_dequeue(unsigned long devarg)
+static irqreturn_t caam_jr_threadirq(int irq, void *st_dev)
 {
 	int hw_idx, sw_idx, i, head, tail;
-	struct device *dev = (struct device *)devarg;
+	struct device *dev = st_dev;
 	struct caam_drv_private_jr *jrp = dev_get_drvdata(dev);
 	void (*usercall)(struct device *dev, u32 *desc, u32 status, void *arg);
 	u32 *userdesc, userstatus;
@@ -237,6 +230,8 @@
 
 	/* reenable / unmask IRQs */
 	clrsetbits_32(&jrp->rregs->rconfig_lo, JRCFG_IMSK, 0);
+
+	return IRQ_HANDLED;
 }
 
 /**
@@ -394,11 +389,10 @@
 
 	jrp = dev_get_drvdata(dev);
 
-	tasklet_init(&jrp->irqtask, caam_jr_dequeue, (unsigned long)dev);
-
 	/* Connect job ring interrupt handler. */
-	error = request_irq(jrp->irq, caam_jr_interrupt, IRQF_SHARED,
-			    dev_name(dev), dev);
+	error = request_threaded_irq(jrp->irq, caam_jr_interrupt,
+				     caam_jr_threadirq, IRQF_SHARED,
+				     dev_name(dev), dev);
 	if (error) {
 		dev_err(dev, "can't connect JobR %d interrupt (%d)\n",
 			jrp->ridx, jrp->irq);
@@ -460,7 +454,6 @@
 out_free_irq:
 	free_irq(jrp->irq, dev);
 out_kill_deq:
-	tasklet_kill(&jrp->irqtask);
 	return error;
 }
 
@@ -513,6 +506,7 @@
 	error = caam_jr_init(jrdev); /* now turn on hardware */
 	if (error) {
 		irq_dispose_mapping(jrpriv->irq);
+		iounmap(ctrl);
 		return error;
 	}
 
diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h
index b3c5016..84d2f83 100644
--- a/drivers/crypto/caam/regs.h
+++ b/drivers/crypto/caam/regs.h
@@ -196,6 +196,14 @@
 #define caam_dma_to_cpu(value) caam32_to_cpu(value)
 #endif /* CONFIG_ARCH_DMA_ADDR_T_64BIT  */
 
+#ifdef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX
+#define cpu_to_caam_dma64(value) \
+		(((u64)cpu_to_caam32(lower_32_bits(value)) << 32) | \
+		 (u64)cpu_to_caam32(upper_32_bits(value)))
+#else
+#define cpu_to_caam_dma64(value) cpu_to_caam64(value)
+#endif
+
 /*
  * jr_outentry
  * Represents each entry in a JobR output ring
diff --git a/drivers/crypto/caam/sg_sw_sec4.h b/drivers/crypto/caam/sg_sw_sec4.h
index 19dc64f..41cd5a3 100644
--- a/drivers/crypto/caam/sg_sw_sec4.h
+++ b/drivers/crypto/caam/sg_sw_sec4.h
@@ -15,7 +15,7 @@
 static inline void dma_to_sec4_sg_one(struct sec4_sg_entry *sec4_sg_ptr,
 				      dma_addr_t dma, u32 len, u16 offset)
 {
-	sec4_sg_ptr->ptr = cpu_to_caam_dma(dma);
+	sec4_sg_ptr->ptr = cpu_to_caam_dma64(dma);
 	sec4_sg_ptr->len = cpu_to_caam32(len);
 	sec4_sg_ptr->bpid_offset = cpu_to_caam32(offset & SEC4_SG_OFFSET_MASK);
 #ifdef DEBUG
diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile
index ee4d274..346ceb8 100644
--- a/drivers/crypto/ccp/Makefile
+++ b/drivers/crypto/ccp/Makefile
@@ -2,6 +2,7 @@
 ccp-objs := ccp-dev.o \
 	    ccp-ops.o \
 	    ccp-dev-v3.o \
+	    ccp-dev-v5.o \
 	    ccp-platform.o \
 	    ccp-dmaengine.o
 ccp-$(CONFIG_PCI) += ccp-pci.o
diff --git a/drivers/crypto/ccp/ccp-crypto-sha.c b/drivers/crypto/ccp/ccp-crypto-sha.c
index 8f36af6..84a652b 100644
--- a/drivers/crypto/ccp/ccp-crypto-sha.c
+++ b/drivers/crypto/ccp/ccp-crypto-sha.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2013,2016 Advanced Micro Devices, Inc.
  *
  * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ * Author: Gary R Hook <gary.hook@amd.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -134,7 +135,22 @@
 	rctx->cmd.engine = CCP_ENGINE_SHA;
 	rctx->cmd.u.sha.type = rctx->type;
 	rctx->cmd.u.sha.ctx = &rctx->ctx_sg;
-	rctx->cmd.u.sha.ctx_len = sizeof(rctx->ctx);
+
+	switch (rctx->type) {
+	case CCP_SHA_TYPE_1:
+		rctx->cmd.u.sha.ctx_len = SHA1_DIGEST_SIZE;
+		break;
+	case CCP_SHA_TYPE_224:
+		rctx->cmd.u.sha.ctx_len = SHA224_DIGEST_SIZE;
+		break;
+	case CCP_SHA_TYPE_256:
+		rctx->cmd.u.sha.ctx_len = SHA256_DIGEST_SIZE;
+		break;
+	default:
+		/* Should never get here */
+		break;
+	}
+
 	rctx->cmd.u.sha.src = sg;
 	rctx->cmd.u.sha.src_len = rctx->hash_cnt;
 	rctx->cmd.u.sha.opad = ctx->u.sha.key_len ?
diff --git a/drivers/crypto/ccp/ccp-dev-v3.c b/drivers/crypto/ccp/ccp-dev-v3.c
index d7a7103..8d2dbac 100644
--- a/drivers/crypto/ccp/ccp-dev-v3.c
+++ b/drivers/crypto/ccp/ccp-dev-v3.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2013,2016 Advanced Micro Devices, Inc.
  *
  * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ * Author: Gary R Hook <gary.hook@amd.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -19,6 +20,61 @@
 
 #include "ccp-dev.h"
 
+static u32 ccp_alloc_ksb(struct ccp_cmd_queue *cmd_q, unsigned int count)
+{
+	int start;
+	struct ccp_device *ccp = cmd_q->ccp;
+
+	for (;;) {
+		mutex_lock(&ccp->sb_mutex);
+
+		start = (u32)bitmap_find_next_zero_area(ccp->sb,
+							ccp->sb_count,
+							ccp->sb_start,
+							count, 0);
+		if (start <= ccp->sb_count) {
+			bitmap_set(ccp->sb, start, count);
+
+			mutex_unlock(&ccp->sb_mutex);
+			break;
+		}
+
+		ccp->sb_avail = 0;
+
+		mutex_unlock(&ccp->sb_mutex);
+
+		/* Wait for KSB entries to become available */
+		if (wait_event_interruptible(ccp->sb_queue, ccp->sb_avail))
+			return 0;
+	}
+
+	return KSB_START + start;
+}
+
+static void ccp_free_ksb(struct ccp_cmd_queue *cmd_q, unsigned int start,
+			 unsigned int count)
+{
+	struct ccp_device *ccp = cmd_q->ccp;
+
+	if (!start)
+		return;
+
+	mutex_lock(&ccp->sb_mutex);
+
+	bitmap_clear(ccp->sb, start - KSB_START, count);
+
+	ccp->sb_avail = 1;
+
+	mutex_unlock(&ccp->sb_mutex);
+
+	wake_up_interruptible_all(&ccp->sb_queue);
+}
+
+static unsigned int ccp_get_free_slots(struct ccp_cmd_queue *cmd_q)
+{
+	return CMD_Q_DEPTH(ioread32(cmd_q->reg_status));
+}
+
 static int ccp_do_cmd(struct ccp_op *op, u32 *cr, unsigned int cr_count)
 {
 	struct ccp_cmd_queue *cmd_q = op->cmd_q;
@@ -68,6 +124,9 @@
 			/* On error delete all related jobs from the queue */
 			cmd = (cmd_q->id << DEL_Q_ID_SHIFT)
 			      | op->jobid;
+			if (cmd_q->cmd_error)
+				ccp_log_error(cmd_q->ccp,
+					      cmd_q->cmd_error);
 
 			iowrite32(cmd, ccp->io_regs + DEL_CMD_Q_JOB);
 
@@ -99,10 +158,10 @@
 		| (op->u.aes.type << REQ1_AES_TYPE_SHIFT)
 		| (op->u.aes.mode << REQ1_AES_MODE_SHIFT)
 		| (op->u.aes.action << REQ1_AES_ACTION_SHIFT)
-		| (op->ksb_key << REQ1_KEY_KSB_SHIFT);
+		| (op->sb_key << REQ1_KEY_KSB_SHIFT);
 	cr[1] = op->src.u.dma.length - 1;
 	cr[2] = ccp_addr_lo(&op->src.u.dma);
-	cr[3] = (op->ksb_ctx << REQ4_KSB_SHIFT)
+	cr[3] = (op->sb_ctx << REQ4_KSB_SHIFT)
 		| (CCP_MEMTYPE_SYSTEM << REQ4_MEMTYPE_SHIFT)
 		| ccp_addr_hi(&op->src.u.dma);
 	cr[4] = ccp_addr_lo(&op->dst.u.dma);
@@ -129,10 +188,10 @@
 	cr[0] = (CCP_ENGINE_XTS_AES_128 << REQ1_ENGINE_SHIFT)
 		| (op->u.xts.action << REQ1_AES_ACTION_SHIFT)
 		| (op->u.xts.unit_size << REQ1_XTS_AES_SIZE_SHIFT)
-		| (op->ksb_key << REQ1_KEY_KSB_SHIFT);
+		| (op->sb_key << REQ1_KEY_KSB_SHIFT);
 	cr[1] = op->src.u.dma.length - 1;
 	cr[2] = ccp_addr_lo(&op->src.u.dma);
-	cr[3] = (op->ksb_ctx << REQ4_KSB_SHIFT)
+	cr[3] = (op->sb_ctx << REQ4_KSB_SHIFT)
 		| (CCP_MEMTYPE_SYSTEM << REQ4_MEMTYPE_SHIFT)
 		| ccp_addr_hi(&op->src.u.dma);
 	cr[4] = ccp_addr_lo(&op->dst.u.dma);
@@ -158,7 +217,7 @@
 		| REQ1_INIT;
 	cr[1] = op->src.u.dma.length - 1;
 	cr[2] = ccp_addr_lo(&op->src.u.dma);
-	cr[3] = (op->ksb_ctx << REQ4_KSB_SHIFT)
+	cr[3] = (op->sb_ctx << REQ4_KSB_SHIFT)
 		| (CCP_MEMTYPE_SYSTEM << REQ4_MEMTYPE_SHIFT)
 		| ccp_addr_hi(&op->src.u.dma);
 
@@ -181,11 +240,11 @@
 	/* Fill out the register contents for REQ1 through REQ6 */
 	cr[0] = (CCP_ENGINE_RSA << REQ1_ENGINE_SHIFT)
 		| (op->u.rsa.mod_size << REQ1_RSA_MOD_SIZE_SHIFT)
-		| (op->ksb_key << REQ1_KEY_KSB_SHIFT)
+		| (op->sb_key << REQ1_KEY_KSB_SHIFT)
 		| REQ1_EOM;
 	cr[1] = op->u.rsa.input_len - 1;
 	cr[2] = ccp_addr_lo(&op->src.u.dma);
-	cr[3] = (op->ksb_ctx << REQ4_KSB_SHIFT)
+	cr[3] = (op->sb_ctx << REQ4_KSB_SHIFT)
 		| (CCP_MEMTYPE_SYSTEM << REQ4_MEMTYPE_SHIFT)
 		| ccp_addr_hi(&op->src.u.dma);
 	cr[4] = ccp_addr_lo(&op->dst.u.dma);
@@ -215,10 +274,10 @@
 			| ccp_addr_hi(&op->src.u.dma);
 
 		if (op->u.passthru.bit_mod != CCP_PASSTHRU_BITWISE_NOOP)
-			cr[3] |= (op->ksb_key << REQ4_KSB_SHIFT);
+			cr[3] |= (op->sb_key << REQ4_KSB_SHIFT);
 	} else {
-		cr[2] = op->src.u.ksb * CCP_KSB_BYTES;
-		cr[3] = (CCP_MEMTYPE_KSB << REQ4_MEMTYPE_SHIFT);
+		cr[2] = op->src.u.sb * CCP_SB_BYTES;
+		cr[3] = (CCP_MEMTYPE_SB << REQ4_MEMTYPE_SHIFT);
 	}
 
 	if (op->dst.type == CCP_MEMTYPE_SYSTEM) {
@@ -226,8 +285,8 @@
 		cr[5] = (CCP_MEMTYPE_SYSTEM << REQ6_MEMTYPE_SHIFT)
 			| ccp_addr_hi(&op->dst.u.dma);
 	} else {
-		cr[4] = op->dst.u.ksb * CCP_KSB_BYTES;
-		cr[5] = (CCP_MEMTYPE_KSB << REQ6_MEMTYPE_SHIFT);
+		cr[4] = op->dst.u.sb * CCP_SB_BYTES;
+		cr[5] = (CCP_MEMTYPE_SB << REQ6_MEMTYPE_SHIFT);
 	}
 
 	if (op->eom)
@@ -256,35 +315,6 @@
 	return ccp_do_cmd(op, cr, ARRAY_SIZE(cr));
 }
 
-static int ccp_trng_read(struct hwrng *rng, void *data, size_t max, bool wait)
-{
-	struct ccp_device *ccp = container_of(rng, struct ccp_device, hwrng);
-	u32 trng_value;
-	int len = min_t(int, sizeof(trng_value), max);
-
-	/*
-	 * Locking is provided by the caller so we can update device
-	 * hwrng-related fields safely
-	 */
-	trng_value = ioread32(ccp->io_regs + TRNG_OUT_REG);
-	if (!trng_value) {
-		/* Zero is returned if not data is available or if a
-		 * bad-entropy error is present. Assume an error if
-		 * we exceed TRNG_RETRIES reads of zero.
-		 */
-		if (ccp->hwrng_retries++ > TRNG_RETRIES)
-			return -EIO;
-
-		return 0;
-	}
-
-	/* Reset the counter and save the rng value */
-	ccp->hwrng_retries = 0;
-	memcpy(data, &trng_value, len);
-
-	return len;
-}
-
 static int ccp_init(struct ccp_device *ccp)
 {
 	struct device *dev = ccp->dev;
@@ -321,9 +351,9 @@
 		cmd_q->dma_pool = dma_pool;
 
 		/* Reserve 2 KSB regions for the queue */
-		cmd_q->ksb_key = KSB_START + ccp->ksb_start++;
-		cmd_q->ksb_ctx = KSB_START + ccp->ksb_start++;
-		ccp->ksb_count -= 2;
+		cmd_q->sb_key = KSB_START + ccp->sb_start++;
+		cmd_q->sb_ctx = KSB_START + ccp->sb_start++;
+		ccp->sb_count -= 2;
 
 		/* Preset some register values and masks that are queue
 		 * number dependent
@@ -335,7 +365,7 @@
 		cmd_q->int_ok = 1 << (i * 2);
 		cmd_q->int_err = 1 << ((i * 2) + 1);
 
-		cmd_q->free_slots = CMD_Q_DEPTH(ioread32(cmd_q->reg_status));
+		cmd_q->free_slots = ccp_get_free_slots(cmd_q);
 
 		init_waitqueue_head(&cmd_q->int_queue);
 
@@ -375,9 +405,10 @@
 	}
 
 	/* Initialize the queues used to wait for KSB space and suspend */
-	init_waitqueue_head(&ccp->ksb_queue);
+	init_waitqueue_head(&ccp->sb_queue);
 	init_waitqueue_head(&ccp->suspend_queue);
 
+	dev_dbg(dev, "Starting threads...\n");
 	/* Create a kthread for each queue */
 	for (i = 0; i < ccp->cmd_q_count; i++) {
 		struct task_struct *kthread;
@@ -397,29 +428,26 @@
 		wake_up_process(kthread);
 	}
 
-	/* Register the RNG */
-	ccp->hwrng.name = ccp->rngname;
-	ccp->hwrng.read = ccp_trng_read;
-	ret = hwrng_register(&ccp->hwrng);
-	if (ret) {
-		dev_err(dev, "error registering hwrng (%d)\n", ret);
+	dev_dbg(dev, "Enabling interrupts...\n");
+	/* Enable interrupts */
+	iowrite32(qim, ccp->io_regs + IRQ_MASK_REG);
+
+	dev_dbg(dev, "Registering device...\n");
+	ccp_add_device(ccp);
+
+	ret = ccp_register_rng(ccp);
+	if (ret)
 		goto e_kthread;
-	}
 
 	/* Register the DMA engine support */
 	ret = ccp_dmaengine_register(ccp);
 	if (ret)
 		goto e_hwrng;
 
-	ccp_add_device(ccp);
-
-	/* Enable interrupts */
-	iowrite32(qim, ccp->io_regs + IRQ_MASK_REG);
-
 	return 0;
 
 e_hwrng:
-	hwrng_unregister(&ccp->hwrng);
+	ccp_unregister_rng(ccp);
 
 e_kthread:
 	for (i = 0; i < ccp->cmd_q_count; i++)
@@ -441,19 +469,14 @@
 	struct ccp_cmd *cmd;
 	unsigned int qim, i;
 
-	/* Remove this device from the list of available units first */
-	ccp_del_device(ccp);
-
 	/* Unregister the DMA engine */
 	ccp_dmaengine_unregister(ccp);
 
 	/* Unregister the RNG */
-	hwrng_unregister(&ccp->hwrng);
+	ccp_unregister_rng(ccp);
 
-	/* Stop the queue kthreads */
-	for (i = 0; i < ccp->cmd_q_count; i++)
-		if (ccp->cmd_q[i].kthread)
-			kthread_stop(ccp->cmd_q[i].kthread);
+	/* Remove this device from the list of available units */
+	ccp_del_device(ccp);
 
 	/* Build queue interrupt mask (two interrupt masks per queue) */
 	qim = 0;
@@ -472,6 +495,11 @@
 	}
 	iowrite32(qim, ccp->io_regs + IRQ_STATUS_REG);
 
+	/* Stop the queue kthreads */
+	for (i = 0; i < ccp->cmd_q_count; i++)
+		if (ccp->cmd_q[i].kthread)
+			kthread_stop(ccp->cmd_q[i].kthread);
+
 	ccp->free_irq(ccp);
 
 	for (i = 0; i < ccp->cmd_q_count; i++)
@@ -527,18 +555,24 @@
 }
 
 static const struct ccp_actions ccp3_actions = {
-	.perform_aes = ccp_perform_aes,
-	.perform_xts_aes = ccp_perform_xts_aes,
-	.perform_sha = ccp_perform_sha,
-	.perform_rsa = ccp_perform_rsa,
-	.perform_passthru = ccp_perform_passthru,
-	.perform_ecc = ccp_perform_ecc,
+	.aes = ccp_perform_aes,
+	.xts_aes = ccp_perform_xts_aes,
+	.sha = ccp_perform_sha,
+	.rsa = ccp_perform_rsa,
+	.passthru = ccp_perform_passthru,
+	.ecc = ccp_perform_ecc,
+	.sballoc = ccp_alloc_ksb,
+	.sbfree = ccp_free_ksb,
 	.init = ccp_init,
 	.destroy = ccp_destroy,
+	.get_free_slots = ccp_get_free_slots,
 	.irqhandler = ccp_irq_handler,
 };
 
-struct ccp_vdata ccpv3 = {
+const struct ccp_vdata ccpv3 = {
 	.version = CCP_VERSION(3, 0),
+	.setup = NULL,
 	.perform = &ccp3_actions,
+	.bar = 2,
+	.offset = 0x20000,
 };
diff --git a/drivers/crypto/ccp/ccp-dev-v5.c b/drivers/crypto/ccp/ccp-dev-v5.c
new file mode 100644
index 0000000..faf3cb3
--- /dev/null
+++ b/drivers/crypto/ccp/ccp-dev-v5.c
@@ -0,0 +1,1017 @@
+/*
+ * AMD Cryptographic Coprocessor (CCP) driver
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Gary R Hook <gary.hook@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/kthread.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/compiler.h>
+#include <linux/ccp.h>
+
+#include "ccp-dev.h"
+
+static u32 ccp_lsb_alloc(struct ccp_cmd_queue *cmd_q, unsigned int count)
+{
+	struct ccp_device *ccp;
+	int start;
+
+	/* First look at the map for the queue */
+	if (cmd_q->lsb >= 0) {
+		start = (u32)bitmap_find_next_zero_area(cmd_q->lsbmap,
+							LSB_SIZE,
+							0, count, 0);
+		if (start < LSB_SIZE) {
+			bitmap_set(cmd_q->lsbmap, start, count);
+			return start + cmd_q->lsb * LSB_SIZE;
+		}
+	}
+
+	/* No joy; try to get an entry from the shared blocks */
+	ccp = cmd_q->ccp;
+	for (;;) {
+		mutex_lock(&ccp->sb_mutex);
+
+		start = (u32)bitmap_find_next_zero_area(ccp->lsbmap,
+							MAX_LSB_CNT * LSB_SIZE,
+							0,
+							count, 0);
+		if (start <= MAX_LSB_CNT * LSB_SIZE) {
+			bitmap_set(ccp->lsbmap, start, count);
+
+			mutex_unlock(&ccp->sb_mutex);
+			return start * LSB_ITEM_SIZE;
+		}
+
+		ccp->sb_avail = 0;
+
+		mutex_unlock(&ccp->sb_mutex);
+
+		/* Wait for KSB entries to become available */
+		if (wait_event_interruptible(ccp->sb_queue, ccp->sb_avail))
+			return 0;
+	}
+}
+
+static void ccp_lsb_free(struct ccp_cmd_queue *cmd_q, unsigned int start,
+			 unsigned int count)
+{
+	int lsbno = start / LSB_SIZE;
+
+	if (!start)
+		return;
+
+	if (cmd_q->lsb == lsbno) {
+		/* An entry from the private LSB */
+		bitmap_clear(cmd_q->lsbmap, start % LSB_SIZE, count);
+	} else {
+		/* From the shared LSBs */
+		struct ccp_device *ccp = cmd_q->ccp;
+
+		mutex_lock(&ccp->sb_mutex);
+		bitmap_clear(ccp->lsbmap, start, count);
+		ccp->sb_avail = 1;
+		mutex_unlock(&ccp->sb_mutex);
+		wake_up_interruptible_all(&ccp->sb_queue);
+	}
+}
+
+/* CCP version 5: Union to define the function field (cmd_reg1/dword0) */
+union ccp_function {
+	struct {
+		u16 size:7;
+		u16 encrypt:1;
+		u16 mode:5;
+		u16 type:2;
+	} aes;
+	struct {
+		u16 size:7;
+		u16 encrypt:1;
+		u16 rsvd:5;
+		u16 type:2;
+	} aes_xts;
+	struct {
+		u16 rsvd1:10;
+		u16 type:4;
+		u16 rsvd2:1;
+	} sha;
+	struct {
+		u16 mode:3;
+		u16 size:12;
+	} rsa;
+	struct {
+		u16 byteswap:2;
+		u16 bitwise:3;
+		u16 reflect:2;
+		u16 rsvd:8;
+	} pt;
+	struct  {
+		u16 rsvd:13;
+	} zlib;
+	struct {
+		u16 size:10;
+		u16 type:2;
+		u16 mode:3;
+	} ecc;
+	u16 raw;
+};
+
+#define	CCP_AES_SIZE(p)		((p)->aes.size)
+#define	CCP_AES_ENCRYPT(p)	((p)->aes.encrypt)
+#define	CCP_AES_MODE(p)		((p)->aes.mode)
+#define	CCP_AES_TYPE(p)		((p)->aes.type)
+#define	CCP_XTS_SIZE(p)		((p)->aes_xts.size)
+#define	CCP_XTS_ENCRYPT(p)	((p)->aes_xts.encrypt)
+#define	CCP_SHA_TYPE(p)		((p)->sha.type)
+#define	CCP_RSA_SIZE(p)		((p)->rsa.size)
+#define	CCP_PT_BYTESWAP(p)	((p)->pt.byteswap)
+#define	CCP_PT_BITWISE(p)	((p)->pt.bitwise)
+#define	CCP_ECC_MODE(p)		((p)->ecc.mode)
+#define	CCP_ECC_AFFINE(p)	((p)->ecc.one)
+
+/* Word 0 */
+#define CCP5_CMD_DW0(p)		((p)->dw0)
+#define CCP5_CMD_SOC(p)		(CCP5_CMD_DW0(p).soc)
+#define CCP5_CMD_IOC(p)		(CCP5_CMD_DW0(p).ioc)
+#define CCP5_CMD_INIT(p)	(CCP5_CMD_DW0(p).init)
+#define CCP5_CMD_EOM(p)		(CCP5_CMD_DW0(p).eom)
+#define CCP5_CMD_FUNCTION(p)	(CCP5_CMD_DW0(p).function)
+#define CCP5_CMD_ENGINE(p)	(CCP5_CMD_DW0(p).engine)
+#define CCP5_CMD_PROT(p)	(CCP5_CMD_DW0(p).prot)
+
+/* Word 1 */
+#define CCP5_CMD_DW1(p)		((p)->length)
+#define CCP5_CMD_LEN(p)		(CCP5_CMD_DW1(p))
+
+/* Word 2 */
+#define CCP5_CMD_DW2(p)		((p)->src_lo)
+#define CCP5_CMD_SRC_LO(p)	(CCP5_CMD_DW2(p))
+
+/* Word 3 */
+#define CCP5_CMD_DW3(p)		((p)->dw3)
+#define CCP5_CMD_SRC_MEM(p)	((p)->dw3.src_mem)
+#define CCP5_CMD_SRC_HI(p)	((p)->dw3.src_hi)
+#define CCP5_CMD_LSB_ID(p)	((p)->dw3.lsb_cxt_id)
+#define CCP5_CMD_FIX_SRC(p)	((p)->dw3.fixed)
+
+/* Words 4/5 */
+#define CCP5_CMD_DW4(p)		((p)->dw4)
+#define CCP5_CMD_DST_LO(p)	(CCP5_CMD_DW4(p).dst_lo)
+#define CCP5_CMD_DW5(p)		((p)->dw5.fields.dst_hi)
+#define CCP5_CMD_DST_HI(p)	(CCP5_CMD_DW5(p))
+#define CCP5_CMD_DST_MEM(p)	((p)->dw5.fields.dst_mem)
+#define CCP5_CMD_FIX_DST(p)	((p)->dw5.fields.fixed)
+#define CCP5_CMD_SHA_LO(p)	((p)->dw4.sha_len_lo)
+#define CCP5_CMD_SHA_HI(p)	((p)->dw5.sha_len_hi)
+
+/* Word 6/7 */
+#define CCP5_CMD_DW6(p)		((p)->key_lo)
+#define CCP5_CMD_KEY_LO(p)	(CCP5_CMD_DW6(p))
+#define CCP5_CMD_DW7(p)		((p)->dw7)
+#define CCP5_CMD_KEY_HI(p)	((p)->dw7.key_hi)
+#define CCP5_CMD_KEY_MEM(p)	((p)->dw7.key_mem)
+
+static inline u32 low_address(unsigned long addr)
+{
+	return (u64)addr & 0x0ffffffff;
+}
+
+static inline u32 high_address(unsigned long addr)
+{
+	return ((u64)addr >> 32) & 0x00000ffff;
+}
+
+static unsigned int ccp5_get_free_slots(struct ccp_cmd_queue *cmd_q)
+{
+	unsigned int head_idx, n;
+	u32 head_lo, queue_start;
+
+	queue_start = low_address(cmd_q->qdma_tail);
+	head_lo = ioread32(cmd_q->reg_head_lo);
+	head_idx = (head_lo - queue_start) / sizeof(struct ccp5_desc);
+
+	n = head_idx + COMMANDS_PER_QUEUE - cmd_q->qidx - 1;
+
+	return n % COMMANDS_PER_QUEUE; /* Always one unused spot */
+}
+
+static int ccp5_do_cmd(struct ccp5_desc *desc,
+		       struct ccp_cmd_queue *cmd_q)
+{
+	u32 *mP;
+	__le32 *dP;
+	u32 tail;
+	int	i;
+	int ret = 0;
+
+	if (CCP5_CMD_SOC(desc)) {
+		CCP5_CMD_IOC(desc) = 1;
+		CCP5_CMD_SOC(desc) = 0;
+	}
+	mutex_lock(&cmd_q->q_mutex);
+
+	mP = (u32 *) &cmd_q->qbase[cmd_q->qidx];
+	dP = (__le32 *) desc;
+	for (i = 0; i < 8; i++)
+		mP[i] = cpu_to_le32(dP[i]); /* handle endianness */
+
+	cmd_q->qidx = (cmd_q->qidx + 1) % COMMANDS_PER_QUEUE;
+
+	/* The data used by this command must be flushed to memory */
+	wmb();
+
+	/* Write the new tail address back to the queue register */
+	tail = low_address(cmd_q->qdma_tail + cmd_q->qidx * Q_DESC_SIZE);
+	iowrite32(tail, cmd_q->reg_tail_lo);
+
+	/* Turn the queue back on using our cached control register */
+	iowrite32(cmd_q->qcontrol | CMD5_Q_RUN, cmd_q->reg_control);
+	mutex_unlock(&cmd_q->q_mutex);
+
+	if (CCP5_CMD_IOC(desc)) {
+		/* Wait for the job to complete */
+		ret = wait_event_interruptible(cmd_q->int_queue,
+					       cmd_q->int_rcvd);
+		if (ret || cmd_q->cmd_error) {
+			if (cmd_q->cmd_error)
+				ccp_log_error(cmd_q->ccp,
+					      cmd_q->cmd_error);
+			/* A version 5 device doesn't use Job IDs... */
+			if (!ret)
+				ret = -EIO;
+		}
+		cmd_q->int_rcvd = 0;
+	}
+
+	return 0;
+}
+
+static int ccp5_perform_aes(struct ccp_op *op)
+{
+	struct ccp5_desc desc;
+	union ccp_function function;
+	u32 key_addr = op->sb_key * LSB_ITEM_SIZE;
+
+	/* Zero out all the fields of the command desc */
+	memset(&desc, 0, Q_DESC_SIZE);
+
+	CCP5_CMD_ENGINE(&desc) = CCP_ENGINE_AES;
+
+	CCP5_CMD_SOC(&desc) = op->soc;
+	CCP5_CMD_IOC(&desc) = 1;
+	CCP5_CMD_INIT(&desc) = op->init;
+	CCP5_CMD_EOM(&desc) = op->eom;
+	CCP5_CMD_PROT(&desc) = 0;
+
+	function.raw = 0;
+	CCP_AES_ENCRYPT(&function) = op->u.aes.action;
+	CCP_AES_MODE(&function) = op->u.aes.mode;
+	CCP_AES_TYPE(&function) = op->u.aes.type;
+	if (op->u.aes.mode == CCP_AES_MODE_CFB)
+		CCP_AES_SIZE(&function) = 0x7f;
+
+	CCP5_CMD_FUNCTION(&desc) = function.raw;
+
+	CCP5_CMD_LEN(&desc) = op->src.u.dma.length;
+
+	CCP5_CMD_SRC_LO(&desc) = ccp_addr_lo(&op->src.u.dma);
+	CCP5_CMD_SRC_HI(&desc) = ccp_addr_hi(&op->src.u.dma);
+	CCP5_CMD_SRC_MEM(&desc) = CCP_MEMTYPE_SYSTEM;
+
+	CCP5_CMD_DST_LO(&desc) = ccp_addr_lo(&op->dst.u.dma);
+	CCP5_CMD_DST_HI(&desc) = ccp_addr_hi(&op->dst.u.dma);
+	CCP5_CMD_DST_MEM(&desc) = CCP_MEMTYPE_SYSTEM;
+
+	CCP5_CMD_KEY_LO(&desc) = lower_32_bits(key_addr);
+	CCP5_CMD_KEY_HI(&desc) = 0;
+	CCP5_CMD_KEY_MEM(&desc) = CCP_MEMTYPE_SB;
+	CCP5_CMD_LSB_ID(&desc) = op->sb_ctx;
+
+	return ccp5_do_cmd(&desc, op->cmd_q);
+}
+
+static int ccp5_perform_xts_aes(struct ccp_op *op)
+{
+	struct ccp5_desc desc;
+	union ccp_function function;
+	u32 key_addr = op->sb_key * LSB_ITEM_SIZE;
+
+	/* Zero out all the fields of the command desc */
+	memset(&desc, 0, Q_DESC_SIZE);
+
+	CCP5_CMD_ENGINE(&desc) = CCP_ENGINE_XTS_AES_128;
+
+	CCP5_CMD_SOC(&desc) = op->soc;
+	CCP5_CMD_IOC(&desc) = 1;
+	CCP5_CMD_INIT(&desc) = op->init;
+	CCP5_CMD_EOM(&desc) = op->eom;
+	CCP5_CMD_PROT(&desc) = 0;
+
+	function.raw = 0;
+	CCP_XTS_ENCRYPT(&function) = op->u.xts.action;
+	CCP_XTS_SIZE(&function) = op->u.xts.unit_size;
+	CCP5_CMD_FUNCTION(&desc) = function.raw;
+
+	CCP5_CMD_LEN(&desc) = op->src.u.dma.length;
+
+	CCP5_CMD_SRC_LO(&desc) = ccp_addr_lo(&op->src.u.dma);
+	CCP5_CMD_SRC_HI(&desc) = ccp_addr_hi(&op->src.u.dma);
+	CCP5_CMD_SRC_MEM(&desc) = CCP_MEMTYPE_SYSTEM;
+
+	CCP5_CMD_DST_LO(&desc) = ccp_addr_lo(&op->dst.u.dma);
+	CCP5_CMD_DST_HI(&desc) = ccp_addr_hi(&op->dst.u.dma);
+	CCP5_CMD_DST_MEM(&desc) = CCP_MEMTYPE_SYSTEM;
+
+	CCP5_CMD_KEY_LO(&desc) = lower_32_bits(key_addr);
+	CCP5_CMD_KEY_HI(&desc) =  0;
+	CCP5_CMD_KEY_MEM(&desc) = CCP_MEMTYPE_SB;
+	CCP5_CMD_LSB_ID(&desc) = op->sb_ctx;
+
+	return ccp5_do_cmd(&desc, op->cmd_q);
+}
+
+static int ccp5_perform_sha(struct ccp_op *op)
+{
+	struct ccp5_desc desc;
+	union ccp_function function;
+
+	/* Zero out all the fields of the command desc */
+	memset(&desc, 0, Q_DESC_SIZE);
+
+	CCP5_CMD_ENGINE(&desc) = CCP_ENGINE_SHA;
+
+	CCP5_CMD_SOC(&desc) = op->soc;
+	CCP5_CMD_IOC(&desc) = 1;
+	CCP5_CMD_INIT(&desc) = 1;
+	CCP5_CMD_EOM(&desc) = op->eom;
+	CCP5_CMD_PROT(&desc) = 0;
+
+	function.raw = 0;
+	CCP_SHA_TYPE(&function) = op->u.sha.type;
+	CCP5_CMD_FUNCTION(&desc) = function.raw;
+
+	CCP5_CMD_LEN(&desc) = op->src.u.dma.length;
+
+	CCP5_CMD_SRC_LO(&desc) = ccp_addr_lo(&op->src.u.dma);
+	CCP5_CMD_SRC_HI(&desc) = ccp_addr_hi(&op->src.u.dma);
+	CCP5_CMD_SRC_MEM(&desc) = CCP_MEMTYPE_SYSTEM;
+
+	CCP5_CMD_LSB_ID(&desc) = op->sb_ctx;
+
+	if (op->eom) {
+		CCP5_CMD_SHA_LO(&desc) = lower_32_bits(op->u.sha.msg_bits);
+		CCP5_CMD_SHA_HI(&desc) = upper_32_bits(op->u.sha.msg_bits);
+	} else {
+		CCP5_CMD_SHA_LO(&desc) = 0;
+		CCP5_CMD_SHA_HI(&desc) = 0;
+	}
+
+	return ccp5_do_cmd(&desc, op->cmd_q);
+}
+
+static int ccp5_perform_rsa(struct ccp_op *op)
+{
+	struct ccp5_desc desc;
+	union ccp_function function;
+
+	/* Zero out all the fields of the command desc */
+	memset(&desc, 0, Q_DESC_SIZE);
+
+	CCP5_CMD_ENGINE(&desc) = CCP_ENGINE_RSA;
+
+	CCP5_CMD_SOC(&desc) = op->soc;
+	CCP5_CMD_IOC(&desc) = 1;
+	CCP5_CMD_INIT(&desc) = 0;
+	CCP5_CMD_EOM(&desc) = 1;
+	CCP5_CMD_PROT(&desc) = 0;
+
+	function.raw = 0;
+	CCP_RSA_SIZE(&function) = op->u.rsa.mod_size;
+	CCP5_CMD_FUNCTION(&desc) = function.raw;
+
+	CCP5_CMD_LEN(&desc) = op->u.rsa.input_len;
+
+	/* Source is from external memory */
+	CCP5_CMD_SRC_LO(&desc) = ccp_addr_lo(&op->src.u.dma);
+	CCP5_CMD_SRC_HI(&desc) = ccp_addr_hi(&op->src.u.dma);
+	CCP5_CMD_SRC_MEM(&desc) = CCP_MEMTYPE_SYSTEM;
+
+	/* Destination is in external memory */
+	CCP5_CMD_DST_LO(&desc) = ccp_addr_lo(&op->dst.u.dma);
+	CCP5_CMD_DST_HI(&desc) = ccp_addr_hi(&op->dst.u.dma);
+	CCP5_CMD_DST_MEM(&desc) = CCP_MEMTYPE_SYSTEM;
+
+	/* Key (Exponent) is in external memory */
+	CCP5_CMD_KEY_LO(&desc) = ccp_addr_lo(&op->exp.u.dma);
+	CCP5_CMD_KEY_HI(&desc) = ccp_addr_hi(&op->exp.u.dma);
+	CCP5_CMD_KEY_MEM(&desc) = CCP_MEMTYPE_SYSTEM;
+
+	return ccp5_do_cmd(&desc, op->cmd_q);
+}
+
+static int ccp5_perform_passthru(struct ccp_op *op)
+{
+	struct ccp5_desc desc;
+	union ccp_function function;
+	struct ccp_dma_info *saddr = &op->src.u.dma;
+	struct ccp_dma_info *daddr = &op->dst.u.dma;
+
+	memset(&desc, 0, Q_DESC_SIZE);
+
+	CCP5_CMD_ENGINE(&desc) = CCP_ENGINE_PASSTHRU;
+
+	CCP5_CMD_SOC(&desc) = 0;
+	CCP5_CMD_IOC(&desc) = 1;
+	CCP5_CMD_INIT(&desc) = 0;
+	CCP5_CMD_EOM(&desc) = op->eom;
+	CCP5_CMD_PROT(&desc) = 0;
+
+	function.raw = 0;
+	CCP_PT_BYTESWAP(&function) = op->u.passthru.byte_swap;
+	CCP_PT_BITWISE(&function) = op->u.passthru.bit_mod;
+	CCP5_CMD_FUNCTION(&desc) = function.raw;
+
+	/* Length of source data is always 256 bytes */
+	if (op->src.type == CCP_MEMTYPE_SYSTEM)
+		CCP5_CMD_LEN(&desc) = saddr->length;
+	else
+		CCP5_CMD_LEN(&desc) = daddr->length;
+
+	if (op->src.type == CCP_MEMTYPE_SYSTEM) {
+		CCP5_CMD_SRC_LO(&desc) = ccp_addr_lo(&op->src.u.dma);
+		CCP5_CMD_SRC_HI(&desc) = ccp_addr_hi(&op->src.u.dma);
+		CCP5_CMD_SRC_MEM(&desc) = CCP_MEMTYPE_SYSTEM;
+
+		if (op->u.passthru.bit_mod != CCP_PASSTHRU_BITWISE_NOOP)
+			CCP5_CMD_LSB_ID(&desc) = op->sb_key;
+	} else {
+		u32 key_addr = op->src.u.sb * CCP_SB_BYTES;
+
+		CCP5_CMD_SRC_LO(&desc) = lower_32_bits(key_addr);
+		CCP5_CMD_SRC_HI(&desc) = 0;
+		CCP5_CMD_SRC_MEM(&desc) = CCP_MEMTYPE_SB;
+	}
+
+	if (op->dst.type == CCP_MEMTYPE_SYSTEM) {
+		CCP5_CMD_DST_LO(&desc) = ccp_addr_lo(&op->dst.u.dma);
+		CCP5_CMD_DST_HI(&desc) = ccp_addr_hi(&op->dst.u.dma);
+		CCP5_CMD_DST_MEM(&desc) = CCP_MEMTYPE_SYSTEM;
+	} else {
+		u32 key_addr = op->dst.u.sb * CCP_SB_BYTES;
+
+		CCP5_CMD_DST_LO(&desc) = lower_32_bits(key_addr);
+		CCP5_CMD_DST_HI(&desc) = 0;
+		CCP5_CMD_DST_MEM(&desc) = CCP_MEMTYPE_SB;
+	}
+
+	return ccp5_do_cmd(&desc, op->cmd_q);
+}
+
+static int ccp5_perform_ecc(struct ccp_op *op)
+{
+	struct ccp5_desc desc;
+	union ccp_function function;
+
+	/* Zero out all the fields of the command desc */
+	memset(&desc, 0, Q_DESC_SIZE);
+
+	CCP5_CMD_ENGINE(&desc) = CCP_ENGINE_ECC;
+
+	CCP5_CMD_SOC(&desc) = 0;
+	CCP5_CMD_IOC(&desc) = 1;
+	CCP5_CMD_INIT(&desc) = 0;
+	CCP5_CMD_EOM(&desc) = 1;
+	CCP5_CMD_PROT(&desc) = 0;
+
+	function.raw = 0;
+	function.ecc.mode = op->u.ecc.function;
+	CCP5_CMD_FUNCTION(&desc) = function.raw;
+
+	CCP5_CMD_LEN(&desc) = op->src.u.dma.length;
+
+	CCP5_CMD_SRC_LO(&desc) = ccp_addr_lo(&op->src.u.dma);
+	CCP5_CMD_SRC_HI(&desc) = ccp_addr_hi(&op->src.u.dma);
+	CCP5_CMD_SRC_MEM(&desc) = CCP_MEMTYPE_SYSTEM;
+
+	CCP5_CMD_DST_LO(&desc) = ccp_addr_lo(&op->dst.u.dma);
+	CCP5_CMD_DST_HI(&desc) = ccp_addr_hi(&op->dst.u.dma);
+	CCP5_CMD_DST_MEM(&desc) = CCP_MEMTYPE_SYSTEM;
+
+	return ccp5_do_cmd(&desc, op->cmd_q);
+}
+
+static int ccp_find_lsb_regions(struct ccp_cmd_queue *cmd_q, u64 status)
+{
+	int q_mask = 1 << cmd_q->id;
+	int queues = 0;
+	int j;
+
+	/* Build a bit mask to know which LSBs this queue has access to.
+	 * Don't bother with segment 0 as it has special privileges.
+	 */
+	for (j = 1; j < MAX_LSB_CNT; j++) {
+		if (status & q_mask)
+			bitmap_set(cmd_q->lsbmask, j, 1);
+		status >>= LSB_REGION_WIDTH;
+	}
+	queues = bitmap_weight(cmd_q->lsbmask, MAX_LSB_CNT);
+	dev_info(cmd_q->ccp->dev, "Queue %d can access %d LSB regions\n",
+		 cmd_q->id, queues);
+
+	return queues ? 0 : -EINVAL;
+}
+
+
+static int ccp_find_and_assign_lsb_to_q(struct ccp_device *ccp,
+					int lsb_cnt, int n_lsbs,
+					unsigned long *lsb_pub)
+{
+	DECLARE_BITMAP(qlsb, MAX_LSB_CNT);
+	int bitno;
+	int qlsb_wgt;
+	int i;
+
+	/* For each queue:
+	 * If the count of potential LSBs available to a queue matches the
+	 * ordinal given to us in lsb_cnt:
+	 * Copy the mask of possible LSBs for this queue into "qlsb";
+	 * For each bit in qlsb, see if the corresponding bit in the
+	 * aggregation mask is set; if so, we have a match.
+	 *     If we have a match, clear the bit in the aggregation to
+	 *     mark it as no longer available.
+	 *     If there is no match, clear the bit in qlsb and keep looking.
+	 */
+	for (i = 0; i < ccp->cmd_q_count; i++) {
+		struct ccp_cmd_queue *cmd_q = &ccp->cmd_q[i];
+
+		qlsb_wgt = bitmap_weight(cmd_q->lsbmask, MAX_LSB_CNT);
+
+		if (qlsb_wgt == lsb_cnt) {
+			bitmap_copy(qlsb, cmd_q->lsbmask, MAX_LSB_CNT);
+
+			bitno = find_first_bit(qlsb, MAX_LSB_CNT);
+			while (bitno < MAX_LSB_CNT) {
+				if (test_bit(bitno, lsb_pub)) {
+					/* We found an available LSB
+					 * that this queue can access
+					 */
+					cmd_q->lsb = bitno;
+					bitmap_clear(lsb_pub, bitno, 1);
+					dev_info(ccp->dev,
+						 "Queue %d gets LSB %d\n",
+						 i, bitno);
+					break;
+				}
+				bitmap_clear(qlsb, bitno, 1);
+				bitno = find_first_bit(qlsb, MAX_LSB_CNT);
+			}
+			if (bitno >= MAX_LSB_CNT)
+				return -EINVAL;
+			n_lsbs--;
+		}
+	}
+	return n_lsbs;
+}
+
+/* For each queue, from the most- to least-constrained:
+ * find an LSB that can be assigned to the queue. If there are N queues that
+ * can only use M LSBs, where N > M, fail; otherwise, every queue will get a
+ * dedicated LSB. Remaining LSB regions become a shared resource.
+ * If we have fewer LSBs than queues, all LSB regions become shared resources.
+ */
+static int ccp_assign_lsbs(struct ccp_device *ccp)
+{
+	DECLARE_BITMAP(lsb_pub, MAX_LSB_CNT);
+	DECLARE_BITMAP(qlsb, MAX_LSB_CNT);
+	int n_lsbs = 0;
+	int bitno;
+	int i, lsb_cnt;
+	int rc = 0;
+
+	bitmap_zero(lsb_pub, MAX_LSB_CNT);
+
+	/* Create an aggregate bitmap to get a total count of available LSBs */
+	for (i = 0; i < ccp->cmd_q_count; i++)
+		bitmap_or(lsb_pub,
+			  lsb_pub, ccp->cmd_q[i].lsbmask,
+			  MAX_LSB_CNT);
+
+	n_lsbs = bitmap_weight(lsb_pub, MAX_LSB_CNT);
+
+	if (n_lsbs >= ccp->cmd_q_count) {
+		/* We have enough LSBS to give every queue a private LSB.
+		 * Brute force search to start with the queues that are more
+		 * constrained in LSB choice. When an LSB is privately
+		 * assigned, it is removed from the public mask.
+		 * This is an ugly N squared algorithm with some optimization.
+		 */
+		for (lsb_cnt = 1;
+		     n_lsbs && (lsb_cnt <= MAX_LSB_CNT);
+		     lsb_cnt++) {
+			rc = ccp_find_and_assign_lsb_to_q(ccp, lsb_cnt, n_lsbs,
+							  lsb_pub);
+			if (rc < 0)
+				return -EINVAL;
+			n_lsbs = rc;
+		}
+	}
+
+	rc = 0;
+	/* What's left of the LSBs, according to the public mask, now become
+	 * shared. Any zero bits in the lsb_pub mask represent an LSB region
+	 * that can't be used as a shared resource, so mark the LSB slots for
+	 * them as "in use".
+	 */
+	bitmap_copy(qlsb, lsb_pub, MAX_LSB_CNT);
+
+	bitno = find_first_zero_bit(qlsb, MAX_LSB_CNT);
+	while (bitno < MAX_LSB_CNT) {
+		bitmap_set(ccp->lsbmap, bitno * LSB_SIZE, LSB_SIZE);
+		bitmap_set(qlsb, bitno, 1);
+		bitno = find_first_zero_bit(qlsb, MAX_LSB_CNT);
+	}
+
+	return rc;
+}
+
+static int ccp5_init(struct ccp_device *ccp)
+{
+	struct device *dev = ccp->dev;
+	struct ccp_cmd_queue *cmd_q;
+	struct dma_pool *dma_pool;
+	char dma_pool_name[MAX_DMAPOOL_NAME_LEN];
+	unsigned int qmr, qim, i;
+	u64 status;
+	u32 status_lo, status_hi;
+	int ret;
+
+	/* Find available queues */
+	qim = 0;
+	qmr = ioread32(ccp->io_regs + Q_MASK_REG);
+	for (i = 0; i < MAX_HW_QUEUES; i++) {
+
+		if (!(qmr & (1 << i)))
+			continue;
+
+		/* Allocate a dma pool for this queue */
+		snprintf(dma_pool_name, sizeof(dma_pool_name), "%s_q%d",
+			 ccp->name, i);
+		dma_pool = dma_pool_create(dma_pool_name, dev,
+					   CCP_DMAPOOL_MAX_SIZE,
+					   CCP_DMAPOOL_ALIGN, 0);
+		if (!dma_pool) {
+			dev_err(dev, "unable to allocate dma pool\n");
+			ret = -ENOMEM;
+		}
+
+		cmd_q = &ccp->cmd_q[ccp->cmd_q_count];
+		ccp->cmd_q_count++;
+
+		cmd_q->ccp = ccp;
+		cmd_q->id = i;
+		cmd_q->dma_pool = dma_pool;
+		mutex_init(&cmd_q->q_mutex);
+
+		/* Page alignment satisfies our needs for N <= 128 */
+		BUILD_BUG_ON(COMMANDS_PER_QUEUE > 128);
+		cmd_q->qsize = Q_SIZE(Q_DESC_SIZE);
+		cmd_q->qbase = dma_zalloc_coherent(dev, cmd_q->qsize,
+						   &cmd_q->qbase_dma,
+						   GFP_KERNEL);
+		if (!cmd_q->qbase) {
+			dev_err(dev, "unable to allocate command queue\n");
+			ret = -ENOMEM;
+			goto e_pool;
+		}
+
+		cmd_q->qidx = 0;
+		/* Preset some register values and masks that are queue
+		 * number dependent
+		 */
+		cmd_q->reg_control = ccp->io_regs +
+				     CMD5_Q_STATUS_INCR * (i + 1);
+		cmd_q->reg_tail_lo = cmd_q->reg_control + CMD5_Q_TAIL_LO_BASE;
+		cmd_q->reg_head_lo = cmd_q->reg_control + CMD5_Q_HEAD_LO_BASE;
+		cmd_q->reg_int_enable = cmd_q->reg_control +
+					CMD5_Q_INT_ENABLE_BASE;
+		cmd_q->reg_interrupt_status = cmd_q->reg_control +
+					      CMD5_Q_INTERRUPT_STATUS_BASE;
+		cmd_q->reg_status = cmd_q->reg_control + CMD5_Q_STATUS_BASE;
+		cmd_q->reg_int_status = cmd_q->reg_control +
+					CMD5_Q_INT_STATUS_BASE;
+		cmd_q->reg_dma_status = cmd_q->reg_control +
+					CMD5_Q_DMA_STATUS_BASE;
+		cmd_q->reg_dma_read_status = cmd_q->reg_control +
+					     CMD5_Q_DMA_READ_STATUS_BASE;
+		cmd_q->reg_dma_write_status = cmd_q->reg_control +
+					      CMD5_Q_DMA_WRITE_STATUS_BASE;
+
+		init_waitqueue_head(&cmd_q->int_queue);
+
+		dev_dbg(dev, "queue #%u available\n", i);
+	}
+	if (ccp->cmd_q_count == 0) {
+		dev_notice(dev, "no command queues available\n");
+		ret = -EIO;
+		goto e_pool;
+	}
+	dev_notice(dev, "%u command queues available\n", ccp->cmd_q_count);
+
+	/* Turn off the queues and disable interrupts until ready */
+	for (i = 0; i < ccp->cmd_q_count; i++) {
+		cmd_q = &ccp->cmd_q[i];
+
+		cmd_q->qcontrol = 0; /* Start with nothing */
+		iowrite32(cmd_q->qcontrol, cmd_q->reg_control);
+
+		/* Disable the interrupts */
+		iowrite32(0x00, cmd_q->reg_int_enable);
+		ioread32(cmd_q->reg_int_status);
+		ioread32(cmd_q->reg_status);
+
+		/* Clear the interrupts */
+		iowrite32(ALL_INTERRUPTS, cmd_q->reg_interrupt_status);
+	}
+
+	dev_dbg(dev, "Requesting an IRQ...\n");
+	/* Request an irq */
+	ret = ccp->get_irq(ccp);
+	if (ret) {
+		dev_err(dev, "unable to allocate an IRQ\n");
+		goto e_pool;
+	}
+
+	/* Initialize the queue used to suspend */
+	init_waitqueue_head(&ccp->suspend_queue);
+
+	dev_dbg(dev, "Loading LSB map...\n");
+	/* Copy the private LSB mask to the public registers */
+	status_lo = ioread32(ccp->io_regs + LSB_PRIVATE_MASK_LO_OFFSET);
+	status_hi = ioread32(ccp->io_regs + LSB_PRIVATE_MASK_HI_OFFSET);
+	iowrite32(status_lo, ccp->io_regs + LSB_PUBLIC_MASK_LO_OFFSET);
+	iowrite32(status_hi, ccp->io_regs + LSB_PUBLIC_MASK_HI_OFFSET);
+	status = ((u64)status_hi<<30) | (u64)status_lo;
+
+	dev_dbg(dev, "Configuring virtual queues...\n");
+	/* Configure size of each virtual queue accessible to host */
+	for (i = 0; i < ccp->cmd_q_count; i++) {
+		u32 dma_addr_lo;
+		u32 dma_addr_hi;
+
+		cmd_q = &ccp->cmd_q[i];
+
+		cmd_q->qcontrol &= ~(CMD5_Q_SIZE << CMD5_Q_SHIFT);
+		cmd_q->qcontrol |= QUEUE_SIZE_VAL << CMD5_Q_SHIFT;
+
+		cmd_q->qdma_tail = cmd_q->qbase_dma;
+		dma_addr_lo = low_address(cmd_q->qdma_tail);
+		iowrite32((u32)dma_addr_lo, cmd_q->reg_tail_lo);
+		iowrite32((u32)dma_addr_lo, cmd_q->reg_head_lo);
+
+		dma_addr_hi = high_address(cmd_q->qdma_tail);
+		cmd_q->qcontrol |= (dma_addr_hi << 16);
+		iowrite32(cmd_q->qcontrol, cmd_q->reg_control);
+
+		/* Find the LSB regions accessible to the queue */
+		ccp_find_lsb_regions(cmd_q, status);
+		cmd_q->lsb = -1; /* Unassigned value */
+	}
+
+	dev_dbg(dev, "Assigning LSBs...\n");
+	ret = ccp_assign_lsbs(ccp);
+	if (ret) {
+		dev_err(dev, "Unable to assign LSBs (%d)\n", ret);
+		goto e_irq;
+	}
+
+	/* Optimization: pre-allocate LSB slots for each queue */
+	for (i = 0; i < ccp->cmd_q_count; i++) {
+		ccp->cmd_q[i].sb_key = ccp_lsb_alloc(&ccp->cmd_q[i], 2);
+		ccp->cmd_q[i].sb_ctx = ccp_lsb_alloc(&ccp->cmd_q[i], 2);
+	}
+
+	dev_dbg(dev, "Starting threads...\n");
+	/* Create a kthread for each queue */
+	for (i = 0; i < ccp->cmd_q_count; i++) {
+		struct task_struct *kthread;
+
+		cmd_q = &ccp->cmd_q[i];
+
+		kthread = kthread_create(ccp_cmd_queue_thread, cmd_q,
+					 "%s-q%u", ccp->name, cmd_q->id);
+		if (IS_ERR(kthread)) {
+			dev_err(dev, "error creating queue thread (%ld)\n",
+				PTR_ERR(kthread));
+			ret = PTR_ERR(kthread);
+			goto e_kthread;
+		}
+
+		cmd_q->kthread = kthread;
+		wake_up_process(kthread);
+	}
+
+	dev_dbg(dev, "Enabling interrupts...\n");
+	/* Enable interrupts */
+	for (i = 0; i < ccp->cmd_q_count; i++) {
+		cmd_q = &ccp->cmd_q[i];
+		iowrite32(ALL_INTERRUPTS, cmd_q->reg_int_enable);
+	}
+
+	dev_dbg(dev, "Registering device...\n");
+	/* Put this on the unit list to make it available */
+	ccp_add_device(ccp);
+
+	ret = ccp_register_rng(ccp);
+	if (ret)
+		goto e_kthread;
+
+	/* Register the DMA engine support */
+	ret = ccp_dmaengine_register(ccp);
+	if (ret)
+		goto e_hwrng;
+
+	return 0;
+
+e_hwrng:
+	ccp_unregister_rng(ccp);
+
+e_kthread:
+	for (i = 0; i < ccp->cmd_q_count; i++)
+		if (ccp->cmd_q[i].kthread)
+			kthread_stop(ccp->cmd_q[i].kthread);
+
+e_irq:
+	ccp->free_irq(ccp);
+
+e_pool:
+	for (i = 0; i < ccp->cmd_q_count; i++)
+		dma_pool_destroy(ccp->cmd_q[i].dma_pool);
+
+	return ret;
+}
+
+static void ccp5_destroy(struct ccp_device *ccp)
+{
+	struct device *dev = ccp->dev;
+	struct ccp_cmd_queue *cmd_q;
+	struct ccp_cmd *cmd;
+	unsigned int i;
+
+	/* Unregister the DMA engine */
+	ccp_dmaengine_unregister(ccp);
+
+	/* Unregister the RNG */
+	ccp_unregister_rng(ccp);
+
+	/* Remove this device from the list of available units first */
+	ccp_del_device(ccp);
+
+	/* Disable and clear interrupts */
+	for (i = 0; i < ccp->cmd_q_count; i++) {
+		cmd_q = &ccp->cmd_q[i];
+
+		/* Turn off the run bit */
+		iowrite32(cmd_q->qcontrol & ~CMD5_Q_RUN, cmd_q->reg_control);
+
+		/* Disable the interrupts */
+		iowrite32(ALL_INTERRUPTS, cmd_q->reg_interrupt_status);
+
+		/* Clear the interrupt status */
+		iowrite32(0x00, cmd_q->reg_int_enable);
+		ioread32(cmd_q->reg_int_status);
+		ioread32(cmd_q->reg_status);
+	}
+
+	/* Stop the queue kthreads */
+	for (i = 0; i < ccp->cmd_q_count; i++)
+		if (ccp->cmd_q[i].kthread)
+			kthread_stop(ccp->cmd_q[i].kthread);
+
+	ccp->free_irq(ccp);
+
+	for (i = 0; i < ccp->cmd_q_count; i++) {
+		cmd_q = &ccp->cmd_q[i];
+		dma_free_coherent(dev, cmd_q->qsize, cmd_q->qbase,
+				  cmd_q->qbase_dma);
+	}
+
+	/* Flush the cmd and backlog queue */
+	while (!list_empty(&ccp->cmd)) {
+		/* Invoke the callback directly with an error code */
+		cmd = list_first_entry(&ccp->cmd, struct ccp_cmd, entry);
+		list_del(&cmd->entry);
+		cmd->callback(cmd->data, -ENODEV);
+	}
+	while (!list_empty(&ccp->backlog)) {
+		/* Invoke the callback directly with an error code */
+		cmd = list_first_entry(&ccp->backlog, struct ccp_cmd, entry);
+		list_del(&cmd->entry);
+		cmd->callback(cmd->data, -ENODEV);
+	}
+}
+
+static irqreturn_t ccp5_irq_handler(int irq, void *data)
+{
+	struct device *dev = data;
+	struct ccp_device *ccp = dev_get_drvdata(dev);
+	u32 status;
+	unsigned int i;
+
+	for (i = 0; i < ccp->cmd_q_count; i++) {
+		struct ccp_cmd_queue *cmd_q = &ccp->cmd_q[i];
+
+		status = ioread32(cmd_q->reg_interrupt_status);
+
+		if (status) {
+			cmd_q->int_status = status;
+			cmd_q->q_status = ioread32(cmd_q->reg_status);
+			cmd_q->q_int_status = ioread32(cmd_q->reg_int_status);
+
+			/* On error, only save the first error value */
+			if ((status & INT_ERROR) && !cmd_q->cmd_error)
+				cmd_q->cmd_error = CMD_Q_ERROR(cmd_q->q_status);
+
+			cmd_q->int_rcvd = 1;
+
+			/* Acknowledge the interrupt and wake the kthread */
+			iowrite32(ALL_INTERRUPTS, cmd_q->reg_interrupt_status);
+			wake_up_interruptible(&cmd_q->int_queue);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void ccp5_config(struct ccp_device *ccp)
+{
+	/* Public side */
+	iowrite32(0x00001249, ccp->io_regs + CMD5_REQID_CONFIG_OFFSET);
+}
+
+static void ccp5other_config(struct ccp_device *ccp)
+{
+	int i;
+	u32 rnd;
+
+	/* We own all of the queues on the NTB CCP */
+
+	iowrite32(0x00012D57, ccp->io_regs + CMD5_TRNG_CTL_OFFSET);
+	iowrite32(0x00000003, ccp->io_regs + CMD5_CONFIG_0_OFFSET);
+	for (i = 0; i < 12; i++) {
+		rnd = ioread32(ccp->io_regs + TRNG_OUT_REG);
+		iowrite32(rnd, ccp->io_regs + CMD5_AES_MASK_OFFSET);
+	}
+
+	iowrite32(0x0000001F, ccp->io_regs + CMD5_QUEUE_MASK_OFFSET);
+	iowrite32(0x00005B6D, ccp->io_regs + CMD5_QUEUE_PRIO_OFFSET);
+	iowrite32(0x00000000, ccp->io_regs + CMD5_CMD_TIMEOUT_OFFSET);
+
+	iowrite32(0x3FFFFFFF, ccp->io_regs + LSB_PRIVATE_MASK_LO_OFFSET);
+	iowrite32(0x000003FF, ccp->io_regs + LSB_PRIVATE_MASK_HI_OFFSET);
+
+	iowrite32(0x00108823, ccp->io_regs + CMD5_CLK_GATE_CTL_OFFSET);
+
+	ccp5_config(ccp);
+}
+
+/* Version 5 adds some function, but is essentially the same as v5 */
+static const struct ccp_actions ccp5_actions = {
+	.aes = ccp5_perform_aes,
+	.xts_aes = ccp5_perform_xts_aes,
+	.sha = ccp5_perform_sha,
+	.rsa = ccp5_perform_rsa,
+	.passthru = ccp5_perform_passthru,
+	.ecc = ccp5_perform_ecc,
+	.sballoc = ccp_lsb_alloc,
+	.sbfree = ccp_lsb_free,
+	.init = ccp5_init,
+	.destroy = ccp5_destroy,
+	.get_free_slots = ccp5_get_free_slots,
+	.irqhandler = ccp5_irq_handler,
+};
+
+const struct ccp_vdata ccpv5a = {
+	.version = CCP_VERSION(5, 0),
+	.setup = ccp5_config,
+	.perform = &ccp5_actions,
+	.bar = 2,
+	.offset = 0x0,
+};
+
+const struct ccp_vdata ccpv5b = {
+	.version = CCP_VERSION(5, 0),
+	.setup = ccp5other_config,
+	.perform = &ccp5_actions,
+	.bar = 2,
+	.offset = 0x0,
+};
diff --git a/drivers/crypto/ccp/ccp-dev.c b/drivers/crypto/ccp/ccp-dev.c
index 87b9f2b..cafa633 100644
--- a/drivers/crypto/ccp/ccp-dev.c
+++ b/drivers/crypto/ccp/ccp-dev.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2013,2016 Advanced Micro Devices, Inc.
  *
  * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ * Author: Gary R Hook <gary.hook@amd.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -39,6 +40,59 @@
 	struct ccp_cmd *cmd;
 };
 
+/* Human-readable error strings */
+char *ccp_error_codes[] = {
+	"",
+	"ERR 01: ILLEGAL_ENGINE",
+	"ERR 02: ILLEGAL_KEY_ID",
+	"ERR 03: ILLEGAL_FUNCTION_TYPE",
+	"ERR 04: ILLEGAL_FUNCTION_MODE",
+	"ERR 05: ILLEGAL_FUNCTION_ENCRYPT",
+	"ERR 06: ILLEGAL_FUNCTION_SIZE",
+	"ERR 07: Zlib_MISSING_INIT_EOM",
+	"ERR 08: ILLEGAL_FUNCTION_RSVD",
+	"ERR 09: ILLEGAL_BUFFER_LENGTH",
+	"ERR 10: VLSB_FAULT",
+	"ERR 11: ILLEGAL_MEM_ADDR",
+	"ERR 12: ILLEGAL_MEM_SEL",
+	"ERR 13: ILLEGAL_CONTEXT_ID",
+	"ERR 14: ILLEGAL_KEY_ADDR",
+	"ERR 15: 0xF Reserved",
+	"ERR 16: Zlib_ILLEGAL_MULTI_QUEUE",
+	"ERR 17: Zlib_ILLEGAL_JOBID_CHANGE",
+	"ERR 18: CMD_TIMEOUT",
+	"ERR 19: IDMA0_AXI_SLVERR",
+	"ERR 20: IDMA0_AXI_DECERR",
+	"ERR 21: 0x15 Reserved",
+	"ERR 22: IDMA1_AXI_SLAVE_FAULT",
+	"ERR 23: IDMA1_AIXI_DECERR",
+	"ERR 24: 0x18 Reserved",
+	"ERR 25: ZLIBVHB_AXI_SLVERR",
+	"ERR 26: ZLIBVHB_AXI_DECERR",
+	"ERR 27: 0x1B Reserved",
+	"ERR 27: ZLIB_UNEXPECTED_EOM",
+	"ERR 27: ZLIB_EXTRA_DATA",
+	"ERR 30: ZLIB_BTYPE",
+	"ERR 31: ZLIB_UNDEFINED_SYMBOL",
+	"ERR 32: ZLIB_UNDEFINED_DISTANCE_S",
+	"ERR 33: ZLIB_CODE_LENGTH_SYMBOL",
+	"ERR 34: ZLIB _VHB_ILLEGAL_FETCH",
+	"ERR 35: ZLIB_UNCOMPRESSED_LEN",
+	"ERR 36: ZLIB_LIMIT_REACHED",
+	"ERR 37: ZLIB_CHECKSUM_MISMATCH0",
+	"ERR 38: ODMA0_AXI_SLVERR",
+	"ERR 39: ODMA0_AXI_DECERR",
+	"ERR 40: 0x28 Reserved",
+	"ERR 41: ODMA1_AXI_SLVERR",
+	"ERR 42: ODMA1_AXI_DECERR",
+	"ERR 43: LSB_PARITY_ERR",
+};
+
+void ccp_log_error(struct ccp_device *d, int e)
+{
+	dev_err(d->dev, "CCP error: %s (0x%x)\n", ccp_error_codes[e], e);
+}
+
 /* List of CCPs, CCP count, read-write access lock, and access functions
  *
  * Lock structure: get ccp_unit_lock for reading whenever we need to
@@ -58,7 +112,7 @@
 
 /* Ever-increasing value to produce unique unit numbers */
 static atomic_t ccp_unit_ordinal;
-unsigned int ccp_increment_unit_ordinal(void)
+static unsigned int ccp_increment_unit_ordinal(void)
 {
 	return atomic_inc_return(&ccp_unit_ordinal);
 }
@@ -118,6 +172,29 @@
 	write_unlock_irqrestore(&ccp_unit_lock, flags);
 }
 
+
+
+int ccp_register_rng(struct ccp_device *ccp)
+{
+	int ret = 0;
+
+	dev_dbg(ccp->dev, "Registering RNG...\n");
+	/* Register an RNG */
+	ccp->hwrng.name = ccp->rngname;
+	ccp->hwrng.read = ccp_trng_read;
+	ret = hwrng_register(&ccp->hwrng);
+	if (ret)
+		dev_err(ccp->dev, "error registering hwrng (%d)\n", ret);
+
+	return ret;
+}
+
+void ccp_unregister_rng(struct ccp_device *ccp)
+{
+	if (ccp->hwrng.name)
+		hwrng_unregister(&ccp->hwrng);
+}
+
 static struct ccp_device *ccp_get_device(void)
 {
 	unsigned long flags;
@@ -397,9 +474,9 @@
 
 	spin_lock_init(&ccp->cmd_lock);
 	mutex_init(&ccp->req_mutex);
-	mutex_init(&ccp->ksb_mutex);
-	ccp->ksb_count = KSB_COUNT;
-	ccp->ksb_start = 0;
+	mutex_init(&ccp->sb_mutex);
+	ccp->sb_count = KSB_COUNT;
+	ccp->sb_start = 0;
 
 	ccp->ord = ccp_increment_unit_ordinal();
 	snprintf(ccp->name, MAX_CCP_NAME_LEN, "ccp-%u", ccp->ord);
@@ -408,6 +485,34 @@
 	return ccp;
 }
 
+int ccp_trng_read(struct hwrng *rng, void *data, size_t max, bool wait)
+{
+	struct ccp_device *ccp = container_of(rng, struct ccp_device, hwrng);
+	u32 trng_value;
+	int len = min_t(int, sizeof(trng_value), max);
+
+	/* Locking is provided by the caller so we can update device
+	 * hwrng-related fields safely
+	 */
+	trng_value = ioread32(ccp->io_regs + TRNG_OUT_REG);
+	if (!trng_value) {
+		/* Zero is returned if not data is available or if a
+		 * bad-entropy error is present. Assume an error if
+		 * we exceed TRNG_RETRIES reads of zero.
+		 */
+		if (ccp->hwrng_retries++ > TRNG_RETRIES)
+			return -EIO;
+
+		return 0;
+	}
+
+	/* Reset the counter and save the rng value */
+	ccp->hwrng_retries = 0;
+	memcpy(data, &trng_value, len);
+
+	return len;
+}
+
 #ifdef CONFIG_PM
 bool ccp_queues_suspended(struct ccp_device *ccp)
 {
diff --git a/drivers/crypto/ccp/ccp-dev.h b/drivers/crypto/ccp/ccp-dev.h
index bd41ffce..da5f4a6 100644
--- a/drivers/crypto/ccp/ccp-dev.h
+++ b/drivers/crypto/ccp/ccp-dev.h
@@ -4,6 +4,7 @@
  * Copyright (C) 2013,2016 Advanced Micro Devices, Inc.
  *
  * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ * Author: Gary R Hook <gary.hook@amd.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -60,7 +61,69 @@
 #define CMD_Q_ERROR(__qs)		((__qs) & 0x0000003f)
 #define CMD_Q_DEPTH(__qs)		(((__qs) >> 12) & 0x0000000f)
 
-/****** REQ0 Related Values ******/
+/* ------------------------ CCP Version 5 Specifics ------------------------ */
+#define CMD5_QUEUE_MASK_OFFSET		0x00
+#define	CMD5_QUEUE_PRIO_OFFSET		0x04
+#define CMD5_REQID_CONFIG_OFFSET	0x08
+#define	CMD5_CMD_TIMEOUT_OFFSET		0x10
+#define LSB_PUBLIC_MASK_LO_OFFSET	0x18
+#define LSB_PUBLIC_MASK_HI_OFFSET	0x1C
+#define LSB_PRIVATE_MASK_LO_OFFSET	0x20
+#define LSB_PRIVATE_MASK_HI_OFFSET	0x24
+
+#define CMD5_Q_CONTROL_BASE		0x0000
+#define CMD5_Q_TAIL_LO_BASE		0x0004
+#define CMD5_Q_HEAD_LO_BASE		0x0008
+#define CMD5_Q_INT_ENABLE_BASE		0x000C
+#define CMD5_Q_INTERRUPT_STATUS_BASE	0x0010
+
+#define CMD5_Q_STATUS_BASE		0x0100
+#define CMD5_Q_INT_STATUS_BASE		0x0104
+#define CMD5_Q_DMA_STATUS_BASE		0x0108
+#define CMD5_Q_DMA_READ_STATUS_BASE	0x010C
+#define CMD5_Q_DMA_WRITE_STATUS_BASE	0x0110
+#define CMD5_Q_ABORT_BASE		0x0114
+#define CMD5_Q_AX_CACHE_BASE		0x0118
+
+#define	CMD5_CONFIG_0_OFFSET		0x6000
+#define	CMD5_TRNG_CTL_OFFSET		0x6008
+#define	CMD5_AES_MASK_OFFSET		0x6010
+#define	CMD5_CLK_GATE_CTL_OFFSET	0x603C
+
+/* Address offset between two virtual queue registers */
+#define CMD5_Q_STATUS_INCR		0x1000
+
+/* Bit masks */
+#define CMD5_Q_RUN			0x1
+#define CMD5_Q_HALT			0x2
+#define CMD5_Q_MEM_LOCATION		0x4
+#define CMD5_Q_SIZE			0x1F
+#define CMD5_Q_SHIFT			3
+#define COMMANDS_PER_QUEUE		16
+#define QUEUE_SIZE_VAL			((ffs(COMMANDS_PER_QUEUE) - 2) & \
+					  CMD5_Q_SIZE)
+#define Q_PTR_MASK			(2 << (QUEUE_SIZE_VAL + 5) - 1)
+#define Q_DESC_SIZE			sizeof(struct ccp5_desc)
+#define Q_SIZE(n)			(COMMANDS_PER_QUEUE*(n))
+
+#define INT_COMPLETION			0x1
+#define INT_ERROR			0x2
+#define INT_QUEUE_STOPPED		0x4
+#define ALL_INTERRUPTS			(INT_COMPLETION| \
+					 INT_ERROR| \
+					 INT_QUEUE_STOPPED)
+
+#define LSB_REGION_WIDTH		5
+#define MAX_LSB_CNT			8
+
+#define LSB_SIZE			16
+#define LSB_ITEM_SIZE			32
+#define PLSB_MAP_SIZE			(LSB_SIZE)
+#define SLSB_MAP_SIZE			(MAX_LSB_CNT * LSB_SIZE)
+
+#define LSB_ENTRY_NUMBER(LSB_ADDR)	(LSB_ADDR / LSB_ITEM_SIZE)
+
+/* ------------------------ CCP Version 3 Specifics ------------------------ */
 #define REQ0_WAIT_FOR_WRITE		0x00000004
 #define REQ0_INT_ON_COMPLETE		0x00000002
 #define REQ0_STOP_ON_COMPLETE		0x00000001
@@ -110,29 +173,30 @@
 #define KSB_START			77
 #define KSB_END				127
 #define KSB_COUNT			(KSB_END - KSB_START + 1)
-#define CCP_KSB_BITS			256
-#define CCP_KSB_BYTES			32
+#define CCP_SB_BITS			256
 
 #define CCP_JOBID_MASK			0x0000003f
 
+/* ------------------------ General CCP Defines ------------------------ */
+
 #define CCP_DMAPOOL_MAX_SIZE		64
 #define CCP_DMAPOOL_ALIGN		BIT(5)
 
 #define CCP_REVERSE_BUF_SIZE		64
 
-#define CCP_AES_KEY_KSB_COUNT		1
-#define CCP_AES_CTX_KSB_COUNT		1
+#define CCP_AES_KEY_SB_COUNT		1
+#define CCP_AES_CTX_SB_COUNT		1
 
-#define CCP_XTS_AES_KEY_KSB_COUNT	1
-#define CCP_XTS_AES_CTX_KSB_COUNT	1
+#define CCP_XTS_AES_KEY_SB_COUNT	1
+#define CCP_XTS_AES_CTX_SB_COUNT	1
 
-#define CCP_SHA_KSB_COUNT		1
+#define CCP_SHA_SB_COUNT		1
 
 #define CCP_RSA_MAX_WIDTH		4096
 
 #define CCP_PASSTHRU_BLOCKSIZE		256
 #define CCP_PASSTHRU_MASKSIZE		32
-#define CCP_PASSTHRU_KSB_COUNT		1
+#define CCP_PASSTHRU_SB_COUNT		1
 
 #define CCP_ECC_MODULUS_BYTES		48      /* 384-bits */
 #define CCP_ECC_MAX_OPERANDS		6
@@ -144,31 +208,12 @@
 #define CCP_ECC_RESULT_OFFSET		60
 #define CCP_ECC_RESULT_SUCCESS		0x0001
 
+#define CCP_SB_BYTES			32
+
 struct ccp_op;
-
-/* Structure for computation functions that are device-specific */
-struct ccp_actions {
-	int (*perform_aes)(struct ccp_op *);
-	int (*perform_xts_aes)(struct ccp_op *);
-	int (*perform_sha)(struct ccp_op *);
-	int (*perform_rsa)(struct ccp_op *);
-	int (*perform_passthru)(struct ccp_op *);
-	int (*perform_ecc)(struct ccp_op *);
-	int (*init)(struct ccp_device *);
-	void (*destroy)(struct ccp_device *);
-	irqreturn_t (*irqhandler)(int, void *);
-};
-
-/* Structure to hold CCP version-specific values */
-struct ccp_vdata {
-	unsigned int version;
-	const struct ccp_actions *perform;
-};
-
-extern struct ccp_vdata ccpv3;
-
 struct ccp_device;
 struct ccp_cmd;
+struct ccp_fns;
 
 struct ccp_dma_cmd {
 	struct list_head entry;
@@ -212,9 +257,29 @@
 	/* Queue dma pool */
 	struct dma_pool *dma_pool;
 
-	/* Queue reserved KSB regions */
-	u32 ksb_key;
-	u32 ksb_ctx;
+	/* Queue base address (not neccessarily aligned)*/
+	struct ccp5_desc *qbase;
+
+	/* Aligned queue start address (per requirement) */
+	struct mutex q_mutex ____cacheline_aligned;
+	unsigned int qidx;
+
+	/* Version 5 has different requirements for queue memory */
+	unsigned int qsize;
+	dma_addr_t qbase_dma;
+	dma_addr_t qdma_tail;
+
+	/* Per-queue reserved storage block(s) */
+	u32 sb_key;
+	u32 sb_ctx;
+
+	/* Bitmap of LSBs that can be accessed by this queue */
+	DECLARE_BITMAP(lsbmask, MAX_LSB_CNT);
+	/* Private LSB that is assigned to this queue, or -1 if none.
+	 * Bitmap for my private LSB, unused otherwise
+	 */
+	unsigned int lsb;
+	DECLARE_BITMAP(lsbmap, PLSB_MAP_SIZE);
 
 	/* Queue processing thread */
 	struct task_struct *kthread;
@@ -229,8 +294,17 @@
 	u32 int_err;
 
 	/* Register addresses for queue */
+	void __iomem *reg_control;
+	void __iomem *reg_tail_lo;
+	void __iomem *reg_head_lo;
+	void __iomem *reg_int_enable;
+	void __iomem *reg_interrupt_status;
 	void __iomem *reg_status;
 	void __iomem *reg_int_status;
+	void __iomem *reg_dma_status;
+	void __iomem *reg_dma_read_status;
+	void __iomem *reg_dma_write_status;
+	u32 qcontrol; /* Cached control register */
 
 	/* Status values from job */
 	u32 int_status;
@@ -253,16 +327,14 @@
 
 	struct device *dev;
 
-	/*
-	 * Bus specific device information
+	/* Bus specific device information
 	 */
 	void *dev_specific;
 	int (*get_irq)(struct ccp_device *ccp);
 	void (*free_irq)(struct ccp_device *ccp);
 	unsigned int irq;
 
-	/*
-	 * I/O area used for device communication. The register mapping
+	/* I/O area used for device communication. The register mapping
 	 * starts at an offset into the mapped bar.
 	 *   The CMD_REQx registers and the Delete_Cmd_Queue_Job register
 	 *   need to be protected while a command queue thread is accessing
@@ -272,8 +344,7 @@
 	void __iomem *io_map;
 	void __iomem *io_regs;
 
-	/*
-	 * Master lists that all cmds are queued on. Because there can be
+	/* Master lists that all cmds are queued on. Because there can be
 	 * more than one CCP command queue that can process a cmd a separate
 	 * backlog list is neeeded so that the backlog completion call
 	 * completes before the cmd is available for execution.
@@ -283,47 +354,54 @@
 	struct list_head cmd;
 	struct list_head backlog;
 
-	/*
-	 * The command queues. These represent the queues available on the
+	/* The command queues. These represent the queues available on the
 	 * CCP that are available for processing cmds
 	 */
 	struct ccp_cmd_queue cmd_q[MAX_HW_QUEUES];
 	unsigned int cmd_q_count;
 
-	/*
-	 * Support for the CCP True RNG
+	/* Support for the CCP True RNG
 	 */
 	struct hwrng hwrng;
 	unsigned int hwrng_retries;
 
-	/*
-	 * Support for the CCP DMA capabilities
+	/* Support for the CCP DMA capabilities
 	 */
 	struct dma_device dma_dev;
 	struct ccp_dma_chan *ccp_dma_chan;
 	struct kmem_cache *dma_cmd_cache;
 	struct kmem_cache *dma_desc_cache;
 
-	/*
-	 * A counter used to generate job-ids for cmds submitted to the CCP
+	/* A counter used to generate job-ids for cmds submitted to the CCP
 	 */
 	atomic_t current_id ____cacheline_aligned;
 
-	/*
-	 * The CCP uses key storage blocks (KSB) to maintain context for certain
-	 * operations. To prevent multiple cmds from using the same KSB range
-	 * a command queue reserves a KSB range for the duration of the cmd.
-	 * Each queue, will however, reserve 2 KSB blocks for operations that
-	 * only require single KSB entries (eg. AES context/iv and key) in order
-	 * to avoid allocation contention.  This will reserve at most 10 KSB
-	 * entries, leaving 40 KSB entries available for dynamic allocation.
+	/* The v3 CCP uses key storage blocks (SB) to maintain context for
+	 * certain operations. To prevent multiple cmds from using the same
+	 * SB range a command queue reserves an SB range for the duration of
+	 * the cmd. Each queue, will however, reserve 2 SB blocks for
+	 * operations that only require single SB entries (eg. AES context/iv
+	 * and key) in order to avoid allocation contention.  This will reserve
+	 * at most 10 SB entries, leaving 40 SB entries available for dynamic
+	 * allocation.
+	 *
+	 * The v5 CCP Local Storage Block (LSB) is broken up into 8
+	 * memrory ranges, each of which can be enabled for access by one
+	 * or more queues. Device initialization takes this into account,
+	 * and attempts to assign one region for exclusive use by each
+	 * available queue; the rest are then aggregated as "public" use.
+	 * If there are fewer regions than queues, all regions are shared
+	 * amongst all queues.
 	 */
-	struct mutex ksb_mutex ____cacheline_aligned;
-	DECLARE_BITMAP(ksb, KSB_COUNT);
-	wait_queue_head_t ksb_queue;
-	unsigned int ksb_avail;
-	unsigned int ksb_count;
-	u32 ksb_start;
+	struct mutex sb_mutex ____cacheline_aligned;
+	DECLARE_BITMAP(sb, KSB_COUNT);
+	wait_queue_head_t sb_queue;
+	unsigned int sb_avail;
+	unsigned int sb_count;
+	u32 sb_start;
+
+	/* Bitmap of shared LSBs, if any */
+	DECLARE_BITMAP(lsbmap, SLSB_MAP_SIZE);
 
 	/* Suspend support */
 	unsigned int suspending;
@@ -335,10 +413,11 @@
 
 enum ccp_memtype {
 	CCP_MEMTYPE_SYSTEM = 0,
-	CCP_MEMTYPE_KSB,
+	CCP_MEMTYPE_SB,
 	CCP_MEMTYPE_LOCAL,
 	CCP_MEMTYPE__LAST,
 };
+#define	CCP_MEMTYPE_LSB	CCP_MEMTYPE_KSB
 
 struct ccp_dma_info {
 	dma_addr_t address;
@@ -379,7 +458,7 @@
 	enum ccp_memtype type;
 	union {
 		struct ccp_dma_info dma;
-		u32 ksb;
+		u32 sb;
 	} u;
 };
 
@@ -419,13 +498,14 @@
 	u32 jobid;
 	u32 ioc;
 	u32 soc;
-	u32 ksb_key;
-	u32 ksb_ctx;
+	u32 sb_key;
+	u32 sb_ctx;
 	u32 init;
 	u32 eom;
 
 	struct ccp_mem src;
 	struct ccp_mem dst;
+	struct ccp_mem exp;
 
 	union {
 		struct ccp_aes_op aes;
@@ -435,6 +515,7 @@
 		struct ccp_passthru_op passthru;
 		struct ccp_ecc_op ecc;
 	} u;
+	struct ccp_mem key;
 };
 
 static inline u32 ccp_addr_lo(struct ccp_dma_info *info)
@@ -447,6 +528,70 @@
 	return upper_32_bits(info->address + info->offset) & 0x0000ffff;
 }
 
+/**
+ * descriptor for version 5 CPP commands
+ * 8 32-bit words:
+ * word 0: function; engine; control bits
+ * word 1: length of source data
+ * word 2: low 32 bits of source pointer
+ * word 3: upper 16 bits of source pointer; source memory type
+ * word 4: low 32 bits of destination pointer
+ * word 5: upper 16 bits of destination pointer; destination memory type
+ * word 6: low 32 bits of key pointer
+ * word 7: upper 16 bits of key pointer; key memory type
+ */
+struct dword0 {
+	__le32 soc:1;
+	__le32 ioc:1;
+	__le32 rsvd1:1;
+	__le32 init:1;
+	__le32 eom:1;		/* AES/SHA only */
+	__le32 function:15;
+	__le32 engine:4;
+	__le32 prot:1;
+	__le32 rsvd2:7;
+};
+
+struct dword3 {
+	__le32 src_hi:16;
+	__le32 src_mem:2;
+	__le32 lsb_cxt_id:8;
+	__le32 rsvd1:5;
+	__le32 fixed:1;
+};
+
+union dword4 {
+	__le32 dst_lo;		/* NON-SHA	*/
+	__le32 sha_len_lo;	/* SHA		*/
+};
+
+union dword5 {
+	struct {
+		__le32 dst_hi:16;
+		__le32 dst_mem:2;
+		__le32 rsvd1:13;
+		__le32 fixed:1;
+	} fields;
+	__le32 sha_len_hi;
+};
+
+struct dword7 {
+	__le32 key_hi:16;
+	__le32 key_mem:2;
+	__le32 rsvd1:14;
+};
+
+struct ccp5_desc {
+	struct dword0 dw0;
+	__le32 length;
+	__le32 src_lo;
+	struct dword3 dw3;
+	union dword4 dw4;
+	union dword5 dw5;
+	__le32 key_lo;
+	struct dword7 dw7;
+};
+
 int ccp_pci_init(void);
 void ccp_pci_exit(void);
 
@@ -456,13 +601,48 @@
 void ccp_add_device(struct ccp_device *ccp);
 void ccp_del_device(struct ccp_device *ccp);
 
+extern void ccp_log_error(struct ccp_device *, int);
+
 struct ccp_device *ccp_alloc_struct(struct device *dev);
 bool ccp_queues_suspended(struct ccp_device *ccp);
 int ccp_cmd_queue_thread(void *data);
+int ccp_trng_read(struct hwrng *rng, void *data, size_t max, bool wait);
 
 int ccp_run_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd);
 
+int ccp_register_rng(struct ccp_device *ccp);
+void ccp_unregister_rng(struct ccp_device *ccp);
 int ccp_dmaengine_register(struct ccp_device *ccp);
 void ccp_dmaengine_unregister(struct ccp_device *ccp);
 
+/* Structure for computation functions that are device-specific */
+struct ccp_actions {
+	int (*aes)(struct ccp_op *);
+	int (*xts_aes)(struct ccp_op *);
+	int (*sha)(struct ccp_op *);
+	int (*rsa)(struct ccp_op *);
+	int (*passthru)(struct ccp_op *);
+	int (*ecc)(struct ccp_op *);
+	u32 (*sballoc)(struct ccp_cmd_queue *, unsigned int);
+	void (*sbfree)(struct ccp_cmd_queue *, unsigned int,
+			       unsigned int);
+	unsigned int (*get_free_slots)(struct ccp_cmd_queue *);
+	int (*init)(struct ccp_device *);
+	void (*destroy)(struct ccp_device *);
+	irqreturn_t (*irqhandler)(int, void *);
+};
+
+/* Structure to hold CCP version-specific values */
+struct ccp_vdata {
+	const unsigned int version;
+	void (*setup)(struct ccp_device *);
+	const struct ccp_actions *perform;
+	const unsigned int bar;
+	const unsigned int offset;
+};
+
+extern const struct ccp_vdata ccpv3;
+extern const struct ccp_vdata ccpv5a;
+extern const struct ccp_vdata ccpv5b;
+
 #endif
diff --git a/drivers/crypto/ccp/ccp-dmaengine.c b/drivers/crypto/ccp/ccp-dmaengine.c
index 94f77b0..6553912 100644
--- a/drivers/crypto/ccp/ccp-dmaengine.c
+++ b/drivers/crypto/ccp/ccp-dmaengine.c
@@ -299,12 +299,10 @@
 {
 	struct ccp_dma_desc *desc;
 
-	desc = kmem_cache_alloc(chan->ccp->dma_desc_cache, GFP_NOWAIT);
+	desc = kmem_cache_zalloc(chan->ccp->dma_desc_cache, GFP_NOWAIT);
 	if (!desc)
 		return NULL;
 
-	memset(desc, 0, sizeof(*desc));
-
 	dma_async_tx_descriptor_init(&desc->tx_desc, &chan->dma_chan);
 	desc->tx_desc.flags = flags;
 	desc->tx_desc.tx_submit = ccp_tx_submit;
@@ -650,8 +648,11 @@
 	dma_desc_cache_name = devm_kasprintf(ccp->dev, GFP_KERNEL,
 					     "%s-dmaengine-desc-cache",
 					     ccp->name);
-	if (!dma_cmd_cache_name)
-		return -ENOMEM;
+	if (!dma_desc_cache_name) {
+		ret = -ENOMEM;
+		goto err_cache;
+	}
+
 	ccp->dma_desc_cache = kmem_cache_create(dma_desc_cache_name,
 						sizeof(struct ccp_dma_desc),
 						sizeof(void *),
diff --git a/drivers/crypto/ccp/ccp-ops.c b/drivers/crypto/ccp/ccp-ops.c
index ffa2891..50fae44 100644
--- a/drivers/crypto/ccp/ccp-ops.c
+++ b/drivers/crypto/ccp/ccp-ops.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2013,2016 Advanced Micro Devices, Inc.
  *
  * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ * Author: Gary R Hook <gary.hook@amd.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -20,72 +21,28 @@
 #include "ccp-dev.h"
 
 /* SHA initial context values */
-static const __be32 ccp_sha1_init[CCP_SHA_CTXSIZE / sizeof(__be32)] = {
+static const __be32 ccp_sha1_init[SHA1_DIGEST_SIZE / sizeof(__be32)] = {
 	cpu_to_be32(SHA1_H0), cpu_to_be32(SHA1_H1),
 	cpu_to_be32(SHA1_H2), cpu_to_be32(SHA1_H3),
-	cpu_to_be32(SHA1_H4), 0, 0, 0,
+	cpu_to_be32(SHA1_H4),
 };
 
-static const __be32 ccp_sha224_init[CCP_SHA_CTXSIZE / sizeof(__be32)] = {
+static const __be32 ccp_sha224_init[SHA256_DIGEST_SIZE / sizeof(__be32)] = {
 	cpu_to_be32(SHA224_H0), cpu_to_be32(SHA224_H1),
 	cpu_to_be32(SHA224_H2), cpu_to_be32(SHA224_H3),
 	cpu_to_be32(SHA224_H4), cpu_to_be32(SHA224_H5),
 	cpu_to_be32(SHA224_H6), cpu_to_be32(SHA224_H7),
 };
 
-static const __be32 ccp_sha256_init[CCP_SHA_CTXSIZE / sizeof(__be32)] = {
+static const __be32 ccp_sha256_init[SHA256_DIGEST_SIZE / sizeof(__be32)] = {
 	cpu_to_be32(SHA256_H0), cpu_to_be32(SHA256_H1),
 	cpu_to_be32(SHA256_H2), cpu_to_be32(SHA256_H3),
 	cpu_to_be32(SHA256_H4), cpu_to_be32(SHA256_H5),
 	cpu_to_be32(SHA256_H6), cpu_to_be32(SHA256_H7),
 };
 
-static u32 ccp_alloc_ksb(struct ccp_device *ccp, unsigned int count)
-{
-	int start;
-
-	for (;;) {
-		mutex_lock(&ccp->ksb_mutex);
-
-		start = (u32)bitmap_find_next_zero_area(ccp->ksb,
-							ccp->ksb_count,
-							ccp->ksb_start,
-							count, 0);
-		if (start <= ccp->ksb_count) {
-			bitmap_set(ccp->ksb, start, count);
-
-			mutex_unlock(&ccp->ksb_mutex);
-			break;
-		}
-
-		ccp->ksb_avail = 0;
-
-		mutex_unlock(&ccp->ksb_mutex);
-
-		/* Wait for KSB entries to become available */
-		if (wait_event_interruptible(ccp->ksb_queue, ccp->ksb_avail))
-			return 0;
-	}
-
-	return KSB_START + start;
-}
-
-static void ccp_free_ksb(struct ccp_device *ccp, unsigned int start,
-			 unsigned int count)
-{
-	if (!start)
-		return;
-
-	mutex_lock(&ccp->ksb_mutex);
-
-	bitmap_clear(ccp->ksb, start - KSB_START, count);
-
-	ccp->ksb_avail = 1;
-
-	mutex_unlock(&ccp->ksb_mutex);
-
-	wake_up_interruptible_all(&ccp->ksb_queue);
-}
+#define	CCP_NEW_JOBID(ccp)	((ccp->vdata->version == CCP_VERSION(3, 0)) ? \
+					ccp_gen_jobid(ccp) : 0)
 
 static u32 ccp_gen_jobid(struct ccp_device *ccp)
 {
@@ -231,7 +188,7 @@
 				   unsigned int len, unsigned int se_len,
 				   bool sign_extend)
 {
-	unsigned int nbytes, sg_offset, dm_offset, ksb_len, i;
+	unsigned int nbytes, sg_offset, dm_offset, sb_len, i;
 	u8 buffer[CCP_REVERSE_BUF_SIZE];
 
 	if (WARN_ON(se_len > sizeof(buffer)))
@@ -241,21 +198,21 @@
 	dm_offset = 0;
 	nbytes = len;
 	while (nbytes) {
-		ksb_len = min_t(unsigned int, nbytes, se_len);
-		sg_offset -= ksb_len;
+		sb_len = min_t(unsigned int, nbytes, se_len);
+		sg_offset -= sb_len;
 
-		scatterwalk_map_and_copy(buffer, sg, sg_offset, ksb_len, 0);
-		for (i = 0; i < ksb_len; i++)
-			wa->address[dm_offset + i] = buffer[ksb_len - i - 1];
+		scatterwalk_map_and_copy(buffer, sg, sg_offset, sb_len, 0);
+		for (i = 0; i < sb_len; i++)
+			wa->address[dm_offset + i] = buffer[sb_len - i - 1];
 
-		dm_offset += ksb_len;
-		nbytes -= ksb_len;
+		dm_offset += sb_len;
+		nbytes -= sb_len;
 
-		if ((ksb_len != se_len) && sign_extend) {
+		if ((sb_len != se_len) && sign_extend) {
 			/* Must sign-extend to nearest sign-extend length */
 			if (wa->address[dm_offset - 1] & 0x80)
 				memset(wa->address + dm_offset, 0xff,
-				       se_len - ksb_len);
+				       se_len - sb_len);
 		}
 	}
 
@@ -266,22 +223,22 @@
 				    struct scatterlist *sg,
 				    unsigned int len)
 {
-	unsigned int nbytes, sg_offset, dm_offset, ksb_len, i;
+	unsigned int nbytes, sg_offset, dm_offset, sb_len, i;
 	u8 buffer[CCP_REVERSE_BUF_SIZE];
 
 	sg_offset = 0;
 	dm_offset = len;
 	nbytes = len;
 	while (nbytes) {
-		ksb_len = min_t(unsigned int, nbytes, sizeof(buffer));
-		dm_offset -= ksb_len;
+		sb_len = min_t(unsigned int, nbytes, sizeof(buffer));
+		dm_offset -= sb_len;
 
-		for (i = 0; i < ksb_len; i++)
-			buffer[ksb_len - i - 1] = wa->address[dm_offset + i];
-		scatterwalk_map_and_copy(buffer, sg, sg_offset, ksb_len, 1);
+		for (i = 0; i < sb_len; i++)
+			buffer[sb_len - i - 1] = wa->address[dm_offset + i];
+		scatterwalk_map_and_copy(buffer, sg, sg_offset, sb_len, 1);
 
-		sg_offset += ksb_len;
-		nbytes -= ksb_len;
+		sg_offset += sb_len;
+		nbytes -= sb_len;
 	}
 }
 
@@ -449,9 +406,9 @@
 	}
 }
 
-static int ccp_copy_to_from_ksb(struct ccp_cmd_queue *cmd_q,
-				struct ccp_dm_workarea *wa, u32 jobid, u32 ksb,
-				u32 byte_swap, bool from)
+static int ccp_copy_to_from_sb(struct ccp_cmd_queue *cmd_q,
+			       struct ccp_dm_workarea *wa, u32 jobid, u32 sb,
+			       u32 byte_swap, bool from)
 {
 	struct ccp_op op;
 
@@ -463,8 +420,8 @@
 
 	if (from) {
 		op.soc = 1;
-		op.src.type = CCP_MEMTYPE_KSB;
-		op.src.u.ksb = ksb;
+		op.src.type = CCP_MEMTYPE_SB;
+		op.src.u.sb = sb;
 		op.dst.type = CCP_MEMTYPE_SYSTEM;
 		op.dst.u.dma.address = wa->dma.address;
 		op.dst.u.dma.length = wa->length;
@@ -472,27 +429,27 @@
 		op.src.type = CCP_MEMTYPE_SYSTEM;
 		op.src.u.dma.address = wa->dma.address;
 		op.src.u.dma.length = wa->length;
-		op.dst.type = CCP_MEMTYPE_KSB;
-		op.dst.u.ksb = ksb;
+		op.dst.type = CCP_MEMTYPE_SB;
+		op.dst.u.sb = sb;
 	}
 
 	op.u.passthru.byte_swap = byte_swap;
 
-	return cmd_q->ccp->vdata->perform->perform_passthru(&op);
+	return cmd_q->ccp->vdata->perform->passthru(&op);
 }
 
-static int ccp_copy_to_ksb(struct ccp_cmd_queue *cmd_q,
-			   struct ccp_dm_workarea *wa, u32 jobid, u32 ksb,
-			   u32 byte_swap)
+static int ccp_copy_to_sb(struct ccp_cmd_queue *cmd_q,
+			  struct ccp_dm_workarea *wa, u32 jobid, u32 sb,
+			  u32 byte_swap)
 {
-	return ccp_copy_to_from_ksb(cmd_q, wa, jobid, ksb, byte_swap, false);
+	return ccp_copy_to_from_sb(cmd_q, wa, jobid, sb, byte_swap, false);
 }
 
-static int ccp_copy_from_ksb(struct ccp_cmd_queue *cmd_q,
-			     struct ccp_dm_workarea *wa, u32 jobid, u32 ksb,
-			     u32 byte_swap)
+static int ccp_copy_from_sb(struct ccp_cmd_queue *cmd_q,
+			    struct ccp_dm_workarea *wa, u32 jobid, u32 sb,
+			    u32 byte_swap)
 {
-	return ccp_copy_to_from_ksb(cmd_q, wa, jobid, ksb, byte_swap, true);
+	return ccp_copy_to_from_sb(cmd_q, wa, jobid, sb, byte_swap, true);
 }
 
 static int ccp_run_aes_cmac_cmd(struct ccp_cmd_queue *cmd_q,
@@ -527,54 +484,54 @@
 			return -EINVAL;
 	}
 
-	BUILD_BUG_ON(CCP_AES_KEY_KSB_COUNT != 1);
-	BUILD_BUG_ON(CCP_AES_CTX_KSB_COUNT != 1);
+	BUILD_BUG_ON(CCP_AES_KEY_SB_COUNT != 1);
+	BUILD_BUG_ON(CCP_AES_CTX_SB_COUNT != 1);
 
 	ret = -EIO;
 	memset(&op, 0, sizeof(op));
 	op.cmd_q = cmd_q;
-	op.jobid = ccp_gen_jobid(cmd_q->ccp);
-	op.ksb_key = cmd_q->ksb_key;
-	op.ksb_ctx = cmd_q->ksb_ctx;
+	op.jobid = CCP_NEW_JOBID(cmd_q->ccp);
+	op.sb_key = cmd_q->sb_key;
+	op.sb_ctx = cmd_q->sb_ctx;
 	op.init = 1;
 	op.u.aes.type = aes->type;
 	op.u.aes.mode = aes->mode;
 	op.u.aes.action = aes->action;
 
-	/* All supported key sizes fit in a single (32-byte) KSB entry
+	/* All supported key sizes fit in a single (32-byte) SB entry
 	 * and must be in little endian format. Use the 256-bit byte
 	 * swap passthru option to convert from big endian to little
 	 * endian.
 	 */
 	ret = ccp_init_dm_workarea(&key, cmd_q,
-				   CCP_AES_KEY_KSB_COUNT * CCP_KSB_BYTES,
+				   CCP_AES_KEY_SB_COUNT * CCP_SB_BYTES,
 				   DMA_TO_DEVICE);
 	if (ret)
 		return ret;
 
-	dm_offset = CCP_KSB_BYTES - aes->key_len;
+	dm_offset = CCP_SB_BYTES - aes->key_len;
 	ccp_set_dm_area(&key, dm_offset, aes->key, 0, aes->key_len);
-	ret = ccp_copy_to_ksb(cmd_q, &key, op.jobid, op.ksb_key,
-			      CCP_PASSTHRU_BYTESWAP_256BIT);
+	ret = ccp_copy_to_sb(cmd_q, &key, op.jobid, op.sb_key,
+			     CCP_PASSTHRU_BYTESWAP_256BIT);
 	if (ret) {
 		cmd->engine_error = cmd_q->cmd_error;
 		goto e_key;
 	}
 
-	/* The AES context fits in a single (32-byte) KSB entry and
+	/* The AES context fits in a single (32-byte) SB entry and
 	 * must be in little endian format. Use the 256-bit byte swap
 	 * passthru option to convert from big endian to little endian.
 	 */
 	ret = ccp_init_dm_workarea(&ctx, cmd_q,
-				   CCP_AES_CTX_KSB_COUNT * CCP_KSB_BYTES,
+				   CCP_AES_CTX_SB_COUNT * CCP_SB_BYTES,
 				   DMA_BIDIRECTIONAL);
 	if (ret)
 		goto e_key;
 
-	dm_offset = CCP_KSB_BYTES - AES_BLOCK_SIZE;
+	dm_offset = CCP_SB_BYTES - AES_BLOCK_SIZE;
 	ccp_set_dm_area(&ctx, dm_offset, aes->iv, 0, aes->iv_len);
-	ret = ccp_copy_to_ksb(cmd_q, &ctx, op.jobid, op.ksb_ctx,
-			      CCP_PASSTHRU_BYTESWAP_256BIT);
+	ret = ccp_copy_to_sb(cmd_q, &ctx, op.jobid, op.sb_ctx,
+			     CCP_PASSTHRU_BYTESWAP_256BIT);
 	if (ret) {
 		cmd->engine_error = cmd_q->cmd_error;
 		goto e_ctx;
@@ -592,9 +549,9 @@
 			op.eom = 1;
 
 			/* Push the K1/K2 key to the CCP now */
-			ret = ccp_copy_from_ksb(cmd_q, &ctx, op.jobid,
-						op.ksb_ctx,
-						CCP_PASSTHRU_BYTESWAP_256BIT);
+			ret = ccp_copy_from_sb(cmd_q, &ctx, op.jobid,
+					       op.sb_ctx,
+					       CCP_PASSTHRU_BYTESWAP_256BIT);
 			if (ret) {
 				cmd->engine_error = cmd_q->cmd_error;
 				goto e_src;
@@ -602,15 +559,15 @@
 
 			ccp_set_dm_area(&ctx, 0, aes->cmac_key, 0,
 					aes->cmac_key_len);
-			ret = ccp_copy_to_ksb(cmd_q, &ctx, op.jobid, op.ksb_ctx,
-					      CCP_PASSTHRU_BYTESWAP_256BIT);
+			ret = ccp_copy_to_sb(cmd_q, &ctx, op.jobid, op.sb_ctx,
+					     CCP_PASSTHRU_BYTESWAP_256BIT);
 			if (ret) {
 				cmd->engine_error = cmd_q->cmd_error;
 				goto e_src;
 			}
 		}
 
-		ret = cmd_q->ccp->vdata->perform->perform_aes(&op);
+		ret = cmd_q->ccp->vdata->perform->aes(&op);
 		if (ret) {
 			cmd->engine_error = cmd_q->cmd_error;
 			goto e_src;
@@ -622,15 +579,15 @@
 	/* Retrieve the AES context - convert from LE to BE using
 	 * 32-byte (256-bit) byteswapping
 	 */
-	ret = ccp_copy_from_ksb(cmd_q, &ctx, op.jobid, op.ksb_ctx,
-				CCP_PASSTHRU_BYTESWAP_256BIT);
+	ret = ccp_copy_from_sb(cmd_q, &ctx, op.jobid, op.sb_ctx,
+			       CCP_PASSTHRU_BYTESWAP_256BIT);
 	if (ret) {
 		cmd->engine_error = cmd_q->cmd_error;
 		goto e_src;
 	}
 
 	/* ...but we only need AES_BLOCK_SIZE bytes */
-	dm_offset = CCP_KSB_BYTES - AES_BLOCK_SIZE;
+	dm_offset = CCP_SB_BYTES - AES_BLOCK_SIZE;
 	ccp_get_dm_area(&ctx, dm_offset, aes->iv, 0, aes->iv_len);
 
 e_src:
@@ -680,56 +637,56 @@
 			return -EINVAL;
 	}
 
-	BUILD_BUG_ON(CCP_AES_KEY_KSB_COUNT != 1);
-	BUILD_BUG_ON(CCP_AES_CTX_KSB_COUNT != 1);
+	BUILD_BUG_ON(CCP_AES_KEY_SB_COUNT != 1);
+	BUILD_BUG_ON(CCP_AES_CTX_SB_COUNT != 1);
 
 	ret = -EIO;
 	memset(&op, 0, sizeof(op));
 	op.cmd_q = cmd_q;
-	op.jobid = ccp_gen_jobid(cmd_q->ccp);
-	op.ksb_key = cmd_q->ksb_key;
-	op.ksb_ctx = cmd_q->ksb_ctx;
+	op.jobid = CCP_NEW_JOBID(cmd_q->ccp);
+	op.sb_key = cmd_q->sb_key;
+	op.sb_ctx = cmd_q->sb_ctx;
 	op.init = (aes->mode == CCP_AES_MODE_ECB) ? 0 : 1;
 	op.u.aes.type = aes->type;
 	op.u.aes.mode = aes->mode;
 	op.u.aes.action = aes->action;
 
-	/* All supported key sizes fit in a single (32-byte) KSB entry
+	/* All supported key sizes fit in a single (32-byte) SB entry
 	 * and must be in little endian format. Use the 256-bit byte
 	 * swap passthru option to convert from big endian to little
 	 * endian.
 	 */
 	ret = ccp_init_dm_workarea(&key, cmd_q,
-				   CCP_AES_KEY_KSB_COUNT * CCP_KSB_BYTES,
+				   CCP_AES_KEY_SB_COUNT * CCP_SB_BYTES,
 				   DMA_TO_DEVICE);
 	if (ret)
 		return ret;
 
-	dm_offset = CCP_KSB_BYTES - aes->key_len;
+	dm_offset = CCP_SB_BYTES - aes->key_len;
 	ccp_set_dm_area(&key, dm_offset, aes->key, 0, aes->key_len);
-	ret = ccp_copy_to_ksb(cmd_q, &key, op.jobid, op.ksb_key,
-			      CCP_PASSTHRU_BYTESWAP_256BIT);
+	ret = ccp_copy_to_sb(cmd_q, &key, op.jobid, op.sb_key,
+			     CCP_PASSTHRU_BYTESWAP_256BIT);
 	if (ret) {
 		cmd->engine_error = cmd_q->cmd_error;
 		goto e_key;
 	}
 
-	/* The AES context fits in a single (32-byte) KSB entry and
+	/* The AES context fits in a single (32-byte) SB entry and
 	 * must be in little endian format. Use the 256-bit byte swap
 	 * passthru option to convert from big endian to little endian.
 	 */
 	ret = ccp_init_dm_workarea(&ctx, cmd_q,
-				   CCP_AES_CTX_KSB_COUNT * CCP_KSB_BYTES,
+				   CCP_AES_CTX_SB_COUNT * CCP_SB_BYTES,
 				   DMA_BIDIRECTIONAL);
 	if (ret)
 		goto e_key;
 
 	if (aes->mode != CCP_AES_MODE_ECB) {
-		/* Load the AES context - conver to LE */
-		dm_offset = CCP_KSB_BYTES - AES_BLOCK_SIZE;
+		/* Load the AES context - convert to LE */
+		dm_offset = CCP_SB_BYTES - AES_BLOCK_SIZE;
 		ccp_set_dm_area(&ctx, dm_offset, aes->iv, 0, aes->iv_len);
-		ret = ccp_copy_to_ksb(cmd_q, &ctx, op.jobid, op.ksb_ctx,
-				      CCP_PASSTHRU_BYTESWAP_256BIT);
+		ret = ccp_copy_to_sb(cmd_q, &ctx, op.jobid, op.sb_ctx,
+				     CCP_PASSTHRU_BYTESWAP_256BIT);
 		if (ret) {
 			cmd->engine_error = cmd_q->cmd_error;
 			goto e_ctx;
@@ -772,7 +729,7 @@
 				op.soc = 1;
 		}
 
-		ret = cmd_q->ccp->vdata->perform->perform_aes(&op);
+		ret = cmd_q->ccp->vdata->perform->aes(&op);
 		if (ret) {
 			cmd->engine_error = cmd_q->cmd_error;
 			goto e_dst;
@@ -785,15 +742,15 @@
 		/* Retrieve the AES context - convert from LE to BE using
 		 * 32-byte (256-bit) byteswapping
 		 */
-		ret = ccp_copy_from_ksb(cmd_q, &ctx, op.jobid, op.ksb_ctx,
-					CCP_PASSTHRU_BYTESWAP_256BIT);
+		ret = ccp_copy_from_sb(cmd_q, &ctx, op.jobid, op.sb_ctx,
+				       CCP_PASSTHRU_BYTESWAP_256BIT);
 		if (ret) {
 			cmd->engine_error = cmd_q->cmd_error;
 			goto e_dst;
 		}
 
 		/* ...but we only need AES_BLOCK_SIZE bytes */
-		dm_offset = CCP_KSB_BYTES - AES_BLOCK_SIZE;
+		dm_offset = CCP_SB_BYTES - AES_BLOCK_SIZE;
 		ccp_get_dm_area(&ctx, dm_offset, aes->iv, 0, aes->iv_len);
 	}
 
@@ -857,53 +814,53 @@
 	if (!xts->key || !xts->iv || !xts->src || !xts->dst)
 		return -EINVAL;
 
-	BUILD_BUG_ON(CCP_XTS_AES_KEY_KSB_COUNT != 1);
-	BUILD_BUG_ON(CCP_XTS_AES_CTX_KSB_COUNT != 1);
+	BUILD_BUG_ON(CCP_XTS_AES_KEY_SB_COUNT != 1);
+	BUILD_BUG_ON(CCP_XTS_AES_CTX_SB_COUNT != 1);
 
 	ret = -EIO;
 	memset(&op, 0, sizeof(op));
 	op.cmd_q = cmd_q;
-	op.jobid = ccp_gen_jobid(cmd_q->ccp);
-	op.ksb_key = cmd_q->ksb_key;
-	op.ksb_ctx = cmd_q->ksb_ctx;
+	op.jobid = CCP_NEW_JOBID(cmd_q->ccp);
+	op.sb_key = cmd_q->sb_key;
+	op.sb_ctx = cmd_q->sb_ctx;
 	op.init = 1;
 	op.u.xts.action = xts->action;
 	op.u.xts.unit_size = xts->unit_size;
 
-	/* All supported key sizes fit in a single (32-byte) KSB entry
+	/* All supported key sizes fit in a single (32-byte) SB entry
 	 * and must be in little endian format. Use the 256-bit byte
 	 * swap passthru option to convert from big endian to little
 	 * endian.
 	 */
 	ret = ccp_init_dm_workarea(&key, cmd_q,
-				   CCP_XTS_AES_KEY_KSB_COUNT * CCP_KSB_BYTES,
+				   CCP_XTS_AES_KEY_SB_COUNT * CCP_SB_BYTES,
 				   DMA_TO_DEVICE);
 	if (ret)
 		return ret;
 
-	dm_offset = CCP_KSB_BYTES - AES_KEYSIZE_128;
+	dm_offset = CCP_SB_BYTES - AES_KEYSIZE_128;
 	ccp_set_dm_area(&key, dm_offset, xts->key, 0, xts->key_len);
 	ccp_set_dm_area(&key, 0, xts->key, dm_offset, xts->key_len);
-	ret = ccp_copy_to_ksb(cmd_q, &key, op.jobid, op.ksb_key,
-			      CCP_PASSTHRU_BYTESWAP_256BIT);
+	ret = ccp_copy_to_sb(cmd_q, &key, op.jobid, op.sb_key,
+			     CCP_PASSTHRU_BYTESWAP_256BIT);
 	if (ret) {
 		cmd->engine_error = cmd_q->cmd_error;
 		goto e_key;
 	}
 
-	/* The AES context fits in a single (32-byte) KSB entry and
+	/* The AES context fits in a single (32-byte) SB entry and
 	 * for XTS is already in little endian format so no byte swapping
 	 * is needed.
 	 */
 	ret = ccp_init_dm_workarea(&ctx, cmd_q,
-				   CCP_XTS_AES_CTX_KSB_COUNT * CCP_KSB_BYTES,
+				   CCP_XTS_AES_CTX_SB_COUNT * CCP_SB_BYTES,
 				   DMA_BIDIRECTIONAL);
 	if (ret)
 		goto e_key;
 
 	ccp_set_dm_area(&ctx, 0, xts->iv, 0, xts->iv_len);
-	ret = ccp_copy_to_ksb(cmd_q, &ctx, op.jobid, op.ksb_ctx,
-			      CCP_PASSTHRU_BYTESWAP_NOOP);
+	ret = ccp_copy_to_sb(cmd_q, &ctx, op.jobid, op.sb_ctx,
+			     CCP_PASSTHRU_BYTESWAP_NOOP);
 	if (ret) {
 		cmd->engine_error = cmd_q->cmd_error;
 		goto e_ctx;
@@ -937,7 +894,7 @@
 		if (!src.sg_wa.bytes_left)
 			op.eom = 1;
 
-		ret = cmd_q->ccp->vdata->perform->perform_xts_aes(&op);
+		ret = cmd_q->ccp->vdata->perform->xts_aes(&op);
 		if (ret) {
 			cmd->engine_error = cmd_q->cmd_error;
 			goto e_dst;
@@ -949,15 +906,15 @@
 	/* Retrieve the AES context - convert from LE to BE using
 	 * 32-byte (256-bit) byteswapping
 	 */
-	ret = ccp_copy_from_ksb(cmd_q, &ctx, op.jobid, op.ksb_ctx,
-				CCP_PASSTHRU_BYTESWAP_256BIT);
+	ret = ccp_copy_from_sb(cmd_q, &ctx, op.jobid, op.sb_ctx,
+			       CCP_PASSTHRU_BYTESWAP_256BIT);
 	if (ret) {
 		cmd->engine_error = cmd_q->cmd_error;
 		goto e_dst;
 	}
 
 	/* ...but we only need AES_BLOCK_SIZE bytes */
-	dm_offset = CCP_KSB_BYTES - AES_BLOCK_SIZE;
+	dm_offset = CCP_SB_BYTES - AES_BLOCK_SIZE;
 	ccp_get_dm_area(&ctx, dm_offset, xts->iv, 0, xts->iv_len);
 
 e_dst:
@@ -982,164 +939,228 @@
 	struct ccp_dm_workarea ctx;
 	struct ccp_data src;
 	struct ccp_op op;
+	unsigned int ioffset, ooffset;
+	unsigned int digest_size;
+	int sb_count;
+	const void *init;
+	u64 block_size;
+	int ctx_size;
 	int ret;
 
-	if (sha->ctx_len != CCP_SHA_CTXSIZE)
+	switch (sha->type) {
+	case CCP_SHA_TYPE_1:
+		if (sha->ctx_len < SHA1_DIGEST_SIZE)
+			return -EINVAL;
+		block_size = SHA1_BLOCK_SIZE;
+		break;
+	case CCP_SHA_TYPE_224:
+		if (sha->ctx_len < SHA224_DIGEST_SIZE)
+			return -EINVAL;
+		block_size = SHA224_BLOCK_SIZE;
+		break;
+	case CCP_SHA_TYPE_256:
+		if (sha->ctx_len < SHA256_DIGEST_SIZE)
+			return -EINVAL;
+		block_size = SHA256_BLOCK_SIZE;
+		break;
+	default:
 		return -EINVAL;
+	}
 
 	if (!sha->ctx)
 		return -EINVAL;
 
-	if (!sha->final && (sha->src_len & (CCP_SHA_BLOCKSIZE - 1)))
+	if (!sha->final && (sha->src_len & (block_size - 1)))
 		return -EINVAL;
 
-	if (!sha->src_len) {
-		const u8 *sha_zero;
+	/* The version 3 device can't handle zero-length input */
+	if (cmd_q->ccp->vdata->version == CCP_VERSION(3, 0)) {
 
-		/* Not final, just return */
-		if (!sha->final)
+		if (!sha->src_len) {
+			unsigned int digest_len;
+			const u8 *sha_zero;
+
+			/* Not final, just return */
+			if (!sha->final)
+				return 0;
+
+			/* CCP can't do a zero length sha operation so the
+			 * caller must buffer the data.
+			 */
+			if (sha->msg_bits)
+				return -EINVAL;
+
+			/* The CCP cannot perform zero-length sha operations
+			 * so the caller is required to buffer data for the
+			 * final operation. However, a sha operation for a
+			 * message with a total length of zero is valid so
+			 * known values are required to supply the result.
+			 */
+			switch (sha->type) {
+			case CCP_SHA_TYPE_1:
+				sha_zero = sha1_zero_message_hash;
+				digest_len = SHA1_DIGEST_SIZE;
+				break;
+			case CCP_SHA_TYPE_224:
+				sha_zero = sha224_zero_message_hash;
+				digest_len = SHA224_DIGEST_SIZE;
+				break;
+			case CCP_SHA_TYPE_256:
+				sha_zero = sha256_zero_message_hash;
+				digest_len = SHA256_DIGEST_SIZE;
+				break;
+			default:
+				return -EINVAL;
+			}
+
+			scatterwalk_map_and_copy((void *)sha_zero, sha->ctx, 0,
+						 digest_len, 1);
+
 			return 0;
-
-		/* CCP can't do a zero length sha operation so the caller
-		 * must buffer the data.
-		 */
-		if (sha->msg_bits)
-			return -EINVAL;
-
-		/* The CCP cannot perform zero-length sha operations so the
-		 * caller is required to buffer data for the final operation.
-		 * However, a sha operation for a message with a total length
-		 * of zero is valid so known values are required to supply
-		 * the result.
-		 */
-		switch (sha->type) {
-		case CCP_SHA_TYPE_1:
-			sha_zero = sha1_zero_message_hash;
-			break;
-		case CCP_SHA_TYPE_224:
-			sha_zero = sha224_zero_message_hash;
-			break;
-		case CCP_SHA_TYPE_256:
-			sha_zero = sha256_zero_message_hash;
-			break;
-		default:
-			return -EINVAL;
 		}
-
-		scatterwalk_map_and_copy((void *)sha_zero, sha->ctx, 0,
-					 sha->ctx_len, 1);
-
-		return 0;
 	}
 
-	if (!sha->src)
-		return -EINVAL;
+	/* Set variables used throughout */
+	switch (sha->type) {
+	case CCP_SHA_TYPE_1:
+		digest_size = SHA1_DIGEST_SIZE;
+		init = (void *) ccp_sha1_init;
+		ctx_size = SHA1_DIGEST_SIZE;
+		sb_count = 1;
+		if (cmd_q->ccp->vdata->version != CCP_VERSION(3, 0))
+			ooffset = ioffset = CCP_SB_BYTES - SHA1_DIGEST_SIZE;
+		else
+			ooffset = ioffset = 0;
+		break;
+	case CCP_SHA_TYPE_224:
+		digest_size = SHA224_DIGEST_SIZE;
+		init = (void *) ccp_sha224_init;
+		ctx_size = SHA256_DIGEST_SIZE;
+		sb_count = 1;
+		ioffset = 0;
+		if (cmd_q->ccp->vdata->version != CCP_VERSION(3, 0))
+			ooffset = CCP_SB_BYTES - SHA224_DIGEST_SIZE;
+		else
+			ooffset = 0;
+		break;
+	case CCP_SHA_TYPE_256:
+		digest_size = SHA256_DIGEST_SIZE;
+		init = (void *) ccp_sha256_init;
+		ctx_size = SHA256_DIGEST_SIZE;
+		sb_count = 1;
+		ooffset = ioffset = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		goto e_data;
+	}
 
-	BUILD_BUG_ON(CCP_SHA_KSB_COUNT != 1);
+	/* For zero-length plaintext the src pointer is ignored;
+	 * otherwise both parts must be valid
+	 */
+	if (sha->src_len && !sha->src)
+		return -EINVAL;
 
 	memset(&op, 0, sizeof(op));
 	op.cmd_q = cmd_q;
-	op.jobid = ccp_gen_jobid(cmd_q->ccp);
-	op.ksb_ctx = cmd_q->ksb_ctx;
+	op.jobid = CCP_NEW_JOBID(cmd_q->ccp);
+	op.sb_ctx = cmd_q->sb_ctx; /* Pre-allocated */
 	op.u.sha.type = sha->type;
 	op.u.sha.msg_bits = sha->msg_bits;
 
-	/* The SHA context fits in a single (32-byte) KSB entry and
-	 * must be in little endian format. Use the 256-bit byte swap
-	 * passthru option to convert from big endian to little endian.
-	 */
-	ret = ccp_init_dm_workarea(&ctx, cmd_q,
-				   CCP_SHA_KSB_COUNT * CCP_KSB_BYTES,
+	ret = ccp_init_dm_workarea(&ctx, cmd_q, sb_count * CCP_SB_BYTES,
 				   DMA_BIDIRECTIONAL);
 	if (ret)
 		return ret;
-
 	if (sha->first) {
-		const __be32 *init;
-
 		switch (sha->type) {
 		case CCP_SHA_TYPE_1:
-			init = ccp_sha1_init;
-			break;
 		case CCP_SHA_TYPE_224:
-			init = ccp_sha224_init;
-			break;
 		case CCP_SHA_TYPE_256:
-			init = ccp_sha256_init;
+			memcpy(ctx.address + ioffset, init, ctx_size);
 			break;
 		default:
 			ret = -EINVAL;
 			goto e_ctx;
 		}
-		memcpy(ctx.address, init, CCP_SHA_CTXSIZE);
 	} else {
-		ccp_set_dm_area(&ctx, 0, sha->ctx, 0, sha->ctx_len);
+		/* Restore the context */
+		ccp_set_dm_area(&ctx, 0, sha->ctx, 0,
+				sb_count * CCP_SB_BYTES);
 	}
 
-	ret = ccp_copy_to_ksb(cmd_q, &ctx, op.jobid, op.ksb_ctx,
-			      CCP_PASSTHRU_BYTESWAP_256BIT);
+	ret = ccp_copy_to_sb(cmd_q, &ctx, op.jobid, op.sb_ctx,
+			     CCP_PASSTHRU_BYTESWAP_256BIT);
 	if (ret) {
 		cmd->engine_error = cmd_q->cmd_error;
 		goto e_ctx;
 	}
 
-	/* Send data to the CCP SHA engine */
-	ret = ccp_init_data(&src, cmd_q, sha->src, sha->src_len,
-			    CCP_SHA_BLOCKSIZE, DMA_TO_DEVICE);
-	if (ret)
-		goto e_ctx;
+	if (sha->src) {
+		/* Send data to the CCP SHA engine; block_size is set above */
+		ret = ccp_init_data(&src, cmd_q, sha->src, sha->src_len,
+				    block_size, DMA_TO_DEVICE);
+		if (ret)
+			goto e_ctx;
 
-	while (src.sg_wa.bytes_left) {
-		ccp_prepare_data(&src, NULL, &op, CCP_SHA_BLOCKSIZE, false);
-		if (sha->final && !src.sg_wa.bytes_left)
-			op.eom = 1;
+		while (src.sg_wa.bytes_left) {
+			ccp_prepare_data(&src, NULL, &op, block_size, false);
+			if (sha->final && !src.sg_wa.bytes_left)
+				op.eom = 1;
 
-		ret = cmd_q->ccp->vdata->perform->perform_sha(&op);
+			ret = cmd_q->ccp->vdata->perform->sha(&op);
+			if (ret) {
+				cmd->engine_error = cmd_q->cmd_error;
+				goto e_data;
+			}
+
+			ccp_process_data(&src, NULL, &op);
+		}
+	} else {
+		op.eom = 1;
+		ret = cmd_q->ccp->vdata->perform->sha(&op);
 		if (ret) {
 			cmd->engine_error = cmd_q->cmd_error;
 			goto e_data;
 		}
-
-		ccp_process_data(&src, NULL, &op);
 	}
 
 	/* Retrieve the SHA context - convert from LE to BE using
 	 * 32-byte (256-bit) byteswapping to BE
 	 */
-	ret = ccp_copy_from_ksb(cmd_q, &ctx, op.jobid, op.ksb_ctx,
-				CCP_PASSTHRU_BYTESWAP_256BIT);
+	ret = ccp_copy_from_sb(cmd_q, &ctx, op.jobid, op.sb_ctx,
+			       CCP_PASSTHRU_BYTESWAP_256BIT);
 	if (ret) {
 		cmd->engine_error = cmd_q->cmd_error;
 		goto e_data;
 	}
 
-	ccp_get_dm_area(&ctx, 0, sha->ctx, 0, sha->ctx_len);
+	if (sha->final) {
+		/* Finishing up, so get the digest */
+		switch (sha->type) {
+		case CCP_SHA_TYPE_1:
+		case CCP_SHA_TYPE_224:
+		case CCP_SHA_TYPE_256:
+			ccp_get_dm_area(&ctx, ooffset,
+					sha->ctx, 0,
+					digest_size);
+			break;
+		default:
+			ret = -EINVAL;
+			goto e_ctx;
+		}
+	} else {
+		/* Stash the context */
+		ccp_get_dm_area(&ctx, 0, sha->ctx, 0,
+				sb_count * CCP_SB_BYTES);
+	}
 
 	if (sha->final && sha->opad) {
 		/* HMAC operation, recursively perform final SHA */
 		struct ccp_cmd hmac_cmd;
 		struct scatterlist sg;
-		u64 block_size, digest_size;
 		u8 *hmac_buf;
 
-		switch (sha->type) {
-		case CCP_SHA_TYPE_1:
-			block_size = SHA1_BLOCK_SIZE;
-			digest_size = SHA1_DIGEST_SIZE;
-			break;
-		case CCP_SHA_TYPE_224:
-			block_size = SHA224_BLOCK_SIZE;
-			digest_size = SHA224_DIGEST_SIZE;
-			break;
-		case CCP_SHA_TYPE_256:
-			block_size = SHA256_BLOCK_SIZE;
-			digest_size = SHA256_DIGEST_SIZE;
-			break;
-		default:
-			ret = -EINVAL;
-			goto e_data;
-		}
-
 		if (sha->opad_len != block_size) {
 			ret = -EINVAL;
 			goto e_data;
@@ -1153,7 +1174,18 @@
 		sg_init_one(&sg, hmac_buf, block_size + digest_size);
 
 		scatterwalk_map_and_copy(hmac_buf, sha->opad, 0, block_size, 0);
-		memcpy(hmac_buf + block_size, ctx.address, digest_size);
+		switch (sha->type) {
+		case CCP_SHA_TYPE_1:
+		case CCP_SHA_TYPE_224:
+		case CCP_SHA_TYPE_256:
+			memcpy(hmac_buf + block_size,
+			       ctx.address + ooffset,
+			       digest_size);
+			break;
+		default:
+			ret = -EINVAL;
+			goto e_ctx;
+		}
 
 		memset(&hmac_cmd, 0, sizeof(hmac_cmd));
 		hmac_cmd.engine = CCP_ENGINE_SHA;
@@ -1176,7 +1208,8 @@
 	}
 
 e_data:
-	ccp_free_data(&src, cmd_q);
+	if (sha->src)
+		ccp_free_data(&src, cmd_q);
 
 e_ctx:
 	ccp_dm_free(&ctx);
@@ -1190,7 +1223,7 @@
 	struct ccp_dm_workarea exp, src;
 	struct ccp_data dst;
 	struct ccp_op op;
-	unsigned int ksb_count, i_len, o_len;
+	unsigned int sb_count, i_len, o_len;
 	int ret;
 
 	if (rsa->key_size > CCP_RSA_MAX_WIDTH)
@@ -1208,16 +1241,17 @@
 	o_len = ((rsa->key_size + 255) / 256) * 32;
 	i_len = o_len * 2;
 
-	ksb_count = o_len / CCP_KSB_BYTES;
+	sb_count = o_len / CCP_SB_BYTES;
 
 	memset(&op, 0, sizeof(op));
 	op.cmd_q = cmd_q;
 	op.jobid = ccp_gen_jobid(cmd_q->ccp);
-	op.ksb_key = ccp_alloc_ksb(cmd_q->ccp, ksb_count);
-	if (!op.ksb_key)
+	op.sb_key = cmd_q->ccp->vdata->perform->sballoc(cmd_q, sb_count);
+
+	if (!op.sb_key)
 		return -EIO;
 
-	/* The RSA exponent may span multiple (32-byte) KSB entries and must
+	/* The RSA exponent may span multiple (32-byte) SB entries and must
 	 * be in little endian format. Reverse copy each 32-byte chunk
 	 * of the exponent (En chunk to E0 chunk, E(n-1) chunk to E1 chunk)
 	 * and each byte within that chunk and do not perform any byte swap
@@ -1225,14 +1259,14 @@
 	 */
 	ret = ccp_init_dm_workarea(&exp, cmd_q, o_len, DMA_TO_DEVICE);
 	if (ret)
-		goto e_ksb;
+		goto e_sb;
 
 	ret = ccp_reverse_set_dm_area(&exp, rsa->exp, rsa->exp_len,
-				      CCP_KSB_BYTES, false);
+				      CCP_SB_BYTES, false);
 	if (ret)
 		goto e_exp;
-	ret = ccp_copy_to_ksb(cmd_q, &exp, op.jobid, op.ksb_key,
-			      CCP_PASSTHRU_BYTESWAP_NOOP);
+	ret = ccp_copy_to_sb(cmd_q, &exp, op.jobid, op.sb_key,
+			     CCP_PASSTHRU_BYTESWAP_NOOP);
 	if (ret) {
 		cmd->engine_error = cmd_q->cmd_error;
 		goto e_exp;
@@ -1247,12 +1281,12 @@
 		goto e_exp;
 
 	ret = ccp_reverse_set_dm_area(&src, rsa->mod, rsa->mod_len,
-				      CCP_KSB_BYTES, false);
+				      CCP_SB_BYTES, false);
 	if (ret)
 		goto e_src;
 	src.address += o_len;	/* Adjust the address for the copy operation */
 	ret = ccp_reverse_set_dm_area(&src, rsa->src, rsa->src_len,
-				      CCP_KSB_BYTES, false);
+				      CCP_SB_BYTES, false);
 	if (ret)
 		goto e_src;
 	src.address -= o_len;	/* Reset the address to original value */
@@ -1274,7 +1308,7 @@
 	op.u.rsa.mod_size = rsa->key_size;
 	op.u.rsa.input_len = i_len;
 
-	ret = cmd_q->ccp->vdata->perform->perform_rsa(&op);
+	ret = cmd_q->ccp->vdata->perform->rsa(&op);
 	if (ret) {
 		cmd->engine_error = cmd_q->cmd_error;
 		goto e_dst;
@@ -1291,8 +1325,8 @@
 e_exp:
 	ccp_dm_free(&exp);
 
-e_ksb:
-	ccp_free_ksb(cmd_q->ccp, op.ksb_key, ksb_count);
+e_sb:
+	cmd_q->ccp->vdata->perform->sbfree(cmd_q, op.sb_key, sb_count);
 
 	return ret;
 }
@@ -1306,7 +1340,7 @@
 	struct ccp_op op;
 	bool in_place = false;
 	unsigned int i;
-	int ret;
+	int ret = 0;
 
 	if (!pt->final && (pt->src_len & (CCP_PASSTHRU_BLOCKSIZE - 1)))
 		return -EINVAL;
@@ -1321,26 +1355,26 @@
 			return -EINVAL;
 	}
 
-	BUILD_BUG_ON(CCP_PASSTHRU_KSB_COUNT != 1);
+	BUILD_BUG_ON(CCP_PASSTHRU_SB_COUNT != 1);
 
 	memset(&op, 0, sizeof(op));
 	op.cmd_q = cmd_q;
-	op.jobid = ccp_gen_jobid(cmd_q->ccp);
+	op.jobid = CCP_NEW_JOBID(cmd_q->ccp);
 
 	if (pt->bit_mod != CCP_PASSTHRU_BITWISE_NOOP) {
 		/* Load the mask */
-		op.ksb_key = cmd_q->ksb_key;
+		op.sb_key = cmd_q->sb_key;
 
 		ret = ccp_init_dm_workarea(&mask, cmd_q,
-					   CCP_PASSTHRU_KSB_COUNT *
-					   CCP_KSB_BYTES,
+					   CCP_PASSTHRU_SB_COUNT *
+					   CCP_SB_BYTES,
 					   DMA_TO_DEVICE);
 		if (ret)
 			return ret;
 
 		ccp_set_dm_area(&mask, 0, pt->mask, 0, pt->mask_len);
-		ret = ccp_copy_to_ksb(cmd_q, &mask, op.jobid, op.ksb_key,
-				      CCP_PASSTHRU_BYTESWAP_NOOP);
+		ret = ccp_copy_to_sb(cmd_q, &mask, op.jobid, op.sb_key,
+				     CCP_PASSTHRU_BYTESWAP_NOOP);
 		if (ret) {
 			cmd->engine_error = cmd_q->cmd_error;
 			goto e_mask;
@@ -1399,7 +1433,7 @@
 		op.dst.u.dma.offset = dst.sg_wa.sg_used;
 		op.dst.u.dma.length = op.src.u.dma.length;
 
-		ret = cmd_q->ccp->vdata->perform->perform_passthru(&op);
+		ret = cmd_q->ccp->vdata->perform->passthru(&op);
 		if (ret) {
 			cmd->engine_error = cmd_q->cmd_error;
 			goto e_dst;
@@ -1448,7 +1482,7 @@
 			return -EINVAL;
 	}
 
-	BUILD_BUG_ON(CCP_PASSTHRU_KSB_COUNT != 1);
+	BUILD_BUG_ON(CCP_PASSTHRU_SB_COUNT != 1);
 
 	memset(&op, 0, sizeof(op));
 	op.cmd_q = cmd_q;
@@ -1456,13 +1490,13 @@
 
 	if (pt->bit_mod != CCP_PASSTHRU_BITWISE_NOOP) {
 		/* Load the mask */
-		op.ksb_key = cmd_q->ksb_key;
+		op.sb_key = cmd_q->sb_key;
 
 		mask.length = pt->mask_len;
 		mask.dma.address = pt->mask;
 		mask.dma.length = pt->mask_len;
 
-		ret = ccp_copy_to_ksb(cmd_q, &mask, op.jobid, op.ksb_key,
+		ret = ccp_copy_to_sb(cmd_q, &mask, op.jobid, op.sb_key,
 				     CCP_PASSTHRU_BYTESWAP_NOOP);
 		if (ret) {
 			cmd->engine_error = cmd_q->cmd_error;
@@ -1484,7 +1518,7 @@
 	op.dst.u.dma.offset = 0;
 	op.dst.u.dma.length = pt->src_len;
 
-	ret = cmd_q->ccp->vdata->perform->perform_passthru(&op);
+	ret = cmd_q->ccp->vdata->perform->passthru(&op);
 	if (ret)
 		cmd->engine_error = cmd_q->cmd_error;
 
@@ -1514,7 +1548,7 @@
 
 	memset(&op, 0, sizeof(op));
 	op.cmd_q = cmd_q;
-	op.jobid = ccp_gen_jobid(cmd_q->ccp);
+	op.jobid = CCP_NEW_JOBID(cmd_q->ccp);
 
 	/* Concatenate the modulus and the operands. Both the modulus and
 	 * the operands must be in little endian format.  Since the input
@@ -1575,7 +1609,7 @@
 
 	op.u.ecc.function = cmd->u.ecc.function;
 
-	ret = cmd_q->ccp->vdata->perform->perform_ecc(&op);
+	ret = cmd_q->ccp->vdata->perform->ecc(&op);
 	if (ret) {
 		cmd->engine_error = cmd_q->cmd_error;
 		goto e_dst;
@@ -1639,7 +1673,7 @@
 
 	memset(&op, 0, sizeof(op));
 	op.cmd_q = cmd_q;
-	op.jobid = ccp_gen_jobid(cmd_q->ccp);
+	op.jobid = CCP_NEW_JOBID(cmd_q->ccp);
 
 	/* Concatenate the modulus and the operands. Both the modulus and
 	 * the operands must be in little endian format.  Since the input
@@ -1677,7 +1711,7 @@
 		goto e_src;
 	src.address += CCP_ECC_OPERAND_SIZE;
 
-	/* Set the first point Z coordianate to 1 */
+	/* Set the first point Z coordinate to 1 */
 	*src.address = 0x01;
 	src.address += CCP_ECC_OPERAND_SIZE;
 
@@ -1696,7 +1730,7 @@
 			goto e_src;
 		src.address += CCP_ECC_OPERAND_SIZE;
 
-		/* Set the second point Z coordianate to 1 */
+		/* Set the second point Z coordinate to 1 */
 		*src.address = 0x01;
 		src.address += CCP_ECC_OPERAND_SIZE;
 	} else {
@@ -1739,7 +1773,7 @@
 
 	op.u.ecc.function = cmd->u.ecc.function;
 
-	ret = cmd_q->ccp->vdata->perform->perform_ecc(&op);
+	ret = cmd_q->ccp->vdata->perform->ecc(&op);
 	if (ret) {
 		cmd->engine_error = cmd_q->cmd_error;
 		goto e_dst;
@@ -1810,7 +1844,7 @@
 	cmd->engine_error = 0;
 	cmd_q->cmd_error = 0;
 	cmd_q->int_rcvd = 0;
-	cmd_q->free_slots = CMD_Q_DEPTH(ioread32(cmd_q->reg_status));
+	cmd_q->free_slots = cmd_q->ccp->vdata->perform->get_free_slots(cmd_q);
 
 	switch (cmd->engine) {
 	case CCP_ENGINE_AES:
diff --git a/drivers/crypto/ccp/ccp-pci.c b/drivers/crypto/ccp/ccp-pci.c
index 0bf262e..28a9996 100644
--- a/drivers/crypto/ccp/ccp-pci.c
+++ b/drivers/crypto/ccp/ccp-pci.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2013,2016 Advanced Micro Devices, Inc.
  *
  * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ * Author: Gary R Hook <gary.hook@amd.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -25,9 +26,6 @@
 
 #include "ccp-dev.h"
 
-#define IO_BAR				2
-#define IO_OFFSET			0x20000
-
 #define MSIX_VECTORS			2
 
 struct ccp_msix {
@@ -143,10 +141,11 @@
 			free_irq(ccp_pci->msix[ccp_pci->msix_count].vector,
 				 dev);
 		pci_disable_msix(pdev);
-	} else {
+	} else if (ccp->irq) {
 		free_irq(ccp->irq, dev);
 		pci_disable_msi(pdev);
 	}
+	ccp->irq = 0;
 }
 
 static int ccp_find_mmio_area(struct ccp_device *ccp)
@@ -156,10 +155,11 @@
 	resource_size_t io_len;
 	unsigned long io_flags;
 
-	io_flags = pci_resource_flags(pdev, IO_BAR);
-	io_len = pci_resource_len(pdev, IO_BAR);
-	if ((io_flags & IORESOURCE_MEM) && (io_len >= (IO_OFFSET + 0x800)))
-		return IO_BAR;
+	io_flags = pci_resource_flags(pdev, ccp->vdata->bar);
+	io_len = pci_resource_len(pdev, ccp->vdata->bar);
+	if ((io_flags & IORESOURCE_MEM) &&
+	    (io_len >= (ccp->vdata->offset + 0x800)))
+		return ccp->vdata->bar;
 
 	return -EIO;
 }
@@ -216,7 +216,7 @@
 		dev_err(dev, "pci_iomap failed\n");
 		goto e_device;
 	}
-	ccp->io_regs = ccp->io_map + IO_OFFSET;
+	ccp->io_regs = ccp->io_map + ccp->vdata->offset;
 
 	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(48));
 	if (ret) {
@@ -230,6 +230,9 @@
 
 	dev_set_drvdata(dev, ccp);
 
+	if (ccp->vdata->setup)
+		ccp->vdata->setup(ccp);
+
 	ret = ccp->vdata->perform->init(ccp);
 	if (ret)
 		goto e_iomap;
@@ -322,6 +325,8 @@
 
 static const struct pci_device_id ccp_pci_table[] = {
 	{ PCI_VDEVICE(AMD, 0x1537), (kernel_ulong_t)&ccpv3 },
+	{ PCI_VDEVICE(AMD, 0x1456), (kernel_ulong_t)&ccpv5a },
+	{ PCI_VDEVICE(AMD, 0x1468), (kernel_ulong_t)&ccpv5b },
 	/* Last entry must be zero */
 	{ 0, }
 };
diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c
index eee2c7e..e09d405 100644
--- a/drivers/crypto/hifn_795x.c
+++ b/drivers/crypto/hifn_795x.c
@@ -636,20 +636,12 @@
 
 static inline u32 hifn_read_0(struct hifn_device *dev, u32 reg)
 {
-	u32 ret;
-
-	ret = readl(dev->bar[0] + reg);
-
-	return ret;
+	return readl(dev->bar[0] + reg);
 }
 
 static inline u32 hifn_read_1(struct hifn_device *dev, u32 reg)
 {
-	u32 ret;
-
-	ret = readl(dev->bar[1] + reg);
-
-	return ret;
+	return readl(dev->bar[1] + reg);
 }
 
 static inline void hifn_write_0(struct hifn_device *dev, u32 reg, u32 val)
diff --git a/drivers/crypto/img-hash.c b/drivers/crypto/img-hash.c
index 68e8aa9..a2e77b8 100644
--- a/drivers/crypto/img-hash.c
+++ b/drivers/crypto/img-hash.c
@@ -71,6 +71,7 @@
 #define DRIVER_FLAGS_MD5		BIT(21)
 
 #define IMG_HASH_QUEUE_LENGTH		20
+#define IMG_HASH_DMA_BURST		4
 #define IMG_HASH_DMA_THRESHOLD		64
 
 #ifdef __LITTLE_ENDIAN
@@ -102,8 +103,10 @@
 	unsigned long		op;
 
 	size_t			bufcnt;
-	u8 buffer[0] __aligned(sizeof(u32));
 	struct ahash_request	fallback_req;
+
+	/* Zero length buffer must remain last member of struct */
+	u8 buffer[0] __aligned(sizeof(u32));
 };
 
 struct img_hash_ctx {
@@ -340,7 +343,7 @@
 	dma_conf.direction = DMA_MEM_TO_DEV;
 	dma_conf.dst_addr = hdev->bus_addr;
 	dma_conf.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
-	dma_conf.dst_maxburst = 16;
+	dma_conf.dst_maxburst = IMG_HASH_DMA_BURST;
 	dma_conf.device_fc = false;
 
 	err = dmaengine_slave_config(hdev->dma_lch,  &dma_conf);
@@ -361,7 +364,7 @@
 	size_t nbytes, bleft, wsend, len, tbc;
 	struct scatterlist tsg;
 
-	if (!ctx->sg)
+	if (!hdev->req || !ctx->sg)
 		return;
 
 	addr = sg_virt(ctx->sg);
@@ -587,6 +590,32 @@
 	return crypto_ahash_finup(&rctx->fallback_req);
 }
 
+static int img_hash_import(struct ahash_request *req, const void *in)
+{
+	struct img_hash_request_ctx *rctx = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct img_hash_ctx *ctx = crypto_ahash_ctx(tfm);
+
+	ahash_request_set_tfm(&rctx->fallback_req, ctx->fallback);
+	rctx->fallback_req.base.flags = req->base.flags
+		& CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	return crypto_ahash_import(&rctx->fallback_req, in);
+}
+
+static int img_hash_export(struct ahash_request *req, void *out)
+{
+	struct img_hash_request_ctx *rctx = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct img_hash_ctx *ctx = crypto_ahash_ctx(tfm);
+
+	ahash_request_set_tfm(&rctx->fallback_req, ctx->fallback);
+	rctx->fallback_req.base.flags = req->base.flags
+		& CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	return crypto_ahash_export(&rctx->fallback_req, out);
+}
+
 static int img_hash_digest(struct ahash_request *req)
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
@@ -643,10 +672,9 @@
 	return err;
 }
 
-static int img_hash_cra_init(struct crypto_tfm *tfm)
+static int img_hash_cra_init(struct crypto_tfm *tfm, const char *alg_name)
 {
 	struct img_hash_ctx *ctx = crypto_tfm_ctx(tfm);
-	const char *alg_name = crypto_tfm_alg_name(tfm);
 	int err = -ENOMEM;
 
 	ctx->fallback = crypto_alloc_ahash(alg_name, 0,
@@ -658,6 +686,7 @@
 	}
 	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
 				 sizeof(struct img_hash_request_ctx) +
+				 crypto_ahash_reqsize(ctx->fallback) +
 				 IMG_HASH_DMA_THRESHOLD);
 
 	return 0;
@@ -666,6 +695,26 @@
 	return err;
 }
 
+static int img_hash_cra_md5_init(struct crypto_tfm *tfm)
+{
+	return img_hash_cra_init(tfm, "md5-generic");
+}
+
+static int img_hash_cra_sha1_init(struct crypto_tfm *tfm)
+{
+	return img_hash_cra_init(tfm, "sha1-generic");
+}
+
+static int img_hash_cra_sha224_init(struct crypto_tfm *tfm)
+{
+	return img_hash_cra_init(tfm, "sha224-generic");
+}
+
+static int img_hash_cra_sha256_init(struct crypto_tfm *tfm)
+{
+	return img_hash_cra_init(tfm, "sha256-generic");
+}
+
 static void img_hash_cra_exit(struct crypto_tfm *tfm)
 {
 	struct img_hash_ctx *tctx = crypto_tfm_ctx(tfm);
@@ -711,9 +760,12 @@
 		.update = img_hash_update,
 		.final = img_hash_final,
 		.finup = img_hash_finup,
+		.export = img_hash_export,
+		.import = img_hash_import,
 		.digest = img_hash_digest,
 		.halg = {
 			.digestsize = MD5_DIGEST_SIZE,
+			.statesize = sizeof(struct md5_state),
 			.base = {
 				.cra_name = "md5",
 				.cra_driver_name = "img-md5",
@@ -723,7 +775,7 @@
 				CRYPTO_ALG_NEED_FALLBACK,
 				.cra_blocksize = MD5_HMAC_BLOCK_SIZE,
 				.cra_ctxsize = sizeof(struct img_hash_ctx),
-				.cra_init = img_hash_cra_init,
+				.cra_init = img_hash_cra_md5_init,
 				.cra_exit = img_hash_cra_exit,
 				.cra_module = THIS_MODULE,
 			}
@@ -734,9 +786,12 @@
 		.update = img_hash_update,
 		.final = img_hash_final,
 		.finup = img_hash_finup,
+		.export = img_hash_export,
+		.import = img_hash_import,
 		.digest = img_hash_digest,
 		.halg = {
 			.digestsize = SHA1_DIGEST_SIZE,
+			.statesize = sizeof(struct sha1_state),
 			.base = {
 				.cra_name = "sha1",
 				.cra_driver_name = "img-sha1",
@@ -746,7 +801,7 @@
 				CRYPTO_ALG_NEED_FALLBACK,
 				.cra_blocksize = SHA1_BLOCK_SIZE,
 				.cra_ctxsize = sizeof(struct img_hash_ctx),
-				.cra_init = img_hash_cra_init,
+				.cra_init = img_hash_cra_sha1_init,
 				.cra_exit = img_hash_cra_exit,
 				.cra_module = THIS_MODULE,
 			}
@@ -757,9 +812,12 @@
 		.update = img_hash_update,
 		.final = img_hash_final,
 		.finup = img_hash_finup,
+		.export = img_hash_export,
+		.import = img_hash_import,
 		.digest = img_hash_digest,
 		.halg = {
 			.digestsize = SHA224_DIGEST_SIZE,
+			.statesize = sizeof(struct sha256_state),
 			.base = {
 				.cra_name = "sha224",
 				.cra_driver_name = "img-sha224",
@@ -769,7 +827,7 @@
 				CRYPTO_ALG_NEED_FALLBACK,
 				.cra_blocksize = SHA224_BLOCK_SIZE,
 				.cra_ctxsize = sizeof(struct img_hash_ctx),
-				.cra_init = img_hash_cra_init,
+				.cra_init = img_hash_cra_sha224_init,
 				.cra_exit = img_hash_cra_exit,
 				.cra_module = THIS_MODULE,
 			}
@@ -780,9 +838,12 @@
 		.update = img_hash_update,
 		.final = img_hash_final,
 		.finup = img_hash_finup,
+		.export = img_hash_export,
+		.import = img_hash_import,
 		.digest = img_hash_digest,
 		.halg = {
 			.digestsize = SHA256_DIGEST_SIZE,
+			.statesize = sizeof(struct sha256_state),
 			.base = {
 				.cra_name = "sha256",
 				.cra_driver_name = "img-sha256",
@@ -792,7 +853,7 @@
 				CRYPTO_ALG_NEED_FALLBACK,
 				.cra_blocksize = SHA256_BLOCK_SIZE,
 				.cra_ctxsize = sizeof(struct img_hash_ctx),
-				.cra_init = img_hash_cra_init,
+				.cra_init = img_hash_cra_sha256_init,
 				.cra_exit = img_hash_cra_exit,
 				.cra_module = THIS_MODULE,
 			}
@@ -971,7 +1032,7 @@
 	err = img_register_algs(hdev);
 	if (err)
 		goto err_algs;
-	dev_dbg(dev, "Img MD5/SHA1/SHA224/SHA256 Hardware accelerator initialized\n");
+	dev_info(dev, "Img MD5/SHA1/SHA224/SHA256 Hardware accelerator initialized\n");
 
 	return 0;
 
@@ -1013,11 +1074,38 @@
 	return 0;
 }
 
+#ifdef CONFIG_PM_SLEEP
+static int img_hash_suspend(struct device *dev)
+{
+	struct img_hash_dev *hdev = dev_get_drvdata(dev);
+
+	clk_disable_unprepare(hdev->hash_clk);
+	clk_disable_unprepare(hdev->sys_clk);
+
+	return 0;
+}
+
+static int img_hash_resume(struct device *dev)
+{
+	struct img_hash_dev *hdev = dev_get_drvdata(dev);
+
+	clk_prepare_enable(hdev->hash_clk);
+	clk_prepare_enable(hdev->sys_clk);
+
+	return 0;
+}
+#endif /* CONFIG_PM_SLEEP */
+
+static const struct dev_pm_ops img_hash_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(img_hash_suspend, img_hash_resume)
+};
+
 static struct platform_driver img_hash_driver = {
 	.probe		= img_hash_probe,
 	.remove		= img_hash_remove,
 	.driver		= {
 		.name	= "img-hash-accelerator",
+		.pm	= &img_hash_pm_ops,
 		.of_match_table	= of_match_ptr(img_hash_match),
 	}
 };
diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c
index 2296934..7868765 100644
--- a/drivers/crypto/ixp4xx_crypto.c
+++ b/drivers/crypto/ixp4xx_crypto.c
@@ -447,9 +447,8 @@
 
 	if (!npe_running(npe_c)) {
 		ret = npe_load_firmware(npe_c, npe_name(npe_c), dev);
-		if (ret) {
-			return ret;
-		}
+		if (ret)
+			goto npe_release;
 		if (npe_recv_message(npe_c, msg, "STATUS_MSG"))
 			goto npe_error;
 	} else {
@@ -473,7 +472,8 @@
 	default:
 		printk(KERN_ERR "Firmware of %s lacks crypto support\n",
 			npe_name(npe_c));
-		return -ENODEV;
+		ret = -ENODEV;
+		goto npe_release;
 	}
 	/* buffer_pool will also be used to sometimes store the hmac,
 	 * so assure it is large enough
@@ -512,6 +512,7 @@
 err:
 	dma_pool_destroy(ctx_pool);
 	dma_pool_destroy(buffer_pool);
+npe_release:
 	npe_release(npe_c);
 	return ret;
 }
diff --git a/drivers/crypto/marvell/cesa.c b/drivers/crypto/marvell/cesa.c
index d64af86..37dadb2 100644
--- a/drivers/crypto/marvell/cesa.c
+++ b/drivers/crypto/marvell/cesa.c
@@ -166,6 +166,7 @@
 			if (!req)
 				break;
 
+			ctx = crypto_tfm_ctx(req->tfm);
 			mv_cesa_complete_req(ctx, req, 0);
 		}
 	}
diff --git a/drivers/crypto/marvell/hash.c b/drivers/crypto/marvell/hash.c
index 82e0f4e6..9f28468 100644
--- a/drivers/crypto/marvell/hash.c
+++ b/drivers/crypto/marvell/hash.c
@@ -374,7 +374,7 @@
 	.complete = mv_cesa_ahash_complete,
 };
 
-static int mv_cesa_ahash_init(struct ahash_request *req,
+static void mv_cesa_ahash_init(struct ahash_request *req,
 			      struct mv_cesa_op_ctx *tmpl, bool algo_le)
 {
 	struct mv_cesa_ahash_req *creq = ahash_request_ctx(req);
@@ -390,8 +390,6 @@
 	creq->op_tmpl = *tmpl;
 	creq->len = 0;
 	creq->algo_le = algo_le;
-
-	return 0;
 }
 
 static inline int mv_cesa_ahash_cra_init(struct crypto_tfm *tfm)
@@ -405,15 +403,16 @@
 	return 0;
 }
 
-static int mv_cesa_ahash_cache_req(struct ahash_request *req, bool *cached)
+static bool mv_cesa_ahash_cache_req(struct ahash_request *req)
 {
 	struct mv_cesa_ahash_req *creq = ahash_request_ctx(req);
+	bool cached = false;
 
-	if (creq->cache_ptr + req->nbytes < 64 && !creq->last_req) {
-		*cached = true;
+	if (creq->cache_ptr + req->nbytes < CESA_MAX_HASH_BLOCK_SIZE && !creq->last_req) {
+		cached = true;
 
 		if (!req->nbytes)
-			return 0;
+			return cached;
 
 		sg_pcopy_to_buffer(req->src, creq->src_nents,
 				   creq->cache + creq->cache_ptr,
@@ -422,7 +421,7 @@
 		creq->cache_ptr += req->nbytes;
 	}
 
-	return 0;
+	return cached;
 }
 
 static struct mv_cesa_op_ctx *
@@ -455,7 +454,6 @@
 
 static int
 mv_cesa_ahash_dma_add_cache(struct mv_cesa_tdma_chain *chain,
-			    struct mv_cesa_ahash_dma_iter *dma_iter,
 			    struct mv_cesa_ahash_req *creq,
 			    gfp_t flags)
 {
@@ -586,7 +584,7 @@
 	 * Add the cache (left-over data from a previous block) first.
 	 * This will never overflow the SRAM size.
 	 */
-	ret = mv_cesa_ahash_dma_add_cache(&basereq->chain, &iter, creq, flags);
+	ret = mv_cesa_ahash_dma_add_cache(&basereq->chain, creq, flags);
 	if (ret)
 		goto err_free_tdma;
 
@@ -668,7 +666,6 @@
 static int mv_cesa_ahash_req_init(struct ahash_request *req, bool *cached)
 {
 	struct mv_cesa_ahash_req *creq = ahash_request_ctx(req);
-	int ret;
 
 	creq->src_nents = sg_nents_for_len(req->src, req->nbytes);
 	if (creq->src_nents < 0) {
@@ -676,17 +673,15 @@
 		return creq->src_nents;
 	}
 
-	ret = mv_cesa_ahash_cache_req(req, cached);
-	if (ret)
-		return ret;
+	*cached = mv_cesa_ahash_cache_req(req);
 
 	if (*cached)
 		return 0;
 
 	if (cesa_dev->caps->has_tdma)
-		ret = mv_cesa_ahash_dma_req_init(req);
-
-	return ret;
+		return mv_cesa_ahash_dma_req_init(req);
+	else
+		return 0;
 }
 
 static int mv_cesa_ahash_queue_req(struct ahash_request *req)
@@ -805,13 +800,14 @@
 	struct mv_cesa_op_ctx tmpl = { };
 
 	mv_cesa_set_op_cfg(&tmpl, CESA_SA_DESC_CFG_MACM_MD5);
+
+	mv_cesa_ahash_init(req, &tmpl, true);
+
 	creq->state[0] = MD5_H0;
 	creq->state[1] = MD5_H1;
 	creq->state[2] = MD5_H2;
 	creq->state[3] = MD5_H3;
 
-	mv_cesa_ahash_init(req, &tmpl, true);
-
 	return 0;
 }
 
@@ -873,14 +869,15 @@
 	struct mv_cesa_op_ctx tmpl = { };
 
 	mv_cesa_set_op_cfg(&tmpl, CESA_SA_DESC_CFG_MACM_SHA1);
+
+	mv_cesa_ahash_init(req, &tmpl, false);
+
 	creq->state[0] = SHA1_H0;
 	creq->state[1] = SHA1_H1;
 	creq->state[2] = SHA1_H2;
 	creq->state[3] = SHA1_H3;
 	creq->state[4] = SHA1_H4;
 
-	mv_cesa_ahash_init(req, &tmpl, false);
-
 	return 0;
 }
 
@@ -942,6 +939,9 @@
 	struct mv_cesa_op_ctx tmpl = { };
 
 	mv_cesa_set_op_cfg(&tmpl, CESA_SA_DESC_CFG_MACM_SHA256);
+
+	mv_cesa_ahash_init(req, &tmpl, false);
+
 	creq->state[0] = SHA256_H0;
 	creq->state[1] = SHA256_H1;
 	creq->state[2] = SHA256_H2;
@@ -951,8 +951,6 @@
 	creq->state[6] = SHA256_H6;
 	creq->state[7] = SHA256_H7;
 
-	mv_cesa_ahash_init(req, &tmpl, false);
-
 	return 0;
 }
 
diff --git a/drivers/crypto/marvell/tdma.c b/drivers/crypto/marvell/tdma.c
index 86a065b..9fd7a5f 100644
--- a/drivers/crypto/marvell/tdma.c
+++ b/drivers/crypto/marvell/tdma.c
@@ -261,6 +261,7 @@
 	tdma->op = op;
 	tdma->byte_cnt = cpu_to_le32(size | BIT(31));
 	tdma->src = cpu_to_le32(dma_handle);
+	tdma->dst = CESA_SA_CFG_SRAM_OFFSET;
 	tdma->flags = CESA_TDMA_DST_IN_SRAM | CESA_TDMA_OP;
 
 	return op;
diff --git a/drivers/crypto/mv_cesa.c b/drivers/crypto/mv_cesa.c
index e6b658f..104e9ce 100644
--- a/drivers/crypto/mv_cesa.c
+++ b/drivers/crypto/mv_cesa.c
@@ -1091,11 +1091,8 @@
 
 	cp->max_req_size = cp->sram_size - SRAM_CFG_SPACE;
 
-	if (pdev->dev.of_node)
-		irq = irq_of_parse_and_map(pdev->dev.of_node, 0);
-	else
-		irq = platform_get_irq(pdev, 0);
-	if (irq < 0 || irq == NO_IRQ) {
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
 		ret = irq;
 		goto err;
 	}
diff --git a/drivers/crypto/mxc-scc.c b/drivers/crypto/mxc-scc.c
index ff383ef..ee4be1b0 100644
--- a/drivers/crypto/mxc-scc.c
+++ b/drivers/crypto/mxc-scc.c
@@ -668,7 +668,9 @@
 		return PTR_ERR(scc->clk);
 	}
 
-	clk_prepare_enable(scc->clk);
+	ret = clk_prepare_enable(scc->clk);
+	if (ret)
+		return ret;
 
 	/* clear error status register */
 	writel(0x0, scc->base + SCC_SCM_ERROR_STATUS);
diff --git a/drivers/crypto/omap-aes.c b/drivers/crypto/omap-aes.c
index 4ab53a60..fe32dd9 100644
--- a/drivers/crypto/omap-aes.c
+++ b/drivers/crypto/omap-aes.c
@@ -35,7 +35,8 @@
 #include <linux/interrupt.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/aes.h>
-#include <crypto/algapi.h>
+#include <crypto/engine.h>
+#include <crypto/internal/skcipher.h>
 
 #define DST_MAXBURST			4
 #define DMA_MIN				(DST_MAXBURST * sizeof(u32))
@@ -85,6 +86,8 @@
 #define AES_REG_IRQ_DATA_OUT           BIT(2)
 #define DEFAULT_TIMEOUT		(5*HZ)
 
+#define DEFAULT_AUTOSUSPEND_DELAY	1000
+
 #define FLAGS_MODE_MASK		0x000f
 #define FLAGS_ENCRYPT		BIT(0)
 #define FLAGS_CBC		BIT(1)
@@ -103,6 +106,7 @@
 	int		keylen;
 	u32		key[AES_KEYSIZE_256 / sizeof(u32)];
 	unsigned long	flags;
+	struct crypto_skcipher	*fallback;
 };
 
 struct omap_aes_reqctx {
@@ -238,11 +242,19 @@
 
 static int omap_aes_hw_init(struct omap_aes_dev *dd)
 {
+	int err;
+
 	if (!(dd->flags & FLAGS_INIT)) {
 		dd->flags |= FLAGS_INIT;
 		dd->err = 0;
 	}
 
+	err = pm_runtime_get_sync(dd->dev);
+	if (err < 0) {
+		dev_err(dd->dev, "failed to get sync: %d\n", err);
+		return err;
+	}
+
 	return 0;
 }
 
@@ -319,20 +331,12 @@
 
 static struct omap_aes_dev *omap_aes_find_dev(struct omap_aes_ctx *ctx)
 {
-	struct omap_aes_dev *dd = NULL, *tmp;
+	struct omap_aes_dev *dd;
 
 	spin_lock_bh(&list_lock);
-	if (!ctx->dd) {
-		list_for_each_entry(tmp, &dev_list, list) {
-			/* FIXME: take fist available aes core */
-			dd = tmp;
-			break;
-		}
-		ctx->dd = dd;
-	} else {
-		/* already found before */
-		dd = ctx->dd;
-	}
+	dd = list_first_entry(&dev_list, struct omap_aes_dev, list);
+	list_move_tail(&dd->list, &dev_list);
+	ctx->dd = dd;
 	spin_unlock_bh(&list_lock);
 
 	return dd;
@@ -519,7 +523,10 @@
 
 	pr_debug("err: %d\n", err);
 
-	crypto_finalize_request(dd->engine, req, err);
+	crypto_finalize_cipher_request(dd->engine, req, err);
+
+	pm_runtime_mark_last_busy(dd->dev);
+	pm_runtime_put_autosuspend(dd->dev);
 }
 
 static int omap_aes_crypt_dma_stop(struct omap_aes_dev *dd)
@@ -592,7 +599,7 @@
 				 struct ablkcipher_request *req)
 {
 	if (req)
-		return crypto_transfer_request_to_engine(dd->engine, req);
+		return crypto_transfer_cipher_request_to_engine(dd->engine, req);
 
 	return 0;
 }
@@ -602,7 +609,7 @@
 {
 	struct omap_aes_ctx *ctx = crypto_ablkcipher_ctx(
 			crypto_ablkcipher_reqtfm(req));
-	struct omap_aes_dev *dd = omap_aes_find_dev(ctx);
+	struct omap_aes_dev *dd = ctx->dd;
 	struct omap_aes_reqctx *rctx;
 
 	if (!dd)
@@ -648,7 +655,7 @@
 {
 	struct omap_aes_ctx *ctx = crypto_ablkcipher_ctx(
 			crypto_ablkcipher_reqtfm(req));
-	struct omap_aes_dev *dd = omap_aes_find_dev(ctx);
+	struct omap_aes_dev *dd = ctx->dd;
 
 	if (!dd)
 		return -ENODEV;
@@ -696,11 +703,29 @@
 			crypto_ablkcipher_reqtfm(req));
 	struct omap_aes_reqctx *rctx = ablkcipher_request_ctx(req);
 	struct omap_aes_dev *dd;
+	int ret;
 
 	pr_debug("nbytes: %d, enc: %d, cbc: %d\n", req->nbytes,
 		  !!(mode & FLAGS_ENCRYPT),
 		  !!(mode & FLAGS_CBC));
 
+	if (req->nbytes < 200) {
+		SKCIPHER_REQUEST_ON_STACK(subreq, ctx->fallback);
+
+		skcipher_request_set_tfm(subreq, ctx->fallback);
+		skcipher_request_set_callback(subreq, req->base.flags, NULL,
+					      NULL);
+		skcipher_request_set_crypt(subreq, req->src, req->dst,
+					   req->nbytes, req->info);
+
+		if (mode & FLAGS_ENCRYPT)
+			ret = crypto_skcipher_encrypt(subreq);
+		else
+			ret = crypto_skcipher_decrypt(subreq);
+
+		skcipher_request_zero(subreq);
+		return ret;
+	}
 	dd = omap_aes_find_dev(ctx);
 	if (!dd)
 		return -ENODEV;
@@ -716,6 +741,7 @@
 			   unsigned int keylen)
 {
 	struct omap_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	int ret;
 
 	if (keylen != AES_KEYSIZE_128 && keylen != AES_KEYSIZE_192 &&
 		   keylen != AES_KEYSIZE_256)
@@ -726,6 +752,14 @@
 	memcpy(ctx->key, key, keylen);
 	ctx->keylen = keylen;
 
+	crypto_skcipher_clear_flags(ctx->fallback, CRYPTO_TFM_REQ_MASK);
+	crypto_skcipher_set_flags(ctx->fallback, tfm->base.crt_flags &
+						 CRYPTO_TFM_REQ_MASK);
+
+	ret = crypto_skcipher_setkey(ctx->fallback, key, keylen);
+	if (!ret)
+		return 0;
+
 	return 0;
 }
 
@@ -761,22 +795,16 @@
 
 static int omap_aes_cra_init(struct crypto_tfm *tfm)
 {
-	struct omap_aes_dev *dd = NULL;
-	int err;
+	const char *name = crypto_tfm_alg_name(tfm);
+	const u32 flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK;
+	struct omap_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct crypto_skcipher *blk;
 
-	/* Find AES device, currently picks the first device */
-	spin_lock_bh(&list_lock);
-	list_for_each_entry(dd, &dev_list, list) {
-		break;
-	}
-	spin_unlock_bh(&list_lock);
+	blk = crypto_alloc_skcipher(name, 0, flags);
+	if (IS_ERR(blk))
+		return PTR_ERR(blk);
 
-	err = pm_runtime_get_sync(dd->dev);
-	if (err < 0) {
-		dev_err(dd->dev, "%s: failed to get_sync(%d)\n",
-			__func__, err);
-		return err;
-	}
+	ctx->fallback = blk;
 
 	tfm->crt_ablkcipher.reqsize = sizeof(struct omap_aes_reqctx);
 
@@ -785,16 +813,12 @@
 
 static void omap_aes_cra_exit(struct crypto_tfm *tfm)
 {
-	struct omap_aes_dev *dd = NULL;
+	struct omap_aes_ctx *ctx = crypto_tfm_ctx(tfm);
 
-	/* Find AES device, currently picks the first device */
-	spin_lock_bh(&list_lock);
-	list_for_each_entry(dd, &dev_list, list) {
-		break;
-	}
-	spin_unlock_bh(&list_lock);
+	if (ctx->fallback)
+		crypto_free_skcipher(ctx->fallback);
 
-	pm_runtime_put_sync(dd->dev);
+	ctx->fallback = NULL;
 }
 
 /* ********************** ALGS ************************************ */
@@ -806,7 +830,7 @@
 	.cra_priority		= 300,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER |
 				  CRYPTO_ALG_KERN_DRIVER_ONLY |
-				  CRYPTO_ALG_ASYNC,
+				  CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct omap_aes_ctx),
 	.cra_alignmask		= 0,
@@ -828,7 +852,7 @@
 	.cra_priority		= 300,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER |
 				  CRYPTO_ALG_KERN_DRIVER_ONLY |
-				  CRYPTO_ALG_ASYNC,
+				  CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct omap_aes_ctx),
 	.cra_alignmask		= 0,
@@ -854,7 +878,7 @@
 	.cra_priority		= 300,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER |
 				  CRYPTO_ALG_KERN_DRIVER_ONLY |
-				  CRYPTO_ALG_ASYNC,
+				  CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct omap_aes_ctx),
 	.cra_alignmask		= 0,
@@ -1140,6 +1164,9 @@
 	}
 	dd->phys_base = res.start;
 
+	pm_runtime_use_autosuspend(dev);
+	pm_runtime_set_autosuspend_delay(dev, DEFAULT_AUTOSUSPEND_DELAY);
+
 	pm_runtime_enable(dev);
 	err = pm_runtime_get_sync(dev);
 	if (err < 0) {
@@ -1186,6 +1213,19 @@
 	list_add_tail(&dd->list, &dev_list);
 	spin_unlock(&list_lock);
 
+	/* Initialize crypto engine */
+	dd->engine = crypto_engine_alloc_init(dev, 1);
+	if (!dd->engine) {
+		err = -ENOMEM;
+		goto err_engine;
+	}
+
+	dd->engine->prepare_cipher_request = omap_aes_prepare_req;
+	dd->engine->cipher_one_request = omap_aes_crypt_req;
+	err = crypto_engine_start(dd->engine);
+	if (err)
+		goto err_engine;
+
 	for (i = 0; i < dd->pdata->algs_info_size; i++) {
 		if (!dd->pdata->algs_info[i].registered) {
 			for (j = 0; j < dd->pdata->algs_info[i].size; j++) {
@@ -1203,26 +1243,17 @@
 		}
 	}
 
-	/* Initialize crypto engine */
-	dd->engine = crypto_engine_alloc_init(dev, 1);
-	if (!dd->engine)
-		goto err_algs;
-
-	dd->engine->prepare_request = omap_aes_prepare_req;
-	dd->engine->crypt_one_request = omap_aes_crypt_req;
-	err = crypto_engine_start(dd->engine);
-	if (err)
-		goto err_engine;
-
 	return 0;
-err_engine:
-	crypto_engine_exit(dd->engine);
 err_algs:
 	for (i = dd->pdata->algs_info_size - 1; i >= 0; i--)
 		for (j = dd->pdata->algs_info[i].registered - 1; j >= 0; j--)
 			crypto_unregister_alg(
 					&dd->pdata->algs_info[i].algs_list[j]);
 
+err_engine:
+	if (dd->engine)
+		crypto_engine_exit(dd->engine);
+
 	omap_aes_dma_cleanup(dd);
 err_irq:
 	tasklet_kill(&dd->done_task);
diff --git a/drivers/crypto/omap-des.c b/drivers/crypto/omap-des.c
index 5691434..a6f6553 100644
--- a/drivers/crypto/omap-des.c
+++ b/drivers/crypto/omap-des.c
@@ -39,6 +39,7 @@
 #include <crypto/scatterwalk.h>
 #include <crypto/des.h>
 #include <crypto/algapi.h>
+#include <crypto/engine.h>
 
 #define DST_MAXBURST			2
 
@@ -506,7 +507,7 @@
 	pr_debug("err: %d\n", err);
 
 	pm_runtime_put(dd->dev);
-	crypto_finalize_request(dd->engine, req, err);
+	crypto_finalize_cipher_request(dd->engine, req, err);
 }
 
 static int omap_des_crypt_dma_stop(struct omap_des_dev *dd)
@@ -574,7 +575,7 @@
 				 struct ablkcipher_request *req)
 {
 	if (req)
-		return crypto_transfer_request_to_engine(dd->engine, req);
+		return crypto_transfer_cipher_request_to_engine(dd->engine, req);
 
 	return 0;
 }
@@ -1078,6 +1079,19 @@
 	list_add_tail(&dd->list, &dev_list);
 	spin_unlock(&list_lock);
 
+	/* Initialize des crypto engine */
+	dd->engine = crypto_engine_alloc_init(dev, 1);
+	if (!dd->engine) {
+		err = -ENOMEM;
+		goto err_engine;
+	}
+
+	dd->engine->prepare_cipher_request = omap_des_prepare_req;
+	dd->engine->cipher_one_request = omap_des_crypt_req;
+	err = crypto_engine_start(dd->engine);
+	if (err)
+		goto err_engine;
+
 	for (i = 0; i < dd->pdata->algs_info_size; i++) {
 		for (j = 0; j < dd->pdata->algs_info[i].size; j++) {
 			algp = &dd->pdata->algs_info[i].algs_list[j];
@@ -1093,27 +1107,18 @@
 		}
 	}
 
-	/* Initialize des crypto engine */
-	dd->engine = crypto_engine_alloc_init(dev, 1);
-	if (!dd->engine)
-		goto err_algs;
-
-	dd->engine->prepare_request = omap_des_prepare_req;
-	dd->engine->crypt_one_request = omap_des_crypt_req;
-	err = crypto_engine_start(dd->engine);
-	if (err)
-		goto err_engine;
-
 	return 0;
 
-err_engine:
-	crypto_engine_exit(dd->engine);
 err_algs:
 	for (i = dd->pdata->algs_info_size - 1; i >= 0; i--)
 		for (j = dd->pdata->algs_info[i].registered - 1; j >= 0; j--)
 			crypto_unregister_alg(
 					&dd->pdata->algs_info[i].algs_list[j]);
 
+err_engine:
+	if (dd->engine)
+		crypto_engine_exit(dd->engine);
+
 	omap_des_dma_cleanup(dd);
 err_irq:
 	tasklet_kill(&dd->done_task);
diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c
index 7fe4eef..d0b16e5 100644
--- a/drivers/crypto/omap-sham.c
+++ b/drivers/crypto/omap-sham.c
@@ -112,9 +112,10 @@
 #define FLAGS_DMA_READY		6
 #define FLAGS_AUTO_XOR		7
 #define FLAGS_BE32_SHA1		8
+#define FLAGS_SGS_COPIED	9
+#define FLAGS_SGS_ALLOCED	10
 /* context flags */
 #define FLAGS_FINUP		16
-#define FLAGS_SG		17
 
 #define FLAGS_MODE_SHIFT	18
 #define FLAGS_MODE_MASK		(SHA_REG_MODE_ALGO_MASK	<< FLAGS_MODE_SHIFT)
@@ -134,7 +135,8 @@
 #define OMAP_ALIGN_MASK		(sizeof(u32)-1)
 #define OMAP_ALIGNED		__attribute__((aligned(sizeof(u32))))
 
-#define BUFLEN			PAGE_SIZE
+#define BUFLEN			SHA512_BLOCK_SIZE
+#define OMAP_SHA_DMA_THRESHOLD	256
 
 struct omap_sham_dev;
 
@@ -147,12 +149,12 @@
 	size_t			digcnt;
 	size_t			bufcnt;
 	size_t			buflen;
-	dma_addr_t		dma_addr;
 
 	/* walk state */
 	struct scatterlist	*sg;
-	struct scatterlist	sgl;
-	unsigned int		offset;	/* offset in current sg */
+	struct scatterlist	sgl[2];
+	int			offset;	/* offset in current sg */
+	int			sg_len;
 	unsigned int		total;	/* total request */
 
 	u8			buffer[0] OMAP_ALIGNED;
@@ -223,6 +225,7 @@
 	struct dma_chan		*dma_lch;
 	struct tasklet_struct	done_task;
 	u8			polling_mode;
+	u8			xmit_buf[BUFLEN];
 
 	unsigned long		flags;
 	struct crypto_queue	queue;
@@ -510,12 +513,14 @@
 			      SHA_REG_IRQSTATUS_INPUT_RDY);
 }
 
-static int omap_sham_xmit_cpu(struct omap_sham_dev *dd, const u8 *buf,
-			      size_t length, int final)
+static int omap_sham_xmit_cpu(struct omap_sham_dev *dd, size_t length,
+			      int final)
 {
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(dd->req);
 	int count, len32, bs32, offset = 0;
-	const u32 *buffer = (const u32 *)buf;
+	const u32 *buffer;
+	int mlen;
+	struct sg_mapping_iter mi;
 
 	dev_dbg(dd->dev, "xmit_cpu: digcnt: %d, length: %d, final: %d\n",
 						ctx->digcnt, length, final);
@@ -525,6 +530,7 @@
 
 	/* should be non-zero before next lines to disable clocks later */
 	ctx->digcnt += length;
+	ctx->total -= length;
 
 	if (final)
 		set_bit(FLAGS_FINAL, &dd->flags); /* catch last interrupt */
@@ -534,16 +540,35 @@
 	len32 = DIV_ROUND_UP(length, sizeof(u32));
 	bs32 = get_block_size(ctx) / sizeof(u32);
 
+	sg_miter_start(&mi, ctx->sg, ctx->sg_len,
+		       SG_MITER_FROM_SG | SG_MITER_ATOMIC);
+
+	mlen = 0;
+
 	while (len32) {
 		if (dd->pdata->poll_irq(dd))
 			return -ETIMEDOUT;
 
-		for (count = 0; count < min(len32, bs32); count++, offset++)
+		for (count = 0; count < min(len32, bs32); count++, offset++) {
+			if (!mlen) {
+				sg_miter_next(&mi);
+				mlen = mi.length;
+				if (!mlen) {
+					pr_err("sg miter failure.\n");
+					return -EINVAL;
+				}
+				offset = 0;
+				buffer = mi.addr;
+			}
 			omap_sham_write(dd, SHA_REG_DIN(dd, count),
 					buffer[offset]);
+			mlen -= 4;
+		}
 		len32 -= min(len32, bs32);
 	}
 
+	sg_miter_stop(&mi);
+
 	return -EINPROGRESS;
 }
 
@@ -555,22 +580,27 @@
 	tasklet_schedule(&dd->done_task);
 }
 
-static int omap_sham_xmit_dma(struct omap_sham_dev *dd, dma_addr_t dma_addr,
-			      size_t length, int final, int is_sg)
+static int omap_sham_xmit_dma(struct omap_sham_dev *dd, size_t length,
+			      int final)
 {
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(dd->req);
 	struct dma_async_tx_descriptor *tx;
 	struct dma_slave_config cfg;
-	int len32, ret, dma_min = get_block_size(ctx);
+	int ret;
 
 	dev_dbg(dd->dev, "xmit_dma: digcnt: %d, length: %d, final: %d\n",
 						ctx->digcnt, length, final);
 
+	if (!dma_map_sg(dd->dev, ctx->sg, ctx->sg_len, DMA_TO_DEVICE)) {
+		dev_err(dd->dev, "dma_map_sg error\n");
+		return -EINVAL;
+	}
+
 	memset(&cfg, 0, sizeof(cfg));
 
 	cfg.dst_addr = dd->phys_base + SHA_REG_DIN(dd, 0);
 	cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
-	cfg.dst_maxburst = dma_min / DMA_SLAVE_BUSWIDTH_4_BYTES;
+	cfg.dst_maxburst = get_block_size(ctx) / DMA_SLAVE_BUSWIDTH_4_BYTES;
 
 	ret = dmaengine_slave_config(dd->dma_lch, &cfg);
 	if (ret) {
@@ -578,30 +608,12 @@
 		return ret;
 	}
 
-	len32 = DIV_ROUND_UP(length, dma_min) * dma_min;
-
-	if (is_sg) {
-		/*
-		 * The SG entry passed in may not have the 'length' member
-		 * set correctly so use a local SG entry (sgl) with the
-		 * proper value for 'length' instead.  If this is not done,
-		 * the dmaengine may try to DMA the incorrect amount of data.
-		 */
-		sg_init_table(&ctx->sgl, 1);
-		sg_assign_page(&ctx->sgl, sg_page(ctx->sg));
-		ctx->sgl.offset = ctx->sg->offset;
-		sg_dma_len(&ctx->sgl) = len32;
-		sg_dma_address(&ctx->sgl) = sg_dma_address(ctx->sg);
-
-		tx = dmaengine_prep_slave_sg(dd->dma_lch, &ctx->sgl, 1,
-			DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
-	} else {
-		tx = dmaengine_prep_slave_single(dd->dma_lch, dma_addr, len32,
-			DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
-	}
+	tx = dmaengine_prep_slave_sg(dd->dma_lch, ctx->sg, ctx->sg_len,
+				     DMA_MEM_TO_DEV,
+				     DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 
 	if (!tx) {
-		dev_err(dd->dev, "prep_slave_sg/single() failed\n");
+		dev_err(dd->dev, "prep_slave_sg failed\n");
 		return -EINVAL;
 	}
 
@@ -611,6 +623,7 @@
 	dd->pdata->write_ctrl(dd, length, final, 1);
 
 	ctx->digcnt += length;
+	ctx->total -= length;
 
 	if (final)
 		set_bit(FLAGS_FINAL, &dd->flags); /* catch last interrupt */
@@ -625,188 +638,256 @@
 	return -EINPROGRESS;
 }
 
-static size_t omap_sham_append_buffer(struct omap_sham_reqctx *ctx,
-				const u8 *data, size_t length)
+static int omap_sham_copy_sg_lists(struct omap_sham_reqctx *ctx,
+				   struct scatterlist *sg, int bs, int new_len)
 {
-	size_t count = min(length, ctx->buflen - ctx->bufcnt);
+	int n = sg_nents(sg);
+	struct scatterlist *tmp;
+	int offset = ctx->offset;
 
-	count = min(count, ctx->total);
-	if (count <= 0)
-		return 0;
-	memcpy(ctx->buffer + ctx->bufcnt, data, count);
-	ctx->bufcnt += count;
+	if (ctx->bufcnt)
+		n++;
 
-	return count;
+	ctx->sg = kmalloc_array(n, sizeof(*sg), GFP_KERNEL);
+	if (!ctx->sg)
+		return -ENOMEM;
+
+	sg_init_table(ctx->sg, n);
+
+	tmp = ctx->sg;
+
+	ctx->sg_len = 0;
+
+	if (ctx->bufcnt) {
+		sg_set_buf(tmp, ctx->dd->xmit_buf, ctx->bufcnt);
+		tmp = sg_next(tmp);
+		ctx->sg_len++;
+	}
+
+	while (sg && new_len) {
+		int len = sg->length - offset;
+
+		if (offset) {
+			offset -= sg->length;
+			if (offset < 0)
+				offset = 0;
+		}
+
+		if (new_len < len)
+			len = new_len;
+
+		if (len > 0) {
+			new_len -= len;
+			sg_set_page(tmp, sg_page(sg), len, sg->offset);
+			if (new_len <= 0)
+				sg_mark_end(tmp);
+			tmp = sg_next(tmp);
+			ctx->sg_len++;
+		}
+
+		sg = sg_next(sg);
+	}
+
+	set_bit(FLAGS_SGS_ALLOCED, &ctx->dd->flags);
+
+	ctx->bufcnt = 0;
+
+	return 0;
 }
 
-static size_t omap_sham_append_sg(struct omap_sham_reqctx *ctx)
+static int omap_sham_copy_sgs(struct omap_sham_reqctx *ctx,
+			      struct scatterlist *sg, int bs, int new_len)
 {
-	size_t count;
-	const u8 *vaddr;
+	int pages;
+	void *buf;
+	int len;
 
-	while (ctx->sg) {
-		vaddr = kmap_atomic(sg_page(ctx->sg));
-		vaddr += ctx->sg->offset;
+	len = new_len + ctx->bufcnt;
 
-		count = omap_sham_append_buffer(ctx,
-				vaddr + ctx->offset,
-				ctx->sg->length - ctx->offset);
+	pages = get_order(ctx->total);
 
-		kunmap_atomic((void *)vaddr);
+	buf = (void *)__get_free_pages(GFP_ATOMIC, pages);
+	if (!buf) {
+		pr_err("Couldn't allocate pages for unaligned cases.\n");
+		return -ENOMEM;
+	}
 
-		if (!count)
+	if (ctx->bufcnt)
+		memcpy(buf, ctx->dd->xmit_buf, ctx->bufcnt);
+
+	scatterwalk_map_and_copy(buf + ctx->bufcnt, sg, ctx->offset,
+				 ctx->total - ctx->bufcnt, 0);
+	sg_init_table(ctx->sgl, 1);
+	sg_set_buf(ctx->sgl, buf, len);
+	ctx->sg = ctx->sgl;
+	set_bit(FLAGS_SGS_COPIED, &ctx->dd->flags);
+	ctx->sg_len = 1;
+	ctx->bufcnt = 0;
+	ctx->offset = 0;
+
+	return 0;
+}
+
+static int omap_sham_align_sgs(struct scatterlist *sg,
+			       int nbytes, int bs, bool final,
+			       struct omap_sham_reqctx *rctx)
+{
+	int n = 0;
+	bool aligned = true;
+	bool list_ok = true;
+	struct scatterlist *sg_tmp = sg;
+	int new_len;
+	int offset = rctx->offset;
+
+	if (!sg || !sg->length || !nbytes)
+		return 0;
+
+	new_len = nbytes;
+
+	if (offset)
+		list_ok = false;
+
+	if (final)
+		new_len = DIV_ROUND_UP(new_len, bs) * bs;
+	else
+		new_len = new_len / bs * bs;
+
+	while (nbytes > 0 && sg_tmp) {
+		n++;
+
+		if (offset < sg_tmp->length) {
+			if (!IS_ALIGNED(offset + sg_tmp->offset, 4)) {
+				aligned = false;
+				break;
+			}
+
+			if (!IS_ALIGNED(sg_tmp->length - offset, bs)) {
+				aligned = false;
+				break;
+			}
+		}
+
+		if (offset) {
+			offset -= sg_tmp->length;
+			if (offset < 0) {
+				nbytes += offset;
+				offset = 0;
+			}
+		} else {
+			nbytes -= sg_tmp->length;
+		}
+
+		sg_tmp = sg_next(sg_tmp);
+
+		if (nbytes < 0) {
+			list_ok = false;
 			break;
-		ctx->offset += count;
-		ctx->total -= count;
-		if (ctx->offset == ctx->sg->length) {
-			ctx->sg = sg_next(ctx->sg);
-			if (ctx->sg)
-				ctx->offset = 0;
-			else
-				ctx->total = 0;
 		}
 	}
 
+	if (!aligned)
+		return omap_sham_copy_sgs(rctx, sg, bs, new_len);
+	else if (!list_ok)
+		return omap_sham_copy_sg_lists(rctx, sg, bs, new_len);
+
+	rctx->sg_len = n;
+	rctx->sg = sg;
+
 	return 0;
 }
 
-static int omap_sham_xmit_dma_map(struct omap_sham_dev *dd,
-					struct omap_sham_reqctx *ctx,
-					size_t length, int final)
+static int omap_sham_prepare_request(struct ahash_request *req, bool update)
 {
+	struct omap_sham_reqctx *rctx = ahash_request_ctx(req);
+	int bs;
 	int ret;
+	int nbytes;
+	bool final = rctx->flags & BIT(FLAGS_FINUP);
+	int xmit_len, hash_later;
 
-	ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer, ctx->buflen,
-				       DMA_TO_DEVICE);
-	if (dma_mapping_error(dd->dev, ctx->dma_addr)) {
-		dev_err(dd->dev, "dma %u bytes error\n", ctx->buflen);
-		return -EINVAL;
-	}
-
-	ctx->flags &= ~BIT(FLAGS_SG);
-
-	ret = omap_sham_xmit_dma(dd, ctx->dma_addr, length, final, 0);
-	if (ret != -EINPROGRESS)
-		dma_unmap_single(dd->dev, ctx->dma_addr, ctx->buflen,
-				 DMA_TO_DEVICE);
-
-	return ret;
-}
-
-static int omap_sham_update_dma_slow(struct omap_sham_dev *dd)
-{
-	struct omap_sham_reqctx *ctx = ahash_request_ctx(dd->req);
-	unsigned int final;
-	size_t count;
-
-	omap_sham_append_sg(ctx);
-
-	final = (ctx->flags & BIT(FLAGS_FINUP)) && !ctx->total;
-
-	dev_dbg(dd->dev, "slow: bufcnt: %u, digcnt: %d, final: %d\n",
-					 ctx->bufcnt, ctx->digcnt, final);
-
-	if (final || (ctx->bufcnt == ctx->buflen && ctx->total)) {
-		count = ctx->bufcnt;
-		ctx->bufcnt = 0;
-		return omap_sham_xmit_dma_map(dd, ctx, count, final);
-	}
-
-	return 0;
-}
-
-/* Start address alignment */
-#define SG_AA(sg)	(IS_ALIGNED(sg->offset, sizeof(u32)))
-/* SHA1 block size alignment */
-#define SG_SA(sg, bs)	(IS_ALIGNED(sg->length, bs))
-
-static int omap_sham_update_dma_start(struct omap_sham_dev *dd)
-{
-	struct omap_sham_reqctx *ctx = ahash_request_ctx(dd->req);
-	unsigned int length, final, tail;
-	struct scatterlist *sg;
-	int ret, bs;
-
-	if (!ctx->total)
+	if (!req)
 		return 0;
 
-	if (ctx->bufcnt || ctx->offset)
-		return omap_sham_update_dma_slow(dd);
+	bs = get_block_size(rctx);
 
-	/*
-	 * Don't use the sg interface when the transfer size is less
-	 * than the number of elements in a DMA frame.  Otherwise,
-	 * the dmaengine infrastructure will calculate that it needs
-	 * to transfer 0 frames which ultimately fails.
-	 */
-	if (ctx->total < get_block_size(ctx))
-		return omap_sham_update_dma_slow(dd);
+	if (update)
+		nbytes = req->nbytes;
+	else
+		nbytes = 0;
 
-	dev_dbg(dd->dev, "fast: digcnt: %d, bufcnt: %u, total: %u\n",
-			ctx->digcnt, ctx->bufcnt, ctx->total);
+	rctx->total = nbytes + rctx->bufcnt;
 
-	sg = ctx->sg;
-	bs = get_block_size(ctx);
+	if (!rctx->total)
+		return 0;
 
-	if (!SG_AA(sg))
-		return omap_sham_update_dma_slow(dd);
+	if (nbytes && (!IS_ALIGNED(rctx->bufcnt, bs))) {
+		int len = bs - rctx->bufcnt % bs;
 
-	if (!sg_is_last(sg) && !SG_SA(sg, bs))
-		/* size is not BLOCK_SIZE aligned */
-		return omap_sham_update_dma_slow(dd);
+		if (len > nbytes)
+			len = nbytes;
+		scatterwalk_map_and_copy(rctx->buffer + rctx->bufcnt, req->src,
+					 0, len, 0);
+		rctx->bufcnt += len;
+		nbytes -= len;
+		rctx->offset = len;
+	}
 
-	length = min(ctx->total, sg->length);
+	if (rctx->bufcnt)
+		memcpy(rctx->dd->xmit_buf, rctx->buffer, rctx->bufcnt);
 
-	if (sg_is_last(sg)) {
-		if (!(ctx->flags & BIT(FLAGS_FINUP))) {
-			/* not last sg must be BLOCK_SIZE aligned */
-			tail = length & (bs - 1);
-			/* without finup() we need one block to close hash */
-			if (!tail)
-				tail = bs;
-			length -= tail;
+	ret = omap_sham_align_sgs(req->src, nbytes, bs, final, rctx);
+	if (ret)
+		return ret;
+
+	xmit_len = rctx->total;
+
+	if (!IS_ALIGNED(xmit_len, bs)) {
+		if (final)
+			xmit_len = DIV_ROUND_UP(xmit_len, bs) * bs;
+		else
+			xmit_len = xmit_len / bs * bs;
+	}
+
+	hash_later = rctx->total - xmit_len;
+	if (hash_later < 0)
+		hash_later = 0;
+
+	if (rctx->bufcnt && nbytes) {
+		/* have data from previous operation and current */
+		sg_init_table(rctx->sgl, 2);
+		sg_set_buf(rctx->sgl, rctx->dd->xmit_buf, rctx->bufcnt);
+
+		sg_chain(rctx->sgl, 2, req->src);
+
+		rctx->sg = rctx->sgl;
+
+		rctx->sg_len++;
+	} else if (rctx->bufcnt) {
+		/* have buffered data only */
+		sg_init_table(rctx->sgl, 1);
+		sg_set_buf(rctx->sgl, rctx->dd->xmit_buf, xmit_len);
+
+		rctx->sg = rctx->sgl;
+
+		rctx->sg_len = 1;
+	}
+
+	if (hash_later) {
+		if (req->nbytes) {
+			scatterwalk_map_and_copy(rctx->buffer, req->src,
+						 req->nbytes - hash_later,
+						 hash_later, 0);
+		} else {
+			memcpy(rctx->buffer, rctx->buffer + xmit_len,
+			       hash_later);
 		}
+		rctx->bufcnt = hash_later;
+	} else {
+		rctx->bufcnt = 0;
 	}
 
-	if (!dma_map_sg(dd->dev, ctx->sg, 1, DMA_TO_DEVICE)) {
-		dev_err(dd->dev, "dma_map_sg  error\n");
-		return -EINVAL;
-	}
-
-	ctx->flags |= BIT(FLAGS_SG);
-
-	ctx->total -= length;
-	ctx->offset = length; /* offset where to start slow */
-
-	final = (ctx->flags & BIT(FLAGS_FINUP)) && !ctx->total;
-
-	ret = omap_sham_xmit_dma(dd, sg_dma_address(ctx->sg), length, final, 1);
-	if (ret != -EINPROGRESS)
-		dma_unmap_sg(dd->dev, ctx->sg, 1, DMA_TO_DEVICE);
-
-	return ret;
-}
-
-static int omap_sham_update_cpu(struct omap_sham_dev *dd)
-{
-	struct omap_sham_reqctx *ctx = ahash_request_ctx(dd->req);
-	int bufcnt, final;
-
-	if (!ctx->total)
-		return 0;
-
-	omap_sham_append_sg(ctx);
-
-	final = (ctx->flags & BIT(FLAGS_FINUP)) && !ctx->total;
-
-	dev_dbg(dd->dev, "cpu: bufcnt: %u, digcnt: %d, final: %d\n",
-		ctx->bufcnt, ctx->digcnt, final);
-
-	if (final || (ctx->bufcnt == ctx->buflen && ctx->total)) {
-		bufcnt = ctx->bufcnt;
-		ctx->bufcnt = 0;
-		return omap_sham_xmit_cpu(dd, ctx->buffer, bufcnt, final);
-	}
+	if (!final)
+		rctx->total = xmit_len;
 
 	return 0;
 }
@@ -815,18 +896,9 @@
 {
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(dd->req);
 
+	dma_unmap_sg(dd->dev, ctx->sg, ctx->sg_len, DMA_TO_DEVICE);
 
-	if (ctx->flags & BIT(FLAGS_SG)) {
-		dma_unmap_sg(dd->dev, ctx->sg, 1, DMA_TO_DEVICE);
-		if (ctx->sg->length == ctx->offset) {
-			ctx->sg = sg_next(ctx->sg);
-			if (ctx->sg)
-				ctx->offset = 0;
-		}
-	} else {
-		dma_unmap_single(dd->dev, ctx->dma_addr, ctx->buflen,
-				 DMA_TO_DEVICE);
-	}
+	clear_bit(FLAGS_DMA_ACTIVE, &dd->flags);
 
 	return 0;
 }
@@ -887,6 +959,8 @@
 
 	ctx->bufcnt = 0;
 	ctx->digcnt = 0;
+	ctx->total = 0;
+	ctx->offset = 0;
 	ctx->buflen = BUFLEN;
 
 	if (tctx->flags & BIT(FLAGS_HMAC)) {
@@ -909,14 +983,19 @@
 	struct ahash_request *req = dd->req;
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(req);
 	int err;
+	bool final = ctx->flags & BIT(FLAGS_FINUP);
 
 	dev_dbg(dd->dev, "update_req: total: %u, digcnt: %d, finup: %d\n",
 		 ctx->total, ctx->digcnt, (ctx->flags & BIT(FLAGS_FINUP)) != 0);
 
+	if (ctx->total < get_block_size(ctx) ||
+	    ctx->total < OMAP_SHA_DMA_THRESHOLD)
+		ctx->flags |= BIT(FLAGS_CPU);
+
 	if (ctx->flags & BIT(FLAGS_CPU))
-		err = omap_sham_update_cpu(dd);
+		err = omap_sham_xmit_cpu(dd, ctx->total, final);
 	else
-		err = omap_sham_update_dma_start(dd);
+		err = omap_sham_xmit_dma(dd, ctx->total, final);
 
 	/* wait for dma completion before can take more data */
 	dev_dbg(dd->dev, "update: err: %d, digcnt: %d\n", err, ctx->digcnt);
@@ -930,7 +1009,7 @@
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(req);
 	int err = 0, use_dma = 1;
 
-	if ((ctx->bufcnt <= get_block_size(ctx)) || dd->polling_mode)
+	if ((ctx->total <= get_block_size(ctx)) || dd->polling_mode)
 		/*
 		 * faster to handle last block with cpu or
 		 * use cpu when dma is not present.
@@ -938,9 +1017,9 @@
 		use_dma = 0;
 
 	if (use_dma)
-		err = omap_sham_xmit_dma_map(dd, ctx, ctx->bufcnt, 1);
+		err = omap_sham_xmit_dma(dd, ctx->total, 1);
 	else
-		err = omap_sham_xmit_cpu(dd, ctx->buffer, ctx->bufcnt, 1);
+		err = omap_sham_xmit_cpu(dd, ctx->total, 1);
 
 	ctx->bufcnt = 0;
 
@@ -988,6 +1067,17 @@
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(req);
 	struct omap_sham_dev *dd = ctx->dd;
 
+	if (test_bit(FLAGS_SGS_COPIED, &dd->flags))
+		free_pages((unsigned long)sg_virt(ctx->sg),
+			   get_order(ctx->sg->length));
+
+	if (test_bit(FLAGS_SGS_ALLOCED, &dd->flags))
+		kfree(ctx->sg);
+
+	ctx->sg = NULL;
+
+	dd->flags &= ~(BIT(FLAGS_SGS_ALLOCED) | BIT(FLAGS_SGS_COPIED));
+
 	if (!err) {
 		dd->pdata->copy_hash(req, 1);
 		if (test_bit(FLAGS_FINAL, &dd->flags))
@@ -1005,9 +1095,6 @@
 
 	if (req->base.complete)
 		req->base.complete(&req->base, err);
-
-	/* handle new request */
-	tasklet_schedule(&dd->done_task);
 }
 
 static int omap_sham_handle_queue(struct omap_sham_dev *dd,
@@ -1018,6 +1105,7 @@
 	unsigned long flags;
 	int err = 0, ret = 0;
 
+retry:
 	spin_lock_irqsave(&dd->lock, flags);
 	if (req)
 		ret = ahash_enqueue_request(&dd->queue, req);
@@ -1041,6 +1129,10 @@
 	dd->req = req;
 	ctx = ahash_request_ctx(req);
 
+	err = omap_sham_prepare_request(req, ctx->op == OP_UPDATE);
+	if (err)
+		goto err1;
+
 	dev_dbg(dd->dev, "handling new req, op: %lu, nbytes: %d\n",
 						ctx->op, req->nbytes);
 
@@ -1061,11 +1153,19 @@
 		err = omap_sham_final_req(dd);
 	}
 err1:
-	if (err != -EINPROGRESS)
+	dev_dbg(dd->dev, "exit, err: %d\n", err);
+
+	if (err != -EINPROGRESS) {
 		/* done_task will not finish it, so do it here */
 		omap_sham_finish_req(req, err);
+		req = NULL;
 
-	dev_dbg(dd->dev, "exit, err: %d\n", err);
+		/*
+		 * Execute next request immediately if there is anything
+		 * in queue.
+		 */
+		goto retry;
+	}
 
 	return ret;
 }
@@ -1085,34 +1185,15 @@
 {
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(req);
 	struct omap_sham_dev *dd = ctx->dd;
-	int bs = get_block_size(ctx);
 
 	if (!req->nbytes)
 		return 0;
 
-	ctx->total = req->nbytes;
-	ctx->sg = req->src;
-	ctx->offset = 0;
-
-	if (ctx->flags & BIT(FLAGS_FINUP)) {
-		if ((ctx->digcnt + ctx->bufcnt + ctx->total) < 240) {
-			/*
-			* OMAP HW accel works only with buffers >= 9
-			* will switch to bypass in final()
-			* final has the same request and data
-			*/
-			omap_sham_append_sg(ctx);
-			return 0;
-		} else if ((ctx->bufcnt + ctx->total <= bs) ||
-			   dd->polling_mode) {
-			/*
-			 * faster to use CPU for short transfers or
-			 * use cpu when dma is not present.
-			 */
-			ctx->flags |= BIT(FLAGS_CPU);
-		}
-	} else if (ctx->bufcnt + ctx->total < ctx->buflen) {
-		omap_sham_append_sg(ctx);
+	if (ctx->total + req->nbytes < ctx->buflen) {
+		scatterwalk_map_and_copy(ctx->buffer + ctx->bufcnt, req->src,
+					 0, req->nbytes, 0);
+		ctx->bufcnt += req->nbytes;
+		ctx->total += req->nbytes;
 		return 0;
 	}
 
@@ -1137,9 +1218,20 @@
 {
 	struct omap_sham_ctx *tctx = crypto_tfm_ctx(req->base.tfm);
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(req);
+	int offset = 0;
+
+	/*
+	 * If we are running HMAC on limited hardware support, skip
+	 * the ipad in the beginning of the buffer if we are going for
+	 * software fallback algorithm.
+	 */
+	if (test_bit(FLAGS_HMAC, &ctx->flags) &&
+	    !test_bit(FLAGS_AUTO_XOR, &ctx->dd->flags))
+		offset = get_block_size(ctx);
 
 	return omap_sham_shash_digest(tctx->fallback, req->base.flags,
-				      ctx->buffer, ctx->bufcnt, req->result);
+				      ctx->buffer + offset,
+				      ctx->bufcnt - offset, req->result);
 }
 
 static int omap_sham_final(struct ahash_request *req)
@@ -1154,10 +1246,11 @@
 	/*
 	 * OMAP HW accel works only with buffers >= 9.
 	 * HMAC is always >= 9 because ipad == block size.
-	 * If buffersize is less than 240, we use fallback SW encoding,
-	 * as using DMA + HW in this case doesn't provide any benefit.
+	 * If buffersize is less than DMA_THRESHOLD, we use fallback
+	 * SW encoding, as using DMA + HW in this case doesn't provide
+	 * any benefit.
 	 */
-	if ((ctx->digcnt + ctx->bufcnt) < 240)
+	if (!ctx->digcnt && ctx->bufcnt < OMAP_SHA_DMA_THRESHOLD)
 		return omap_sham_final_shash(req);
 	else if (ctx->bufcnt)
 		return omap_sham_enqueue(req, OP_FINAL);
@@ -1323,6 +1416,25 @@
 	}
 }
 
+static int omap_sham_export(struct ahash_request *req, void *out)
+{
+	struct omap_sham_reqctx *rctx = ahash_request_ctx(req);
+
+	memcpy(out, rctx, sizeof(*rctx) + rctx->bufcnt);
+
+	return 0;
+}
+
+static int omap_sham_import(struct ahash_request *req, const void *in)
+{
+	struct omap_sham_reqctx *rctx = ahash_request_ctx(req);
+	const struct omap_sham_reqctx *ctx_in = in;
+
+	memcpy(rctx, in, sizeof(*rctx) + ctx_in->bufcnt);
+
+	return 0;
+}
+
 static struct ahash_alg algs_sha1_md5[] = {
 {
 	.init		= omap_sham_init,
@@ -1341,7 +1453,7 @@
 						CRYPTO_ALG_NEED_FALLBACK,
 		.cra_blocksize		= SHA1_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct omap_sham_ctx),
-		.cra_alignmask		= 0,
+		.cra_alignmask		= OMAP_ALIGN_MASK,
 		.cra_module		= THIS_MODULE,
 		.cra_init		= omap_sham_cra_init,
 		.cra_exit		= omap_sham_cra_exit,
@@ -1440,7 +1552,7 @@
 						CRYPTO_ALG_NEED_FALLBACK,
 		.cra_blocksize		= SHA224_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct omap_sham_ctx),
-		.cra_alignmask		= 0,
+		.cra_alignmask		= OMAP_ALIGN_MASK,
 		.cra_module		= THIS_MODULE,
 		.cra_init		= omap_sham_cra_init,
 		.cra_exit		= omap_sham_cra_exit,
@@ -1462,7 +1574,7 @@
 						CRYPTO_ALG_NEED_FALLBACK,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct omap_sham_ctx),
-		.cra_alignmask		= 0,
+		.cra_alignmask		= OMAP_ALIGN_MASK,
 		.cra_module		= THIS_MODULE,
 		.cra_init		= omap_sham_cra_init,
 		.cra_exit		= omap_sham_cra_exit,
@@ -1535,7 +1647,7 @@
 						CRYPTO_ALG_NEED_FALLBACK,
 		.cra_blocksize		= SHA384_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct omap_sham_ctx),
-		.cra_alignmask		= 0,
+		.cra_alignmask		= OMAP_ALIGN_MASK,
 		.cra_module		= THIS_MODULE,
 		.cra_init		= omap_sham_cra_init,
 		.cra_exit		= omap_sham_cra_exit,
@@ -1557,7 +1669,7 @@
 						CRYPTO_ALG_NEED_FALLBACK,
 		.cra_blocksize		= SHA512_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct omap_sham_ctx),
-		.cra_alignmask		= 0,
+		.cra_alignmask		= OMAP_ALIGN_MASK,
 		.cra_module		= THIS_MODULE,
 		.cra_init		= omap_sham_cra_init,
 		.cra_exit		= omap_sham_cra_exit,
@@ -1624,12 +1736,8 @@
 	}
 
 	if (test_bit(FLAGS_CPU, &dd->flags)) {
-		if (test_and_clear_bit(FLAGS_OUTPUT_READY, &dd->flags)) {
-			/* hash or semi-hash ready */
-			err = omap_sham_update_cpu(dd);
-			if (err != -EINPROGRESS)
-				goto finish;
-		}
+		if (test_and_clear_bit(FLAGS_OUTPUT_READY, &dd->flags))
+			goto finish;
 	} else if (test_bit(FLAGS_DMA_READY, &dd->flags)) {
 		if (test_and_clear_bit(FLAGS_DMA_ACTIVE, &dd->flags)) {
 			omap_sham_update_dma_stop(dd);
@@ -1641,8 +1749,6 @@
 		if (test_and_clear_bit(FLAGS_OUTPUT_READY, &dd->flags)) {
 			/* hash or semi-hash ready */
 			clear_bit(FLAGS_DMA_READY, &dd->flags);
-			err = omap_sham_update_dma_start(dd);
-			if (err != -EINPROGRESS)
 				goto finish;
 		}
 	}
@@ -1653,6 +1759,10 @@
 	dev_dbg(dd->dev, "update done: err: %d\n", err);
 	/* finish curent request */
 	omap_sham_finish_req(dd->req, err);
+
+	/* If we are not busy, process next req */
+	if (!test_bit(FLAGS_BUSY, &dd->flags))
+		omap_sham_handle_queue(dd, NULL);
 }
 
 static irqreturn_t omap_sham_irq_common(struct omap_sham_dev *dd)
@@ -1977,8 +2087,14 @@
 
 	for (i = 0; i < dd->pdata->algs_info_size; i++) {
 		for (j = 0; j < dd->pdata->algs_info[i].size; j++) {
-			err = crypto_register_ahash(
-					&dd->pdata->algs_info[i].algs_list[j]);
+			struct ahash_alg *alg;
+
+			alg = &dd->pdata->algs_info[i].algs_list[j];
+			alg->export = omap_sham_export;
+			alg->import = omap_sham_import;
+			alg->halg.statesize = sizeof(struct omap_sham_reqctx) +
+					      BUFLEN;
+			err = crypto_register_ahash(alg);
 			if (err)
 				goto err_algs;
 
diff --git a/drivers/crypto/qat/qat_c3xxx/adf_c3xxx_hw_data.h b/drivers/crypto/qat/qat_c3xxx/adf_c3xxx_hw_data.h
index 2f2681d..afc9a0a 100644
--- a/drivers/crypto/qat/qat_c3xxx/adf_c3xxx_hw_data.h
+++ b/drivers/crypto/qat/qat_c3xxx/adf_c3xxx_hw_data.h
@@ -55,7 +55,7 @@
 #define ADF_C3XXX_MAX_ACCELERATORS 3
 #define ADF_C3XXX_MAX_ACCELENGINES 6
 #define ADF_C3XXX_ACCELERATORS_REG_OFFSET 16
-#define ADF_C3XXX_ACCELERATORS_MASK 0x3
+#define ADF_C3XXX_ACCELERATORS_MASK 0x7
 #define ADF_C3XXX_ACCELENGINES_MASK 0x3F
 #define ADF_C3XXX_ETR_MAX_BANKS 16
 #define ADF_C3XXX_SMIAPF0_MASK_OFFSET (0x3A000 + 0x28)
diff --git a/drivers/crypto/qat/qat_common/adf_admin.c b/drivers/crypto/qat/qat_common/adf_admin.c
index ce7c462..3744b22 100644
--- a/drivers/crypto/qat/qat_common/adf_admin.c
+++ b/drivers/crypto/qat/qat_common/adf_admin.c
@@ -146,6 +146,7 @@
 	dma_addr_t phy_addr;
 	dma_addr_t const_tbl_addr;
 	void *virt_addr;
+	void *virt_tbl_addr;
 	void __iomem *mailbox_addr;
 	struct mutex lock;	/* protects adf_admin_comms struct */
 };
@@ -251,17 +252,19 @@
 		return -ENOMEM;
 	}
 
-	admin->const_tbl_addr = dma_map_single(&GET_DEV(accel_dev),
-					       (void *) const_tab, 1024,
-					       DMA_TO_DEVICE);
-
-	if (unlikely(dma_mapping_error(&GET_DEV(accel_dev),
-				       admin->const_tbl_addr))) {
+	admin->virt_tbl_addr = dma_zalloc_coherent(&GET_DEV(accel_dev),
+						   PAGE_SIZE,
+						   &admin->const_tbl_addr,
+						   GFP_KERNEL);
+	if (!admin->virt_tbl_addr) {
+		dev_err(&GET_DEV(accel_dev), "Failed to allocate const_tbl\n");
 		dma_free_coherent(&GET_DEV(accel_dev), PAGE_SIZE,
 				  admin->virt_addr, admin->phy_addr);
 		kfree(admin);
 		return -ENOMEM;
 	}
+
+	memcpy(admin->virt_tbl_addr, const_tab, sizeof(const_tab));
 	reg_val = (u64)admin->phy_addr;
 	ADF_CSR_WR(csr, ADF_DH895XCC_ADMINMSGUR_OFFSET, reg_val >> 32);
 	ADF_CSR_WR(csr, ADF_DH895XCC_ADMINMSGLR_OFFSET, reg_val);
@@ -282,9 +285,10 @@
 	if (admin->virt_addr)
 		dma_free_coherent(&GET_DEV(accel_dev), PAGE_SIZE,
 				  admin->virt_addr, admin->phy_addr);
+	if (admin->virt_tbl_addr)
+		dma_free_coherent(&GET_DEV(accel_dev), PAGE_SIZE,
+				  admin->virt_tbl_addr, admin->const_tbl_addr);
 
-	dma_unmap_single(&GET_DEV(accel_dev), admin->const_tbl_addr, 1024,
-			 DMA_TO_DEVICE);
 	mutex_destroy(&admin->lock);
 	kfree(admin);
 	accel_dev->admin = NULL;
diff --git a/drivers/crypto/qat/qat_common/qat_uclo.c b/drivers/crypto/qat/qat_common/qat_uclo.c
index 9b961b3..e2454d9 100644
--- a/drivers/crypto/qat/qat_common/qat_uclo.c
+++ b/drivers/crypto/qat/qat_common/qat_uclo.c
@@ -967,10 +967,6 @@
 	struct icp_qat_uclo_objhandle *obj_handle = handle->obj_handle;
 	unsigned int ae;
 
-	obj_handle->uword_buf = kcalloc(UWORD_CPYBUF_SIZE, sizeof(uint64_t),
-					GFP_KERNEL);
-	if (!obj_handle->uword_buf)
-		return -ENOMEM;
 	obj_handle->encap_uof_obj.beg_uof = obj_handle->obj_hdr->file_buff;
 	obj_handle->encap_uof_obj.obj_hdr = (struct icp_qat_uof_objhdr *)
 					     obj_handle->obj_hdr->file_buff;
@@ -982,6 +978,10 @@
 		pr_err("QAT: UOF incompatible\n");
 		return -EINVAL;
 	}
+	obj_handle->uword_buf = kcalloc(UWORD_CPYBUF_SIZE, sizeof(uint64_t),
+					GFP_KERNEL);
+	if (!obj_handle->uword_buf)
+		return -ENOMEM;
 	obj_handle->ustore_phy_size = ICP_QAT_UCLO_MAX_USTORE;
 	if (!obj_handle->obj_hdr->file_buff ||
 	    !qat_uclo_map_str_table(obj_handle->obj_hdr, ICP_QAT_UOF_STRT,
diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c
index af50825..d0f80c6 100644
--- a/drivers/crypto/rockchip/rk3288_crypto.c
+++ b/drivers/crypto/rockchip/rk3288_crypto.c
@@ -304,11 +304,9 @@
 	usleep_range(10, 20);
 	reset_control_deassert(crypto_info->rst);
 
-	err = devm_add_action(dev, rk_crypto_action, crypto_info);
-	if (err) {
-		reset_control_assert(crypto_info->rst);
+	err = devm_add_action_or_reset(dev, rk_crypto_action, crypto_info);
+	if (err)
 		goto err_crypto;
-	}
 
 	spin_lock_init(&crypto_info->lock);
 
diff --git a/drivers/crypto/sunxi-ss/sun4i-ss-cipher.c b/drivers/crypto/sunxi-ss/sun4i-ss-cipher.c
index 3830d7c..90efd10 100644
--- a/drivers/crypto/sunxi-ss/sun4i-ss-cipher.c
+++ b/drivers/crypto/sunxi-ss/sun4i-ss-cipher.c
@@ -29,7 +29,8 @@
 	u32 tx_cnt = 0;
 	u32 spaces;
 	u32 v;
-	int i, err = 0;
+	int err = 0;
+	unsigned int i;
 	unsigned int ileft = areq->nbytes;
 	unsigned int oleft = areq->nbytes;
 	unsigned int todo;
@@ -139,7 +140,8 @@
 	u32 tx_cnt = 0;
 	u32 v;
 	u32 spaces;
-	int i, err = 0;
+	int err = 0;
+	unsigned int i;
 	unsigned int ileft = areq->nbytes;
 	unsigned int oleft = areq->nbytes;
 	unsigned int todo;
diff --git a/drivers/crypto/sunxi-ss/sun4i-ss-core.c b/drivers/crypto/sunxi-ss/sun4i-ss-core.c
index 107cd2a..3ac6c6c 100644
--- a/drivers/crypto/sunxi-ss/sun4i-ss-core.c
+++ b/drivers/crypto/sunxi-ss/sun4i-ss-core.c
@@ -172,45 +172,45 @@
 },
 {       .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
 	.alg.crypto = {
-			.cra_name = "cbc(des3_ede)",
-			.cra_driver_name = "cbc-des3-sun4i-ss",
-			.cra_priority = 300,
-			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
-			.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER,
-			.cra_ctxsize = sizeof(struct sun4i_req_ctx),
-			.cra_module = THIS_MODULE,
-			.cra_alignmask = 3,
-			.cra_type = &crypto_ablkcipher_type,
-			.cra_init = sun4i_ss_cipher_init,
-			.cra_u.ablkcipher = {
-				.min_keysize    = DES3_EDE_KEY_SIZE,
-				.max_keysize    = DES3_EDE_KEY_SIZE,
-				.ivsize         = DES3_EDE_BLOCK_SIZE,
-				.setkey         = sun4i_ss_des3_setkey,
-				.encrypt        = sun4i_ss_cbc_des3_encrypt,
-				.decrypt        = sun4i_ss_cbc_des3_decrypt,
+		.cra_name = "cbc(des3_ede)",
+		.cra_driver_name = "cbc-des3-sun4i-ss",
+		.cra_priority = 300,
+		.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+		.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER,
+		.cra_ctxsize = sizeof(struct sun4i_req_ctx),
+		.cra_module = THIS_MODULE,
+		.cra_alignmask = 3,
+		.cra_type = &crypto_ablkcipher_type,
+		.cra_init = sun4i_ss_cipher_init,
+		.cra_u.ablkcipher = {
+			.min_keysize    = DES3_EDE_KEY_SIZE,
+			.max_keysize    = DES3_EDE_KEY_SIZE,
+			.ivsize         = DES3_EDE_BLOCK_SIZE,
+			.setkey         = sun4i_ss_des3_setkey,
+			.encrypt        = sun4i_ss_cbc_des3_encrypt,
+			.decrypt        = sun4i_ss_cbc_des3_decrypt,
 		}
 	}
 },
 {       .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
 	.alg.crypto = {
-			.cra_name = "ecb(des3_ede)",
-			.cra_driver_name = "ecb-des3-sun4i-ss",
-			.cra_priority = 300,
-			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
-			.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER,
-			.cra_ctxsize = sizeof(struct sun4i_req_ctx),
-			.cra_module = THIS_MODULE,
-			.cra_alignmask = 3,
-			.cra_type = &crypto_ablkcipher_type,
-			.cra_init = sun4i_ss_cipher_init,
-			.cra_u.ablkcipher = {
-				.min_keysize    = DES3_EDE_KEY_SIZE,
-				.max_keysize    = DES3_EDE_KEY_SIZE,
-				.ivsize         = DES3_EDE_BLOCK_SIZE,
-				.setkey         = sun4i_ss_des3_setkey,
-				.encrypt        = sun4i_ss_ecb_des3_encrypt,
-				.decrypt        = sun4i_ss_ecb_des3_decrypt,
+		.cra_name = "ecb(des3_ede)",
+		.cra_driver_name = "ecb-des3-sun4i-ss",
+		.cra_priority = 300,
+		.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+		.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER,
+		.cra_ctxsize = sizeof(struct sun4i_req_ctx),
+		.cra_module = THIS_MODULE,
+		.cra_alignmask = 3,
+		.cra_type = &crypto_ablkcipher_type,
+		.cra_init = sun4i_ss_cipher_init,
+		.cra_u.ablkcipher = {
+			.min_keysize    = DES3_EDE_KEY_SIZE,
+			.max_keysize    = DES3_EDE_KEY_SIZE,
+			.ivsize         = DES3_EDE_BLOCK_SIZE,
+			.setkey         = sun4i_ss_des3_setkey,
+			.encrypt        = sun4i_ss_ecb_des3_encrypt,
+			.decrypt        = sun4i_ss_ecb_des3_decrypt,
 		}
 	}
 },
diff --git a/drivers/crypto/sunxi-ss/sun4i-ss-hash.c b/drivers/crypto/sunxi-ss/sun4i-ss-hash.c
index ff80314..0de2f62 100644
--- a/drivers/crypto/sunxi-ss/sun4i-ss-hash.c
+++ b/drivers/crypto/sunxi-ss/sun4i-ss-hash.c
@@ -20,6 +20,15 @@
 
 int sun4i_hash_crainit(struct crypto_tfm *tfm)
 {
+	struct sun4i_tfm_ctx *op = crypto_tfm_ctx(tfm);
+	struct ahash_alg *alg = __crypto_ahash_alg(tfm->__crt_alg);
+	struct sun4i_ss_alg_template *algt;
+
+	memset(op, 0, sizeof(struct sun4i_tfm_ctx));
+
+	algt = container_of(alg, struct sun4i_ss_alg_template, alg.hash);
+	op->ss = algt->ss;
+
 	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
 				 sizeof(struct sun4i_req_ctx));
 	return 0;
@@ -32,13 +41,10 @@
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq);
 	struct ahash_alg *alg = __crypto_ahash_alg(tfm->base.__crt_alg);
 	struct sun4i_ss_alg_template *algt;
-	struct sun4i_ss_ctx *ss;
 
 	memset(op, 0, sizeof(struct sun4i_req_ctx));
 
 	algt = container_of(alg, struct sun4i_ss_alg_template, alg.hash);
-	ss = algt->ss;
-	op->ss = algt->ss;
 	op->mode = algt->mode;
 
 	return 0;
@@ -129,6 +135,9 @@
 	return 0;
 }
 
+#define SS_HASH_UPDATE 1
+#define SS_HASH_FINAL 2
+
 /*
  * sun4i_hash_update: update hash engine
  *
@@ -156,7 +165,7 @@
  * write remaining data in op->buf
  * final state op->len=56
  */
-int sun4i_hash_update(struct ahash_request *areq)
+static int sun4i_hash(struct ahash_request *areq)
 {
 	u32 v, ivmode = 0;
 	unsigned int i = 0;
@@ -167,8 +176,9 @@
 	 */
 
 	struct sun4i_req_ctx *op = ahash_request_ctx(areq);
-	struct sun4i_ss_ctx *ss = op->ss;
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq);
+	struct sun4i_tfm_ctx *tfmctx = crypto_ahash_ctx(tfm);
+	struct sun4i_ss_ctx *ss = tfmctx->ss;
 	unsigned int in_i = 0; /* advancement in the current SG */
 	unsigned int end;
 	/*
@@ -180,22 +190,30 @@
 	u32 spaces, rx_cnt = SS_RX_DEFAULT;
 	size_t copied = 0;
 	struct sg_mapping_iter mi;
+	unsigned int j = 0;
+	int zeros;
+	unsigned int index, padlen;
+	__be64 bits;
+	u32 bf[32];
+	u32 wb = 0;
+	unsigned int nwait, nbw = 0;
+	struct scatterlist *in_sg = areq->src;
 
 	dev_dbg(ss->dev, "%s %s bc=%llu len=%u mode=%x wl=%u h0=%0x",
 		__func__, crypto_tfm_alg_name(areq->base.tfm),
 		op->byte_count, areq->nbytes, op->mode,
 		op->len, op->hash[0]);
 
-	if (areq->nbytes == 0)
+	if (unlikely(areq->nbytes == 0) && (op->flags & SS_HASH_FINAL) == 0)
 		return 0;
 
 	/* protect against overflow */
-	if (areq->nbytes > UINT_MAX - op->len) {
+	if (unlikely(areq->nbytes > UINT_MAX - op->len)) {
 		dev_err(ss->dev, "Cannot process too large request\n");
 		return -EINVAL;
 	}
 
-	if (op->len + areq->nbytes < 64) {
+	if (op->len + areq->nbytes < 64 && (op->flags & SS_HASH_FINAL) == 0) {
 		/* linearize data to op->buf */
 		copied = sg_pcopy_to_buffer(areq->src, sg_nents(areq->src),
 					    op->buf + op->len, areq->nbytes, 0);
@@ -203,14 +221,6 @@
 		return 0;
 	}
 
-	end = ((areq->nbytes + op->len) / 64) * 64 - op->len;
-
-	if (end > areq->nbytes || areq->nbytes - end > 63) {
-		dev_err(ss->dev, "ERROR: Bound error %u %u\n",
-			end, areq->nbytes);
-		return -EINVAL;
-	}
-
 	spin_lock_bh(&ss->slock);
 
 	/*
@@ -225,6 +235,34 @@
 	/* Enable the device */
 	writel(op->mode | SS_ENABLED | ivmode, ss->base + SS_CTL);
 
+	if ((op->flags & SS_HASH_UPDATE) == 0)
+		goto hash_final;
+
+	/* start of handling data */
+	if ((op->flags & SS_HASH_FINAL) == 0) {
+		end = ((areq->nbytes + op->len) / 64) * 64 - op->len;
+
+		if (end > areq->nbytes || areq->nbytes - end > 63) {
+			dev_err(ss->dev, "ERROR: Bound error %u %u\n",
+				end, areq->nbytes);
+			err = -EINVAL;
+			goto release_ss;
+		}
+	} else {
+		/* Since we have the flag final, we can go up to modulo 4 */
+		end = ((areq->nbytes + op->len) / 4) * 4 - op->len;
+	}
+
+	/* TODO if SGlen % 4 and op->len == 0 then DMA */
+	i = 1;
+	while (in_sg && i == 1) {
+		if ((in_sg->length % 4) != 0)
+			i = 0;
+		in_sg = sg_next(in_sg);
+	}
+	if (i == 1 && op->len == 0)
+		dev_dbg(ss->dev, "We can DMA\n");
+
 	i = 0;
 	sg_miter_start(&mi, areq->src, sg_nents(areq->src),
 		       SG_MITER_FROM_SG | SG_MITER_ATOMIC);
@@ -285,7 +323,11 @@
 			}
 		}
 	} while (i < end);
-	/* final linear */
+
+	/*
+	 * Now we have written to the device all that we can,
+	 * store the remaining bytes in op->buf
+	 */
 	if ((areq->nbytes - i) < 64) {
 		while (i < areq->nbytes && in_i < mi.length && op->len < 64) {
 			/* how many bytes we can read from current SG */
@@ -304,13 +346,21 @@
 
 	sg_miter_stop(&mi);
 
+	/*
+	 * End of data process
+	 * Now if we have the flag final go to finalize part
+	 * If not, store the partial hash
+	 */
+	if ((op->flags & SS_HASH_FINAL) > 0)
+		goto hash_final;
+
 	writel(op->mode | SS_ENABLED | SS_DATA_END, ss->base + SS_CTL);
 	i = 0;
 	do {
 		v = readl(ss->base + SS_CTL);
 		i++;
 	} while (i < SS_TIMEOUT && (v & SS_DATA_END) > 0);
-	if (i >= SS_TIMEOUT) {
+	if (unlikely(i >= SS_TIMEOUT)) {
 		dev_err_ratelimited(ss->dev,
 				    "ERROR: hash end timeout %d>%d ctl=%x len=%u\n",
 				    i, SS_TIMEOUT, v, areq->nbytes);
@@ -318,56 +368,24 @@
 		goto release_ss;
 	}
 
-	/* get the partial hash only if something was written */
 	for (i = 0; i < crypto_ahash_digestsize(tfm) / 4; i++)
 		op->hash[i] = readl(ss->base + SS_MD0 + i * 4);
 
-release_ss:
-	writel(0, ss->base + SS_CTL);
-	spin_unlock_bh(&ss->slock);
-	return err;
-}
+	goto release_ss;
 
 /*
- * sun4i_hash_final: finalize hashing operation
+ * hash_final: finalize hashing operation
  *
  * If we have some remaining bytes, we write them.
  * Then ask the SS for finalizing the hashing operation
  *
  * I do not check RX FIFO size in this function since the size is 32
  * after each enabling and this function neither write more than 32 words.
+ * If we come from the update part, we cannot have more than
+ * 3 remaining bytes to write and SS is fast enough to not care about it.
  */
-int sun4i_hash_final(struct ahash_request *areq)
-{
-	u32 v, ivmode = 0;
-	unsigned int i;
-	unsigned int j = 0;
-	int zeros, err = 0;
-	unsigned int index, padlen;
-	__be64 bits;
-	struct sun4i_req_ctx *op = ahash_request_ctx(areq);
-	struct sun4i_ss_ctx *ss = op->ss;
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq);
-	u32 bf[32];
-	u32 wb = 0;
-	unsigned int nwait, nbw = 0;
 
-	dev_dbg(ss->dev, "%s: byte=%llu len=%u mode=%x wl=%u h=%x",
-		__func__, op->byte_count, areq->nbytes, op->mode,
-		op->len, op->hash[0]);
-
-	spin_lock_bh(&ss->slock);
-
-	/*
-	 * if we have already written something,
-	 * restore the partial hash state
-	 */
-	if (op->byte_count > 0) {
-		ivmode = SS_IV_ARBITRARY;
-		for (i = 0; i < crypto_ahash_digestsize(tfm) / 4; i++)
-			writel(op->hash[i], ss->base + SS_IV0 + i * 4);
-	}
-	writel(op->mode | SS_ENABLED | ivmode, ss->base + SS_CTL);
+hash_final:
 
 	/* write the remaining words of the wait buffer */
 	if (op->len > 0) {
@@ -428,7 +446,7 @@
 
 	/*
 	 * Wait for SS to finish the hash.
-	 * The timeout could happen only in case of bad overcloking
+	 * The timeout could happen only in case of bad overclocking
 	 * or driver bug.
 	 */
 	i = 0;
@@ -436,7 +454,7 @@
 		v = readl(ss->base + SS_CTL);
 		i++;
 	} while (i < SS_TIMEOUT && (v & SS_DATA_END) > 0);
-	if (i >= SS_TIMEOUT) {
+	if (unlikely(i >= SS_TIMEOUT)) {
 		dev_err_ratelimited(ss->dev,
 				    "ERROR: hash end timeout %d>%d ctl=%x len=%u\n",
 				    i, SS_TIMEOUT, v, areq->nbytes);
@@ -463,30 +481,41 @@
 	return err;
 }
 
+int sun4i_hash_final(struct ahash_request *areq)
+{
+	struct sun4i_req_ctx *op = ahash_request_ctx(areq);
+
+	op->flags = SS_HASH_FINAL;
+	return sun4i_hash(areq);
+}
+
+int sun4i_hash_update(struct ahash_request *areq)
+{
+	struct sun4i_req_ctx *op = ahash_request_ctx(areq);
+
+	op->flags = SS_HASH_UPDATE;
+	return sun4i_hash(areq);
+}
+
 /* sun4i_hash_finup: finalize hashing operation after an update */
 int sun4i_hash_finup(struct ahash_request *areq)
 {
-	int err;
+	struct sun4i_req_ctx *op = ahash_request_ctx(areq);
 
-	err = sun4i_hash_update(areq);
-	if (err != 0)
-		return err;
-
-	return sun4i_hash_final(areq);
+	op->flags = SS_HASH_UPDATE | SS_HASH_FINAL;
+	return sun4i_hash(areq);
 }
 
 /* combo of init/update/final functions */
 int sun4i_hash_digest(struct ahash_request *areq)
 {
 	int err;
+	struct sun4i_req_ctx *op = ahash_request_ctx(areq);
 
 	err = sun4i_hash_init(areq);
 	if (err != 0)
 		return err;
 
-	err = sun4i_hash_update(areq);
-	if (err != 0)
-		return err;
-
-	return sun4i_hash_final(areq);
+	op->flags = SS_HASH_UPDATE | SS_HASH_FINAL;
+	return sun4i_hash(areq);
 }
diff --git a/drivers/crypto/sunxi-ss/sun4i-ss.h b/drivers/crypto/sunxi-ss/sun4i-ss.h
index 8e9c05f..f04c0f8 100644
--- a/drivers/crypto/sunxi-ss/sun4i-ss.h
+++ b/drivers/crypto/sunxi-ss/sun4i-ss.h
@@ -163,7 +163,7 @@
 	u32 hash[5]; /* for storing SS_IVx register */
 	char buf[64];
 	unsigned int len;
-	struct sun4i_ss_ctx *ss;
+	int flags;
 };
 
 int sun4i_hash_crainit(struct crypto_tfm *tfm);
diff --git a/drivers/crypto/vmx/Kconfig b/drivers/crypto/vmx/Kconfig
index a83ead1..c3d524e 100644
--- a/drivers/crypto/vmx/Kconfig
+++ b/drivers/crypto/vmx/Kconfig
@@ -1,6 +1,7 @@
 config CRYPTO_DEV_VMX_ENCRYPT
 	tristate "Encryption acceleration support on P8 CPU"
 	depends on CRYPTO_DEV_VMX
+	select CRYPTO_GHASH
 	default m
 	help
 	  Support for VMX cryptographic acceleration instructions on Power8 CPU.
diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index 6c999cb0..27a94a1 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -26,16 +26,13 @@
 #include <linux/hardirq.h>
 #include <asm/switch_to.h>
 #include <crypto/aes.h>
+#include <crypto/ghash.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/internal/hash.h>
 #include <crypto/b128ops.h>
 
 #define IN_INTERRUPT in_interrupt()
 
-#define GHASH_BLOCK_SIZE (16)
-#define GHASH_DIGEST_SIZE (16)
-#define GHASH_KEY_LEN (16)
-
 void gcm_init_p8(u128 htable[16], const u64 Xi[2]);
 void gcm_gmult_p8(u64 Xi[2], const u128 htable[16]);
 void gcm_ghash_p8(u64 Xi[2], const u128 htable[16],
@@ -55,16 +52,11 @@
 
 static int p8_ghash_init_tfm(struct crypto_tfm *tfm)
 {
-	const char *alg;
+	const char *alg = "ghash-generic";
 	struct crypto_shash *fallback;
 	struct crypto_shash *shash_tfm = __crypto_shash_cast(tfm);
 	struct p8_ghash_ctx *ctx = crypto_tfm_ctx(tfm);
 
-	if (!(alg = crypto_tfm_alg_name(tfm))) {
-		printk(KERN_ERR "Failed to get algorithm name.\n");
-		return -ENOENT;
-	}
-
 	fallback = crypto_alloc_shash(alg, 0, CRYPTO_ALG_NEED_FALLBACK);
 	if (IS_ERR(fallback)) {
 		printk(KERN_ERR
@@ -78,10 +70,18 @@
 	crypto_shash_set_flags(fallback,
 			       crypto_shash_get_flags((struct crypto_shash
 						       *) tfm));
-	ctx->fallback = fallback;
 
-	shash_tfm->descsize = sizeof(struct p8_ghash_desc_ctx)
-	    + crypto_shash_descsize(fallback);
+	/* Check if the descsize defined in the algorithm is still enough. */
+	if (shash_tfm->descsize < sizeof(struct p8_ghash_desc_ctx)
+	    + crypto_shash_descsize(fallback)) {
+		printk(KERN_ERR
+		       "Desc size of the fallback implementation (%s) does not match the expected value: %lu vs %u\n",
+		       alg,
+		       shash_tfm->descsize - sizeof(struct p8_ghash_desc_ctx),
+		       crypto_shash_descsize(fallback));
+		return -EINVAL;
+	}
+	ctx->fallback = fallback;
 
 	return 0;
 }
@@ -113,7 +113,7 @@
 {
 	struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(tfm));
 
-	if (keylen != GHASH_KEY_LEN)
+	if (keylen != GHASH_BLOCK_SIZE)
 		return -EINVAL;
 
 	preempt_disable();
@@ -211,7 +211,8 @@
 	.update = p8_ghash_update,
 	.final = p8_ghash_final,
 	.setkey = p8_ghash_setkey,
-	.descsize = sizeof(struct p8_ghash_desc_ctx),
+	.descsize = sizeof(struct p8_ghash_desc_ctx)
+		+ sizeof(struct ghash_desc_ctx),
 	.base = {
 		 .cra_name = "ghash",
 		 .cra_driver_name = "p8_ghash",
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index cffc1c0..c232729 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -834,6 +834,17 @@
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD,	PCI_DEVICE_ID_AMD_VIPER_7410,	quirk_amd_ioapic);
 #endif /* CONFIG_X86_IO_APIC */
 
+#if defined(CONFIG_ARM64) && defined(CONFIG_PCI_ATS)
+
+static void quirk_cavium_sriov_rnm_link(struct pci_dev *dev)
+{
+	/* Fix for improper SRIOV configuration on Cavium cn88xx  RNM device */
+	if (dev->subsystem_device == 0xa118)
+		dev->sriov->link = dev->devfn;
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CAVIUM, 0xa018, quirk_cavium_sriov_rnm_link);
+#endif
+
 /*
  * Some settings of MMRBC can lead to data corruption so block changes.
  * See AMD 8131 HyperTransport PCI-X Tunnel Revision Guide
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 8637cdf..404e955 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -15,7 +15,6 @@
 #include <linux/crypto.h>
 #include <linux/list.h>
 #include <linux/kernel.h>
-#include <linux/kthread.h>
 #include <linux/skbuff.h>
 
 struct crypto_aead;
@@ -129,75 +128,6 @@
 	unsigned int		blocksize;
 };
 
-#define ENGINE_NAME_LEN	30
-/*
- * struct crypto_engine - crypto hardware engine
- * @name: the engine name
- * @idling: the engine is entering idle state
- * @busy: request pump is busy
- * @running: the engine is on working
- * @cur_req_prepared: current request is prepared
- * @list: link with the global crypto engine list
- * @queue_lock: spinlock to syncronise access to request queue
- * @queue: the crypto queue of the engine
- * @rt: whether this queue is set to run as a realtime task
- * @prepare_crypt_hardware: a request will soon arrive from the queue
- * so the subsystem requests the driver to prepare the hardware
- * by issuing this call
- * @unprepare_crypt_hardware: there are currently no more requests on the
- * queue so the subsystem notifies the driver that it may relax the
- * hardware by issuing this call
- * @prepare_request: do some prepare if need before handle the current request
- * @unprepare_request: undo any work done by prepare_message()
- * @crypt_one_request: do encryption for current request
- * @kworker: thread struct for request pump
- * @kworker_task: pointer to task for request pump kworker thread
- * @pump_requests: work struct for scheduling work to the request pump
- * @priv_data: the engine private data
- * @cur_req: the current request which is on processing
- */
-struct crypto_engine {
-	char			name[ENGINE_NAME_LEN];
-	bool			idling;
-	bool			busy;
-	bool			running;
-	bool			cur_req_prepared;
-
-	struct list_head	list;
-	spinlock_t		queue_lock;
-	struct crypto_queue	queue;
-
-	bool			rt;
-
-	int (*prepare_crypt_hardware)(struct crypto_engine *engine);
-	int (*unprepare_crypt_hardware)(struct crypto_engine *engine);
-
-	int (*prepare_request)(struct crypto_engine *engine,
-			       struct ablkcipher_request *req);
-	int (*unprepare_request)(struct crypto_engine *engine,
-				 struct ablkcipher_request *req);
-	int (*crypt_one_request)(struct crypto_engine *engine,
-				 struct ablkcipher_request *req);
-
-	struct kthread_worker           kworker;
-	struct task_struct              *kworker_task;
-	struct kthread_work             pump_requests;
-
-	void				*priv_data;
-	struct ablkcipher_request	*cur_req;
-};
-
-int crypto_transfer_request(struct crypto_engine *engine,
-			    struct ablkcipher_request *req, bool need_pump);
-int crypto_transfer_request_to_engine(struct crypto_engine *engine,
-				      struct ablkcipher_request *req);
-void crypto_finalize_request(struct crypto_engine *engine,
-			     struct ablkcipher_request *req, int err);
-int crypto_engine_start(struct crypto_engine *engine);
-int crypto_engine_stop(struct crypto_engine *engine);
-struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt);
-int crypto_engine_exit(struct crypto_engine *engine);
-
 extern const struct crypto_type crypto_ablkcipher_type;
 extern const struct crypto_type crypto_blkcipher_type;
 
diff --git a/include/crypto/engine.h b/include/crypto/engine.h
new file mode 100644
index 0000000..04eb5c7
--- /dev/null
+++ b/include/crypto/engine.h
@@ -0,0 +1,107 @@
+/*
+ * Crypto engine API
+ *
+ * Copyright (c) 2016 Baolin Wang <baolin.wang@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#ifndef _CRYPTO_ENGINE_H
+#define _CRYPTO_ENGINE_H
+
+#include <linux/crypto.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <crypto/algapi.h>
+#include <crypto/hash.h>
+
+#define ENGINE_NAME_LEN	30
+/*
+ * struct crypto_engine - crypto hardware engine
+ * @name: the engine name
+ * @idling: the engine is entering idle state
+ * @busy: request pump is busy
+ * @running: the engine is on working
+ * @cur_req_prepared: current request is prepared
+ * @list: link with the global crypto engine list
+ * @queue_lock: spinlock to syncronise access to request queue
+ * @queue: the crypto queue of the engine
+ * @rt: whether this queue is set to run as a realtime task
+ * @prepare_crypt_hardware: a request will soon arrive from the queue
+ * so the subsystem requests the driver to prepare the hardware
+ * by issuing this call
+ * @unprepare_crypt_hardware: there are currently no more requests on the
+ * queue so the subsystem notifies the driver that it may relax the
+ * hardware by issuing this call
+ * @prepare_cipher_request: do some prepare if need before handle the current request
+ * @unprepare_cipher_request: undo any work done by prepare_cipher_request()
+ * @cipher_one_request: do encryption for current request
+ * @prepare_hash_request: do some prepare if need before handle the current request
+ * @unprepare_hash_request: undo any work done by prepare_hash_request()
+ * @hash_one_request: do hash for current request
+ * @kworker: thread struct for request pump
+ * @kworker_task: pointer to task for request pump kworker thread
+ * @pump_requests: work struct for scheduling work to the request pump
+ * @priv_data: the engine private data
+ * @cur_req: the current request which is on processing
+ */
+struct crypto_engine {
+	char			name[ENGINE_NAME_LEN];
+	bool			idling;
+	bool			busy;
+	bool			running;
+	bool			cur_req_prepared;
+
+	struct list_head	list;
+	spinlock_t		queue_lock;
+	struct crypto_queue	queue;
+
+	bool			rt;
+
+	int (*prepare_crypt_hardware)(struct crypto_engine *engine);
+	int (*unprepare_crypt_hardware)(struct crypto_engine *engine);
+
+	int (*prepare_cipher_request)(struct crypto_engine *engine,
+				      struct ablkcipher_request *req);
+	int (*unprepare_cipher_request)(struct crypto_engine *engine,
+					struct ablkcipher_request *req);
+	int (*prepare_hash_request)(struct crypto_engine *engine,
+				    struct ahash_request *req);
+	int (*unprepare_hash_request)(struct crypto_engine *engine,
+				      struct ahash_request *req);
+	int (*cipher_one_request)(struct crypto_engine *engine,
+				  struct ablkcipher_request *req);
+	int (*hash_one_request)(struct crypto_engine *engine,
+				struct ahash_request *req);
+
+	struct kthread_worker           kworker;
+	struct task_struct              *kworker_task;
+	struct kthread_work             pump_requests;
+
+	void				*priv_data;
+	struct crypto_async_request	*cur_req;
+};
+
+int crypto_transfer_cipher_request(struct crypto_engine *engine,
+				   struct ablkcipher_request *req,
+				   bool need_pump);
+int crypto_transfer_cipher_request_to_engine(struct crypto_engine *engine,
+					     struct ablkcipher_request *req);
+int crypto_transfer_hash_request(struct crypto_engine *engine,
+				 struct ahash_request *req, bool need_pump);
+int crypto_transfer_hash_request_to_engine(struct crypto_engine *engine,
+					   struct ahash_request *req);
+void crypto_finalize_cipher_request(struct crypto_engine *engine,
+				    struct ablkcipher_request *req, int err);
+void crypto_finalize_hash_request(struct crypto_engine *engine,
+				  struct ahash_request *req, int err);
+int crypto_engine_start(struct crypto_engine *engine);
+int crypto_engine_stop(struct crypto_engine *engine);
+struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt);
+int crypto_engine_exit(struct crypto_engine *engine);
+
+#endif /* _CRYPTO_ENGINE_H */
diff --git a/include/crypto/ghash.h b/include/crypto/ghash.h
new file mode 100644
index 0000000..2a61c9b
--- /dev/null
+++ b/include/crypto/ghash.h
@@ -0,0 +1,23 @@
+/*
+ * Common values for GHASH algorithms
+ */
+
+#ifndef __CRYPTO_GHASH_H__
+#define __CRYPTO_GHASH_H__
+
+#include <linux/types.h>
+#include <crypto/gf128mul.h>
+
+#define GHASH_BLOCK_SIZE	16
+#define GHASH_DIGEST_SIZE	16
+
+struct ghash_ctx {
+	struct gf128mul_4k *gf128;
+};
+
+struct ghash_desc_ctx {
+	u8 buffer[GHASH_BLOCK_SIZE];
+	u32 bytes;
+};
+
+#endif
diff --git a/include/linux/ccp.h b/include/linux/ccp.h
index 7c2bb27..a765333 100644
--- a/include/linux/ccp.h
+++ b/include/linux/ccp.h
@@ -238,9 +238,6 @@
 };
 
 /***** SHA engine *****/
-#define CCP_SHA_BLOCKSIZE               SHA256_BLOCK_SIZE
-#define CCP_SHA_CTXSIZE                 SHA256_DIGEST_SIZE
-
 /**
  * ccp_sha_type - type of SHA operation
  *
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index 4f7d8f4..34a0dc1 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -29,7 +29,9 @@
  *			Returns the number of lower random bytes in "data".
  *			Must not be NULL.    *OBSOLETE*
  * @read:		New API. drivers can fill up to max bytes of data
- *			into the buffer. The buffer is aligned for any type.
+ *			into the buffer. The buffer is aligned for any type
+ *			and max is guaranteed to be >= to that alignment
+ *			(either 4 or 8 depending on architecture).
  * @priv:		Private data, for use by the RNG driver.
  * @quality:		Estimation of true entropy in RNG's bitstream
  *			(per mill).