Support sync sm3 ce instruction, users can use ce instruction to accelerate sm3 sync task through init2 related functions.
This patch also includes: 1. Add compile parameter and related file to support isa-ce library. 2. Check whether the platform supports the CE instruction in alg driver register process. 3. Make HW driver and INSTR driver of the same alg can be requested at the same time. 4. Support sm3 ce block mode and stream mode for sm3-normal and hmac-sm3.
Signed-off-by: Zhiqi Song songzhiqi1@huawei.com --- Makefile.am | 18 +- configure.ac | 3 + drv/isa_ce_sm3.c | 401 ++++++++++++++++++++ drv/isa_ce_sm3.h | 86 +++++ drv/isa_ce_sm3_armv8.S | 765 ++++++++++++++++++++++++++++++++++++++ include/drv/arm_arch_ce.h | 199 ++++++++++ include/wd_alg.h | 43 +++ wd_alg.c | 32 +- wd_digest.c | 2 +- wd_sched.c | 2 +- wd_util.c | 87 ++++- 11 files changed, 1618 insertions(+), 20 deletions(-) create mode 100644 drv/isa_ce_sm3.c create mode 100644 drv/isa_ce_sm3.h create mode 100644 drv/isa_ce_sm3_armv8.S create mode 100644 include/drv/arm_arch_ce.h
diff --git a/Makefile.am b/Makefile.am index 25853eb..b267e9e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -43,7 +43,8 @@ nobase_pkginclude_HEADERS = v1/wd.h v1/wd_cipher.h v1/wd_aead.h v1/uacce.h v1/wd lib_LTLIBRARIES=libwd.la libwd_comp.la libwd_crypto.la
uadk_driversdir=$(libdir)/uadk -uadk_drivers_LTLIBRARIES=libhisi_sec.la libhisi_hpre.la libhisi_zip.la +uadk_drivers_LTLIBRARIES=libhisi_sec.la libhisi_hpre.la libhisi_zip.la \ + libisa_ce.la
libwd_la_SOURCES=wd.c wd_mempool.c wd.h wd_alg.c wd_alg.h \ v1/wd.c v1/wd.h v1/wd_adapter.c v1/wd_adapter.h \ @@ -79,7 +80,8 @@ libwd_crypto_la_SOURCES=wd_cipher.c wd_cipher.h wd_cipher_drv.h \ wd_digest.c wd_digest.h wd_digest_drv.h \ wd_util.c wd_util.h \ wd_sched.c wd_sched.h \ - wd.c wd.h + wd.c wd.h \ + arm_arch_ce.h isa_ce_sm3.h
libhisi_sec_la_SOURCES=drv/hisi_sec.c drv/hisi_qm_udrv.c \ lib/crypto/aes.c lib/crypto/galois.c \ @@ -87,6 +89,10 @@ libhisi_sec_la_SOURCES=drv/hisi_sec.c drv/hisi_qm_udrv.c \
libhisi_hpre_la_SOURCES=drv/hisi_hpre.c drv/hisi_qm_udrv.c \ hisi_qm_udrv.h + +libisa_ce_la_SOURCES=drv/isa_ce_sm3.c drv/isa_ce_sm3_armv8.S arm_arch_ce.h \ + drv/isa_ce_sm3.h + if WD_STATIC_DRV AM_CFLAGS += -DWD_STATIC_DRV -fPIC AM_CFLAGS += -DWD_NO_LOG @@ -106,6 +112,10 @@ libhisi_sec_la_DEPENDENCIES = libwd.la libwd_crypto.la
libhisi_hpre_la_LIBADD = $(libwd_la_OBJECTS) $(libwd_crypto_la_OBJECTS) libhisi_hpre_la_DEPENDENCIES = libwd.la libwd_crypto.la + +libisa_ce_la_LIBADD = $(libwd_la_OBJECTS) $(libwd_crypto_la_OBJECTS) +libisa_ce_la_DEPENDENCIES = libwd.la libwd_crypto.la + else UADK_WD_SYMBOL= -Wl,--version-script,$(top_srcdir)/libwd.map UADK_CRYPTO_SYMBOL= -Wl,--version-script,$(top_srcdir)/libwd_crypto.map @@ -134,6 +144,10 @@ libhisi_sec_la_DEPENDENCIES= libwd.la libwd_crypto.la libhisi_hpre_la_LIBADD= -lwd -lwd_crypto libhisi_hpre_la_LDFLAGS=$(UADK_VERSION) libhisi_hpre_la_DEPENDENCIES= libwd.la libwd_crypto.la + +libisa_ce_la_LIBADD= -lwd -lwd_crypto +libisa_ce_la_LDFLAGS=$(UADK_VERSION) +libisa_ce_la_DEPENDENCIES= libwd.la libwd_crypto.la endif # WD_STATIC_DRV
pkgconfigdir = $(libdir)/pkgconfig diff --git a/configure.ac b/configure.ac index b198417..4ed111e 100644 --- a/configure.ac +++ b/configure.ac @@ -21,6 +21,9 @@ LT_INIT AC_SUBST([hardcode_into_libs], [no]) AM_PROG_CC_C_O
+# Support assembler +AM_PROG_AS + AC_ARG_ENABLE([debug-log], AS_HELP_STRING([--enable-debug-log], [enable debug logging globally]), [ AS_IF([test "x$enable_debug_log" = "xyes"], diff --git a/drv/isa_ce_sm3.c b/drv/isa_ce_sm3.c new file mode 100644 index 0000000..c20ac55 --- /dev/null +++ b/drv/isa_ce_sm3.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ +/* + * Copyright 2023 Huawei Technologies Co.,Ltd. All rights reserved. + */ + +#include <stdlib.h> +#include <sys/auxv.h> +#include <pthread.h> +#include "drv/wd_digest_drv.h" +#include "drv/isa_ce_sm3.h" +#include "wd_digest.h" +#include "wd_util.h" + +typedef void (sm3_ce_block_fn)(__u32 word_reg[SM3_STATE_WORDS], + const unsigned char *src, size_t blocks); + +static int sm3_ce_drv_init(struct wd_alg_driver *drv, void *conf); +static void sm3_ce_drv_exit(struct wd_alg_driver *drv); +static int sm3_ce_drv_send(struct wd_alg_driver *drv, handle_t ctx, void *digest_msg); +static int sm3_ce_drv_recv(struct wd_alg_driver *drv, handle_t ctx, void *digest_msg); +static int sm3_ce_get_usage(void *param); + +static struct wd_alg_driver sm3_ce_alg_driver = { + .drv_name = "isa_ce_sm3", + .alg_name = "sm3", + .calc_type = UADK_ALG_CE_INSTR, + .priority = 200, + .queue_num = 1, + .op_type_num = 1, + .fallback = 0, + .init = sm3_ce_drv_init, + .exit = sm3_ce_drv_exit, + .send = sm3_ce_drv_send, + .recv = sm3_ce_drv_recv, + .get_usage = sm3_ce_get_usage, +}; + +static void __attribute__((constructor)) sm3_ce_probe(void) +{ + int ret; + + WD_INFO("Info: register SM3 CE alg driver!\n"); + ret = wd_alg_driver_register(&sm3_ce_alg_driver); + if (ret && ret != -WD_ENODEV) + WD_ERR("Error: register SM3 CE failed!\n"); +} + +static void __attribute__((destructor)) sm3_ce_remove(void) +{ + wd_alg_driver_unregister(&sm3_ce_alg_driver); +} + +static int sm3_ce_get_usage(void *param) +{ + return WD_SUCCESS; +} + +static inline void sm3_ce_init(struct sm3_ce_ctx *sctx) +{ + sctx->word_reg[0] = SM3_IVA; + sctx->word_reg[1] = SM3_IVB; + sctx->word_reg[2] = SM3_IVC; + sctx->word_reg[3] = SM3_IVD; + sctx->word_reg[4] = SM3_IVE; + sctx->word_reg[5] = SM3_IVF; + sctx->word_reg[6] = SM3_IVG; + sctx->word_reg[7] = SM3_IVH; +} + +static void trans_output_result(__u8 *out_digest, __u32 *word_reg) +{ + size_t i; + + for (i = 0; i < SM3_STATE_WORDS; i++) + PUTU32_TO_U8(out_digest + i * WORD_TO_CHAR_OFFSET, word_reg[i]); +} + +static void sm3_ce_init_ex(struct sm3_ce_ctx *sctx, __u8 *iv, __u16 iv_bytes) +{ + size_t i; + + if (iv_bytes != SM3_DIGEST_SIZE) { + WD_ERR("invalid iv size: %u\n", iv_bytes); + return; + } + + for (i = 0; i < SM3_STATE_WORDS; i++) + PUTU8_TO_U32(sctx->word_reg[i], iv + i * WORD_TO_CHAR_OFFSET); +} + +static void sm3_ce_update(struct sm3_ce_ctx *sctx, const __u8 *data, + size_t data_len, sm3_ce_block_fn *block_fn) +{ + size_t remain_data_len, blk_num; + + /* Get the data num that need compute currently */ + sctx->num &= (SM3_BLOCK_SIZE - 1); + + if (sctx->num) { + remain_data_len = SM3_BLOCK_SIZE - sctx->num; + /* If data_len does not enough a block size, then leave it to final */ + if (data_len < remain_data_len) { + memcpy(sctx->block + sctx->num, data, data_len); + sctx->num += data_len; + return; + } + + memcpy(sctx->block + sctx->num, data, remain_data_len); + block_fn(sctx->word_reg, sctx->block, 1); + sctx->nblocks++; + data += remain_data_len; + data_len -= remain_data_len; + } + + /* Group the filled msg by 512-bits (64-bytes) */ + blk_num = data_len / SM3_BLOCK_SIZE; + if (blk_num) { + block_fn(sctx->word_reg, data, blk_num); + sctx->nblocks += blk_num; + data += SM3_BLOCK_SIZE * blk_num; + data_len -= SM3_BLOCK_SIZE * blk_num; + } + + sctx->num = data_len; + if (data_len) + memcpy(sctx->block, data, data_len); +} + +static void sm3_ce_final(struct sm3_ce_ctx *sctx, __u8 *md, + sm3_ce_block_fn *block_fn) +{ + size_t i, offset1, offset2; + __u64 nh, nl; + + sctx->num &= (SM3_BLOCK_SIZE - 1); + sctx->block[sctx->num] = SM3_PADDING_BYTE; + + if (sctx->num <= SM3_BLOCK_SIZE - BIT_TO_BLOCK_OFFSET) { + memset(sctx->block + sctx->num + 1, 0, SM3_BLOCK_SIZE - sctx->num - 9); + } else { + memset(sctx->block + sctx->num + 1, 0, SM3_BLOCK_SIZE - sctx->num - 1); + block_fn(sctx->word_reg, sctx->block, 1); + memset(sctx->block, 0, SM3_BLOCK_SIZE - 8); + } + + /* + * Put the length of the message in bits into the last + * 64-bits (penultimate two words). + */ + offset2 = SM3_BLOCK_SIZE - WORD_TO_CHAR_OFFSET * 2; + offset1 = SM3_BLOCK_SIZE - WORD_TO_CHAR_OFFSET; + nh = sctx->nblocks >> NH_OFFSET; + nl = (sctx->nblocks << BIT_TO_BLOCK_OFFSET) + (sctx->num << BIT_TO_BYTE_OFFSET); + PUTU32_TO_U8(sctx->block + offset2 , nh); + PUTU32_TO_U8(sctx->block + offset1, nl); + + block_fn(sctx->word_reg, sctx->block, 1); + for (i = 0; i < SM3_STATE_WORDS; i++) + PUTU32_TO_U8(md + i * WORD_TO_CHAR_OFFSET, sctx->word_reg[i]); +} + +static int do_sm3_ce(struct wd_digest_msg *msg, __u8 *out_digest) +{ + enum hash_block_type block_type; + struct sm3_ce_ctx sctx = {0}; + size_t data_len, iv_len; + __u8 *data, *iv; + + block_type = get_hash_block_type(msg); + data_len = msg->in_bytes; + data = msg->in; + iv_len = SM3_DIGEST_SIZE; + /* Use last output as the iv in current cycle */ + iv = msg->out; + + switch(block_type) { + case HASH_SINGLE_BLOCK: + sm3_ce_init(&sctx); + sm3_ce_update(&sctx, data, data_len, sm3_ce_block_compress); + sm3_ce_final(&sctx, out_digest, sm3_ce_block_compress); + break; + case HASH_FRIST_BLOCK: + sm3_ce_init(&sctx); + sm3_ce_update(&sctx, data, data_len, sm3_ce_block_compress); + trans_output_result(out_digest, sctx.word_reg); + break; + case HASH_MIDDLE_BLOCK: + sm3_ce_init_ex(&sctx, iv, iv_len); + sm3_ce_update(&sctx, data, data_len, sm3_ce_block_compress); + /* Transform the middle result without final padding */ + trans_output_result(out_digest, sctx.word_reg); + break; + case HASH_END_BLOCK: + sm3_ce_init_ex(&sctx, iv, iv_len); + sm3_ce_update(&sctx, data, data_len, sm3_ce_block_compress); + /* Put the whole message length in last 64-bits */ + sctx.nblocks = msg->long_data_len / SM3_BLOCK_SIZE; + sm3_ce_final(&sctx, out_digest, sm3_ce_block_compress); + break; + default: + WD_ERR("Invalid block type!\n"); + return -WD_EINVAL; + } + + if (msg->out_bytes < SM3_DIGEST_SIZE) + memcpy(msg->out, out_digest, msg->out_bytes); + else + memcpy(msg->out, out_digest, SM3_DIGEST_SIZE); + + memset(&sctx, 0, sizeof(struct sm3_ce_ctx)); + + return WD_SUCCESS; +} + +static void sm3_hmac_key_padding(struct hmac_sm3_ctx *hctx, + const __u8 *key, size_t key_len) +{ + size_t i; + + if (key_len <= SM3_BLOCK_SIZE) { + memcpy(hctx->key, key, key_len); + memset(hctx->key + key_len, 0, SM3_BLOCK_SIZE - key_len); + } else { + sm3_ce_init(&hctx->sctx); + sm3_ce_update(&hctx->sctx, key, key_len, sm3_ce_block_compress); + sm3_ce_final(&hctx->sctx, hctx->key, sm3_ce_block_compress); + /* Pad key to SM3_BLOCK_SIZE after hash */ + memset(hctx->key + SM3_DIGEST_SIZE, 0, + SM3_BLOCK_SIZE - SM3_DIGEST_SIZE); + } + + for (i = 0; i < SM3_BLOCK_SIZE; i++) { + hctx->key[i] ^= IPAD_DATA; + } +} + +static void sm3_ce_hmac_init(struct hmac_sm3_ctx *hctx, const __u8 *key, size_t key_len) +{ + sm3_hmac_key_padding(hctx, key, key_len); + + /* Ipadded key is the first block to hash in first cycle */ + sm3_ce_init(&hctx->sctx); + sm3_ce_update(&hctx->sctx, hctx->key, SM3_BLOCK_SIZE, sm3_ce_block_compress); +} + +static void sm3_ce_hmac_update(struct hmac_sm3_ctx *hctx, const __u8 *data, size_t data_len) +{ + sm3_ce_update(&hctx->sctx, data, data_len, sm3_ce_block_compress); +} + +static void sm3_ce_hmac_final(struct hmac_sm3_ctx *hctx, __u8 *out_hmac) +{ + __u8 digest[SM3_DIGEST_SIZE] = {0}; + size_t i; + + for (i = 0; i < SM3_BLOCK_SIZE; i++) { + hctx->key[i] ^= (IPAD_DATA ^ OPAD_DATA); + } + + /* Compute the last data from update process */ + sm3_ce_final(&hctx->sctx, digest, sm3_ce_block_compress); + + /* Opadded key is the first block to hash in second cycle */ + memset(&hctx->sctx, 0, sizeof(struct sm3_ce_ctx)); + sm3_ce_init(&hctx->sctx); + sm3_ce_update(&hctx->sctx, hctx->key, SM3_BLOCK_SIZE, sm3_ce_block_compress); + + /* Compute the the first cycle result */ + sm3_ce_update(&hctx->sctx, digest, SM3_DIGEST_SIZE, sm3_ce_block_compress); + sm3_ce_final(&hctx->sctx, out_hmac, sm3_ce_block_compress); +} + +static int do_hmac_sm3_ce(struct wd_digest_msg *msg, __u8 *out_hmac) +{ + size_t data_len, key_len, iv_len; + enum hash_block_type block_type; + struct hmac_sm3_ctx hctx = {0}; + __u8 *data, *key, *iv; + + data_len = msg->in_bytes; + data = msg->in; + key = msg->key; + key_len = msg->key_bytes; + iv_len = SM3_DIGEST_SIZE; + /* Use last output as the iv in current cycle */ + iv = msg->out; + + if (!key_len) { + WD_ERR("invalid hmac key_len is 0!\n"); + return -WD_EINVAL; + } + + block_type = get_hash_block_type(msg); + switch(block_type) { + case HASH_SINGLE_BLOCK: + sm3_ce_hmac_init(&hctx, key, key_len); + sm3_ce_hmac_update(&hctx, data, data_len); + sm3_ce_hmac_final(&hctx, out_hmac); + break; + case HASH_FRIST_BLOCK: + sm3_ce_hmac_init(&hctx, key, key_len); + sm3_ce_hmac_update(&hctx, data, data_len); + trans_output_result(out_hmac, hctx.sctx.word_reg); + break; + case HASH_MIDDLE_BLOCK: + sm3_ce_init_ex(&(hctx.sctx), iv, iv_len); + sm3_ce_hmac_update(&hctx, data, data_len); + trans_output_result(out_hmac, hctx.sctx.word_reg); + break; + case HASH_END_BLOCK: + sm3_hmac_key_padding(&hctx, key, key_len); + sm3_ce_init_ex(&(hctx.sctx), iv, iv_len); + sm3_ce_hmac_update(&hctx, data, data_len); + hctx.sctx.nblocks = msg->long_data_len / SM3_BLOCK_SIZE + KEY_BLOCK_NUM; + sm3_ce_hmac_final(&hctx, out_hmac); + break; + default: + WD_ERR("Invalid block type!\n"); + return -WD_EINVAL; + } + + if (msg->out_bytes < SM3_DIGEST_SIZE) + memcpy(msg->out, out_hmac, msg->out_bytes); + else + memcpy(msg->out, out_hmac, SM3_DIGEST_SIZE); + + memset(&hctx, 0, sizeof(struct hmac_sm3_ctx)); + + return WD_SUCCESS; +} + +static int sm3_ce_drv_send(struct wd_alg_driver *drv, handle_t ctx, void *digest_msg) +{ + struct wd_digest_msg *msg = (struct wd_digest_msg *)digest_msg; + __u8 digest[SM3_DIGEST_SIZE] = {0}; + int ret; + + if (!msg) { + WD_ERR("invalid: digest_msg is NULL!\n"); + return -WD_EINVAL; + } + + if (msg->data_fmt == WD_SGL_BUF) { + WD_ERR("invalid: SM3 CE driver do not support sgl data format!\n"); + return -WD_EINVAL; + } + + if (msg->mode == WD_DIGEST_NORMAL) { + ret = do_sm3_ce(msg, digest); + } else if (msg->mode == WD_DIGEST_HMAC) { + ret = do_hmac_sm3_ce(msg, digest); + } else { + WD_ERR("invalid digest mode!\n"); + ret = -WD_EINVAL; + } + + return ret; +} + +static int sm3_ce_drv_recv(struct wd_alg_driver *drv, handle_t ctx, void *digest_msg) +{ + return WD_SUCCESS; +} + +static int sm3_ce_drv_init(struct wd_alg_driver *drv, void *conf) +{ + struct wd_ctx_config_internal *config = (struct wd_ctx_config_internal *)conf; + struct sm3_ce_drv_ctx *sctx = (struct sm3_ce_drv_ctx *)drv->priv; + + config->epoll_en = false; + + /* return if already inited */ + if (sctx) + return WD_SUCCESS; + sctx = malloc(sizeof(struct sm3_ce_drv_ctx)); + if (!sctx) + return -WD_EINVAL; + + memcpy(&sctx->config, config, sizeof(struct wd_ctx_config_internal)); + + return WD_SUCCESS; +} + +static void sm3_ce_drv_exit(struct wd_alg_driver *drv) +{ + struct sm3_ce_drv_ctx *sctx = (struct sm3_ce_drv_ctx *)drv->priv; + + if (!sctx) + return; + + free(sctx); + drv->priv = NULL; +} diff --git a/drv/isa_ce_sm3.h b/drv/isa_ce_sm3.h new file mode 100644 index 0000000..13edb0a --- /dev/null +++ b/drv/isa_ce_sm3.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: Apache-2.0 */ +/* Copyright 2020-2021 Huawei Technologies Co.,Ltd. All rights reserved. */ +#ifndef __ISA_CE_SM3_H +#define __ISA_CE_SM3_H + +#include "wd_alg_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define SM3_DIGEST_SIZE 32 +#define SM3_BLOCK_SIZE 64 +#define SM3_STATE_WORDS 8 +#define HMAC_BLOCK_SIZE 64 +#define WORD_TO_CHAR_OFFSET 4 +#define SM3_PADDING_BYTE 0x80 +#define NH_OFFSET 23 +#define BIT_TO_BLOCK_OFFSET 9 +#define BIT_TO_BYTE_OFFSET 3 +#define IPAD_DATA 0x36 +#define OPAD_DATA 0x5c +#define KEY_BLOCK_NUM 1 + +#define SM3_IVA 0x7380166f +#define SM3_IVB 0x4914b2b9 +#define SM3_IVC 0x172442d7 +#define SM3_IVD 0xda8a0600 +#define SM3_IVE 0xa96f30bc +#define SM3_IVF 0x163138aa +#define SM3_IVG 0xe38dee4d +#define SM3_IVH 0xb0fb0e4e + +#define PUTU32_TO_U8(dst, src) \ + ((dst)[0] = (__u8)((src) >> 24), \ + (dst)[1] = (__u8)((src) >> 16), \ + (dst)[2] = (__u8)((src) >> 8), \ + (dst)[3] = (__u8)(src)) + +#define PUTU8_TO_U32(dst, src) \ + ((dst) = (((__u32)(src)[0]) << 24) + \ + (((__u32)(src)[1]) << 16) + \ + (((__u32)(src)[2]) << 8) + \ + ((__u32)(src)[3])) + +struct sm3_ce_ctx { + /* + * Use an array to represent the eight 32-bits word registers, + * SM3_IVA, SM3_IVB, ..., SM3_IVH, save IV and the final digest. + */ + __u32 word_reg[SM3_STATE_WORDS]; + /* + * The length (in bits) of all the msg fragments, the length of the + * whole msg should less than 2^64 bit, a msg block is 512-bits, + * make a 64-bits number in two parts, low 32-bits - 'Nl' and + * high 32-bits - 'Nh'. + */ + __u64 nblocks; + /* + * Message block, a msg block is 512-bits, use sixteen __u32 type + * element to store it, used in B(i) = W0||W1||W2||...||W15. + * Use a __u8 array to replace the 32-bit array. + */ + __u8 block[SM3_BLOCK_SIZE]; + /* The number of msg that need to compute in current cycle or turn. */ + size_t num; +}; + +struct hmac_sm3_ctx { + struct sm3_ce_ctx sctx; + /* Save user key */ + __u8 key[SM3_BLOCK_SIZE]; +}; + +struct sm3_ce_drv_ctx { + struct wd_ctx_config_internal config; +}; + +void sm3_ce_block_compress(__u32 word_reg[SM3_STATE_WORDS], + const __u8 *src, size_t blocks); + +#ifdef __cplusplus +} +#endif + +#endif /* __ISA_CE_SM3_H */ diff --git a/drv/isa_ce_sm3_armv8.S b/drv/isa_ce_sm3_armv8.S new file mode 100644 index 0000000..3d08e2d --- /dev/null +++ b/drv/isa_ce_sm3_armv8.S @@ -0,0 +1,765 @@ +/* SPDX-License-Identifier: Apache-2.0 */ +/* + * Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include "../include/drv/arm_arch_ce.h" + +.arch armv8.2-a +.text +.globl sm3_ce_block_compress +.type sm3_ce_block_compress,%function +.align 5 +sm3_ce_block_compress: + AARCH64_VALID_CALL_TARGET +/* Loads state */ + /* + * Loads multiple single-element structures from memory(X0 register) and + * writes result to two SIMD&FP registers(v5.4s and v6.4s). + */ + ld1 {v5.4s,v6.4s}, [x0] /* 4s -- 4 * 32bit */ + /* + * Reverses the order of 32-bit(type:s) elements in each doubleword of the + * vector in the src SIMD&FP register(v5), places the result into a vector + * and writes the vector to the dst SIDM&FP register(v5). + */ + rev64 v5.4s, v5.4s + rev64 v6.4s, v6.4s + /* + * Extracts the lowest vector elements from the second src SIMD&FP register, + * and highest vector elements from the first source SIMD&FP register, + * concatenates the result into a vector, and writes the vector to the + * dst SIMD&FP register vector. #8 means the numbered byte element to be extracted. + * Format: ext <dst register>, <first src register>, <second src register>, <index> + * #imm: immediate data. + */ + ext v5.16b, v5.16b, v5.16b, #8 /* 16b -- 16 * 8bit */ + ext v6.16b, v6.16b, v6.16b, #8 + /* From PC-relative address adds an immediate value to form a PC-relative + * address, and writes the result to the dst register. + */ + adr x8, .Tj /* 'Tj' is the constant defined in SM3 protocol */ + /* Loads pair of register calculates an address from a base register value + * and an immediate offset, loads two 32-bit words from memory, and writes + * them to two registers. */ + ldp s16, s17, [x8] /* 'sn' is the scalar register, 'vn' is the vector register */ + +.Loop: +/* Loads input */ + /* + * Loads multipule single-element structrue to four registers. + * #64 is the immediate offset variant, it is the post-index immediate offset. + * Loads the input src data, msg to be hashed. + */ + ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [x1], #64 + /* + * Substracts an optionally-shifted immediate value from a register value, + * and writes the result to the dst register. + */ + sub w2, w2, #1 + + /* Copies the value in a src register to the dst register. */ + mov v18.16b, v5.16b + mov v19.16b, v6.16b + +#ifndef __ARMEB__ + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + rev32 v2.16b, v2.16b + rev32 v3.16b, v3.16b +#endif + + ext v20.16b, v16.16b, v16.16b, #4 + /* s4 = w7 | w8 | w9 | w10 */ + ext v4.16b, v1.16b, v2.16b, #12 + /* vtmp1 = w3 | w4 | w5 | w6 */ + ext v22.16b, v0.16b, v1.16b, #12 + /* vtmp2 = w10 | w11 | w12 | w13 */ + ext v23.16b, v2.16b, v3.16b, #8 + /* sm3partw1 v4.4s, v0.4s, v3.4s */ +.inst 0xce63c004 + /* sm3partw2 v4.4s, v23.4s, v22.4s */ +.inst 0xce76c6e4 + eor v22.16b, v0.16b, v1.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5682e5 + /* sm3tt2a v6.4s, v23.4s, v0.4s[0] */ +.inst 0xce408ae6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5692e5 + /* sm3tt2a v6.4s, v23.4s, v0.4s[1] */ +.inst 0xce409ae6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a2e5 + /* sm3tt2a v6.4s, v23.4s, v0.4s[2] */ +.inst 0xce40aae6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b2e5 + /* sm3tt2a v6.4s, v23.4s, v0.4s[3] */ +.inst 0xce40bae6 + /* s4 = w7 | w8 | w9 | w10 */ + ext v0.16b, v2.16b, v3.16b, #12 + /* vtmp1 = w3 | w4 | w5 | w6 */ + ext v22.16b, v1.16b, v2.16b, #12 + /* vtmp2 = w10 | w11 | w12 | w13 */ + ext v23.16b, v3.16b, v4.16b, #8 + /* sm3partw1 v0.4s, v1.4s, v4.4s */ +.inst 0xce64c020 + /* sm3partw2 v0.4s, v23.4s, v22.4s */ +.inst 0xce76c6e0 + eor v22.16b, v1.16b, v2.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5682e5 + /* sm3tt2a v6.4s, v23.4s, v1.4s[0] */ +.inst 0xce418ae6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5692e5 + /* sm3tt2a v6.4s, v23.4s, v1.4s[1] */ +.inst 0xce419ae6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a2e5 + /* sm3tt2a v6.4s, v23.4s, v1.4s[2] */ +.inst 0xce41aae6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b2e5 + /* sm3tt2a v6.4s, v23.4s, v1.4s[3] */ +.inst 0xce41bae6 + /* s4 = w7 | w8 | w9 | w10 */ + ext v1.16b, v3.16b, v4.16b, #12 + /* vtmp1 = w3 | w4 | w5 | w6 */ + ext v22.16b, v2.16b, v3.16b, #12 + /* vtmp2 = w10 | w11 | w12 | w13 */ + ext v23.16b, v4.16b, v0.16b, #8 + /* sm3partw1 v1.4s, v2.4s, v0.4s */ +.inst 0xce60c041 + /* sm3partw2 v1.4s, v23.4s, v22.4s */ +.inst 0xce76c6e1 + eor v22.16b, v2.16b, v3.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5682e5 + /* sm3tt2a v6.4s, v23.4s, v2.4s[0] */ +.inst 0xce428ae6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5692e5 + /* sm3tt2a v6.4s, v23.4s, v2.4s[1] */ +.inst 0xce429ae6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a2e5 + /* sm3tt2a v6.4s, v23.4s, v2.4s[2] */ +.inst 0xce42aae6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b2e5 + /* sm3tt2a v6.4s, v23.4s, v2.4s[3] */ +.inst 0xce42bae6 + /* s4 = w7 | w8 | w9 | w10 */ + ext v2.16b, v4.16b, v0.16b, #12 + /* vtmp1 = w3 | w4 | w5 | w6 */ + ext v22.16b, v3.16b, v4.16b, #12 + /* vtmp2 = w10 | w11 | w12 | w13 */ + ext v23.16b, v0.16b, v1.16b, #8 + /* sm3partw1 v2.4s, v3.4s, v1.4s */ +.inst 0xce61c062 + /* sm3partw2 v2.4s, v23.4s, v22.4s */ +.inst 0xce76c6e2 + eor v22.16b, v3.16b, v4.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5682e5 + /* sm3tt2a v6.4s, v23.4s, v3.4s[0] */ +.inst 0xce438ae6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5692e5 + /* sm3tt2a v6.4s, v23.4s, v3.4s[1] */ +.inst 0xce439ae6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a2e5 + /* sm3tt2a v6.4s, v23.4s, v3.4s[2] */ +.inst 0xce43aae6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1a v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b2e5 + /* sm3tt2a v6.4s, v23.4s, v3.4s[3] */ +.inst 0xce43bae6 + ext v20.16b, v17.16b, v17.16b, #4 + /* s4 = w7 | w8 | w9 | w10 */ + ext v3.16b, v0.16b, v1.16b, #12 + /* vtmp1 = w3 | w4 | w5 | w6 */ + ext v22.16b, v4.16b, v0.16b, #12 + /* vtmp2 = w10 | w11 | w12 | w13 */ + ext v23.16b, v1.16b, v2.16b, #8 + /* sm3partw1 v3.4s, v4.4s, v2.4s */ +.inst 0xce62c083 + /* sm3partw2 v3.4s, v23.4s, v22.4s */ +.inst 0xce76c6e3 + eor v22.16b, v4.16b, v0.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5686e5 + /* sm3tt2b v6.4s, v23.4s, v4.4s[0] */ +.inst 0xce448ee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5696e5 + /* sm3tt2b v6.4s, v23.4s, v4.4s[1] */ +.inst 0xce449ee6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a6e5 + /* sm3tt2b v6.4s, v23.4s, v4.4s[2] */ +.inst 0xce44aee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b6e5 + /* sm3tt2b v6.4s, v23.4s, v4.4s[3] */ +.inst 0xce44bee6 + /* s4 = w7 | w8 | w9 | w10 */ + ext v4.16b, v1.16b, v2.16b, #12 + /* vtmp1 = w3 | w4 | w5 | w6 */ + ext v22.16b, v0.16b, v1.16b, #12 + /* vtmp2 = w10 | w11 | w12 | w13 */ + ext v23.16b, v2.16b, v3.16b, #8 + /* sm3partw1 v4.4s, v0.4s, v3.4s */ +.inst 0xce63c004 + /* sm3partw2 v4.4s, v23.4s, v22.4s */ +.inst 0xce76c6e4 + eor v22.16b, v0.16b, v1.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5686e5 + /* sm3tt2b v6.4s, v23.4s, v0.4s[0] */ +.inst 0xce408ee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5696e5 + /* sm3tt2b v6.4s, v23.4s, v0.4s[1] */ +.inst 0xce409ee6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a6e5 + /* sm3tt2b v6.4s, v23.4s, v0.4s[2] */ +.inst 0xce40aee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b6e5 + /* sm3tt2b v6.4s, v23.4s, v0.4s[3] */ +.inst 0xce40bee6 + /* s4 = w7 | w8 | w9 | w10 */ + ext v0.16b, v2.16b, v3.16b, #12 + /* vtmp1 = w3 | w4 | w5 | w6 */ + ext v22.16b, v1.16b, v2.16b, #12 + /* vtmp2 = w10 | w11 | w12 | w13 */ + ext v23.16b, v3.16b, v4.16b, #8 + /* sm3partw1 v0.4s, v1.4s, v4.4s */ +.inst 0xce64c020 + /* sm3partw2 v0.4s, v23.4s, v22.4s */ +.inst 0xce76c6e0 + eor v22.16b, v1.16b, v2.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5686e5 + /* sm3tt2b v6.4s, v23.4s, v1.4s[0] */ +.inst 0xce418ee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5696e5 + /* sm3tt2b v6.4s, v23.4s, v1.4s[1] */ +.inst 0xce419ee6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a6e5 + /* sm3tt2b v6.4s, v23.4s, v1.4s[2] */ +.inst 0xce41aee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b6e5 + /* sm3tt2b v6.4s, v23.4s, v1.4s[3] */ +.inst 0xce41bee6 + /* s4 = w7 | w8 | w9 | w10 */ + ext v1.16b, v3.16b, v4.16b, #12 + /* vtmp1 = w3 | w4 | w5 | w6 */ + ext v22.16b, v2.16b, v3.16b, #12 + /* vtmp2 = w10 | w11 | w12 | w13 */ + ext v23.16b, v4.16b, v0.16b, #8 + /* sm3partw1 v1.4s, v2.4s, v0.4s */ +.inst 0xce60c041 + /* sm3partw2 v1.4s, v23.4s, v22.4s */ +.inst 0xce76c6e1 + eor v22.16b, v2.16b, v3.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5686e5 + /* sm3tt2b v6.4s, v23.4s, v2.4s[0] */ +.inst 0xce428ee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5696e5 + /* sm3tt2b v6.4s, v23.4s, v2.4s[1] */ +.inst 0xce429ee6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a6e5 + /* sm3tt2b v6.4s, v23.4s, v2.4s[2] */ +.inst 0xce42aee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b6e5 + /* sm3tt2b v6.4s, v23.4s, v2.4s[3] */ +.inst 0xce42bee6 + /* s4 = w7 | w8 | w9 | w10 */ + ext v2.16b, v4.16b, v0.16b, #12 + /* vtmp1 = w3 | w4 | w5 | w6 */ + ext v22.16b, v3.16b, v4.16b, #12 + /* vtmp2 = w10 | w11 | w12 | w13 */ + ext v23.16b, v0.16b, v1.16b, #8 + /* sm3partw1 v2.4s, v3.4s, v1.4s */ +.inst 0xce61c062 + /* sm3partw2 v2.4s, v23.4s, v22.4s */ +.inst 0xce76c6e2 + eor v22.16b, v3.16b, v4.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5686e5 + /* sm3tt2b v6.4s, v23.4s, v3.4s[0] */ +.inst 0xce438ee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5696e5 + /* sm3tt2b v6.4s, v23.4s, v3.4s[1] */ +.inst 0xce439ee6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a6e5 + /* sm3tt2b v6.4s, v23.4s, v3.4s[2] */ +.inst 0xce43aee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b6e5 + /* sm3tt2b v6.4s, v23.4s, v3.4s[3] */ +.inst 0xce43bee6 + /* s4 = w7 | w8 | w9 | w10 */ + ext v3.16b, v0.16b, v1.16b, #12 + /* vtmp1 = w3 | w4 | w5 | w6 */ + ext v22.16b, v4.16b, v0.16b, #12 + /* vtmp2 = w10 | w11 | w12 | w13 */ + ext v23.16b, v1.16b, v2.16b, #8 + /* sm3partw1 v3.4s, v4.4s, v2.4s */ +.inst 0xce62c083 + /* sm3partw2 v3.4s, v23.4s, v22.4s */ +.inst 0xce76c6e3 + eor v22.16b, v4.16b, v0.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5686e5 + /* sm3tt2b v6.4s, v23.4s, v4.4s[0] */ +.inst 0xce448ee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5696e5 + /* sm3tt2b v6.4s, v23.4s, v4.4s[1] */ +.inst 0xce449ee6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a6e5 + /* sm3tt2b v6.4s, v23.4s, v4.4s[2] */ +.inst 0xce44aee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b6e5 + /* sm3tt2b v6.4s, v23.4s, v4.4s[3] */ +.inst 0xce44bee6 + /* s4 = w7 | w8 | w9 | w10 */ + ext v4.16b, v1.16b, v2.16b, #12 + /* vtmp1 = w3 | w4 | w5 | w6 */ + ext v22.16b, v0.16b, v1.16b, #12 + /* vtmp2 = w10 | w11 | w12 | w13 */ + ext v23.16b, v2.16b, v3.16b, #8 + /* sm3partw1 v4.4s, v0.4s, v3.4s */ +.inst 0xce63c004 + /* sm3partw2 v4.4s, v23.4s, v22.4s */ +.inst 0xce76c6e4 + eor v22.16b, v0.16b, v1.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5686e5 + /* sm3tt2b v6.4s, v23.4s, v0.4s[0] */ +.inst 0xce408ee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5696e5 + /* sm3tt2b v6.4s, v23.4s, v0.4s[1] */ +.inst 0xce409ee6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a6e5 + /* sm3tt2b v6.4s, v23.4s, v0.4s[2] */ +.inst 0xce40aee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b6e5 + /* sm3tt2b v6.4s, v23.4s, v0.4s[3] */ +.inst 0xce40bee6 + /* s4 = w7 | w8 | w9 | w10 */ + ext v0.16b, v2.16b, v3.16b, #12 + /* vtmp1 = w3 | w4 | w5 | w6 */ + ext v22.16b, v1.16b, v2.16b, #12 + /* vtmp2 = w10 | w11 | w12 | w13 */ + ext v23.16b, v3.16b, v4.16b, #8 + /* sm3partw1 v0.4s, v1.4s, v4.4s */ +.inst 0xce64c020 + /* sm3partw2 v0.4s, v23.4s, v22.4s */ +.inst 0xce76c6e0 + eor v22.16b, v1.16b, v2.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5686e5 + /* sm3tt2b v6.4s, v23.4s, v1.4s[0] */ +.inst 0xce418ee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5696e5 + /* sm3tt2b v6.4s, v23.4s, v1.4s[1] */ +.inst 0xce419ee6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a6e5 + /* sm3tt2b v6.4s, v23.4s, v1.4s[2] */ +.inst 0xce41aee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b6e5 + /* sm3tt2b v6.4s, v23.4s, v1.4s[3] */ +.inst 0xce41bee6 + /* s4 = w7 | w8 | w9 | w10 */ + ext v1.16b, v3.16b, v4.16b, #12 + /* vtmp1 = w3 | w4 | w5 | w6 */ + ext v22.16b, v2.16b, v3.16b, #12 + /* vtmp2 = w10 | w11 | w12 | w13 */ + ext v23.16b, v4.16b, v0.16b, #8 + /* sm3partw1 v1.4s, v2.4s, v0.4s */ +.inst 0xce60c041 + /* sm3partw2 v1.4s, v23.4s, v22.4s */ +.inst 0xce76c6e1 + eor v22.16b, v2.16b, v3.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5686e5 + /* sm3tt2b v6.4s, v23.4s, v2.4s[0] */ +.inst 0xce428ee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5696e5 + /* sm3tt2b v6.4s, v23.4s, v2.4s[1] */ +.inst 0xce429ee6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a6e5 + /* sm3tt2b v6.4s, v23.4s, v2.4s[2] */ +.inst 0xce42aee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b6e5 + /* sm3tt2b v6.4s, v23.4s, v2.4s[3] */ +.inst 0xce42bee6 + eor v22.16b, v3.16b, v4.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5686e5 + /* sm3tt2b v6.4s, v23.4s, v3.4s[0] */ +.inst 0xce438ee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5696e5 + /* sm3tt2b v6.4s, v23.4s, v3.4s[1] */ +.inst 0xce439ee6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a6e5 + /* sm3tt2b v6.4s, v23.4s, v3.4s[2] */ +.inst 0xce43aee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b6e5 + /* sm3tt2b v6.4s, v23.4s, v3.4s[3] */ +.inst 0xce43bee6 + eor v22.16b, v4.16b, v0.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5686e5 + /* sm3tt2b v6.4s, v23.4s, v4.4s[0] */ +.inst 0xce448ee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5696e5 + /* sm3tt2b v6.4s, v23.4s, v4.4s[1] */ +.inst 0xce449ee6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a6e5 + /* sm3tt2b v6.4s, v23.4s, v4.4s[2] */ +.inst 0xce44aee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b6e5 + /* sm3tt2b v6.4s, v23.4s, v4.4s[3] */ +.inst 0xce44bee6 + eor v22.16b, v0.16b, v1.16b + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[0] */ +.inst 0xce5686e5 + /* sm3tt2b v6.4s, v23.4s, v0.4s[0] */ +.inst 0xce408ee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[1] */ +.inst 0xce5696e5 + /* sm3tt2b v6.4s, v23.4s, v0.4s[1] */ +.inst 0xce409ee6 + /* sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s */ +.inst 0xce5418b7 + shl v21.4s, v20.4s, #1 + sri v21.4s, v20.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[2] */ +.inst 0xce56a6e5 + /* sm3tt2b v6.4s, v23.4s, v0.4s[2] */ +.inst 0xce40aee6 + /* sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s */ +.inst 0xce5518b7 + shl v20.4s, v21.4s, #1 + sri v20.4s, v21.4s, #31 + /* sm3tt1b v5.4s, v23.4s, v22.4s[3] */ +.inst 0xce56b6e5 + /* sm3tt2b v6.4s, v23.4s, v0.4s[3] */ +.inst 0xce40bee6 + eor v5.16b, v5.16b, v18.16b + eor v6.16b, v6.16b, v19.16b + /* + * cbnz: compare and branch on Nonzero, compares the value in a register + * with zero, and conditionally branches to a label at a PC-relative offset + * if the comparison is not equal. + * 'w2' is the 32-bit name of the general-purpose register to be tested. + * '.Loop' is the program label to be conditionally branched to. + */ + cbnz w2, .Loop + + /* save state, it is the result of one cycle */ + rev64 v5.4s, v5.4s + rev64 v6.4s, v6.4s + ext v5.16b, v5.16b, v5.16b, #8 + ext v6.16b, v6.16b, v6.16b, #8 + st1 {v5.4s,v6.4s}, [x0] + ret +.size sm3_ce_block_compress,.-sm3_ce_block_compress + +.align 3 +.Tj: +/* + * Inserts a list of 32-bit values as data into the assembly. + * In SM3 protocol: + * when 0 <= j <= 15, Tj = 0x79cc4519, + * when 16 <= j <= 63, Tj = 0x9d8a7a87. + */ +.word 0x79cc4519, 0x9d8a7a87 diff --git a/include/drv/arm_arch_ce.h b/include/drv/arm_arch_ce.h new file mode 100644 index 0000000..cad6e33 --- /dev/null +++ b/include/drv/arm_arch_ce.h @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: Apache-2.0 */ +/* + * Copyright 2011-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#ifndef __ARM_ARCH_CE_H +#define __ARM_ARCH_CE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#if !defined(__ARM_ARCH__) +# if defined(__CC_ARM) +# define __ARM_ARCH__ __TARGET_ARCH_ARM +# if defined(__BIG_ENDIAN) +# define __ARMEB__ +# else +# define __ARMEL__ +# endif +# elif defined(__GNUC__) +# if defined(__aarch64__) +# define __ARM_ARCH__ 8 + /* + * GCC does not define __ARM_ARCH__, instead it defines + * bunch of below macros. See all_architectures[] table in + * gcc/config/arm/arm.c. + */ +# elif defined(__ARM_ARCH) +# define __ARM_ARCH__ __ARM_ARCH +# elif defined(__ARM_ARCH_8A__) +# define __ARM_ARCH__ 8 +# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ + defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \ + defined(__ARM_ARCH_7EM__) +# define __ARM_ARCH__ 7 +# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6M__) || \ + defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_6T2__) +# define __ARM_ARCH__ 6 +# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ + defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ + defined(__ARM_ARCH_5TEJ__) +# define __ARM_ARCH__ 5 +# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) +# define __ARM_ARCH__ 4 +# else +# error "unsupported ARM architecture" +# endif +# endif +#endif + +#if !defined(__ARM_MAX_ARCH__) +# define __ARM_MAX_ARCH__ __ARM_ARCH__ +#endif + +#if __ARM_MAX_ARCH__ < __ARM_ARCH__ +# error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__" +#elif __ARM_MAX_ARCH__ != __ARM_ARCH__ +# if __ARM_ARCH__ < 7 && __ARM_MAX_ARCH__ >= 7 && defined(__ARMEB__) +# error "can't build universal big-endian binary" +# endif +#endif + +#ifndef __ASSEMBLER__ +extern unsigned int ARMCAP_P; +extern unsigned int ARM_MIDR; +#endif + +#define ARMV7_NEON (1<<0) +#define ARMV7_TICK (1<<1) +#define ARMV8_AES (1<<2) +#define ARMV8_SHA1 (1<<3) +#define ARMV8_SHA256 (1<<4) +#define ARMV8_PMULL (1<<5) +#define ARMV8_SHA512 (1<<6) +#define ARMV8_CPUID (1<<7) +#define ARMV8_RNG (1<<8) +#define ARMV8_SM3 (1<<9) +#define ARMV8_SM4 (1<<10) +#define ARMV8_SHA3 (1<<11) +#define ARMV8_UNROLL8_EOR3 (1<<12) +#define ARMV8_SVE (1<<13) +#define ARMV8_SVE2 (1<<14) + +/* + * MIDR_EL1 system register + * + * 63___ _ ___32_31___ _ ___24_23_____20_19_____16_15__ _ __4_3_______0 + * | | | | | | | + * |RES0 | Implementer | Variant | Arch | PartNum |Revision| + * |____ _ _____|_____ _ _____|_________|_______ _|____ _ ___|________| + * + */ + +#define ARM_CPU_IMP_ARM 0x41 +#define HISI_CPU_IMP 0x48 + +#define ARM_CPU_PART_CORTEX_A72 0xD08 +#define ARM_CPU_PART_N1 0xD0C +#define ARM_CPU_PART_V1 0xD40 +#define ARM_CPU_PART_N2 0xD49 +#define HISI_CPU_PART_KP920 0xD01 + +#define MIDR_PARTNUM_SHIFT 4 +#define MIDR_PARTNUM_MASK (0xfffU << MIDR_PARTNUM_SHIFT) +#define MIDR_PARTNUM(midr) \ + (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT) + +#define MIDR_IMPLEMENTER_SHIFT 24 +#define MIDR_IMPLEMENTER_MASK (0xffU << MIDR_IMPLEMENTER_SHIFT) +#define MIDR_IMPLEMENTER(midr) \ + (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT) + +#define MIDR_ARCHITECTURE_SHIFT 16 +#define MIDR_ARCHITECTURE_MASK (0xfU << MIDR_ARCHITECTURE_SHIFT) +#define MIDR_ARCHITECTURE(midr) \ + (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT) + +#define MIDR_CPU_MODEL_MASK \ + (MIDR_IMPLEMENTER_MASK | \ + MIDR_PARTNUM_MASK | \ + MIDR_ARCHITECTURE_MASK) + +#define MIDR_CPU_MODEL(imp, partnum) \ + (((imp) << MIDR_IMPLEMENTER_SHIFT) | \ + (0xfU << MIDR_ARCHITECTURE_SHIFT) | \ + ((partnum) << MIDR_PARTNUM_SHIFT)) + +#define MIDR_IS_CPU_MODEL(midr, imp, partnum) \ + (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum)) + +#if defined(__ASSEMBLER__) + /* + * Support macros for + * - Armv8.3-A Pointer Authentication and + * - Armv8.5-A Branch Target Identification + * features which require emitting a .note.gnu.property section with the + * appropriate architecture-dependent feature bits set. + * Read more: "ELF for the ArmĀ® 64-bit Architecture" + */ +# if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 +# define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */ +# define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */ +# else +# define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */ +# define AARCH64_VALID_CALL_TARGET +# endif + +# if defined(__ARM_FEATURE_PAC_DEFAULT) && \ + (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 /* Signed with A-key */ +# define GNU_PROPERTY_AARCH64_POINTER_AUTH (1 << 1) /* Has Pointer Authentication */ +# define AARCH64_SIGN_LINK_REGISTER hint #25 /* PACIASP */ +# define AARCH64_VALIDATE_LINK_REGISTER hint #29 /* AUTIASP */ +# elif defined(__ARM_FEATURE_PAC_DEFAULT) && \ + (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 /* Signed with B-key */ +# define GNU_PROPERTY_AARCH64_POINTER_AUTH (1 << 1) /* Has Pointer Authentication */ +# define AARCH64_SIGN_LINK_REGISTER hint #27 /* PACIBSP */ +# define AARCH64_VALIDATE_LINK_REGISTER hint #31 /* AUTIBSP */ +# else +# define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 /* No Pointer Authentication */ +# if GNU_PROPERTY_AARCH64_BTI != 0 +# define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET +# else +# define AARCH64_SIGN_LINK_REGISTER +# endif +# define AARCH64_VALIDATE_LINK_REGISTER +# endif + +# if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0 + .pushsection .note.gnu.property, "a"; + .balign 8; + .long 4; + .long 0x10; + .long 0x5; + .asciz "GNU"; + .long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ + .long 4; + .long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI); + .long 0; + .popsection; +# endif + +#endif /* defined __ASSEMBLER__ */ + +#define IS_CPU_SUPPORT_UNROLL8_EOR3() \ + (ARMCAP_P & ARMV8_UNROLL8_EOR3) + +#ifdef __cplusplus +} +#endif + +#endif /* __ARM_ARCH_CE_H */ diff --git a/include/wd_alg.h b/include/wd_alg.h index f8b136e..861b7d9 100644 --- a/include/wd_alg.h +++ b/include/wd_alg.h @@ -19,6 +19,49 @@ extern "C" { #define ALG_NAME_SIZE 128 #define DEV_NAME_LEN 128
+/* + * Macros related to arm platform: + * ARM puts the feature bits for Crypto Extensions in AT_HWCAP2, whereas + * AArch64 used AT_HWCAP. + */ +#ifndef AT_HWCAP +# define AT_HWCAP 16 +#endif + +#ifndef AT_HWCAP2 +# define AT_HWCAP2 26 +#endif + +#if defined(__arm__) || defined(__arm) +# define HWCAP AT_HWCAP +# define HWCAP_NEON (1 << 12) + +# define HWCAP_CE AT_HWCAP2 +# define HWCAP_CE_AES (1 << 0) +# define HWCAP_CE_PMULL (1 << 1) +# define HWCAP_CE_SHA1 (1 << 2) +# define HWCAP_CE_SHA256 (1 << 3) +#elif defined(__aarch64__) +# define HWCAP AT_HWCAP +# define HWCAP_NEON (1 << 1) + +# define HWCAP_CE HWCAP +# define HWCAP_CE_AES (1 << 3) +# define HWCAP_CE_PMULL (1 << 4) +# define HWCAP_CE_SHA1 (1 << 5) +# define HWCAP_CE_SHA256 (1 << 6) +# define HWCAP_CPUID (1 << 11) +# define HWCAP_SHA3 (1 << 17) +# define HWCAP_CE_SM3 (1 << 18) +# define HWCAP_CE_SM4 (1 << 19) +# define HWCAP_CE_SHA512 (1 << 21) +# define HWCAP_SVE (1 << 22) +/* AT_HWCAP2 */ +# define HWCAP2 26 +# define HWCAP2_SVE2 (1 << 1) +# define HWCAP2_RNG (1 << 16) +#endif + enum alg_dev_type { UADK_ALG_SOFT = 0x0, UADK_ALG_CE_INSTR = 0x1, diff --git a/wd_alg.c b/wd_alg.c index 3b111c8..f34a407 100644 --- a/wd_alg.c +++ b/wd_alg.c @@ -9,6 +9,7 @@ #include <stdbool.h> #include <stdlib.h> #include <pthread.h> +#include <sys/auxv.h>
#include "wd.h" #include "wd_alg.h" @@ -90,6 +91,24 @@ static bool wd_check_accel_dev(const char *dev_name) return false; }
+static bool wd_check_ce_support(const char *dev_name) +{ + unsigned long hwcaps = 0; + + #if defined(__arm__) || defined(__arm) + hwcaps = getauxval(AT_HWCAP2); + #elif defined(__aarch64__) + hwcaps = getauxval(AT_HWCAP); + #endif + if (!strcmp("isa_ce_sm3", dev_name) && (hwcaps & HWCAP_CE_SM3)) + return true; + + if (!strcmp("isa_ce_sm4", dev_name) && (hwcaps & HWCAP_CE_SM4)) + return true; + + return false; +} + static bool wd_alg_check_available(int calc_type, const char *dev_name) { bool ret = false; @@ -99,6 +118,7 @@ static bool wd_alg_check_available(int calc_type, const char *dev_name) break; /* Should find the CPU if not support CE */ case UADK_ALG_CE_INSTR: + ret = wd_check_ce_support(dev_name); break; /* Should find the CPU if not support SVE */ case UADK_ALG_SVE_INSTR: @@ -280,8 +300,13 @@ struct wd_alg_driver *wd_request_drv(const char *alg_name, bool hw_mask) struct wd_alg_driver *drv = NULL; int tmp_priority = -1;
- if (!pnext || !alg_name) { - WD_ERR("invalid: request alg param is error!\n"); + if (!pnext) { + WD_ERR("invalid: requset drv pnext is NULL!\n"); + return NULL; + } + + if (!alg_name) { + WD_ERR("invalid: alg_name is NULL!\n"); return NULL; }
@@ -289,7 +314,8 @@ struct wd_alg_driver *wd_request_drv(const char *alg_name, bool hw_mask) pthread_mutex_lock(&mutex); while (pnext) { /* hw_mask true mean not to used hardware dev */ - if (hw_mask && pnext->drv->calc_type == UADK_ALG_HW) { + if ((hw_mask && pnext->drv->calc_type == UADK_ALG_HW) || + (!hw_mask && pnext->drv->calc_type != UADK_ALG_HW)) { pnext = pnext->next; continue; } diff --git a/wd_digest.c b/wd_digest.c index c59184d..491502a 100644 --- a/wd_digest.c +++ b/wd_digest.c @@ -222,7 +222,7 @@ static void wd_digest_clear_status(void) }
static int wd_digest_init_nolock(struct wd_ctx_config *config, - struct wd_sched *sched) + struct wd_sched *sched) { int ret;
diff --git a/wd_sched.c b/wd_sched.c index 419280e..b43834d 100644 --- a/wd_sched.c +++ b/wd_sched.c @@ -453,7 +453,7 @@ static struct wd_sched sched_table[SCHED_POLICY_BUTT] = { .poll_policy = session_sched_poll_policy, }, { .name = "None scheduler", - .sched_policy = SCHED_POLICY_SINGLE, + .sched_policy = SCHED_POLICY_NONE, .sched_init = sched_none_init, .pick_next_ctx = sched_none_pick_next_ctx, .poll_policy = sched_none_poll_policy, diff --git a/wd_util.c b/wd_util.c index 6134239..39909ca 100644 --- a/wd_util.c +++ b/wd_util.c @@ -91,6 +91,11 @@ struct acc_alg_item { char *algtype; };
+struct wd_ce_ctx { + char *drv_name; + void *priv; +}; + static struct acc_alg_item alg_options[] = { {"zlib", "zlib"}, {"gzip", "gzip"}, @@ -229,7 +234,6 @@ int wd_init_ctx_config(struct wd_ctx_config_internal *in, ret = -WD_EINVAL; goto err_out; } - clone_ctx_to_internal(cfg->ctxs + i, ctxs + i); ret = pthread_spin_init(&ctxs[i].lock, PTHREAD_PROCESS_SHARED); if (ret) { @@ -2612,14 +2616,44 @@ out_freelist: return ret; }
+static int wd_alg_ce_ctx_init(struct wd_init_attrs *attrs) +{ + struct wd_ctx_config *ctx_config = attrs->ctx_config; + + ctx_config->ctx_num = 1; + ctx_config->ctxs = calloc(ctx_config->ctx_num, sizeof(struct wd_ctx)); + if (!ctx_config->ctxs) { + return -WD_ENOMEM; + WD_ERR("failed to alloc ctxs!\n"); + } + ctx_config->ctxs[0].ctx = (handle_t)calloc(1, sizeof(struct wd_ce_ctx)); + + return WD_SUCCESS; +} + +static void wd_alg_ce_ctx_uninit(struct wd_ctx_config *ctx_config) +{ + __u32 i; + + for (i = 0; i < ctx_config->ctx_num; i++) { + if (ctx_config->ctxs[i].ctx) { + free((struct wd_ce_ctx *)ctx_config->ctxs[i].ctx); + ctx_config->ctxs[i].ctx = 0; + } + } + + free(ctx_config->ctxs); +} + static void wd_alg_ctx_uninit(struct wd_ctx_config *ctx_config) { __u32 i;
- for (i = 0; i < ctx_config->ctx_num; i++) + for (i = 0; i < ctx_config->ctx_num; i++) { if (ctx_config->ctxs[i].ctx) { wd_release_ctx(ctx_config->ctxs[i].ctx); ctx_config->ctxs[i].ctx = 0; + } }
free(ctx_config->ctxs); @@ -2633,9 +2667,9 @@ int wd_alg_attrs_init(struct wd_init_attrs *attrs) struct wd_ctx_config *ctx_config = NULL; struct wd_sched *alg_sched = NULL; char alg_type[CRYPTO_MAX_ALG_NAME]; - char *alg = attrs->alg; int driver_type = UADK_ALG_HW; - int ret; + char *alg = attrs->alg; + int ret = 0;
if (!attrs->ctx_params) return -WD_EINVAL; @@ -2646,22 +2680,37 @@ int wd_alg_attrs_init(struct wd_init_attrs *attrs) switch (driver_type) { case UADK_ALG_SOFT: case UADK_ALG_CE_INSTR: - /* No need to alloc resource */ - if (sched_type != SCHED_POLICY_NONE) + /* No need to alloc resource */ + if (sched_type != SCHED_POLICY_NONE) { + WD_ERR("invalid sched_type\n"); return -WD_EINVAL; + } + + ctx_config = calloc(1, sizeof(*ctx_config)); + if (!ctx_config) { + WD_ERR("fail to alloc ctx config\n"); + return -WD_ENOMEM; + } + attrs->ctx_config = ctx_config;
alg_sched = wd_sched_rr_alloc(SCHED_POLICY_NONE, 1, 1, alg_poll_func); if (!alg_sched) { WD_ERR("fail to alloc scheduler\n"); - return -WD_EINVAL; + goto out_ctx_config; } + attrs->sched = alg_sched;
- ret = wd_sched_rr_instance(alg_sched, NULL); + ret = wd_alg_ce_ctx_init(attrs); if (ret) { - WD_ERR("fail to instance scheduler\n"); + WD_ERR("fail to init ce ctx\n"); goto out_freesched; } + + ret = alg_init_func(ctx_config, alg_sched); + if (ret) + goto out_pre_init; + break; case UADK_ALG_SVE_INSTR: /* Todo lock cpu core */ @@ -2720,7 +2769,10 @@ int wd_alg_attrs_init(struct wd_init_attrs *attrs) return 0;
out_pre_init: - wd_alg_ctx_uninit(ctx_config); + if (driver_type == UADK_ALG_CE_INSTR || driver_type == UADK_ALG_SOFT) + wd_alg_ce_ctx_uninit(ctx_config); + else + wd_alg_ctx_uninit(ctx_config); out_freesched: wd_sched_rr_release(alg_sched); out_ctx_config: @@ -2733,10 +2785,19 @@ void wd_alg_attrs_uninit(struct wd_init_attrs *attrs) { struct wd_ctx_config *ctx_config = attrs->ctx_config; struct wd_sched *alg_sched = attrs->sched; + int driver_type = attrs->driver->calc_type;
- if (ctx_config) { - wd_alg_ctx_uninit(ctx_config); - free(ctx_config); + if (driver_type == UADK_ALG_CE_INSTR || driver_type == UADK_ALG_SOFT) { + if (ctx_config) { + wd_alg_ce_ctx_uninit(ctx_config); + free(ctx_config); + } + } else { + if (ctx_config) { + wd_alg_ctx_uninit(ctx_config); + free(ctx_config); + } } + wd_sched_rr_release(alg_sched); }