From 54ab27e579595fde8f99e909453172676d5d676c Mon Sep 17 00:00:00 2001 From: Michael Zohner Date: Thu, 28 Jan 2016 14:16:59 +0100 Subject: [PATCH] Added pipelined AES routines for fast randomness generation --- src/util/crypto/crypto.h | 1 + src/util/crypto/intrin_sequential_enc8.c | 283 +++++++++++++++++++++++ src/util/crypto/intrin_sequential_enc8.h | 33 +++ 3 files changed, 317 insertions(+) create mode 100644 src/util/crypto/intrin_sequential_enc8.c create mode 100644 src/util/crypto/intrin_sequential_enc8.h diff --git a/src/util/crypto/crypto.h b/src/util/crypto/crypto.h index 9a57237..25ace6a 100644 --- a/src/util/crypto/crypto.h +++ b/src/util/crypto/crypto.h @@ -21,6 +21,7 @@ #include "../codewords.h" #include "../socket.h" #include "TedKrovetzAesNiWrapperC.h" +#include "intrin_sequential_enc8.h" #define AES_BYTES 16 diff --git a/src/util/crypto/intrin_sequential_enc8.c b/src/util/crypto/intrin_sequential_enc8.c new file mode 100644 index 0000000..b7ee8cd --- /dev/null +++ b/src/util/crypto/intrin_sequential_enc8.c @@ -0,0 +1,283 @@ +/* + * Copied and modified from Shay Gueron's intrin_sequential_ks4_enc8.cpp + * +/********************************************************************/ +/* Copyright(c) 2014, Intel Corp. */ +/* Developers and authors: Shay Gueron (1) (2) */ +/* (1) University of Haifa, Israel */ +/* (2) Intel, Israel */ +/* IPG, Architecture, Israel Development Center, Haifa, Israel */ +/********************************************************************/ + +#include "intrin_sequential_enc8.h" + +#ifdef AES256_HASH +#include +#include +#include + +#if !defined (ALIGN16) +#if defined (__GNUC__) +# define ALIGN16 __attribute__ ( (aligned (16))) +# else +# define ALIGN16 __declspec (align (16)) +# endif +#endif + +#if defined(__INTEL_COMPILER) +# include +#elif defined(__GNUC__) +# include +# include +#endif + +typedef struct KEY_SCHEDULE +{ + ALIGN16 unsigned char KEY[16*15]; + unsigned int nr; +} ROUND_KEYS; + + + +#define KS_BLOCK(t, reg, reg2) {globAux=_mm_slli_epi64(reg, 32);\ + reg=_mm_xor_si128(globAux, reg);\ + globAux=_mm_shuffle_epi8(reg, con3);\ + reg=_mm_xor_si128(globAux, reg);\ + reg=_mm_xor_si128(reg2, reg);\ + } + +#define KS_round(i) { x2 =_mm_shuffle_epi8(keyA, mask); \ + keyA_aux=_mm_aesenclast_si128 (x2, con); \ + KS_BLOCK(0, keyA, keyA_aux);\ + x2 =_mm_shuffle_epi8(keyB, mask); \ + keyB_aux=_mm_aesenclast_si128 (x2, con); \ + KS_BLOCK(1, keyB, keyB_aux);\ + x2 =_mm_shuffle_epi8(keyC, mask); \ + keyC_aux=_mm_aesenclast_si128 (x2, con); \ + KS_BLOCK(2, keyC, keyC_aux);\ + x2 =_mm_shuffle_epi8(keyD, mask); \ + keyD_aux=_mm_aesenclast_si128 (x2, con); \ + KS_BLOCK(3, keyD, keyD_aux);\ + con=_mm_slli_epi32(con, 1);\ + _mm_storeu_si128((__m128i *)(keyptr[0].KEY+i*16), keyA);\ + _mm_storeu_si128((__m128i *)(keyptr[1].KEY+i*16), keyB); \ + _mm_storeu_si128((__m128i *)(keyptr[2].KEY+i*16), keyC); \ + _mm_storeu_si128((__m128i *)(keyptr[3].KEY+i*16), keyD); \ + } + +#define KS_round_last(i) { x2 =_mm_shuffle_epi8(keyA, mask); \ + keyA_aux=_mm_aesenclast_si128 (x2, con); \ + x2 =_mm_shuffle_epi8(keyB, mask); \ + keyB_aux=_mm_aesenclast_si128 (x2, con); \ + x2 =_mm_shuffle_epi8(keyC, mask); \ + keyC_aux=_mm_aesenclast_si128 (x2, con); \ + x2 =_mm_shuffle_epi8(keyD, mask); \ + keyD_aux=_mm_aesenclast_si128 (x2, con); \ + KS_BLOCK(0, keyA, keyA_aux);\ + KS_BLOCK(1, keyB, keyB_aux);\ + KS_BLOCK(2, keyC, keyC_aux);\ + KS_BLOCK(3, keyD, keyD_aux);\ + _mm_storeu_si128((__m128i *)(keyptr[0].KEY+i*16), keyA);\ + _mm_storeu_si128((__m128i *)(keyptr[1].KEY+i*16), keyB); \ + _mm_storeu_si128((__m128i *)(keyptr[2].KEY+i*16), keyC); \ + _mm_storeu_si128((__m128i *)(keyptr[3].KEY+i*16), keyD); \ + } + +#define READ_KEYS(i) {keyA = _mm_loadu_si128((__m128i const*)(keyptr[0].KEY+i*16));\ + keyB = _mm_loadu_si128((__m128i const*)(keyptr[1].KEY+i*16));\ + keyC = _mm_loadu_si128((__m128i const*)(keyptr[2].KEY+i*16));\ + keyD = _mm_loadu_si128((__m128i const*)(keyptr[3].KEY+i*16));\ + keyE = _mm_loadu_si128((__m128i const*)(keyptr[4].KEY+i*16));\ + keyF = _mm_loadu_si128((__m128i const*)(keyptr[5].KEY+i*16));\ + keyG = _mm_loadu_si128((__m128i const*)(keyptr[6].KEY+i*16));\ + keyH = _mm_loadu_si128((__m128i const*)(keyptr[7].KEY+i*16));\ + } + +#define ENC_round(i) {block1=_mm_aesenc_si128(block1, (*(__m128i const*)(keyptr[0].KEY+i*16))); \ + block2=_mm_aesenc_si128(block2, (*(__m128i const*)(keyptr[1].KEY+i*16))); \ + block3=_mm_aesenc_si128(block3, (*(__m128i const*)(keyptr[2].KEY+i*16))); \ + block4=_mm_aesenc_si128(block4, (*(__m128i const*)(keyptr[3].KEY+i*16))); \ + block5=_mm_aesenc_si128(block5, (*(__m128i const*)(keyptr[4].KEY+i*16))); \ + block6=_mm_aesenc_si128(block6, (*(__m128i const*)(keyptr[5].KEY+i*16))); \ + block7=_mm_aesenc_si128(block7, (*(__m128i const*)(keyptr[6].KEY+i*16))); \ + block8=_mm_aesenc_si128(block8, (*(__m128i const*)(keyptr[7].KEY+i*16))); \ +} + +#define ENC_round_last(i) {block1=_mm_aesenclast_si128(block1, (*(__m128i const*)(keyptr[0].KEY+i*16))); \ + block2=_mm_aesenclast_si128(block2, (*(__m128i const*)(keyptr[1].KEY+i*16))); \ + block3=_mm_aesenclast_si128(block3, (*(__m128i const*)(keyptr[2].KEY+i*16))); \ + block4=_mm_aesenclast_si128(block4, (*(__m128i const*)(keyptr[3].KEY+i*16))); \ + block5=_mm_aesenclast_si128(block5, (*(__m128i const*)(keyptr[4].KEY+i*16))); \ + block6=_mm_aesenclast_si128(block6, (*(__m128i const*)(keyptr[5].KEY+i*16))); \ + block7=_mm_aesenclast_si128(block7, (*(__m128i const*)(keyptr[6].KEY+i*16))); \ + block8=_mm_aesenclast_si128(block8, (*(__m128i const*)(keyptr[7].KEY+i*16))); \ +} + +//generates nkeys round keys from the bytes stored in key_bytes +void intrin_sequential_ksn(unsigned char* ks, unsigned char* key_bytes, int nkeys) { + ROUND_KEYS *keyptr=(ROUND_KEYS *)ks; + register __m128i keyA, keyB, keyC, keyD, con, mask, x2, keyA_aux, keyB_aux, keyC_aux, keyD_aux, globAux; + int i; + int _con1[4]={1,1,1,1}; + int _con2[4]={0x1b,0x1b,0x1b,0x1b}; + int _mask[4]={0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d}; + int _con3[4]={0x0ffffffff, 0x0ffffffff, 0x07060504, 0x07060504}; + __m128i con3=_mm_loadu_si128((__m128i const*)_con3); + + for (i=0;i