First test for pipelined AES

This commit is contained in:
Michael Zohner 2016-01-28 14:59:31 +01:00
parent 7add229725
commit ac4d534b02
6 changed files with 317 additions and 9 deletions

View File

@ -7,9 +7,9 @@ EXT=${SRC}/externals
# compiler settings
CC=g++
#COMPILER_OPTIONS=-O2
COMPILER_OPTIONS=-g3 -ggdb -O2 -Wall -Wextra #-fPIC -mavx -maes -mpclmul -DRDTSC -DTEST=AES128
COMPILER_OPTIONS=-g3 -O2 #-fPIC -mavx -maes -mpclmul -DRDTSC -DTEST=AES128
DEBUG_OPTIONS=-g3
DEBUG_OPTIONS=-g3 -ggdb #-Wall -Wextra
BATCH=
@ -67,7 +67,7 @@ all: miracl core bench demo
core: ${OBJECTS_CORE}
%.o:%.cpp %.h
${CC} $< ${COMPILER_OPTIONS} -c ${INCLUDE} ${LIBRARIES} ${CFLAGS} ${BATCH} -o $@
${CC} $< ${COMPILER_OPTIONS} ${DEBUG_OPTIONS} -c ${INCLUDE} ${LIBRARIES} ${CFLAGS} ${BATCH} -o $@
bench:
${CC} -o psi.exe ${SRC}/mains/bench_psi.cpp ${OBJECTS_DHPSI} ${OBJECTS_OTPSI} ${OBJECTS_NAIVE} ${OBJECTS_SERVERAIDED} ${OBJECTS_UTIL} ${OBJECTS_HASHING} ${OBJECTS_CRYPTO} ${OBJECTS_OT} ${OBJECTS_MIRACL} ${CFLAGS} ${DEBUG_OPTIONS} ${LIBRARIES} ${MIRACL_LIB} ${INCLUDE} ${COMPILER_OPTIONS}

View File

@ -0,0 +1,258 @@
/*
* Copied and modified from Shay Gueron's intrin_sequential_ks4_enc8.cpp
*
/********************************************************************/
/* Copyright(c) 2014, Intel Corp. */
/* Developers and authors: Shay Gueron (1) (2) */
/* (1) University of Haifa, Israel */
/* (2) Intel, Israel */
/* IPG, Architecture, Israel Development Center, Haifa, Israel */
/********************************************************************/
#include "intrin_sequential_enc8.h"
#ifdef AES256_HASH
#define KS_BLOCK(t, reg, reg2) {globAux=_mm_slli_epi64(reg, 32);\
reg=_mm_xor_si128(globAux, reg);\
globAux=_mm_shuffle_epi8(reg, con3);\
reg=_mm_xor_si128(globAux, reg);\
reg=_mm_xor_si128(reg2, reg);\
}
#define KS_round(i) { x2 =_mm_shuffle_epi8(keyA, mask); \
keyA_aux=_mm_aesenclast_si128 (x2, con); \
KS_BLOCK(0, keyA, keyA_aux);\
x2 =_mm_shuffle_epi8(keyB, mask); \
keyB_aux=_mm_aesenclast_si128 (x2, con); \
KS_BLOCK(1, keyB, keyB_aux);\
x2 =_mm_shuffle_epi8(keyC, mask); \
keyC_aux=_mm_aesenclast_si128 (x2, con); \
KS_BLOCK(2, keyC, keyC_aux);\
x2 =_mm_shuffle_epi8(keyD, mask); \
keyD_aux=_mm_aesenclast_si128 (x2, con); \
KS_BLOCK(3, keyD, keyD_aux);\
con=_mm_slli_epi32(con, 1);\
_mm_storeu_si128((__m128i *)(keyptr[0].KEY+i*16), keyA);\
_mm_storeu_si128((__m128i *)(keyptr[1].KEY+i*16), keyB); \
_mm_storeu_si128((__m128i *)(keyptr[2].KEY+i*16), keyC); \
_mm_storeu_si128((__m128i *)(keyptr[3].KEY+i*16), keyD); \
}
#define KS_round_last(i) { x2 =_mm_shuffle_epi8(keyA, mask); \
keyA_aux=_mm_aesenclast_si128 (x2, con); \
x2 =_mm_shuffle_epi8(keyB, mask); \
keyB_aux=_mm_aesenclast_si128 (x2, con); \
x2 =_mm_shuffle_epi8(keyC, mask); \
keyC_aux=_mm_aesenclast_si128 (x2, con); \
x2 =_mm_shuffle_epi8(keyD, mask); \
keyD_aux=_mm_aesenclast_si128 (x2, con); \
KS_BLOCK(0, keyA, keyA_aux);\
KS_BLOCK(1, keyB, keyB_aux);\
KS_BLOCK(2, keyC, keyC_aux);\
KS_BLOCK(3, keyD, keyD_aux);\
_mm_storeu_si128((__m128i *)(keyptr[0].KEY+i*16), keyA);\
_mm_storeu_si128((__m128i *)(keyptr[1].KEY+i*16), keyB); \
_mm_storeu_si128((__m128i *)(keyptr[2].KEY+i*16), keyC); \
_mm_storeu_si128((__m128i *)(keyptr[3].KEY+i*16), keyD); \
}
#define READ_KEYS(i) {keyA = _mm_loadu_si128((__m128i const*)(keyptr[0].KEY+i*16));\
keyB = _mm_loadu_si128((__m128i const*)(keyptr[1].KEY+i*16));\
keyC = _mm_loadu_si128((__m128i const*)(keyptr[2].KEY+i*16));\
keyD = _mm_loadu_si128((__m128i const*)(keyptr[3].KEY+i*16));\
keyE = _mm_loadu_si128((__m128i const*)(keyptr[4].KEY+i*16));\
keyF = _mm_loadu_si128((__m128i const*)(keyptr[5].KEY+i*16));\
keyG = _mm_loadu_si128((__m128i const*)(keyptr[6].KEY+i*16));\
keyH = _mm_loadu_si128((__m128i const*)(keyptr[7].KEY+i*16));\
}
#define ENC_round(i) {block1=_mm_aesenc_si128(block1, (*(__m128i const*)(keyptr[0].KEY+i*16))); \
block2=_mm_aesenc_si128(block2, (*(__m128i const*)(keyptr[1].KEY+i*16))); \
block3=_mm_aesenc_si128(block3, (*(__m128i const*)(keyptr[2].KEY+i*16))); \
block4=_mm_aesenc_si128(block4, (*(__m128i const*)(keyptr[3].KEY+i*16))); \
block5=_mm_aesenc_si128(block5, (*(__m128i const*)(keyptr[4].KEY+i*16))); \
block6=_mm_aesenc_si128(block6, (*(__m128i const*)(keyptr[5].KEY+i*16))); \
block7=_mm_aesenc_si128(block7, (*(__m128i const*)(keyptr[6].KEY+i*16))); \
block8=_mm_aesenc_si128(block8, (*(__m128i const*)(keyptr[7].KEY+i*16))); \
}
#define ENC_round_last(i) {block1=_mm_aesenclast_si128(block1, (*(__m128i const*)(keyptr[0].KEY+i*16))); \
block2=_mm_aesenclast_si128(block2, (*(__m128i const*)(keyptr[1].KEY+i*16))); \
block3=_mm_aesenclast_si128(block3, (*(__m128i const*)(keyptr[2].KEY+i*16))); \
block4=_mm_aesenclast_si128(block4, (*(__m128i const*)(keyptr[3].KEY+i*16))); \
block5=_mm_aesenclast_si128(block5, (*(__m128i const*)(keyptr[4].KEY+i*16))); \
block6=_mm_aesenclast_si128(block6, (*(__m128i const*)(keyptr[5].KEY+i*16))); \
block7=_mm_aesenclast_si128(block7, (*(__m128i const*)(keyptr[6].KEY+i*16))); \
block8=_mm_aesenclast_si128(block8, (*(__m128i const*)(keyptr[7].KEY+i*16))); \
}
//generates nkeys round keys from the bytes stored in key_bytes
void intrin_sequential_ks4(ROUND_KEYS* ks, unsigned char* key_bytes, int nkeys) {
ROUND_KEYS *keyptr=(ROUND_KEYS *)ks;
register __m128i keyA, keyB, keyC, keyD, con, mask, x2, keyA_aux, keyB_aux, keyC_aux, keyD_aux, globAux;
int i;
int _con1[4]={1,1,1,1};
int _con2[4]={0x1b,0x1b,0x1b,0x1b};
int _mask[4]={0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d};
int _con3[4]={0x0ffffffff, 0x0ffffffff, 0x07060504, 0x07060504};
__m128i con3=_mm_loadu_si128((__m128i const*)_con3);
for (i=0;i<nkeys;i+=4){
keyptr[0].nr=10;
keyptr[1].nr=10;
keyptr[2].nr=10;
keyptr[3].nr=10;
keyA = _mm_loadu_si128((__m128i const*)(key_bytes));
keyB = _mm_loadu_si128((__m128i const*)(key_bytes+16));
keyC = _mm_loadu_si128((__m128i const*)(key_bytes+32));
keyD = _mm_loadu_si128((__m128i const*)(key_bytes+48));
_mm_storeu_si128((__m128i *)keyptr[0].KEY, keyA);
_mm_storeu_si128((__m128i *)keyptr[1].KEY, keyB);
_mm_storeu_si128((__m128i *)keyptr[2].KEY, keyC);
_mm_storeu_si128((__m128i *)keyptr[3].KEY, keyD);
con = _mm_loadu_si128((__m128i const*)_con1);
mask = _mm_loadu_si128((__m128i const*)_mask);
KS_round(1)
KS_round(2)
KS_round(3)
KS_round(4)
KS_round(5)
KS_round(6)
KS_round(7)
KS_round(8)
con = _mm_loadu_si128((__m128i const*)_con2);
KS_round(9)
KS_round_last(10)
keyptr+=4;
key_bytes+=64;
}
}
void intrin_sequential_enc8(const unsigned char* PT, unsigned char* CT, int n_aesiters, int nkeys, ROUND_KEYS* ks){
ROUND_KEYS *keyptr=(ROUND_KEYS *)ks;
register __m128i keyA, keyB, keyC, keyD, keyE, keyF, keyG, keyH, con, mask, x2, keyA_aux, keyB_aux, keyC_aux, keyD_aux, globAux;
unsigned char *ptptr, ctptr;
int i, j, ptoffset, ctoffset;
ctoffset = n_aesiters * 16;
for (i=0;i<nkeys;i+=8){
for(j=0;j<n_aesiters; j++) {
register __m128i block1 = _mm_loadu_si128((__m128i const*)(0*16+PT));
register __m128i block2 = _mm_loadu_si128((__m128i const*)(1*16+PT));
register __m128i block3 = _mm_loadu_si128((__m128i const*)(2*16+PT));
register __m128i block4 = _mm_loadu_si128((__m128i const*)(3*16+PT));
register __m128i block5 = _mm_loadu_si128((__m128i const*)(4*16+PT));
register __m128i block6 = _mm_loadu_si128((__m128i const*)(5*16+PT));
register __m128i block7 = _mm_loadu_si128((__m128i const*)(6*16+PT));
register __m128i block8 = _mm_loadu_si128((__m128i const*)(7*16+PT));
READ_KEYS(0)
block1 = _mm_xor_si128(keyA, block1);
block2 = _mm_xor_si128(keyB, block2);
block3 = _mm_xor_si128(keyC, block3);
block4 = _mm_xor_si128(keyD, block4);
block5 = _mm_xor_si128(keyE, block5);
block6 = _mm_xor_si128(keyF, block6);
block7 = _mm_xor_si128(keyG, block7);
block8 = _mm_xor_si128(keyH, block8);
ENC_round(1)
ENC_round(2)
ENC_round(3)
ENC_round(4)
ENC_round(5)
ENC_round(6)
ENC_round(7)
ENC_round(8)
ENC_round(9)
ENC_round_last(10)
_mm_storeu_si128((__m128i *)(CT+0*16), block1);
_mm_storeu_si128((__m128i *)(CT+1*16), block2);
_mm_storeu_si128((__m128i *)(CT+2*16), block3);
_mm_storeu_si128((__m128i *)(CT+3*16), block4);
_mm_storeu_si128((__m128i *)(CT+4*16), block5);
_mm_storeu_si128((__m128i *)(CT+5*16), block6);
_mm_storeu_si128((__m128i *)(CT+6*16), block7);
_mm_storeu_si128((__m128i *)(CT+7*16), block8);
PT+=128;
CT+=128;
}
keyptr+=8;
}
}
void intrin_sequential_gen_rnd8(unsigned char* ctr_buf, const unsigned long long ctr, unsigned char* CT,
int n_aesiters, int nkeys, ROUND_KEYS* ks){
ROUND_KEYS *keyptr=(ROUND_KEYS *)ks;
register __m128i keyA, keyB, keyC, keyD, keyE, keyF, keyG, keyH, con, mask, x2, keyA_aux, keyB_aux, keyC_aux, keyD_aux, globAux;
unsigned char *ctptr;
int i, j, ctoffset;
unsigned long long* tmpctr = (unsigned long long*) ctr_buf;
ctoffset = n_aesiters * 16;
register __m128i inblock, block1, block2, block3, block4, block5, block6, block7, block8;
for (i=0;i<nkeys;i+=8){
ctptr=CT + i*ctoffset;
(*tmpctr) = ctr;
for(j=0;j<n_aesiters; j++) {
(*tmpctr)++;
inblock = _mm_loadu_si128((__m128i const*)(ctr_buf));
READ_KEYS(0)
block1 = _mm_xor_si128(keyA, inblock);
block2 = _mm_xor_si128(keyB, inblock);
block3 = _mm_xor_si128(keyC, inblock);
block4 = _mm_xor_si128(keyD, inblock);
block5 = _mm_xor_si128(keyE, inblock);
block6 = _mm_xor_si128(keyF, inblock);
block7 = _mm_xor_si128(keyG, inblock);
block8 = _mm_xor_si128(keyH, inblock);
ENC_round(1)
ENC_round(2)
ENC_round(3)
ENC_round(4)
ENC_round(5)
ENC_round(6)
ENC_round(7)
ENC_round(8)
ENC_round(9)
ENC_round_last(10)
_mm_storeu_si128((__m128i *)(ctptr+0*ctoffset), block1);
_mm_storeu_si128((__m128i *)(ctptr+1*ctoffset), block2);
_mm_storeu_si128((__m128i *)(ctptr+2*ctoffset), block3);
_mm_storeu_si128((__m128i *)(ctptr+3*ctoffset), block4);
_mm_storeu_si128((__m128i *)(ctptr+4*ctoffset), block5);
_mm_storeu_si128((__m128i *)(ctptr+5*ctoffset), block6);
_mm_storeu_si128((__m128i *)(ctptr+6*ctoffset), block7);
_mm_storeu_si128((__m128i *)(ctptr+7*ctoffset), block8);
ctptr+=16;
}
keyptr+=8;
}
}
#endif

View File

@ -16,14 +16,39 @@
#ifdef AES256_HASH
#include <stdint.h>
#include <stdio.h>
#include <wmmintrin.h>
#if !defined (ALIGN16)
#if defined (__GNUC__)
# define ALIGN16 __attribute__ ( (aligned (16)))
# else
# define ALIGN16 __declspec (align (16))
# endif
#endif
#if defined(__INTEL_COMPILER)
# include <ia32intrin.h>
#elif defined(__GNUC__)
# include <emmintrin.h>
# include <smmintrin.h>
#endif
typedef struct KEY_SCHEDULE
{
ALIGN16 unsigned char KEY[16*15];
unsigned int nr;
} ROUND_KEYS;
#ifdef __cplusplus
extern "C" {
#endif
void intrin_sequential_gen_rnd8(unsigned char* ctr_buf, const unsigned long long ctr, unsigned char* CT,
int n_aesiters, int nkeys, unsigned char* ks, unsigned char* TEMP_BUF);
void intrin_sequential_ksn(unsigned char* ks, unsigned char* key_bytes, int nkeys);
void intrin_sequential_enc8(const unsigned char* PT, unsigned char* CT, int aes_niters, int nkeys, unsigned char* ks, unsigned char* TEMP_BUF);
int n_aesiters, int nkeys, ROUND_KEYS* ks);
void intrin_sequential_ks4(ROUND_KEYS* ks, unsigned char* key_bytes, int nkeys);
void intrin_sequential_enc8(const unsigned char* PT, unsigned char* CT, int aes_niters, int nkeys, ROUND_KEYS* ks);
#ifdef __cplusplus
};

View File

@ -276,6 +276,7 @@ void OTExtension1ooNECCReceiver::HashValues(CBitVector& T, CBitVector& seedbuf,
AES_encryptC(&inblock, &outblock, &tk_aeskey);
_mm_storeu_si128((__m128i *)(bufptr), outblock);
#else
cout << "hashing" << endl;
m_cCrypto->hash_ctr(bufptr, AES_BYTES, Tptr, m_nCodeWordBytes, i);
#endif
@ -517,8 +518,17 @@ void OTExtension1ooNECCSender::BuildQMatrix(CBitVector& T, CBitVector& RcvBuf, u
{
uint8_t* rcvbufptr = RcvBuf.GetArr();
uint8_t* Tptr = T.GetArr();
uint32_t* counter = (uint32_t*) ctr_buf;
uint32_t tempctr = *counter;
uint64_t* counter = (uint64_t*) ctr_buf;
uint64_t tempctr = *counter;
#ifdef AES256_HASH
intrin_sequential_gen_rnd8(ctr_buf, tempctr, Tptr, (int) 2*numblocks, (int) m_nCodeWordBits, m_vKeySeeds);
for (uint32_t k = 0; k < m_nCodeWordBits; k++, rcvbufptr += (m_nCodeWordBytes * numblocks)) {
if(m_nU.GetBit(k)){
T.XORBytes(rcvbufptr, k*m_nCodeWordBytes * numblocks, m_nCodeWordBytes * numblocks);
}
}
#else
for (uint32_t k = 0; k < m_nCodeWordBits; k++, rcvbufptr += (m_nCodeWordBytes * numblocks))
{
*counter = tempctr;
@ -531,6 +541,7 @@ void OTExtension1ooNECCSender::BuildQMatrix(CBitVector& T, CBitVector& RcvBuf, u
T.XORBytes(rcvbufptr, k*m_nCodeWordBytes * numblocks, m_nCodeWordBytes * numblocks);
}
}
#endif
}
void OTExtension1ooNECCSender::MaskInputs(CBitVector& Q, CBitVector* seedbuf, CBitVector* snd_buf, uint32_t ctr, uint32_t processedOTs)

View File

@ -248,6 +248,7 @@ void OTExtensionReceiver::HashValues(CBitVector& T, CBitVector& seedbuf, uint32_
//MPC_HASH_UPDATE(&sha, Tptr, m_nSymSecParam>>3);
//MPC_HASH_FINAL(&sha, hash_buf);
//}
cout << "Hashing here" << endl;
memcpy(inbuf, &i, sizeof(uint32_t));
memcpy(inbuf+sizeof(uint32_t), Tptr, m_nSymSecParam>>3);
m_cCrypto->hash(hash_buf, aes_key_bytes, inbuf, hashinbytelen);
@ -573,6 +574,10 @@ void OTExtensionSender::BuildQMatrix(CBitVector& T, CBitVector& RcvBuf, uint32_t
uint32_t dummy;
uint32_t* counter = (uint32_t*) ctr_buf;
uint32_t tempctr = *counter;
#ifdef AES256_HASH
cerr << "Not supported atm. Exiting." << endl;
exit(0);
#else
for (uint32_t k = 0; k < m_nSymSecParam; k++, rcvbufptr += (OTEXT_BLOCK_SIZE_BYTES * numblocks))
{
*counter = tempctr;
@ -586,6 +591,7 @@ void OTExtensionSender::BuildQMatrix(CBitVector& T, CBitVector& RcvBuf, uint32_t
T.XORBytes(rcvbufptr, k*OTEXT_BLOCK_SIZE_BYTES * numblocks, OTEXT_BLOCK_SIZE_BYTES * numblocks);
}
}
#endif
}
void OTExtensionSender::MaskInputs(CBitVector& Q, CBitVector* seedbuf, CBitVector* snd_buf, uint32_t ctr, uint32_t processedOTs)

View File

@ -96,9 +96,13 @@ class OTExtensionSender {
m_cCrypto = crypt;
m_nSymSecParam = m_cCrypto->get_seclvl().symbits;
m_vValues = (CBitVector*) malloc(sizeof(CBitVector) * nSndVals);
#ifdef AES256_HASH
m_vKeySeeds = (ROUND_KEYS*) malloc(sizeof(ROUND_KEYS) * nbaseOTs);
intrin_sequential_ks4(m_vKeySeeds, keybytes, (int) nbaseOTs);
#else
m_vKeySeeds = (AES_KEY_CTX*) malloc(sizeof(AES_KEY_CTX) * nbaseOTs);
InitAESKey(m_vKeySeeds, keybytes, nbaseOTs);
#endif
m_lSendLock = new CLock;
@ -136,7 +140,11 @@ class OTExtensionSender {
CBitVector m_nU;
CBitVector* m_vValues;
MaskingFunction* m_fMaskFct;
#ifdef AES256_HASH
ROUND_KEYS* m_vKeySeeds;
#else
AES_KEY_CTX* m_vKeySeeds;
#endif
OTBlock* m_sBlockHead;
OTBlock* m_sBlockTail;
CLock* m_lSendLock;