Added TedKrovetz AES NI routines for faster hashing. Requires AES-NI

This commit is contained in:
Michael Zohner 2016-01-27 19:21:27 +01:00
parent 5a1fec1bcd
commit a35bae608e
3 changed files with 701 additions and 0 deletions

41
src/util/crypto/Config.h Normal file
View File

@ -0,0 +1,41 @@
/**
* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
*
* Copyright (c) 2012 - SCAPI (http://crypto.biu.ac.il/scapi)
* This file is part of the SCAPI project.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* We request that any publication and/or code referring to and/or based on SCAPI contain an appropriate citation to SCAPI, including a reference to
* http://crypto.biu.ac.il/SCAPI.
*
* SCAPI uses Crypto++, Miracl, NTL and Bouncy Castle. Please see these projects for any further licensing issues.
* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
*
*/
#include <emmintrin.h>
/**
* A configuration file that uses defines used by other files in the project.
*
* author: Cryptography and Computer Security Research Group Department of Computer Science Bar-Ilan University (Meital Levy)
*/
typedef __m128i block;
#define SIZE_OF_BLOCK 16//size in bytes
#define XOR_GATE 6//the truth table is 0110
#define XOR_NOT_GATE 9// the truth table is 1001, can also use the optimization of FreeXor
#define ONE_GATE 15//the truth table is 1111
#define ZERO_BLOCK _mm_setzero_si128()//a zero block used in many cases

View File

@ -0,0 +1,578 @@
#include "TedKrovetzAesNiWrapperC.h"
#ifdef AES256_HASH
#ifdef _WIN32
#include "StdAfx.h"
#endif
void AES_128_Key_Expansion(const unsigned char *userkey, AES_KEY *aesKey)
{
block x0,x1,x2;
//block *kp = (block *)&aesKey;
aesKey->rd_key[0] = x0 = _mm_loadu_si128((block*)userkey);
x2 = _mm_setzero_si128();
EXPAND_ASSIST(x0, x1, x2, x0, 255, 1); aesKey->rd_key[1] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 2); aesKey->rd_key[2] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 4); aesKey->rd_key[3] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 8); aesKey->rd_key[4] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 16); aesKey->rd_key[5] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 32); aesKey->rd_key[6] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 64); aesKey->rd_key[7] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 128); aesKey->rd_key[8] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 27); aesKey->rd_key[9] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 54); aesKey->rd_key[10] = x0;
}
void AES_192_Key_Expansion(const unsigned char *userkey, AES_KEY *aesKey)
{
__m128i x0,x1,x2,x3,tmp,*kp = (block *)&aesKey;
kp[0] = x0 = _mm_loadu_si128((block*)userkey);
tmp = x3 = _mm_loadu_si128((block*)(userkey+16));
x2 = _mm_setzero_si128();
EXPAND192_STEP(1,1);
EXPAND192_STEP(4,4);
EXPAND192_STEP(7,16);
EXPAND192_STEP(10,64);
}
void AES_256_Key_Expansion(const unsigned char *userkey, AES_KEY *aesKey)
{
__m128i x0, x1, x2, x3;/* , *kp = (block *)&aesKey;*/
aesKey->rd_key[0] = x0 = _mm_loadu_si128((block*)userkey);
aesKey->rd_key[1] = x3 = _mm_loadu_si128((block*)(userkey + 16));
x2 = _mm_setzero_si128();
EXPAND_ASSIST(x0, x1, x2, x3, 255, 1); aesKey->rd_key[2] = x0;
EXPAND_ASSIST(x3, x1, x2, x0, 170, 1); aesKey->rd_key[3] = x3;
EXPAND_ASSIST(x0, x1, x2, x3, 255, 2); aesKey->rd_key[4] = x0;
EXPAND_ASSIST(x3, x1, x2, x0, 170, 2); aesKey->rd_key[5] = x3;
EXPAND_ASSIST(x0, x1, x2, x3, 255, 4); aesKey->rd_key[6] = x0;
EXPAND_ASSIST(x3, x1, x2, x0, 170, 4); aesKey->rd_key[7] = x3;
EXPAND_ASSIST(x0, x1, x2, x3, 255, 8); aesKey->rd_key[8] = x0;
EXPAND_ASSIST(x3, x1, x2, x0, 170, 8); aesKey->rd_key[9] = x3;
EXPAND_ASSIST(x0, x1, x2, x3, 255, 16); aesKey->rd_key[10] = x0;
EXPAND_ASSIST(x3, x1, x2, x0, 170, 16); aesKey->rd_key[11] = x3;
EXPAND_ASSIST(x0, x1, x2, x3, 255, 32); aesKey->rd_key[12] = x0;
EXPAND_ASSIST(x3, x1, x2, x0, 170, 32); aesKey->rd_key[13] = x3;
EXPAND_ASSIST(x0, x1, x2, x3, 255, 64); aesKey->rd_key[14] = x0;
}
void AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *aesKey)
{
if (bits == 128) {
AES_128_Key_Expansion(userKey, aesKey);
} else if (bits == 192) {
AES_192_Key_Expansion(userKey, aesKey);
} else if (bits == 256) {
AES_256_Key_Expansion(userKey, aesKey);
}
aesKey->rounds = 6 + bits / 32;
}
void AES_encryptC(block *in, block *out, AES_KEY *aesKey)
{
int j, rnds = ROUNDS(aesKey);
const __m128i *sched = ((__m128i *)(aesKey->rd_key));
__m128i tmp = _mm_load_si128((__m128i*)in);
tmp = _mm_xor_si128(tmp, sched[0]);
for (j = 1; j<rnds; j++) tmp = _mm_aesenc_si128(tmp, sched[j]);
tmp = _mm_aesenclast_si128(tmp, sched[j]);
_mm_store_si128((__m128i*)out, tmp);
}
void AES_ecb_encrypt(block *blk, AES_KEY *aesKey) {
unsigned j, rnds = ROUNDS(aesKey);
const block *sched = ((block *)(aesKey->rd_key));
*blk = _mm_xor_si128(*blk, sched[0]);
for (j = 1; j<rnds; ++j)
*blk = _mm_aesenc_si128(*blk, sched[j]);
*blk = _mm_aesenclast_si128(*blk, sched[j]);
}
void AES_ecb_encrypt_blks(block *blks, unsigned nblks, AES_KEY *aesKey) {
unsigned i,j,rnds=ROUNDS(aesKey);
const block *sched = ((block *)(aesKey->rd_key));
for (i=0; i<nblks; ++i)
blks[i] =_mm_xor_si128(blks[i], sched[0]);
for(j=1; j<rnds; ++j)
for (i=0; i<nblks; ++i)
blks[i] = _mm_aesenc_si128(blks[i], sched[j]);
for (i=0; i<nblks; ++i)
blks[i] =_mm_aesenclast_si128(blks[i], sched[j]);
}
void AES_ecb_encrypt_blks_4(block *blks, AES_KEY *aesKey) {
unsigned j, rnds = ROUNDS(aesKey);
const block *sched = ((block *)(aesKey->rd_key));
blks[0] = _mm_xor_si128(blks[0], sched[0]);
blks[1] = _mm_xor_si128(blks[1], sched[0]);
blks[2] = _mm_xor_si128(blks[2], sched[0]);
blks[3] = _mm_xor_si128(blks[3], sched[0]);
for (j = 1; j < rnds; ++j){
blks[0] = _mm_aesenc_si128(blks[0], sched[j]);
blks[1] = _mm_aesenc_si128(blks[1], sched[j]);
blks[2] = _mm_aesenc_si128(blks[2], sched[j]);
blks[3] = _mm_aesenc_si128(blks[3], sched[j]);
}
blks[0] = _mm_aesenclast_si128(blks[0], sched[j]);
blks[1] = _mm_aesenclast_si128(blks[1], sched[j]);
blks[2] = _mm_aesenclast_si128(blks[2], sched[j]);
blks[3] = _mm_aesenclast_si128(blks[3], sched[j]);
}
void AES_ecb_encrypt_blks_2_in_out(block *in, block *out, AES_KEY *aesKey) {
unsigned j, rnds = ROUNDS(aesKey);
const block *sched = ((block *)(aesKey->rd_key));
out[0] = _mm_xor_si128(in[0], sched[0]);
out[1] = _mm_xor_si128(in[1], sched[0]);
for (j = 1; j < rnds; ++j){
out[0] = _mm_aesenc_si128(out[0], sched[j]);
out[1] = _mm_aesenc_si128(out[1], sched[j]);
}
out[0] = _mm_aesenclast_si128(out[0], sched[j]);
out[1] = _mm_aesenclast_si128(out[1], sched[j]);
}
void AES_ecb_encrypt_blks_4_in_out(block *in, block *out, AES_KEY *aesKey) {
unsigned j, rnds = ROUNDS(aesKey);
const block *sched = ((block *)(aesKey->rd_key));
//block temp[4];
out[0] = _mm_xor_si128(in[0], sched[0]);
out[1] = _mm_xor_si128(in[1], sched[0]);
out[2] = _mm_xor_si128(in[2], sched[0]);
out[3] = _mm_xor_si128(in[3], sched[0]);
for (j = 1; j < rnds; ++j){
out[0] = _mm_aesenc_si128(out[0], sched[j]);
out[1] = _mm_aesenc_si128(out[1], sched[j]);
out[2] = _mm_aesenc_si128(out[2], sched[j]);
out[3] = _mm_aesenc_si128(out[3], sched[j]);
}
out[0] = _mm_aesenclast_si128(out[0], sched[j]);
out[1] = _mm_aesenclast_si128(out[1], sched[j]);
out[2] = _mm_aesenclast_si128(out[2], sched[j]);
out[3] = _mm_aesenclast_si128(out[3], sched[j]);
}
void AES_ecb_encrypt_blks_4_in_out_ind_keys(block *in, block *out, AES_KEY **aesKey, block** sched) {
unsigned j, rnds = ROUNDS(aesKey[0]);
sched[0] = ((block *)(aesKey[0][0].rd_key));
sched[1] = ((block *)(aesKey[0][1].rd_key));
sched[2] = ((block *)(aesKey[0][2].rd_key));
sched[3] = ((block *)(aesKey[0][3].rd_key));
//block temp[4];
out[0] = _mm_xor_si128(in[0], sched[0][0]);
out[1] = _mm_xor_si128(in[1], sched[1][0]);
out[2] = _mm_xor_si128(in[2], sched[2][0]);
out[3] = _mm_xor_si128(in[3], sched[3][0]);
for (j = 1; j < rnds; ++j){
out[0] = _mm_aesenc_si128(out[0], sched[0][j]);
out[1] = _mm_aesenc_si128(out[1], sched[1][j]);
out[2] = _mm_aesenc_si128(out[2], sched[2][j]);
out[3] = _mm_aesenc_si128(out[3], sched[3][j]);
}
out[0] = _mm_aesenclast_si128(out[0], sched[0][j]);
out[1] = _mm_aesenclast_si128(out[1], sched[1][j]);
out[2] = _mm_aesenclast_si128(out[2], sched[2][j]);
out[3] = _mm_aesenclast_si128(out[3], sched[3][j]);
}
void AES_ecb_encrypt_blks_4_in_out_par_ks(block *in, block *out, const unsigned char* userkey) {
unsigned int j, rnds = 10;
block k0, k1, k2, k3, ktmp, k0tmp, k1tmp, k2tmp, k3tmp;
/*aesKey->rd_key[0] = x0 = _mm_loadu_si128((block*)userkey);
x2 = _mm_setzero_si128();
EXPAND_ASSIST(x0, x1, x2, x0, 255, 2); aesKey->rd_key[2] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 4); aesKey->rd_key[3] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 8); aesKey->rd_key[4] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 16); aesKey->rd_key[5] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 32); aesKey->rd_key[6] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 64); aesKey->rd_key[7] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 128); aesKey->rd_key[8] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 27); aesKey->rd_key[9] = x0;
EXPAND_ASSIST(x0, x1, x2, x0, 255, 54); aesKey->rd_key[10] = x0;*/
/*sched[0] = ((block *)(aesKey[0]->rd_key));
sched[1] = ((block *)(aesKey[1]->rd_key));
sched[2] = ((block *)(aesKey[2]->rd_key));
sched[3] = ((block *)(aesKey[3]->rd_key));*/
k0 = _mm_loadu_si128((block*)userkey);
out[0] = _mm_xor_si128(in[0], k0);
k1 = _mm_loadu_si128((block*)(userkey+16));
out[1] = _mm_xor_si128(in[1], k1);
k2 = _mm_loadu_si128((block*)(userkey+32));
out[2] = _mm_xor_si128(in[2], k2);
k3 = _mm_loadu_si128((block*)(userkey+48));
out[3] = _mm_xor_si128(in[3], k3);
k0tmp = _mm_setzero_si128();
k1tmp = _mm_setzero_si128();
k2tmp = _mm_setzero_si128();
k3tmp = _mm_setzero_si128();
//First Round
EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 1);
out[0] = _mm_aesenc_si128(out[0], k0);
EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 1);
out[1] = _mm_aesenc_si128(out[1], k1);
EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 1);
out[2] = _mm_aesenc_si128(out[2], k2);
EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 1);
out[3] = _mm_aesenc_si128(out[3], k3);
//Second Round
EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 2);
out[0] = _mm_aesenc_si128(out[0], k0);
EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 2);
out[1] = _mm_aesenc_si128(out[1], k1);
EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 2);
out[2] = _mm_aesenc_si128(out[2], k2);
EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 2);
out[3] = _mm_aesenc_si128(out[3], k3);
//Third Round
EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 4);
out[0] = _mm_aesenc_si128(out[0], k0);
EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 4);
out[1] = _mm_aesenc_si128(out[1], k1);
EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 4);
out[2] = _mm_aesenc_si128(out[2], k2);
EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 4);
out[3] = _mm_aesenc_si128(out[3], k3);
//Fourth Round
EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 8);
out[0] = _mm_aesenc_si128(out[0], k0);
EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 8);
out[1] = _mm_aesenc_si128(out[1], k1);
EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 8);
out[2] = _mm_aesenc_si128(out[2], k2);
EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 8);
out[3] = _mm_aesenc_si128(out[3], k3);
//Fifth Round
EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 16);
out[0] = _mm_aesenc_si128(out[0], k0);
EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 16);
out[1] = _mm_aesenc_si128(out[1], k1);
EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 16);
out[2] = _mm_aesenc_si128(out[2], k2);
EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 16);
out[3] = _mm_aesenc_si128(out[3], k3);
//Sixth Round
EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 32);
out[0] = _mm_aesenc_si128(out[0], k0);
EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 32);
out[1] = _mm_aesenc_si128(out[1], k1);
EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 32);
out[2] = _mm_aesenc_si128(out[2], k2);
EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 32);
out[3] = _mm_aesenc_si128(out[3], k3);
//Seventh Round
EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 64);
out[0] = _mm_aesenc_si128(out[0], k0);
EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 64);
out[1] = _mm_aesenc_si128(out[1], k1);
EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 64);
out[2] = _mm_aesenc_si128(out[2], k2);
EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 64);
out[3] = _mm_aesenc_si128(out[3], k3);
//Eight Round
EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 128);
out[0] = _mm_aesenc_si128(out[0], k0);
EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 128);
out[1] = _mm_aesenc_si128(out[1], k1);
EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 128);
out[2] = _mm_aesenc_si128(out[2], k2);
EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 128);
out[3] = _mm_aesenc_si128(out[3], k3);
//Ninth Round
EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 27);
out[0] = _mm_aesenc_si128(out[0], k0);
EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 27);
out[1] = _mm_aesenc_si128(out[1], k1);
EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 27);
out[2] = _mm_aesenc_si128(out[2], k2);
EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 27);
out[3] = _mm_aesenc_si128(out[3], k3);
//Tenth Roundkey
EXPAND_ASSIST(k0, ktmp, k0tmp, k0, 255, 54);
out[0] = _mm_aesenclast_si128(out[0], k0);
EXPAND_ASSIST(k1, ktmp, k1tmp, k1, 255, 54);
out[1] = _mm_aesenclast_si128(out[1], k1);
EXPAND_ASSIST(k2, ktmp, k2tmp, k2, 255, 54);
out[2] = _mm_aesenclast_si128(out[2], k2);
EXPAND_ASSIST(k3, ktmp, k3tmp, k3, 255, 54);
out[3] = _mm_aesenclast_si128(out[3], k3);
}
void AES256_ecb_encrypt_blks_4_in_out_par_ks(block *in, block *out, const unsigned char* userkey) {
unsigned int j, rnds = 14;
//four keys for even and odd-numbered rounds as well as temporary keys
block k0e, k1e, k2e, k3e, k0o, k1o, k2o, k3o, ktmp, k0tmp, k1tmp, k2tmp, k3tmp;
/* __m128i x0, x1, x2, x3;
aesKey->rd_key[0] = x0 = _mm_loadu_si128((block*)userkey);
aesKey->rd_key[1] = x3 = _mm_loadu_si128((block*)(userkey + 16));
x2 = _mm_setzero_si128();
EXPAND_ASSIST(x0, x1, x2, x3, 255, 1); aesKey->rd_key[2] = x0;
EXPAND_ASSIST(x3, x1, x2, x0, 170, 1); aesKey->rd_key[3] = x3;
EXPAND_ASSIST(x0, x1, x2, x3, 255, 2); aesKey->rd_key[4] = x0;
EXPAND_ASSIST(x3, x1, x2, x0, 170, 2); aesKey->rd_key[5] = x3;
EXPAND_ASSIST(x0, x1, x2, x3, 255, 4); aesKey->rd_key[6] = x0;
EXPAND_ASSIST(x3, x1, x2, x0, 170, 4); aesKey->rd_key[7] = x3;
EXPAND_ASSIST(x0, x1, x2, x3, 255, 8); aesKey->rd_key[8] = x0;
EXPAND_ASSIST(x3, x1, x2, x0, 170, 8); aesKey->rd_key[9] = x3;
EXPAND_ASSIST(x0, x1, x2, x3, 255, 16); aesKey->rd_key[10] = x0;
EXPAND_ASSIST(x3, x1, x2, x0, 170, 16); aesKey->rd_key[11] = x3;
EXPAND_ASSIST(x0, x1, x2, x3, 255, 32); aesKey->rd_key[12] = x0;
EXPAND_ASSIST(x3, x1, x2, x0, 170, 32); aesKey->rd_key[13] = x3;
EXPAND_ASSIST(x0, x1, x2, x3, 255, 64); aesKey->rd_key[14] = x0;*/
//Zero-th Round
k0e = _mm_loadu_si128((block*)userkey);
out[0] = _mm_xor_si128(in[0], k0e);
k1e = _mm_loadu_si128((block*)(userkey+32));
out[1] = _mm_xor_si128(in[1], k1e);
k2e = _mm_loadu_si128((block*)(userkey+64));
out[2] = _mm_xor_si128(in[2], k2e);
k3e = _mm_loadu_si128((block*)(userkey+96));
out[3] = _mm_xor_si128(in[3], k3e);
k0tmp = _mm_setzero_si128();
k1tmp = _mm_setzero_si128();
k2tmp = _mm_setzero_si128();
k3tmp = _mm_setzero_si128();
//First Round
k0o = _mm_loadu_si128((block*)(userkey+16));
out[0] = _mm_aesenc_si128(out[0], k0o);
k1o = _mm_loadu_si128((block*)(userkey+48));
out[1] = _mm_aesenc_si128(out[1], k1o);
k2o = _mm_loadu_si128((block*)(userkey+80));
out[2] = _mm_aesenc_si128(out[2], k2o);
k3o = _mm_loadu_si128((block*)(userkey+112));
out[3] = _mm_aesenc_si128(out[3], k3o);
//Second Round; even round: result is written in kie
//EXPAND_ASSIST(x0, x1, x2, x3, 255, 1); aesKey->rd_key[2] = x0;
EXPAND_ASSIST(k0e, ktmp, k0tmp, k0o, 255, 1);
out[0] = _mm_aesenc_si128(out[0], k0e);
EXPAND_ASSIST(k1e, ktmp, k1tmp, k1o, 255, 1);
out[1] = _mm_aesenc_si128(out[1], k1e);
EXPAND_ASSIST(k2e, ktmp, k2tmp, k2o, 255, 1);
out[2] = _mm_aesenc_si128(out[2], k2e);
EXPAND_ASSIST(k3e, ktmp, k3tmp, k3o, 255, 1);
out[3] = _mm_aesenc_si128(out[3], k3e);
//Third Round; odd round: result is written in kio
//EXPAND_ASSIST(x3, x1, x2, x0, 170, 1); aesKey->rd_key[3] = x3;
EXPAND_ASSIST(k0o, ktmp, k0tmp, k0e, 170, 1);
out[0] = _mm_aesenc_si128(out[0], k0o);
EXPAND_ASSIST(k1o, ktmp, k1tmp, k1e, 170, 1);
out[1] = _mm_aesenc_si128(out[1], k1o);
EXPAND_ASSIST(k2o, ktmp, k2tmp, k2e, 170, 1);
out[2] = _mm_aesenc_si128(out[2], k2o);
EXPAND_ASSIST(k3o, ktmp, k3tmp, k3e, 170, 1);
out[3] = _mm_aesenc_si128(out[3], k3o);
//Fourth Round; even round: result is written in kie
//EXPAND_ASSIST(x0, x1, x2, x3, 255, 2); aesKey->rd_key[4] = x0;
EXPAND_ASSIST(k0e, ktmp, k0tmp, k0o, 255, 2);
out[0] = _mm_aesenc_si128(out[0], k0e);
EXPAND_ASSIST(k1e, ktmp, k1tmp, k1o, 255, 2);
out[1] = _mm_aesenc_si128(out[1], k1e);
EXPAND_ASSIST(k2e, ktmp, k2tmp, k2o, 255, 2);
out[2] = _mm_aesenc_si128(out[2], k2e);
EXPAND_ASSIST(k3e, ktmp, k3tmp, k3o, 255, 2);
out[3] = _mm_aesenc_si128(out[3], k3e);
//Fifth Round; odd round: result is written in kio
//EXPAND_ASSIST(x3, x1, x2, x0, 170, 2); aesKey->rd_key[5] = x3;
EXPAND_ASSIST(k0o, ktmp, k0tmp, k0e, 170, 2);
out[0] = _mm_aesenc_si128(out[0], k0o);
EXPAND_ASSIST(k1o, ktmp, k1tmp, k1e, 170, 2);
out[1] = _mm_aesenc_si128(out[1], k1o);
EXPAND_ASSIST(k2o, ktmp, k2tmp, k2e, 170, 2);
out[2] = _mm_aesenc_si128(out[2], k2o);
EXPAND_ASSIST(k3o, ktmp, k3tmp, k3e, 170, 2);
out[3] = _mm_aesenc_si128(out[3], k3o);
//Sixth Round; even round: result is written in kie
//EXPAND_ASSIST(x0, x1, x2, x3, 255, 4); aesKey->rd_key[6] = x0;
EXPAND_ASSIST(k0e, ktmp, k0tmp, k0o, 255, 4);
out[0] = _mm_aesenc_si128(out[0], k0e);
EXPAND_ASSIST(k1e, ktmp, k1tmp, k1o, 255, 4);
out[1] = _mm_aesenc_si128(out[1], k1e);
EXPAND_ASSIST(k2e, ktmp, k2tmp, k2o, 255, 4);
out[2] = _mm_aesenc_si128(out[2], k2e);
EXPAND_ASSIST(k3e, ktmp, k3tmp, k3o, 255, 4);
out[3] = _mm_aesenc_si128(out[3], k3e);
//Seventh Round: result is written in kio
//EXPAND_ASSIST(x3, x1, x2, x0, 170, 4); aesKey->rd_key[7] = x3;
EXPAND_ASSIST(k0o, ktmp, k0tmp, k0e, 170, 4);
out[0] = _mm_aesenc_si128(out[0], k0o);
EXPAND_ASSIST(k1o, ktmp, k1tmp, k1e, 170, 4);
out[1] = _mm_aesenc_si128(out[1], k1o);
EXPAND_ASSIST(k2o, ktmp, k2tmp, k2e, 170, 4);
out[2] = _mm_aesenc_si128(out[2], k2o);
EXPAND_ASSIST(k3o, ktmp, k3tmp, k3e, 170, 4);
out[3] = _mm_aesenc_si128(out[3], k3o);
//Eigth Round; even round: result is written in kie
//EXPAND_ASSIST(x0, x1, x2, x3, 255, 8); aesKey->rd_key[8] = x0;
EXPAND_ASSIST(k0e, ktmp, k0tmp, k0o, 255, 8);
out[0] = _mm_aesenc_si128(out[0], k0e);
EXPAND_ASSIST(k1e, ktmp, k1tmp, k1o, 255, 8);
out[1] = _mm_aesenc_si128(out[1], k1e);
EXPAND_ASSIST(k2e, ktmp, k2tmp, k2o, 255, 8);
out[2] = _mm_aesenc_si128(out[2], k2e);
EXPAND_ASSIST(k3e, ktmp, k3tmp, k3o, 255, 8);
out[3] = _mm_aesenc_si128(out[3], k3e);
//Ninth Round: odd result is written in kio
//EXPAND_ASSIST(x3, x1, x2, x0, 170, 8); aesKey->rd_key[9] = x3;
EXPAND_ASSIST(k0o, ktmp, k0tmp, k0e, 170, 8);
out[0] = _mm_aesenc_si128(out[0], k0o);
EXPAND_ASSIST(k1o, ktmp, k1tmp, k1e, 170, 8);
out[1] = _mm_aesenc_si128(out[1], k1o);
EXPAND_ASSIST(k2o, ktmp, k2tmp, k2e, 170, 8);
out[2] = _mm_aesenc_si128(out[2], k2o);
EXPAND_ASSIST(k3o, ktmp, k3tmp, k3e, 170, 8);
out[3] = _mm_aesenc_si128(out[3], k3o);
//Tenth Round; even round: result is written in kie
//EXPAND_ASSIST(x0, x1, x2, x3, 255, 16); aesKey->rd_key[10] = x0;
EXPAND_ASSIST(k0e, ktmp, k0tmp, k0o, 255, 16);
out[0] = _mm_aesenc_si128(out[0], k0e);
EXPAND_ASSIST(k1e, ktmp, k1tmp, k1o, 255, 16);
out[1] = _mm_aesenc_si128(out[1], k1e);
EXPAND_ASSIST(k2e, ktmp, k2tmp, k2o, 255, 16);
out[2] = _mm_aesenc_si128(out[2], k2e);
EXPAND_ASSIST(k3e, ktmp, k3tmp, k3o, 255, 16);
out[3] = _mm_aesenc_si128(out[3], k3e);
//Eleventh Roundkey: odd result is written in kio
//EXPAND_ASSIST(x3, x1, x2, x0, 170, 16); aesKey->rd_key[11] = x3;
EXPAND_ASSIST(k0o, ktmp, k0tmp, k0e, 170, 16);
out[0] = _mm_aesenc_si128(out[0], k0o);
EXPAND_ASSIST(k1o, ktmp, k1tmp, k1e, 170, 16);
out[1] = _mm_aesenc_si128(out[1], k1o);
EXPAND_ASSIST(k2o, ktmp, k2tmp, k2e, 170, 16);
out[2] = _mm_aesenc_si128(out[2], k2o);
EXPAND_ASSIST(k3o, ktmp, k3tmp, k3e, 170, 16);
out[3] = _mm_aesenc_si128(out[3], k3o);
//Twelvth Roundkey; even round: result is written in kie
//EXPAND_ASSIST(x0, x1, x2, x3, 255, 32); aesKey->rd_key[12] = x0;
EXPAND_ASSIST(k0e, ktmp, k0tmp, k0o, 255, 32);
out[0] = _mm_aesenc_si128(out[0], k0e);
EXPAND_ASSIST(k1e, ktmp, k1tmp, k1o, 255, 32);
out[1] = _mm_aesenc_si128(out[1], k1e);
EXPAND_ASSIST(k2e, ktmp, k2tmp, k2o, 255, 32);
out[2] = _mm_aesenc_si128(out[2], k2e);
EXPAND_ASSIST(k3e, ktmp, k3tmp, k3o, 255, 32);
out[3] = _mm_aesenc_si128(out[3], k3e);
//Thirtheenth Roundkey: odd result is written in kio
//EXPAND_ASSIST(x3, x1, x2, x0, 170, 32); aesKey->rd_key[13] = x3;
EXPAND_ASSIST(k0o, ktmp, k0tmp, k0e, 170, 32);
out[0] = _mm_aesenc_si128(out[0], k0o);
EXPAND_ASSIST(k1o, ktmp, k1tmp, k1e, 170, 32);
out[1] = _mm_aesenc_si128(out[1], k1o);
EXPAND_ASSIST(k2o, ktmp, k2tmp, k2e, 170, 32);
out[2] = _mm_aesenc_si128(out[2], k2o);
EXPAND_ASSIST(k3o, ktmp, k3tmp, k3e, 170, 32);
out[3] = _mm_aesenc_si128(out[3], k3o);
//Fourteenth Roundkey; even round: result is written in kie
//EXPAND_ASSIST(x0, x1, x2, x3, 255, 64); aesKey->rd_key[14] = x0;
EXPAND_ASSIST(k0e, ktmp, k0tmp, k0o, 255, 64);
out[0] = _mm_aesenclast_si128(out[0], k0e);
EXPAND_ASSIST(k1e, ktmp, k1tmp, k1o, 255, 64);
out[1] = _mm_aesenclast_si128(out[1], k1e);
EXPAND_ASSIST(k2e, ktmp, k2tmp, k2o, 255, 64);
out[2] = _mm_aesenclast_si128(out[2], k2e);
EXPAND_ASSIST(k3e, ktmp, k3tmp, k3o, 255, 64);
out[3] = _mm_aesenclast_si128(out[3], k3e);
}
void AES_ecb_encrypt_chunk_in_out(block *in, block *out, unsigned nblks, AES_KEY *aesKey) {
int numberOfLoops = nblks / 8;
int blocksPipeLined = numberOfLoops * 8;
int remainingEncrypts = nblks - blocksPipeLined;
unsigned j, rnds = ROUNDS(aesKey);
const block *sched = ((block *)(aesKey->rd_key));
for (int i = 0; i < numberOfLoops; i++){
out[0 + i * 8] = _mm_xor_si128(in[0 + i * 8], sched[0]);
out[1 + i * 8] = _mm_xor_si128(in[1 + i * 8], sched[0]);
out[2 + i * 8] = _mm_xor_si128(in[2 + i * 8], sched[0]);
out[3 + i * 8] = _mm_xor_si128(in[3 + i * 8], sched[0]);
out[4 + i * 8] = _mm_xor_si128(in[4 + i * 8], sched[0]);
out[5 + i * 8] = _mm_xor_si128(in[5 + i * 8], sched[0]);
out[6 + i * 8] = _mm_xor_si128(in[6 + i * 8], sched[0]);
out[7 + i * 8] = _mm_xor_si128(in[7 + i * 8], sched[0]);
for (j = 1; j < rnds; ++j){
out[0 + i * 8] = _mm_aesenc_si128(out[0 + i * 8], sched[j]);
out[1 + i * 8] = _mm_aesenc_si128(out[1 + i * 8], sched[j]);
out[2 + i * 8] = _mm_aesenc_si128(out[2 + i * 8], sched[j]);
out[3 + i * 8] = _mm_aesenc_si128(out[3 + i * 8], sched[j]);
out[4 + i * 8] = _mm_aesenc_si128(out[4 + i * 8], sched[j]);
out[5 + i * 8] = _mm_aesenc_si128(out[5 + i * 8], sched[j]);
out[6 + i * 8] = _mm_aesenc_si128(out[6 + i * 8], sched[j]);
out[7 + i * 8] = _mm_aesenc_si128(out[7 + i * 8], sched[j]);
}
out[0 + i * 8] = _mm_aesenclast_si128(out[0 + i * 8], sched[j]);
out[1 + i * 8] = _mm_aesenclast_si128(out[1 + i * 8], sched[j]);
out[2 + i * 8] = _mm_aesenclast_si128(out[2 + i * 8], sched[j]);
out[3 + i * 8] = _mm_aesenclast_si128(out[3 + i * 8], sched[j]);
out[4 + i * 8] = _mm_aesenclast_si128(out[4 + i * 8], sched[j]);
out[5 + i * 8] = _mm_aesenclast_si128(out[5 + i * 8], sched[j]);
out[6 + i * 8] = _mm_aesenclast_si128(out[6 + i * 8], sched[j]);
out[7 + i * 8] = _mm_aesenclast_si128(out[7 + i * 8], sched[j]);
}
for (int i = blocksPipeLined; i < blocksPipeLined + remainingEncrypts; ++i){
out[i] = _mm_xor_si128(in[i], sched[0]);
for (j = 1; j < rnds; ++j)
{
out[i] = _mm_aesenc_si128(out[i], sched[j]);
}
out[i] = _mm_aesenclast_si128(out[i], sched[j]);
}
}
#endif

View File

@ -0,0 +1,82 @@
/**
* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
*
* Copyright(c) 2013 Ted Krovetz.
* This file was taken from the SCAPI project, and was again taken from the file ocb.c written by Ted Krovetz.
* Some changes and additions may have been made and only part of the file written by Ted Krovetz has been copied
* only for the use of this project.
*
* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
*
*/
// Copyright(c) 2013 Ted Krovetz.
#ifndef TED_FILE
#define TED_FILE
#include "../typedefs.h"
#ifdef AES256_HASH
#include <wmmintrin.h>
#include "Config.h"
#include <iostream>
#include <stdlib.h>
using namespace std;
typedef struct { block rd_key[15]; int rounds; } AES_KEY;
#define ROUNDS(ctx) ((ctx)->rounds)
//output is written to v1, v2 and v3 are temporary variables, v4 is the previous key, shuff_const and aes_const are round/aes specific constants
#define EXPAND_ASSIST(v1,v2,v3,v4,shuff_const,aes_const) \
v2 = _mm_aeskeygenassist_si128(v4, aes_const); \
v3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(v3), \
_mm_castsi128_ps(v1), 16)); \
v1 = _mm_xor_si128(v1,v3); \
v3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(v3), \
_mm_castsi128_ps(v1), 140)); \
v1 = _mm_xor_si128(v1,v3); \
v2 = _mm_shuffle_epi32(v2,shuff_const); \
v1 = _mm_xor_si128(v1,v2)
#define EXPAND192_STEP(idx,aes_const) \
EXPAND_ASSIST(x0,x1,x2,x3,85,aes_const); \
x3 = _mm_xor_si128(x3,_mm_slli_si128 (x3, 4)); \
x3 = _mm_xor_si128(x3,_mm_shuffle_epi32(x0, 255)); \
kp[idx] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), \
_mm_castsi128_ps(x0), 68)); \
kp[idx+1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), \
_mm_castsi128_ps(x3), 78)); \
EXPAND_ASSIST(x0,x1,x2,x3,85,(aes_const*2)); \
x3 = _mm_xor_si128(x3,_mm_slli_si128 (x3, 4)); \
x3 = _mm_xor_si128(x3,_mm_shuffle_epi32(x0, 255)); \
kp[idx+2] = x0; tmp = x3
void AES_128_Key_Expansion(const unsigned char *userkey, AES_KEY* aesKey);
void AES_192_Key_Expansion(const unsigned char *userkey, AES_KEY* aesKey);
void AES_256_Key_Expansion(const unsigned char *userkey, AES_KEY* aesKey);
void AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *aesKey);
void AES_encryptC(block *in, block *out, AES_KEY *aesKey);
void AES_ecb_encrypt(block *blk, AES_KEY *aesKey);
void AES_ecb_encrypt_blks(block *blks, unsigned nblks, AES_KEY *aesKey);
void AES_ecb_encrypt_blks_4(block *blk, AES_KEY *aesKey);
void AES_ecb_encrypt_blks_4_in_out(block *in, block *out, AES_KEY *aesKey);
void AES_ecb_encrypt_blks_4_in_out_ind_keys(block *in, block *out, AES_KEY **aesKey, block** sched);
void AES_ecb_encrypt_blks_4_in_out_par_ks(block *in, block *out, const unsigned char* userkey);
void AES256_ecb_encrypt_blks_4_in_out_par_ks(block *in, block *out, const unsigned char* userkey);
void AES_ecb_encrypt_blks_2_in_out(block *in, block *out, AES_KEY *aesKey);
void AES_ecb_encrypt_chunk_in_out(block *in, block *out, unsigned nblks, AES_KEY *aesKey);
#endif
#endif