AESENC
对输入依次进行ShiftRows,SubBytes,MixColumns,AddRoundKey操作。其中SubBytes是对字节的操作,因此可以和ShiftRows互换,与上面的图比较,可以发现AESENC
恰好是上图的一个普通轮加密。AESENCLAST
对输入依次进行ShiftRows,SubBytes,AddRoundKey操作,相当于上图的尾轮加密。PXOR
指令完成,因此一个完整的AES加密过程如下(pt是明文,k[x]是轮密钥,ct是密文):pxor pt, k[0]
aesenc pt, k[1]
aesenc pt, k[2]
...
aesenc pt, k[n-1]
aesenclast pt, k[n]
movdqa ct, pt
AESENC
+1轮AESENCLAST
这一点很容易记住,但第0个轮密钥是直接PXOR
这一点很容易被忽视掉,需要多加注意。AESDEC,AESDECLAST和AESIMC
AESDEC
指令不是AESENC
指令的逆过程,AESDECLAST
同样不是AESENCLAST
的逆过程。一个完整的AES解密过程如下(pt是明文,k[x]是轮密钥,ct是密文):pxor ct, k[n]
aesdec ct, k'[n-1]
aesdec ct, k'[n-2]
...
aesdec ct, k'[1]
aesdeclast ct, k[0]
movdqa pt, ct
AESIMC
指令,该指令即为进行单个的InvMixColumns操作。AESKEYGENASSIST和PCLMULQDQ
AESKEYGENASSIST
用在密钥扩展中,具体的用法可以参考[设计白皮书]19页。PCLMULQDQ
全称Carry-Less Multiplication Quadword,是对两个GF(2^128)域上的多项式相乘。PCLMULQDQ
本身并不属于AESNI指令集,但除了用于加速CRC32外,PCLMULQDQ
还能计算GCM的GMAC,因此经常出现在SIMD加密算法中。Libsodium中的AES-256-GCM实现就是一个完美的示例。AESNI的进阶用法
分离AES的4种操作
Isolating ShiftRows
PSHUFB xmm0, 0x0b06010c07020d08030e09040f0a0500
Isolating InvShiftRows
PSHUFB xmm0, 0x0306090c0f0205080b0e0104070a0d00
Isolating MixColumns
AESDECLAST xmm0, 0x00000000000000000000000000000000
AESENC xmm0, 0x00000000000000000000000000000000
Isolating InvMixColumns
AESENCLAST xmm0, 0x00000000000000000000000000000000
AESDEC xmm0, 0x00000000000000000000000000000000
Isolating SubBytes
PSHUFB xmm0, 0x0306090c0f0205080b0e0104070a0d00
AESENCLAST xmm0, 0x00000000000000000000000000000000
Isolating InvSubBytes
PSHUFB xmm0, 0x0b06010c07020d08030e09040f0a0500
AESDECLAST xmm0, 0x00000000000000000000000000000000
PSHUFB
指令完成,而SubBytes则是先反向shuffle,再用0密钥进行尾轮加密,消掉尾轮的另外两种操作。MixColumns则结合加密和解密,利用尾轮的特性将MixColumns保留下来。这个神奇的拼接方式令人啧啧称奇。AESIMC
操作,但如果已知等价解密密钥,如何获得加密密钥?AESNI里没有直接的MixColumns操作,但根据上文,可以用AESDECLAST
和AESENC
组合产生。AESIMC
的Latency和Throughput均是AESENC
的两倍,因此斗胆猜测AESIMC
内部也是AESENCLAST
和AESDEC
的拼接。用AESNI加速其它算法
#include <emmintrin.h>#include <smmintrin.h>void Rijndael256_encrypt(unsigned char* in, unsigned char* out, unsigned char* Key_Schedule, unsigned long long length, int number_of_rounds) {
__m128i tmp1, tmp2, data1, data2;
__m128i RIJNDAEL256_MASK =
_mm_set_epi32(0x03020d0c, 0x0f0e0908, 0x0b0a0504, 0x07060100);
__m128i BLEND_MASK =
_mm_set_epi32(0x80000000, 0x80800000, 0x80800000, 0x80808000);
__m128i* KS = (__m128i*)Key_Schedule; int i, j; for (i = 0; i < length / 32; i++) { /* loop over the data blocks */
data1 = _mm_loadu_si128(&((__m128i*)in)[i * 2 + 0]); /* load data block */
data2 = _mm_loadu_si128(&((__m128i*)in)[i * 2 + 1]);
data1 = _mm_xor_si128(data1, KS[0]); /* round 0 (initial xor) */
data2 = _mm_xor_si128(data2, KS[1]); /* Do number_of_rounds-1 AES rounds */
for (j = 1; j < number_of_rounds; j++) { /*Blend to compensate for the shift rows shifts bytes between two
128 bit blocks*/
tmp1 = _mm_blendv_epi8(data1, data2, BLEND_MASK);
tmp2 = _mm_blendv_epi8(data2, data1, BLEND_MASK); /*Shuffle that compensates for the additional shift in rows 3 and 4
as opposed to rijndael128 (AES)*/
tmp1 = _mm_shuffle_epi8(tmp1, RIJNDAEL256_MASK);
tmp2 = _mm_shuffle_epi8(tmp2, RIJNDAEL256_MASK); /*This is the encryption step that includes sub bytes, shift rows,
mix columns, xor with round key*/
data1 = _mm_aesenc_si128(tmp1, KS[j * 2]);
data2 = _mm_aesenc_si128(tmp2, KS[j * 2 + 1]);
}
tmp1 = _mm_blendv_epi8(data1, data2, BLEND_MASK);
tmp2 = _mm_blendv_epi8(data2, data1, BLEND_MASK);
tmp1 = _mm_shuffle_epi8(tmp1, RIJNDAEL256_MASK);
tmp2 = _mm_shuffle_epi8(tmp2, RIJNDAEL256_MASK);
tmp1 = _mm_aesenclast_si128(tmp1, KS[j * 2 + 0]); /*last AES round */
tmp2 = _mm_aesenclast_si128(tmp2, KS[j * 2 + 1]);
_mm_storeu_si128(&((__m128i*)out)[i * 2 + 0], tmp1);
_mm_storeu_si128(&((__m128i*)out)[i * 2 + 1], tmp2);
}
}
PBLENDB
和SSSE3的PSHUFB
调整偏移。8×4排列置换网络是4×4的两倍,因此每一轮需要两个AESENC
指令,结尾同样两个AESENCLAST
。这种错落有致又不失美感的代码正是计算机吸引入的地方。N1CTF 2021 easyRe
VPXOR
,VPADDQ
,VPSUBQ
很容易求出逆运算,VPSHUFD
是对xmm0里的4个32位值重新排列,同样用VPSHUFD
可以排列回去。遇到VAESENC
指令时,首先将整个VAESENC
+VAESENCLAST
块提取出来,对中间的轮密钥VAESIMC
求逆,再生成相反的解密树。注意前文提到过AES加密的第0个轮密钥是直接VPXOR
异或,碰到VAESENC
指令前不是VPXOR
时,可以看作是异或了一个全0密钥,那么解密树的最后一条指令VAESDECLAST
的轮密钥就是0。遇VAESDEC
解密块时处理方法类似,但要使用前文提到的VAESDECLAST
+VAESENC
合成出MixColumns操作,对轮密钥进行变换。import sysimport capstoneimport binascii
sys.setrecursionlimit(0x100000)class Node:
def __init__(self):
self.emitted = False
self.parent = None
def __str__(self):
return "v"+hex(id(self)) def emit(self, f):
if self.emitted: return
self.emitted = True
self.do_emit(f)class Constant(Node):
def __init__(self, c):
super().__init__()
self.c = c def do_emit(self, f):
if self.c == 0:
f.write("__m128i {}=_mm_setzero_si128();n".format(self)) else:
f.write("__m128i {}=_mm_set_epi64x({}ULL,{}ULL);n".format(
self, hex(self.c >> 64), hex(self.c & ((1 << 64)-1))))class Binary(Node):
def __init__(self, a, b):
super().__init__()
self.a = a
self.b = b
a.parent = self
b.parent = selfclass Add(Binary):
def __init__(self, a, b):
super().__init__(a, b) def do_emit(self, f):
self.a.emit(f)
self.b.emit(f)
f.write("__m128i {}=_mm_add_epi64({},{});n".format(
self, self.a, self.b))class Sub(Binary):
def __init__(self, a, b):
super().__init__(a, b) def do_emit(self, f):
self.a.emit(f)
self.b.emit(f)
f.write("__m128i {}=_mm_sub_epi64({},{});n".format(
self, self.a, self.b))class Xor(Binary):
def __init__(self, a, b):
super().__init__(a, b) def do_emit(self, f):
self.a.emit(f)
self.b.emit(f)
f.write("__m128i {}=_mm_xor_si128({},{});n".format(
self, self.a, self.b))class Aes(Node):
def __init__(self, base, key, is_enc, is_last):
super().__init__()
self.base = base
self.key = key
self.is_enc, self.is_last = is_enc, is_last
base.parent = self
key.parent = self def do_emit(self, f):
self.base.emit(f)
self.key.emit(f)
f.write("__m128i {}=_mm_aes{}{}_si128({},{});n".format(
self, "enc" if self.is_enc else "dec", "last" if self.is_last else "", self.base, self.key))class Aesimc(Node):
def __init__(self, a, is_imc):
super().__init__()
self.a = a
self.is_imc = is_imc
a.parent = self def do_emit(self, f):
self.a.emit(f) if self.is_imc:
f.write("__m128i {}=_mm_aesimc_si128({});n".format(self, self.a)) else:
f.write("__m128i {}=_mm_aesenc_si128(_mm_aesdeclast_si128({},zero),zero);n".format(
self, self.a))class Shuffle(Node):
def __init__(self, a, x):
super().__init__()
self.a = a
self.x = x
a.parent = self def do_emit(self, f):
self.a.emit(f)
f.write("__m128i {}=_mm_shuffle_epi32({},{});n".format(
self, self.a, hex(self.x)))def flip(root):
parent = root.parent if isinstance(parent, Constant): return parent elif isinstance(parent, Xor): if root == parent.a: return Xor(parent.b, flip(parent)) else: return Xor(parent.a, flip(parent)) elif isinstance(parent, Add): if root == parent.a: return Sub(flip(parent), parent.b) else: return Sub(flip(parent), parent.a) elif isinstance(parent, Sub): if root == parent.a: return Add(flip(parent), parent.b) else: return Sub(parent.a, flip(parent)) elif isinstance(parent, Shuffle):
x = parent.x
shuffle = [] for i in range(4):
shuffle.append(x & 3)
x >>= 2
assert set(shuffle) == set({0, 1, 2, 3})
x = 0
for i in range(4):
x <<= 2
x += shuffle.index(3-i) return Shuffle(flip(parent), x) elif isinstance(parent, Aesimc): return Aesimc(flip(parent), not parent.is_imc) elif isinstance(parent, Aes):
keys = [parent]
p = parent.parent while True: if isinstance(p, Aes):
keys.append(p) if p.is_last: break
p = p.parent else: raise ValueError
keys.reverse()
r = Xor(flip(p), keys[0].key)
r_keys = {} for i in range(1, len(keys)): if id(keys[i].key) not in r_keys:
r_keys[id(keys[i].key)] = Aesimc(keys[i].key, keys[i].is_enc)
r = Aes(r, r_keys[id(keys[i].key)], not keys[i].is_enc, False) return Aes(r, Constant(0), not keys[0].is_enc, True) else: raise ValueError
xmmnames = ['xmm{}'.format(i) for i in range(16)]
xmm = [None for i in range(16)]
target = Node()
xmm[0] = target
memory = {}
c = ['2f0fc4f2839a1d5401ead9842fc23d00', '24e1c94761c31694cdb7d3a38fb0c100', '2af5fcb6d4373ceac4590d4f86956d00', 'cbc6b50249b0b519a2620a3cc73d9200', '60a876c1193162a02a1531a79d6a5900', 'd083cfb2f3a048c4cf47af9bcaaefa00', 'eb93d59f3756816e2671cd0d1c73bf00', 'c32de58cdbcf9fdd7de74f364a594b00', '6055580a46572c4e6a591ddd77c0ce00', '13bf3e7536d86ce89d81348f6f10e000', ]for i in range(len(c)):
memory[0x620-i * 0x10] = Constant(int.from_bytes(binascii.a2b_hex(c[i]), 'little'))
c = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
c.detail = Trueit = c.disasm(open('easyRe', 'rb').read()[0xc4d:0x19d90], 0x100000C4D)for ins in it: if ins.mnemonic == 'vmovdqa':
a, b = ins.op_str.split(',')
a, b = a.strip(), b.strip() if a in xmmnames and b not in xmmnames: assert b.startswith('xmmword ptr [rbp - ')
off = int(b[19:b.index(']')], 16) assert off in memory
xmm[xmmnames.index(a)] = memory[off] elif b in xmmnames and a not in xmmnames: assert a.startswith('xmmword ptr [rbp - ')
off = int(a[19:a.index(']')], 16)
memory[off] = xmm[xmmnames.index(b)] else:
xmm[xmmnames.index(a)] = xmm[xmmnames.index(b)] elif ins.mnemonic == 'vpxor':
a, b, c = ins.op_str.split(',')
a, b, c = a.strip(), b.strip(), c.strip() if c in xmmnames:
xmm[xmmnames.index(a)] = Xor(
xmm[xmmnames.index(b)], xmm[xmmnames.index(c)]) else: assert c.startswith('xmmword ptr [rbp - ')
off = int(c[19:c.index(']')], 16)
xmm[xmmnames.index(a)] = Xor(
xmm[xmmnames.index(b)], memory[off]) elif ins.mnemonic == 'vpaddq' or ins.mnemonic == 'vpsubq':
a, b, c = ins.op_str.split(',')
a, b, c = a.strip(), b.strip(), c.strip() if c in xmmnames:
xmm[xmmnames.index(a)] = (Add if ins.mnemonic == 'vpaddq' else Sub)(
xmm[xmmnames.index(b)], xmm[xmmnames.index(c)]) else: assert c.startswith('xmmword ptr [rbp - ')
off = int(c[19:c.index(']')], 16)
xmm[xmmnames.index(a)] = (Add if ins.mnemonic == 'vpaddq' else Sub)(
xmm[xmmnames.index(b)], memory[off]) elif ins.mnemonic == 'vpshufd':
a, b, c = ins.op_str.split(',')
a, b, c = a.strip(), b.strip(), c.strip()
c = int(c, 16)
xmm[xmmnames.index(a)] = Shuffle(
xmm[xmmnames.index(b)], c) elif ins.mnemonic == 'vaesenc' or ins.mnemonic == 'vaesdec':
is_enc = ins.mnemonic == 'vaesenc'
a, b, c = ins.op_str.split(',')
a, b, c = a.strip(), b.strip(), c.strip()
xmm[xmmnames.index(a)] = Aes(xmm[xmmnames.index(b)],
xmm[xmmnames.index(c)], is_enc, False) elif ins.mnemonic == 'vaesenclast' or ins.mnemonic == 'vaesdeclast':
is_enc = ins.mnemonic == 'vaesenclast'
a, b, c = ins.op_str.split(',')
a, b, c = a.strip(), b.strip(), c.strip()
xmm[xmmnames.index(a)] = Aes(xmm[xmmnames.index(b)],
xmm[xmmnames.index(c)], is_enc, True) elif ins.mnemonic == 'vaesimc':
a, b = ins.op_str.split(',')
a, b = a.strip(), b.strip()
xmm[xmmnames.index(a)] = Aesimc(xmm[xmmnames.index(b)], True) elif ins.mnemonic == 'movabs' or ins.mnemonic == 'mov': pass
else:
print(ins) raise ValueError
xmm[0].parent = Constant(0x79eeb3fa8c39dbd77bc066c7647d0b72)
target = flip(target)
f = open('a.c', 'w')
f.write('''
#include <immintrin.h>
#include <stdio.h>
int main(){
__m128i zero=_mm_setzero_si128();
''')
target.emit(f)
f.write('''char pt[16];
_mm_storeu_si128((__m128i*)pt, {});
fwrite(pt,16,1,stdout);
return 0;
}}
'''.format(target))
-maes
选项打开AESNI,会生成SSE指令集的程序,如果用-march=native
再多打开一些指令集,还能自动编译出AVX2+VAES的程序,现在的编译器也是十分智能。n1ctf{Easy_AVX!}
(一点都不easy)- 结尾 - 精彩推荐 【技术分享】CVE-2019-10999 Dlink IP 摄像头缓冲区溢出 【技术分享】剖析脏牛1_mmap如何映射内存到文件 【技术分享】A-Journey-into-Synology-NAS-系列——群晖NAS介绍 戳“阅读原文”查看更多内容 原文始发于微信公众号(安全客):【技术分享】Intel AES-NI使用入门
免责声明:文章中涉及的程序(方法)可能带有攻击性,仅供安全研究与教学之用,读者将其信息做其他用途,由读者承担全部法律及连带责任,本站不承担任何法律及连带责任;如有问题可邮件联系(建议使用企业邮箱或有效邮箱,避免邮件被拦截,联系方式见首页),望知悉。
- 左青龙
- 微信扫一扫
-
- 右白虎
- 微信扫一扫
-
评论