GCM for SM4 - emmansun/gmsm GitHub Wiki

为sm4实现的GCM汇编代码是从AES GCM实现中摘抄的。主要为以下三个函数：

	//go:noescape
func precomputeTableAsm(productTable *[256]byte, src *[16]byte)

//go:noescape
func gcmSm4Data(productTable *[256]byte, data []byte, T *[16]byte)

//go:noescape
func gcmSm4Finish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)

gcmSm4Finish没有改变，和gcmAesFinish一模一样；
precomputeTableAsm和gcmAesInit的区别在于前者没有加密部分，输入参数就是加密结果；
gcmSm4Data和gcmAesData的差别在于前者那个T参数同时作为输入输出，而后者只作为输出。

加密和GHASH结合ASM优化

主要困难：

底层的CTR加密数据不是block对齐的，更不是4、8blocks对齐的，所以尾加密及异或运算处理比较麻烦；
AMD64同时支持AVX2和NON-AVX2，代码量比较大，比较复杂；
和GHASH的混合处理，提高性能；计划先把下面方法转成ASM:

// counterCrypt crypts in to out using g.cipher in counter mode.
func (g *gcm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
	mask := make([]byte, g.cipher.blocksSize)
	counters := make([]byte, g.cipher.blocksSize)

	for len(in) >= g.cipher.blocksSize {
		for i := 0; i < g.cipher.batchBlocks; i++ {
			copy(counters[i*gcmBlockSize:(i+1)*gcmBlockSize], counter[:])
			gcmInc32(counter)
		}
		g.cipher.EncryptBlocks(mask, counters)
		xor.XorWords(out, in, mask[:])
		out = out[g.cipher.blocksSize:]
		in = in[g.cipher.blocksSize:]
	}

	if len(in) > 0 {
		blocks := (len(in) + gcmBlockSize - 1) / gcmBlockSize
		for i := 0; i < blocks; i++ {
			copy(counters[i*gcmBlockSize:], counter[:])
			gcmInc32(counter)
		}
		g.cipher.EncryptBlocks(mask, counters)
		xor.XorBytes(out, in, mask[:blocks*gcmBlockSize])
	}
}

最后再处理和GHASH混合处理。

2022年1月14日

gcmSm4Init方法，已完成AMD64和ARM64开发，为了支持golang1.15.x，不得不放弃使用VMOVQ指令。

gcmSm4Enc方法，初步完成AMD64架构非AVX(2)版本开发，正进行更多测试和优化。

2022年1月18日

gcmSm4Enc, gcmSm4Dec, 已完成AMD64架构下非AVX(2)版本及AVX(2)版本，代码有点臃肿；ARM64版本也已完成，优化的方向为矩阵行列转换。

ARM64矩阵转换

// 从高位到低位
// s0 = s0.S3, s0.S2, s0.S1, s0.S0
// s1 = s1.S3, s1.S2, s1.S1, s1.S0
// s2 = s2.S3, s2.S2, s2.S1, s2.S0
// s3 = s3.S3, s3.S2, s3.S1, s3.S0
#define transpose_4x4(s0, s1, s2, s3)   \
        zip1 RTMP0.4s, s0.4s, s1.4s;    \ // RTMP0 = s1.S1, s0.S1, s1.S0, s0.S0
        zip1 RTMP1.4s, s2.4s, s3.4s;    \ // RTMP1 = s3.S1, s2.S1, s3.S0, s2.S0
        zip2 RTMP2.4s, s0.4s, s1.4s;    \ // RTMP2 = s1.S3, s0.S3, s1.S2, s0.S2
        zip2 RTMP3.4s, s2.4s, s3.4s;    \ // RTMP3 = s3.S3, s2.S3, s3.S2, s2.S2
        zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ // s0 = s3.S0, s2.S0, s1.S0, s0.S0
        zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ // s1 = s3.S1, s2.S1, s1.S1, s0.S1
        zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ // s2 = s3.S2, s2.S2, s1.S2, s0.S2
        zip2 s3.2d, RTMP2.2d, RTMP3.2d;   // s3 = s3.S3, s2.S3, s1.S3, s0.S3

#define rotate_clockwise_90(s0, s1, s2, s3) \
        zip1 RTMP0.4s, s1.4s, s0.4s;        \ // RTMP0 = s0.S1, s1.S1, s0.S0, s1.S0
        zip2 RTMP1.4s, s1.4s, s0.4s;        \ // RTMP1 = s0.S3, s1.S3, s0.S2, s1.S2
        zip1 RTMP2.4s, s3.4s, s2.4s;        \ // RTMP2 = s2.S1, s3.S1, s2.S0, s3.S0
        zip2 RTMP3.4s, s3.4s, s2.4s;        \ // RTMP3 = s2.S3, s3.S3, s2.S2, s3.S2
        zip1 s0.2d, RTMP2.2d, RTMP0.2d;     \ // s0 = s0.S0, s1.S0, s2.S0, s3.S0
        zip2 s1.2d, RTMP2.2d, RTMP0.2d;     \ // s1 = s0.S1, s1.S1, s2.S1, s3.S1
        zip1 s2.2d, RTMP3.2d, RTMP1.2d;     \ // s2 = s0.S2, s1.S2, s2.S2, s3.S2
        zip2 s3.2d, RTMP3.2d, RTMP1.2d;	      // s3 = s0.S3, s1.S3, s2.S3, s3.S3

Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode