# 前言
本篇文章简单介绍矩阵乘的加速方法,以学习算子加速需要注意的方面。想要学习更多内容可以参考《OpenBLAS gemm 从零入门》,《BLISlab: A Sandbox for Optimizing GEMM》,道阻且长_再探矩阵乘法优化,《How To Optimize GEMM》等项目或文章。
作为初学者,错误在所难免,还望不吝赐教。
# 1. 基准算法
矩阵乘运算的基准算法,未经过任何优化。矩阵以行主序进行排布, 针对 X86 平台。矩阵 C= A * B,A 矩阵为 (M,K), B 矩阵为 (K,N)。
#include <stdio.h> | |
#define A( i, j ) a[ (i)*lda + (j) ] | |
#define B( i, j ) b[ (i)*ldb + (j) ] | |
#define C( i, j ) c[ (i)*ldc + (j) ] | |
// gemm C = A * B + C | |
void MatrixMultiply(int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc) | |
{ | |
for(int i = 0; i < m; i++){ | |
for (int p=0; p<k; p++ ){ | |
for (int j=0; j<n; j++ ){ | |
C(i, j) = C(i, j) + A(i, p) * B(p, j); | |
} | |
} | |
} | |
} | |
void GemmAccelerate(){ | |
int M = 16; | |
int K = 64; | |
int N = 16; | |
float* MatriA = (float*)malloc(sizeof(float)* M*K); | |
float* MatriB = (float*)malloc(sizeof(float)* K*N); | |
float* MatriC = (float*)malloc(sizeof(float)* M*N); | |
for(int index=0; index<M*K; index<M*K; index++){ | |
MatriA[index] = 1; | |
} | |
for(int index=0; index<K*N; index++){ | |
MatriB[index] = 1; | |
} | |
MatrixMultiply(M,N,K,MatriA,K,MatriB,N,MatriC,N); | |
} |
# 循环展开
引入循环展开(Loop Unrolling) ,每次 A 矩阵一行和 B 四列进行计算,得到四个结果。
可以减少循环控制开销:减少了循环条件检查和更新的频率,因为现在每四个元素作为一个批次处理,理论上减少了 75% 的循环控制指令。
#include <stdio.h> | |
#define A(i,j) a[ (i)*lda + (j) ] | |
#define B(i,j) b[ (i)*ldb + (j) ] | |
#define C(i,j) c[ (i)*ldc + (j) ] | |
#define Y(i) y[ (i)*incx ] | |
void AddDot( int k, float *x, int incx, float *y, float *gamma ) | |
{ | |
int p; | |
for ( p=0; p<k; p++ ){ | |
*gamma += x[p] * Y(p); | |
} | |
} | |
void MY_MMult2( int m, int n, int k, float *a, int lda, | |
float *b, int ldb, float *c, int ldc ) | |
{ | |
int i, j; | |
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */ | |
for ( i=0; i<m; i+=1 ){ /* Loop over the rows of C */ | |
AddDot( k, &A( i,0 ), lda, &B( 0,j ), &C( i,j ) ); | |
AddDot( k, &A( i,0 ), lda, &B( 0,j+1 ), &C( i,j+1 ) ); | |
AddDot( k, &A( i,0 ), lda, &B( 0,j+2 ), &C( i,j+2 ) ); | |
AddDot( k, &A( i,0 ), lda, &B( 0,j+3 ), &C( i,j+3 ) ); | |
} | |
} | |
} | |
void GemmAccelerate(){ | |
int M = 16; | |
int K = 64; | |
int N = 16; | |
float* MatriA = (float*)malloc(sizeof(float)* M*K); | |
float* MatriB = (float*)malloc(sizeof(float)* K*N); | |
float* MatriC = (float*)malloc(sizeof(float)* M*N); | |
for(int index=0; index<M*K; index<M*K; index++){ | |
MatriA[index] = 1; | |
} | |
for(int index=0; index<K*N; index++){ | |
MatriB[index] = 1; | |
} | |
MY_MMult2(M,N,K,MatriA,K,MatriB,N,MatriC,N); | |
} |
# 3. 使用寄存器
中间计算结果使用寄存器存储,可以减少内存访问次数,可以显著提升性能,因为寄存器访问速度远快于内存访问。
#include <stdio.h> | |
#define A(i,j) a[ (i)*lda + (j) ] | |
#define B(i,j) b[ (i)*ldb + (j) ] | |
#define C(i,j) c[ (i)*ldc + (j) ] | |
/* Routine for computing C = A * B + C */ | |
void AddDot1x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc ) | |
{ | |
int p; | |
register float c_00_reg, c_01_reg, c_02_reg, c_03_reg, a_0p_reg; | |
c_00_reg = 0.0; | |
c_01_reg = 0.0; | |
c_02_reg = 0.0; | |
c_03_reg = 0.0; | |
for ( p=0; p<k; p++ ){ | |
a_0p_reg = A( 0, p ); | |
c_00_reg += a_0p_reg * B( p, 0 ); | |
c_01_reg += a_0p_reg * B( p, 1 ); | |
c_02_reg += a_0p_reg * B( p, 2 ); | |
c_03_reg += a_0p_reg * B( p, 3 ); | |
} | |
C( 0, 0 ) += c_00_reg; | |
C( 0, 1 ) += c_01_reg; | |
C( 0, 2 ) += c_02_reg; | |
C( 0, 3 ) += c_03_reg; | |
} | |
void MY_MMult_1x4_6( int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc ) | |
{ | |
int i, j; | |
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */ | |
for ( i=0; i<m; i+=1 ){ /* Loop over the rows of C */ | |
AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc ); | |
} | |
} | |
} | |
void GemmAccelerate(){ | |
int M = 16; | |
int K = 64; | |
int N = 16; | |
float* MatriA = (float*)malloc(sizeof(float)* M*K); | |
float* MatriB = (float*)malloc(sizeof(float)* K*N); | |
float* MatriC = (float*)malloc(sizeof(float)* M*N); | |
for(int index=0; index<M*K; index<M*K; index++){ | |
MatriA[index] = 1; | |
} | |
for(int index=0; index<K*N; index++){ | |
MatriB[index] = 1; | |
} | |
MY_MMult_1x4_6(M,N,K,MatriA,K,MatriB,N,MatriC,N); | |
} |
# 4. 避免乘法
本次算法相对上一个,调整了循环展开的矩阵(从矩阵 B 变成了矩阵 A)。最内层循环中的 c_00_reg += a_0p_reg * B( p, 0 );
变成 c_00_reg += b_0p_reg * *ap0_pntr++;
,即用指针的累加替换了原来的 B( p, 0 )
,使得最内层循环减少了 4 个乘法。
调整循环展开矩阵的原因是 行主序,矩阵 A 在内存中是连续的。
避免乘法: 通常来讲乘法消耗的时间长于加法,效率差异取决于具体的硬件平台。减少乘法计算量可以提高算法效率。
#include <stdio.h> | |
#define A(i,j) a[ (i)*lda + (j) ] | |
#define B(i,j) b[ (i)*ldb + (j) ] | |
#define C(i,j) c[ (i)*ldc + (j) ] | |
/* Routine for computing C = A * B + C */ | |
void AddDot1x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc ) | |
{ | |
int p; | |
register float | |
c_00_reg, c_01_reg, c_02_reg, c_03_reg, b_0p_reg; | |
float | |
*ap0_pntr, *ap1_pntr, *ap2_pntr, *ap3_pntr; | |
ap0_pntr = &A( 0, 0 ); | |
ap1_pntr = &A( 1, 0 ); | |
ap2_pntr = &A( 2, 0 ); | |
ap3_pntr = &A( 3, 0 ); | |
c_00_reg = 0.0; | |
c_01_reg = 0.0; | |
c_02_reg = 0.0; | |
c_03_reg = 0.0; | |
for ( p=0; p<k; p++ ){ | |
b_0p_reg = B( p, 0 ); | |
c_00_reg += b_0p_reg * *ap0_pntr++; | |
c_01_reg += b_0p_reg * *ap1_pntr++; | |
c_02_reg += b_0p_reg * *ap2_pntr++; | |
c_03_reg += b_0p_reg * *ap3_pntr++; | |
} | |
C( 0, 0 ) += c_00_reg; | |
C( 1, 0 ) += c_01_reg; | |
C( 2, 0 ) += c_02_reg; | |
C( 3, 0 ) += c_03_reg; | |
} | |
void MY_MMult_1x4_7( int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc ) | |
{ | |
int i, j; | |
for ( j=0; j<n; j+=1 ){ /* Loop over the columns of C, unrolled by 4 */ | |
for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */ | |
AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc ); | |
} | |
} | |
} | |
void GemmAccelerate(){ | |
int M = 16; | |
int K = 64; | |
int N = 16; | |
float* MatriA = (float*)malloc(sizeof(float)* M*K); | |
float* MatriB = (float*)malloc(sizeof(float)* K*N); | |
float* MatriC = (float*)malloc(sizeof(float)* M*N); | |
for(int index=0; index<M*K; index<M*K; index++){ | |
MatriA[index] = 1; | |
} | |
for(int index=0; index<K*N; index++){ | |
MatriB[index] = 1; | |
} | |
MY_MMult_1x4_7(M,N,K,MatriA,K,MatriB,N,MatriC,N); | |
} |
# 5. 进一步循环展开
将最内部的循环也展开,即原本的 for ( p=0; p<k; p++ )
变成 for ( p=0; p<k; p+=4 )
,内部计算的内容变成原来的四倍。当然也开辟了更多的寄存器。
#include <stdio.h> | |
#define A(i,j) a[ (i)*lda + (j) ] | |
#define B(i,j) b[ (i)*ldb + (j) ] | |
#define C(i,j) c[ (i)*ldc + (j) ] | |
/* Routine for computing C = A * B + C */ | |
void AddDot1x4( int k, float *a, int lda, float *b, int ldb, float *c, int ldc ) | |
{ | |
int p; | |
register float c_00_reg, c_01_reg, c_02_reg, c_03_reg, b_0p_reg; | |
float *ap0_pntr, *ap1_pntr, *ap2_pntr, *ap3_pntr; | |
ap0_pntr = &A( 0, 0 ); | |
ap1_pntr = &A( 1, 0 ); | |
ap2_pntr = &A( 2, 0 ); | |
ap3_pntr = &A( 3, 0 ); | |
c_00_reg = 0.0; | |
c_01_reg = 0.0; | |
c_02_reg = 0.0; | |
c_03_reg = 0.0; | |
for ( p=0; p<k; p+=4 ){ | |
b_0p_reg = B( p, 0 ); | |
c_00_reg += b_0p_reg * *ap0_pntr++; | |
c_01_reg += b_0p_reg * *ap1_pntr++; | |
c_02_reg += b_0p_reg * *ap2_pntr++; | |
c_03_reg += b_0p_reg * *ap3_pntr++; | |
b_0p_reg = B( p+1, 0 ); | |
c_00_reg += b_0p_reg * *ap0_pntr++; | |
c_01_reg += b_0p_reg * *ap1_pntr++; | |
c_02_reg += b_0p_reg * *ap2_pntr++; | |
c_03_reg += b_0p_reg * *ap3_pntr++; | |
b_0p_reg = B( p+2, 0 ); | |
c_00_reg += b_0p_reg * *ap0_pntr++; | |
c_01_reg += b_0p_reg * *ap1_pntr++; | |
c_02_reg += b_0p_reg * *ap2_pntr++; | |
c_03_reg += b_0p_reg * *ap3_pntr++; | |
b_0p_reg = B( p+3, 0); | |
c_00_reg += b_0p_reg * *ap0_pntr++; | |
c_01_reg += b_0p_reg * *ap1_pntr++; | |
c_02_reg += b_0p_reg * *ap2_pntr++; | |
c_03_reg += b_0p_reg * *ap3_pntr++; | |
} | |
C( 0, 0 ) += c_00_reg; | |
C( 1, 0 ) += c_01_reg; | |
C( 2, 0 ) += c_02_reg; | |
C( 3, 0 ) += c_03_reg; | |
} | |
void MY_MMult_1x4_8( int m, int n, int k, float *a, int lda,float *b, int ldb,float *c, int ldc ) | |
{ | |
int i, j; | |
for ( j=0; j<n; j+=1 ){ /* Loop over the columns of C, unrolled by 4 */ | |
for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */ | |
AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc ); | |
} | |
} | |
} | |
void GemmAccelerate(){ | |
int M = 16; | |
int K = 64; | |
int N = 16; | |
float* MatriA = (float*)malloc(sizeof(float)* M*K); | |
float* MatriB = (float*)malloc(sizeof(float)* K*N); | |
float* MatriC = (float*)malloc(sizeof(float)* M*N); | |
for(int index=0; index<M*K; index<M*K; index++){ | |
MatriA[index] = 1; | |
} | |
for(int index=0; index<K*N; index++){ | |
MatriB[index] = 1; | |
} | |
MY_MMult_1x4_8(M,N,K,MatriA,K,MatriB,N,MatriC,N); | |
} |
# 6. 使用 SIMD 指令
SIMD 指令是单指令多数据,以 X86 平台的 SSE2 为例,其支持 128 的寄存器,一次性可以计算 4 个 float 或 int 数据,能够提高算法的并行度。
代码中的 SSE 指令 _mm_setzero_ps
:将寄存器置零, _mm_loadu_ps
:读取指定位置的数据到寄存器, _mm_mul_ps
:乘运算, _mm_add_ps
:加运算, _mm_storeu_ps
:将寄存器的数据存储会内存, _mm_set_ps1
:将给定地址的一个数复制 4 份到寄存器。
以下代码在 M 和 N 两个方向循环展开(和 5 不同),一次性计算 16 个数据。
#include <stdio.h> | |
#include <emmintrin.h> // SSE2 | |
#define A(i,j) a[ (i)*lda + (j) ] | |
#define B(i,j) b[ (i)*ldb + (j) ] | |
#define C(i,j) c[ (i)*ldc + (j) ] | |
/* Routine for computing C = A * B + C */ | |
void AddDot4x4_SSE2(int k, float *a, int lda, float *b, int ldb, float *c, int ldc){ | |
/* Point to the current elements in the four rows of A */ | |
float *a_0p_pntr = &A(0, 0); | |
float *a_1p_pntr = &A(1, 0); | |
float *a_2p_pntr = &A(2, 0); | |
float *a_3p_pntr = &A(3, 0); | |
__m128 c_p0_sum = _mm_setzero_ps(); | |
__m128 c_p1_sum = _mm_setzero_ps(); | |
__m128 c_p2_sum = _mm_setzero_ps(); | |
__m128 c_p3_sum = _mm_setzero_ps(); | |
register float a_0p_reg, a_1p_reg, a_2p_reg, a_3p_reg; | |
for (int p = 0; p < k; ++p) { | |
__m128 b_reg = _mm_loadu_ps(&B(p, 0)); | |
a_0p_reg = *a_0p_pntr++; | |
a_1p_reg = *a_1p_pntr++; | |
a_2p_reg = *a_2p_pntr++; | |
a_3p_reg = *a_3p_pntr++; | |
__m128 a_0p_vec = _mm_set_ps1(a_0p_reg); | |
__m128 a_1p_vec = _mm_set_ps1(a_1p_reg); | |
__m128 a_2p_vec = _mm_set_ps1(a_2p_reg); | |
__m128 a_3p_vec = _mm_set_ps1(a_3p_reg); | |
c_p0_sum = _mm_add_ps(c_p0_sum, _mm_mul_ps(b_reg, a_0p_vec)); | |
c_p1_sum = _mm_add_ps(c_p1_sum, _mm_mul_ps(b_reg, a_1p_vec)); | |
c_p2_sum = _mm_add_ps(c_p2_sum, _mm_mul_ps(b_reg, a_2p_vec)); | |
c_p3_sum = _mm_add_ps(c_p3_sum, _mm_mul_ps(b_reg, a_3p_vec)); | |
} | |
float *c_pntr; | |
__m128 c_reg; | |
c_pntr = &C(0, 0); | |
c_reg = _mm_loadu_ps(c_pntr); | |
c_reg = _mm_add_ps(c_reg, c_p0_sum); | |
_mm_storeu_ps(c_pntr, c_reg); | |
c_pntr = &C(1, 0); | |
c_reg = _mm_loadu_ps(c_pntr); | |
c_reg = _mm_add_ps(c_reg, c_p1_sum); | |
_mm_storeu_ps(c_pntr, c_reg); | |
c_pntr = &C(2, 0); | |
c_reg = _mm_loadu_ps(c_pntr); | |
c_reg = _mm_add_ps(c_reg, c_p2_sum); | |
_mm_storeu_ps(c_pntr, c_reg); | |
c_pntr = &C(3, 0); | |
c_reg = _mm_loadu_ps(c_pntr); | |
c_reg = _mm_add_ps(c_reg, c_p3_sum); | |
_mm_storeu_ps(c_pntr, c_reg); | |
} | |
void MY_MMult_4x4_10(int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc){ | |
int i, j; | |
for (j = 0; j < n; j += 4) { /* Loop over the columns of C, unrolled by 4 */ | |
for (i = 0; i < m; i += 4) { /* Loop over the rows of C */ | |
AddDot4x4_SSE2(k, &A(i, 0), lda, &B(0, j), ldb, &C(i, j), ldc); | |
} | |
} | |
} | |
void GemmAccelerateSimd(){ | |
int M = 16; | |
int K = 64; | |
int N = 16; | |
float* MatriA = (float*)malloc(sizeof(float)* M*K); | |
float* MatriB = (float*)malloc(sizeof(float)* K*N); | |
float* MatriC = (float*)malloc(sizeof(float)* M*N); | |
for(int index=0; index<M*K; index<M*K; index++){ | |
MatriA[index] = 1; | |
} | |
for(int index=0; index<K*N; index++){ | |
MatriB[index] = 1; | |
} | |
MY_MMult_4x4_10(M,N,K,MatriA,K,MatriB,N,MatriC,N); | |
} |
# 7. 矩阵分块
分块技术(blocking)是一种常用的优化策略,旨在改善缓存命中率和数据局部性,进而提升计算密集型操作(如矩阵乘法)的性能。通过将大矩阵分割成较小的子矩阵(即 “块”),可以更好地利用 CPU 缓存,并减少内存访问延迟。
以下代码中, mc
(Macro for m-block Count):定义了处理矩阵行方向上的块大小 , kc
(Macro for k-block Count):定义了处理矩阵列方向上的块大小。在整个代码中被用作循环边界或参数,用于控制矩阵分块的尺寸。他们将矩阵 A 分割乘 256128 的小块,将矩阵 B 分割成 128N 的小块,然后计算两个小块的矩阵乘运算。至于如何得到最终结果,可以参考《线性代数》知识:分块矩阵乘。
#include <emmintrin.h> // SSE2 | |
#include <smmintrin.h> // SSE4.1, for _mm_dp_ps if needed | |
#include <stdio.h> | |
/* Block sizes */ | |
#define mc 256 | |
#define kc 128 | |
/* Create macros so that the matrices are stored in row-major order */ | |
#define A(i,j) a[ (i)*lda + (j) ] | |
#define B(i,j) b[ (i)*ldb + (j) ] | |
#define C(i,j) c[ (i)*ldc + (j) ] | |
#define min(i, j) ((i) < (j) ? (i): (j)) | |
/* Routine for computing C = A * B + C */ | |
void AddDot4x4_SSE( int k, float *a, int lda, float *b, int ldb, float *c, int ldc ) | |
{ | |
/* Point to the current elements in the four rows of A */ | |
float *a_0p_pntr = &A(0, 0); | |
float *a_1p_pntr = &A(1, 0); | |
float *a_2p_pntr = &A(2, 0); | |
float *a_3p_pntr = &A(3, 0); | |
__m128 c_0p_sum = _mm_setzero_ps(); | |
__m128 c_1p_sum = _mm_setzero_ps(); | |
__m128 c_2p_sum = _mm_setzero_ps(); | |
__m128 c_3p_sum = _mm_setzero_ps(); | |
for (int p = 0; p < k; ++p) { | |
__m128 b_reg = _mm_loadu_ps(&B(p, 0)); | |
float a_0p_reg = *a_0p_pntr++; | |
float a_1p_reg = *a_1p_pntr++; | |
float a_2p_reg = *a_2p_pntr++; | |
float a_3p_reg = *a_3p_pntr++; | |
c_0p_sum = _mm_add_ps(c_0p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_0p_reg))); | |
c_1p_sum = _mm_add_ps(c_1p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_1p_reg))); | |
c_2p_sum = _mm_add_ps(c_2p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_2p_reg))); | |
c_3p_sum = _mm_add_ps(c_3p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_3p_reg))); | |
} | |
// Store results back into C matrix | |
for (int row = 0; row < 4; ++row) { | |
float* c_pntr = &C(row, 0); | |
__m128 c_reg = _mm_loadu_ps(c_pntr); | |
switch (row) { | |
case 0: c_reg = _mm_add_ps(c_reg, c_0p_sum); break; | |
case 1: c_reg = _mm_add_ps(c_reg, c_1p_sum); break; | |
case 2: c_reg = _mm_add_ps(c_reg, c_2p_sum); break; | |
case 3: c_reg = _mm_add_ps(c_reg, c_3p_sum); break; | |
} | |
_mm_storeu_ps(c_pntr, c_reg); | |
} | |
} | |
void InnerKernel_SSE( int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc ){ | |
for ( int j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */ | |
for ( int i=0; i<m; i+=4 ){ /* Loop over the rows of C */ | |
AddDot4x4_SSE( k, &A( i,0 ), lda, &B(0, j), ldb, &C( i,j ), ldc ); | |
} | |
} | |
} | |
void MY_MMult_4x4_11_SSE( int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc ) { | |
for (int p = 0; p < k; p += kc) { | |
int pb = min(k - p, kc); | |
for (int i = 0; i < m; i += mc) { | |
int ib = min(m - i, mc); | |
InnerKernel_SSE(ib, n, pb, &A(i, p), lda, &B(p, 0), ldb, &C(i, 0), ldc); | |
} | |
} | |
} | |
void GemmAccelerateSimd(){ | |
int M = 16; | |
int K = 64; | |
int N = 16; | |
float* MatriA = (float*)malloc(sizeof(float)* M*K); | |
float* MatriB = (float*)malloc(sizeof(float)* K*N); | |
float* MatriC = (float*)malloc(sizeof(float)* M*N); | |
for(int index=0; index<M*K; index<M*K; index++){ | |
MatriA[index] = 1; | |
} | |
for(int index=0; index<K*N; index++){ | |
MatriB[index] = 1; | |
} | |
MY_MMult_4x4_11_SSE(M,N,K,MatriA,K,MatriB,N,MatriC,N); | |
} |
# 8. 数据 Pack
在上面的优化中可以发现,在矩阵乘法的计算中,无论是行主序还是列主序,始终有一个矩阵的内存是没办法连续访问的。为了改善这个情况,可以执行数据 Pack,使矩阵 A 和矩阵 B 的内存变成连续的。
下面代码,在矩阵运算之前将矩阵 B 按照访问顺序重新排布,存储在 packedB
中。
是否有速度提升,需要在目标平台上实测,因为算法提升了矩阵运算效率,但是增加了 pack 时间。
#include <emmintrin.h> // SSE2 | |
#include <smmintrin.h> // SSE4.1, for _mm_dp_ps if needed | |
#include <stdio.h> | |
/* Block sizes */ | |
#define mc 256 | |
#define kc 128 | |
/* Create macros so that the matrices are stored in row-major order */ | |
#define A(i,j) a[ (i)*lda + (j) ] | |
#define B(i,j) b[ (i)*ldb + (j) ] | |
#define C(i,j) c[ (i)*ldc + (j) ] | |
#define min(i, j) ((i) < (j) ? (i): (j)) | |
/* Routine for computing C = A * B + C */ | |
void PackMatrixB_SSE(int k, float *b, int ldb, float *b_to) { | |
for (int j = 0; j < k; ++j) { | |
float *b_ij_pntr = &B(j, 0); | |
*b_to++ = b_ij_pntr[0]; | |
*b_to++ = b_ij_pntr[1]; | |
*b_to++ = b_ij_pntr[2]; | |
*b_to++ = b_ij_pntr[3]; | |
} | |
} | |
void AddDot4x4_SSE(int k, float *a, int lda, float *b, int ldb, float *c, int ldc) { | |
/* Point to the current elements in the four rows of A */ | |
float *a_0p_pntr = &A(0, 0); | |
float *a_1p_pntr = &A(1, 0); | |
float *a_2p_pntr = &A(2, 0); | |
float *a_3p_pntr = &A(3, 0); | |
__m128 c_0p_sum = _mm_setzero_ps(); | |
__m128 c_1p_sum = _mm_setzero_ps(); | |
__m128 c_2p_sum = _mm_setzero_ps(); | |
__m128 c_3p_sum = _mm_setzero_ps(); | |
for (int p = 0; p < k; ++p) { | |
__m128 b_reg = _mm_loadu_ps(b); // Load 4 floats from b | |
b += 4; | |
float a_0p_reg = *a_0p_pntr++; | |
float a_1p_reg = *a_1p_pntr++; | |
float a_2p_reg = *a_2p_pntr++; | |
float a_3p_reg = *a_3p_pntr++; | |
c_0p_sum = _mm_add_ps(c_0p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_0p_reg))); | |
c_1p_sum = _mm_add_ps(c_1p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_1p_reg))); | |
c_2p_sum = _mm_add_ps(c_2p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_2p_reg))); | |
c_3p_sum = _mm_add_ps(c_3p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_3p_reg))); | |
} | |
// Store results back into C matrix | |
for (int row = 0; row < 4; ++row) { | |
float* c_pntr = &C(row, 0); | |
__m128 c_reg = _mm_loadu_ps(c_pntr); | |
switch (row) { | |
case 0: c_reg = _mm_add_ps(c_reg, c_0p_sum); break; | |
case 1: c_reg = _mm_add_ps(c_reg, c_1p_sum); break; | |
case 2: c_reg = _mm_add_ps(c_reg, c_2p_sum); break; | |
case 3: c_reg = _mm_add_ps(c_reg, c_3p_sum); break; | |
} | |
_mm_storeu_ps(c_pntr, c_reg); | |
} | |
} | |
void InnerKernel_SSE(int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc) { | |
int i, j; | |
float packedB[k * n]; | |
for (j = 0; j < n; j += 4) { /* Loop over the columns of C, unrolled by 4 */ | |
PackMatrixB_SSE(k, &B(0, j), ldb, &packedB[j * k]); | |
for (i = 0; i < m; i += 4) { /* Loop over the rows of C */ | |
AddDot4x4_SSE(k, &A(i, 0), lda, &packedB[j * k], 4, &C(i, j), ldc); | |
} | |
} | |
} | |
void MY_MMult_4x4_12_SSE(int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc) { | |
int i, p, pb, ib; | |
for (p = 0; p < k; p += kc) { | |
pb = min(k - p, kc); | |
for (i = 0; i < m; i += mc) { | |
ib = min(m - i, mc); | |
InnerKernel_SSE(ib, n, pb, &A(i, p), lda, &B(p, 0), ldb, &C(i, 0), ldc); | |
} | |
} | |
} | |
void GemmAccelerateSimd(){ | |
int M = 16; | |
int K = 64; | |
int N = 16; | |
float* MatriA = (float*)malloc(sizeof(float)* M*K); | |
float* MatriB = (float*)malloc(sizeof(float)* K*N); | |
float* MatriC = (float*)malloc(sizeof(float)* M*N); | |
for(int index=0; index<M*K; index<M*K; index++){ | |
MatriA[index] = 1; | |
} | |
for(int index=0; index<K*N; index++){ | |
MatriB[index] = 1; | |
} | |
MY_MMult_4x4_12_SSE(M,N,K,MatriA,K,MatriB,N,MatriC,N); | |
} |
#9. 进一步 pack
将 A 和 B 两个矩阵都进行 pack。
#include <emmintrin.h> // SSE2 | |
#include <smmintrin.h> // SSE4.1, for _mm_dp_ps if needed | |
#include <stdio.h> | |
/* Block sizes */ | |
#define mc 256 | |
#define kc 128 | |
/* Create macros so that the matrices are stored in row-major order */ | |
#define A(i,j) a[ (i)*lda + (j) ] | |
#define B(i,j) b[ (i)*ldb + (j) ] | |
#define C(i,j) c[ (i)*ldc + (j) ] | |
#define min(i, j) ((i) < (j) ? (i): (j)) | |
/* Routine for computing C = A * B + C */ | |
void PackMatrixB_SSE(int k, float *b, int ldb, float *b_to) { | |
for (int j = 0; j < k; ++j) { | |
float *b_ij_pntr = &B(j, 0); | |
*b_to++ = b_ij_pntr[0]; | |
*b_to++ = b_ij_pntr[1]; | |
*b_to++ = b_ij_pntr[2]; | |
*b_to++ = b_ij_pntr[3]; | |
} | |
} | |
void PackMatrixA_SSE(int k, float *a, int lda, float *a_to) { | |
for (int i = 0; i < k; ++i) { | |
*a_to++ = A(0, i); | |
*a_to++ = A(1, i); | |
*a_to++ = A(2, i); | |
*a_to++ = A(3, i); | |
} | |
} | |
void AddDot4x4_SSE(int k, float *a, int lda, float *b, int ldb, float *c, int ldc) { | |
__m128 c_0p_sum = _mm_setzero_ps(); | |
__m128 c_1p_sum = _mm_setzero_ps(); | |
__m128 c_2p_sum = _mm_setzero_ps(); | |
__m128 c_3p_sum = _mm_setzero_ps(); | |
for (int p = 0; p < k; ++p) { | |
__m128 b_reg = _mm_loadu_ps(b); // Load 4 floats from b | |
b += 4; | |
__m128 a_reg = _mm_loadu_ps(a); // Load 4 floats from a | |
a += 4; | |
c_0p_sum = _mm_add_ps(c_0p_sum, _mm_mul_ps(b_reg, _mm_shuffle_ps(a_reg, a_reg, _MM_SHUFFLE(0, 0, 0, 0)))); | |
c_1p_sum = _mm_add_ps(c_1p_sum, _mm_mul_ps(b_reg, _mm_shuffle_ps(a_reg, a_reg, _MM_SHUFFLE(1, 1, 1, 1)))); | |
c_2p_sum = _mm_add_ps(c_2p_sum, _mm_mul_ps(b_reg, _mm_shuffle_ps(a_reg, a_reg, _MM_SHUFFLE(2, 2, 2, 2)))); | |
c_3p_sum = _mm_add_ps(c_3p_sum, _mm_mul_ps(b_reg, _mm_shuffle_ps(a_reg, a_reg, _MM_SHUFFLE(3, 3, 3, 3)))); | |
} | |
// Store results back into C matrix | |
for (int row = 0; row < 4; ++row) { | |
float* c_pntr = &C(row, 0); | |
__m128 c_reg = _mm_loadu_ps(c_pntr); | |
switch (row) { | |
case 0: c_reg = _mm_add_ps(c_reg, c_0p_sum); break; | |
case 1: c_reg = _mm_add_ps(c_reg, c_1p_sum); break; | |
case 2: c_reg = _mm_add_ps(c_reg, c_2p_sum); break; | |
case 3: c_reg = _mm_add_ps(c_reg, c_3p_sum); break; | |
} | |
_mm_storeu_ps(c_pntr, c_reg); | |
} | |
} | |
void InnerKernel_SSE(int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc) { | |
int i, j; | |
float packedA[m * k]; | |
float packedB[k * n]; | |
for (j = 0; j < n; j += 4) { /* Loop over the columns of C, unrolled by 4 */ | |
PackMatrixB_SSE(k, &B(0, j), ldb, packedB + j * k); | |
for (i = 0; i < m; i += 4) { /* Loop over the rows of C */ | |
if (0 == (j % 4)) { | |
PackMatrixA_SSE(k, &A(i, 0), lda, packedA + i * k); | |
} | |
AddDot4x4_SSE(k, packedA + i * k, k, packedB + j * k, 4, &C(i, j), ldc); | |
} | |
} | |
} | |
void MY_MMult_4x4_13_SSE(int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc) { | |
int i, p, pb, ib; | |
for (p = 0; p < k; p += kc) { | |
pb = min(k - p, kc); | |
for (i = 0; i < m; i += mc) { | |
ib = min(m - i, mc); | |
InnerKernel_SSE(ib, n, pb, &A(i, p), lda, &B(p, 0), ldb, &C(i, 0), ldc); | |
} | |
} | |
} | |
void GemmAccelerateSimd(){ | |
int M = 16; | |
int K = 64; | |
int N = 16; | |
float* MatriA = (float*)malloc(sizeof(float)* M*K); | |
float* MatriB = (float*)malloc(sizeof(float)* K*N); | |
float* MatriC = (float*)malloc(sizeof(float)* M*N); | |
for(int index=0; index<M*K; index<M*K; index++){ | |
MatriA[index] = 1; | |
} | |
for(int index=0; index<K*N; index++){ | |
MatriB[index] = 1; | |
} | |
MY_MMult_4x4_13_SSE(M,N,K,MatriA,K,MatriB,N,MatriC,N); | |
} |
# 后记
本博客目前以及可预期的将来都不会支持评论功能。各位大侠如若有指教和问题,可以在我的 github 项目 或随便一个项目下提出 issue,或者知乎 私信,并指明哪一篇博客,我看到一定及时回复,感激不尽!