# 前言

本篇文章简单介绍矩阵乘的加速方法,以学习算子加速需要注意的方面。想要学习更多内容可以参考《OpenBLAS gemm 从零入门》《BLISlab: A Sandbox for Optimizing GEMM》道阻且长_再探矩阵乘法优化《How To Optimize GEMM》等项目或文章。

作为初学者,错误在所难免,还望不吝赐教。

# 1. 基准算法

1.基准算法

矩阵乘运算的基准算法,未经过任何优化。矩阵以行主序进行排布, 针对 X86 平台。矩阵 C= A * B,A 矩阵为 (M,K), B 矩阵为 (K,N)。

#include <stdio.h>
#define A( i, j ) a[ (i)*lda + (j) ]
#define B( i, j ) b[ (i)*ldb + (j) ]
#define C( i, j ) c[ (i)*ldc + (j) ]
// gemm C = A * B + C
void MatrixMultiply(int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc)
{
    for(int i = 0; i < m; i++){
        for (int p=0; p<k; p++ ){ 
            for (int j=0; j<n; j++ ){    
                C(i, j) = C(i, j) + A(i, p) * B(p, j);
            }
        }
    }
}
void GemmAccelerate(){
    int M = 16;
    int K = 64;
    int N = 16;
    float* MatriA = (float*)malloc(sizeof(float)* M*K);
    float* MatriB = (float*)malloc(sizeof(float)* K*N);
    float* MatriC = (float*)malloc(sizeof(float)* M*N);
    for(int index=0; index<M*K; index<M*K; index++){
        MatriA[index] = 1;
    }
    for(int index=0; index<K*N; index++){
        MatriB[index] = 1;
    }
    MatrixMultiply(M,N,K,MatriA,K,MatriB,N,MatriC,N);
}

# 循环展开

2.循环展开

引入循环展开(Loop Unrolling) ,每次 A 矩阵一行和 B 四列进行计算,得到四个结果。
可以减少循环控制开销:减少了循环条件检查和更新的频率,因为现在每四个元素作为一个批次处理,理论上减少了 75% 的循环控制指令。

#include <stdio.h>
#define A(i,j) a[ (i)*lda + (j) ]
#define B(i,j) b[ (i)*ldb + (j) ]
#define C(i,j) c[ (i)*ldc + (j) ]
#define Y(i) y[ (i)*incx ]
void AddDot( int k, float *x, int incx,  float *y, float *gamma )
{
  int p;
  for ( p=0; p<k; p++ ){
    *gamma += x[p] * Y(p);     
  }
}
void MY_MMult2( int m, int n, int k, float *a, int lda, 
                float *b, int ldb, float *c, int ldc )
{
  int i, j;
  for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
    for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
      AddDot( k, &A( i,0 ), lda, &B( 0,j ), &C( i,j ) );
      AddDot( k, &A( i,0 ), lda, &B( 0,j+1 ), &C( i,j+1 ) );
      AddDot( k, &A( i,0 ), lda, &B( 0,j+2 ), &C( i,j+2 ) );
      AddDot( k, &A( i,0 ), lda, &B( 0,j+3 ), &C( i,j+3 ) );
    }
  }
}
void GemmAccelerate(){
    int M = 16;
    int K = 64;
    int N = 16;
    float* MatriA = (float*)malloc(sizeof(float)* M*K);
    float* MatriB = (float*)malloc(sizeof(float)* K*N);
    float* MatriC = (float*)malloc(sizeof(float)* M*N);
    for(int index=0; index<M*K; index<M*K; index++){
        MatriA[index] = 1;
    }
    for(int index=0; index<K*N; index++){
        MatriB[index] = 1;
    }
    MY_MMult2(M,N,K,MatriA,K,MatriB,N,MatriC,N);
}

# 3. 使用寄存器

中间计算结果使用寄存器存储,可以减少内存访问次数,可以显著提升性能,因为寄存器访问速度远快于内存访问。

#include <stdio.h>
#define A(i,j) a[ (i)*lda + (j) ]
#define B(i,j) b[ (i)*ldb + (j) ]
#define C(i,j) c[ (i)*ldc + (j) ]
/* Routine for computing C = A * B + C */
void AddDot1x4( int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{
  int p;
  register float    c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  a_0p_reg;    
  c_00_reg = 0.0; 
  c_01_reg = 0.0; 
  c_02_reg = 0.0; 
  c_03_reg = 0.0;
 
  for ( p=0; p<k; p++ ){
    a_0p_reg = A( 0, p );
    c_00_reg += a_0p_reg * B( p, 0 );     
    c_01_reg += a_0p_reg * B( p, 1 );     
    c_02_reg += a_0p_reg * B( p, 2 );     
    c_03_reg += a_0p_reg * B( p, 3 );     
  }
  C( 0, 0 ) += c_00_reg; 
  C( 0, 1 ) += c_01_reg; 
  C( 0, 2 ) += c_02_reg; 
  C( 0, 3 ) += c_03_reg;
}
void MY_MMult_1x4_6( int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc )
{
  int i, j;
  for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
    for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
      AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
    }
  }
}
void GemmAccelerate(){
    int M = 16;
    int K = 64;
    int N = 16;
    float* MatriA = (float*)malloc(sizeof(float)* M*K);
    float* MatriB = (float*)malloc(sizeof(float)* K*N);
    float* MatriC = (float*)malloc(sizeof(float)* M*N);
    for(int index=0; index<M*K; index<M*K; index++){
        MatriA[index] = 1;
    }
    for(int index=0; index<K*N; index++){
        MatriB[index] = 1;
    }
    MY_MMult_1x4_6(M,N,K,MatriA,K,MatriB,N,MatriC,N);
}

# 4. 避免乘法

4.避免乘法

本次算法相对上一个,调整了循环展开的矩阵(从矩阵 B 变成了矩阵 A)。最内层循环中的 c_00_reg += a_0p_reg * B( p, 0 ); 变成 c_00_reg += b_0p_reg * *ap0_pntr++; ,即用指针的累加替换了原来的 B( p, 0 ) ,使得最内层循环减少了 4 个乘法。

调整循环展开矩阵的原因是 行主序,矩阵 A 在内存中是连续的。

避免乘法: 通常来讲乘法消耗的时间长于加法,效率差异取决于具体的硬件平台。减少乘法计算量可以提高算法效率。

#include <stdio.h>
#define A(i,j) a[ (i)*lda + (j) ]
#define B(i,j) b[ (i)*ldb + (j) ]
#define C(i,j) c[ (i)*ldc + (j) ]
/* Routine for computing C = A * B + C */
void AddDot1x4( int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{
  int p;
  register float 
 c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  b_0p_reg;
  float 
 *ap0_pntr, *ap1_pntr, *ap2_pntr, *ap3_pntr; 
    
  ap0_pntr = &A( 0, 0 );
  ap1_pntr = &A( 1, 0 );
  ap2_pntr = &A( 2, 0 );
  ap3_pntr = &A( 3, 0 );
  c_00_reg = 0.0; 
  c_01_reg = 0.0; 
  c_02_reg = 0.0; 
  c_03_reg = 0.0;
 
  for ( p=0; p<k; p++ ){
    b_0p_reg = B( p, 0 );
    c_00_reg += b_0p_reg * *ap0_pntr++;
    c_01_reg += b_0p_reg * *ap1_pntr++;
    c_02_reg += b_0p_reg * *ap2_pntr++;
    c_03_reg += b_0p_reg * *ap3_pntr++;
  }
  C( 0, 0 ) += c_00_reg; 
  C( 1, 0 ) += c_01_reg; 
  C( 2, 0 ) += c_02_reg; 
  C( 3, 0 ) += c_03_reg;
}
void MY_MMult_1x4_7( int m, int n, int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{
  int i, j;
  for ( j=0; j<n; j+=1 ){        /* Loop over the columns of C, unrolled by 4 */
    for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
      AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
    }
  }
}
void GemmAccelerate(){
    int M = 16;
    int K = 64;
    int N = 16;
    float* MatriA = (float*)malloc(sizeof(float)* M*K);
    float* MatriB = (float*)malloc(sizeof(float)* K*N);
    float* MatriC = (float*)malloc(sizeof(float)* M*N);
    for(int index=0; index<M*K; index<M*K; index++){
        MatriA[index] = 1;
    }
    for(int index=0; index<K*N; index++){
        MatriB[index] = 1;
    }
    MY_MMult_1x4_7(M,N,K,MatriA,K,MatriB,N,MatriC,N);
}

# 5. 进一步循环展开

将最内部的循环也展开,即原本的 for ( p=0; p<k; p++ ) 变成 for ( p=0; p<k; p+=4 ) ,内部计算的内容变成原来的四倍。当然也开辟了更多的寄存器。

#include <stdio.h>
#define A(i,j) a[ (i)*lda + (j) ]
#define B(i,j) b[ (i)*ldb + (j) ]
#define C(i,j) c[ (i)*ldc + (j) ]
/* Routine for computing C = A * B + C */
void AddDot1x4( int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{
  int p;
  register float   c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  b_0p_reg;
  float  *ap0_pntr, *ap1_pntr, *ap2_pntr, *ap3_pntr;     
  ap0_pntr = &A( 0, 0 );
  ap1_pntr = &A( 1, 0 );
  ap2_pntr = &A( 2, 0 );
  ap3_pntr = &A( 3, 0 );
  c_00_reg = 0.0; 
  c_01_reg = 0.0; 
  c_02_reg = 0.0; 
  c_03_reg = 0.0;
 
  for ( p=0; p<k; p+=4 ){
    b_0p_reg = B( p, 0 );
    c_00_reg += b_0p_reg * *ap0_pntr++;
    c_01_reg += b_0p_reg * *ap1_pntr++;
    c_02_reg += b_0p_reg * *ap2_pntr++;
    c_03_reg += b_0p_reg * *ap3_pntr++;
    b_0p_reg = B( p+1, 0 );
    c_00_reg += b_0p_reg * *ap0_pntr++;
    c_01_reg += b_0p_reg * *ap1_pntr++;
    c_02_reg += b_0p_reg * *ap2_pntr++;
    c_03_reg += b_0p_reg * *ap3_pntr++;
    b_0p_reg = B( p+2, 0 );
    c_00_reg += b_0p_reg * *ap0_pntr++;
    c_01_reg += b_0p_reg * *ap1_pntr++;
    c_02_reg += b_0p_reg * *ap2_pntr++;
    c_03_reg += b_0p_reg * *ap3_pntr++;
    b_0p_reg = B( p+3, 0);
    c_00_reg += b_0p_reg * *ap0_pntr++;
    c_01_reg += b_0p_reg * *ap1_pntr++;
    c_02_reg += b_0p_reg * *ap2_pntr++;
    c_03_reg += b_0p_reg * *ap3_pntr++;
  }
  C( 0, 0 ) += c_00_reg; 
  C( 1, 0 ) += c_01_reg; 
  C( 2, 0 ) += c_02_reg; 
  C( 3, 0 ) += c_03_reg;
}
void MY_MMult_1x4_8( int m, int n, int k, float *a, int lda,float *b, int ldb,float *c, int ldc )
{
  int i, j;
  for ( j=0; j<n; j+=1 ){        /* Loop over the columns of C, unrolled by 4 */
    for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */
      AddDot1x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
    }
  }
}
void GemmAccelerate(){
    int M = 16;
    int K = 64;
    int N = 16;
    float* MatriA = (float*)malloc(sizeof(float)* M*K);
    float* MatriB = (float*)malloc(sizeof(float)* K*N);
    float* MatriC = (float*)malloc(sizeof(float)* M*N);
    for(int index=0; index<M*K; index<M*K; index++){
        MatriA[index] = 1;
    }
    for(int index=0; index<K*N; index++){
        MatriB[index] = 1;
    }
    MY_MMult_1x4_8(M,N,K,MatriA,K,MatriB,N,MatriC,N);
}

# 6. 使用 SIMD 指令

6.使用指令

SIMD 指令是单指令多数据,以 X86 平台的 SSE2 为例,其支持 128 的寄存器,一次性可以计算 4 个 float 或 int 数据,能够提高算法的并行度。

代码中的 SSE 指令 _mm_setzero_ps :将寄存器置零, _mm_loadu_ps :读取指定位置的数据到寄存器, _mm_mul_ps :乘运算, _mm_add_ps :加运算, _mm_storeu_ps :将寄存器的数据存储会内存, _mm_set_ps1 :将给定地址的一个数复制 4 份到寄存器。

以下代码在 M 和 N 两个方向循环展开(和 5 不同),一次性计算 16 个数据。

#include <stdio.h>
#include <emmintrin.h> // SSE2
#define A(i,j) a[ (i)*lda + (j) ]
#define B(i,j) b[ (i)*ldb + (j) ]
#define C(i,j) c[ (i)*ldc + (j) ]
/* Routine for computing C = A * B + C */
void AddDot4x4_SSE2(int k, float *a, int lda, float *b, int ldb, float *c, int ldc){
    /* Point to the current elements in the four rows of A */
    float *a_0p_pntr = &A(0, 0);
    float *a_1p_pntr = &A(1, 0);
    float *a_2p_pntr = &A(2, 0);
    float *a_3p_pntr = &A(3, 0);
    __m128 c_p0_sum = _mm_setzero_ps();
    __m128 c_p1_sum = _mm_setzero_ps();
    __m128 c_p2_sum = _mm_setzero_ps();
    __m128 c_p3_sum = _mm_setzero_ps();
    register float a_0p_reg, a_1p_reg, a_2p_reg, a_3p_reg;
    for (int p = 0; p < k; ++p) {
        __m128 b_reg = _mm_loadu_ps(&B(p, 0));
        a_0p_reg = *a_0p_pntr++;
        a_1p_reg = *a_1p_pntr++;
        a_2p_reg = *a_2p_pntr++;
        a_3p_reg = *a_3p_pntr++;
        __m128 a_0p_vec = _mm_set_ps1(a_0p_reg);
        __m128 a_1p_vec = _mm_set_ps1(a_1p_reg);
        __m128 a_2p_vec = _mm_set_ps1(a_2p_reg);
        __m128 a_3p_vec = _mm_set_ps1(a_3p_reg);
        c_p0_sum = _mm_add_ps(c_p0_sum, _mm_mul_ps(b_reg, a_0p_vec));
        c_p1_sum = _mm_add_ps(c_p1_sum, _mm_mul_ps(b_reg, a_1p_vec));
        c_p2_sum = _mm_add_ps(c_p2_sum, _mm_mul_ps(b_reg, a_2p_vec));
        c_p3_sum = _mm_add_ps(c_p3_sum, _mm_mul_ps(b_reg, a_3p_vec));
    }
    float *c_pntr;
    __m128 c_reg;
    c_pntr = &C(0, 0);
    c_reg = _mm_loadu_ps(c_pntr);
    c_reg = _mm_add_ps(c_reg, c_p0_sum);
    _mm_storeu_ps(c_pntr, c_reg);
    c_pntr = &C(1, 0);
    c_reg = _mm_loadu_ps(c_pntr);
    c_reg = _mm_add_ps(c_reg, c_p1_sum);
    _mm_storeu_ps(c_pntr, c_reg);
    c_pntr = &C(2, 0);
    c_reg = _mm_loadu_ps(c_pntr);
    c_reg = _mm_add_ps(c_reg, c_p2_sum);
    _mm_storeu_ps(c_pntr, c_reg);
    c_pntr = &C(3, 0);
    c_reg = _mm_loadu_ps(c_pntr);
    c_reg = _mm_add_ps(c_reg, c_p3_sum);
    _mm_storeu_ps(c_pntr, c_reg);
}
void MY_MMult_4x4_10(int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc){
    int i, j;
    for (j = 0; j < n; j += 4) {        /* Loop over the columns of C, unrolled by 4 */
        for (i = 0; i < m; i += 4) {    /* Loop over the rows of C */
            AddDot4x4_SSE2(k, &A(i, 0), lda, &B(0, j), ldb, &C(i, j), ldc);
        }
    }
}
void GemmAccelerateSimd(){
    int M = 16;
    int K = 64;
    int N = 16;
    float* MatriA = (float*)malloc(sizeof(float)* M*K);
    float* MatriB = (float*)malloc(sizeof(float)* K*N);
    float* MatriC = (float*)malloc(sizeof(float)* M*N);
    for(int index=0; index<M*K; index<M*K; index++){
        MatriA[index] = 1;
    }
    for(int index=0; index<K*N; index++){
        MatriB[index] = 1;
    }
    MY_MMult_4x4_10(M,N,K,MatriA,K,MatriB,N,MatriC,N);
}

# 7. 矩阵分块

7.矩阵分块

分块技术(blocking)是一种常用的优化策略,旨在改善缓存命中率和数据局部性,进而提升计算密集型操作(如矩阵乘法)的性能。通过将大矩阵分割成较小的子矩阵(即 “块”),可以更好地利用 CPU 缓存,并减少内存访问延迟。

以下代码中, mc (Macro for m-block Count):定义了处理矩阵行方向上的块大小 , kc (Macro for k-block Count):定义了处理矩阵列方向上的块大小。在整个代码中被用作循环边界或参数,用于控制矩阵分块的尺寸。他们将矩阵 A 分割乘 256128 的小块,将矩阵 B 分割成 128N 的小块,然后计算两个小块的矩阵乘运算。至于如何得到最终结果,可以参考《线性代数》知识:分块矩阵乘。

#include <emmintrin.h> // SSE2
#include <smmintrin.h> // SSE4.1, for _mm_dp_ps if needed
#include <stdio.h>
/* Block sizes */
#define mc 256 
#define kc 128
/* Create macros so that the matrices are stored in row-major order */
#define A(i,j) a[ (i)*lda + (j) ]
#define B(i,j) b[ (i)*ldb + (j) ]
#define C(i,j) c[ (i)*ldc + (j) ]
#define min(i, j) ((i) < (j) ? (i): (j))
/* Routine for computing C = A * B + C */
void AddDot4x4_SSE( int k, float *a, int lda, float *b, int ldb, float *c, int ldc )
{
  /* Point to the current elements in the four rows of A */
  float *a_0p_pntr = &A(0, 0);
  float *a_1p_pntr = &A(1, 0);
  float *a_2p_pntr = &A(2, 0);
  float *a_3p_pntr = &A(3, 0);
  __m128 c_0p_sum = _mm_setzero_ps();
  __m128 c_1p_sum = _mm_setzero_ps();
  __m128 c_2p_sum = _mm_setzero_ps();
  __m128 c_3p_sum = _mm_setzero_ps();
  for (int p = 0; p < k; ++p) {
    __m128 b_reg = _mm_loadu_ps(&B(p, 0));
    float a_0p_reg = *a_0p_pntr++;
    float a_1p_reg = *a_1p_pntr++;
    float a_2p_reg = *a_2p_pntr++;
    float a_3p_reg = *a_3p_pntr++;
    c_0p_sum = _mm_add_ps(c_0p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_0p_reg)));
    c_1p_sum = _mm_add_ps(c_1p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_1p_reg)));
    c_2p_sum = _mm_add_ps(c_2p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_2p_reg)));
    c_3p_sum = _mm_add_ps(c_3p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_3p_reg)));
  }
  // Store results back into C matrix
  for (int row = 0; row < 4; ++row) {
    float* c_pntr = &C(row, 0);
    __m128 c_reg = _mm_loadu_ps(c_pntr);
    switch (row) {
      case 0: c_reg = _mm_add_ps(c_reg, c_0p_sum); break;
      case 1: c_reg = _mm_add_ps(c_reg, c_1p_sum); break;
      case 2: c_reg = _mm_add_ps(c_reg, c_2p_sum); break;
      case 3: c_reg = _mm_add_ps(c_reg, c_3p_sum); break;
    }
    _mm_storeu_ps(c_pntr, c_reg);
  }
}
void InnerKernel_SSE( int m, int n, int k, float *a, int lda,  float *b, int ldb, float *c, int ldc ){
  for ( int j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
    for ( int i=0; i<m; i+=4 ){        /* Loop over the rows of C */
      AddDot4x4_SSE( k, &A( i,0 ), lda, &B(0, j), ldb, &C( i,j ), ldc );
    }
  }
}
void MY_MMult_4x4_11_SSE( int m, int n, int k, float *a, int lda,  float *b, int ldb, float *c, int ldc ) {
  for (int p = 0; p < k; p += kc) {
    int pb = min(k - p, kc);
    for (int i = 0; i < m; i += mc) {
      int ib = min(m - i, mc);
      InnerKernel_SSE(ib, n, pb, &A(i, p), lda, &B(p, 0), ldb, &C(i, 0), ldc);
    }
  }
}
void GemmAccelerateSimd(){
    int M = 16;
    int K = 64;
    int N = 16;
    float* MatriA = (float*)malloc(sizeof(float)* M*K);
    float* MatriB = (float*)malloc(sizeof(float)* K*N);
    float* MatriC = (float*)malloc(sizeof(float)* M*N);
    for(int index=0; index<M*K; index<M*K; index++){
        MatriA[index] = 1;
    }
    for(int index=0; index<K*N; index++){
        MatriB[index] = 1;
    }
    MY_MMult_4x4_11_SSE(M,N,K,MatriA,K,MatriB,N,MatriC,N);
}

# 8. 数据 Pack

在上面的优化中可以发现,在矩阵乘法的计算中,无论是行主序还是列主序,始终有一个矩阵的内存是没办法连续访问的。为了改善这个情况,可以执行数据 Pack,使矩阵 A 和矩阵 B 的内存变成连续的。

下面代码,在矩阵运算之前将矩阵 B 按照访问顺序重新排布,存储在 packedB 中。

是否有速度提升,需要在目标平台上实测,因为算法提升了矩阵运算效率,但是增加了 pack 时间。

#include <emmintrin.h> // SSE2
#include <smmintrin.h> // SSE4.1, for _mm_dp_ps if needed
#include <stdio.h>
/* Block sizes */
#define mc 256 
#define kc 128
/* Create macros so that the matrices are stored in row-major order */
#define A(i,j) a[ (i)*lda + (j) ]
#define B(i,j) b[ (i)*ldb + (j) ]
#define C(i,j) c[ (i)*ldc + (j) ]
#define min(i, j) ((i) < (j) ? (i): (j))
/* Routine for computing C = A * B + C */
void PackMatrixB_SSE(int k, float *b, int ldb, float *b_to) {
  for (int j = 0; j < k; ++j) {
    float *b_ij_pntr = &B(j, 0);
    *b_to++ = b_ij_pntr[0];
    *b_to++ = b_ij_pntr[1];
    *b_to++ = b_ij_pntr[2];
    *b_to++ = b_ij_pntr[3];
  }
}
void AddDot4x4_SSE(int k, float *a, int lda, float *b, int ldb, float *c, int ldc) {
  /* Point to the current elements in the four rows of A */
  float *a_0p_pntr = &A(0, 0);
  float *a_1p_pntr = &A(1, 0);
  float *a_2p_pntr = &A(2, 0);
  float *a_3p_pntr = &A(3, 0);
  __m128 c_0p_sum = _mm_setzero_ps();
  __m128 c_1p_sum = _mm_setzero_ps();
  __m128 c_2p_sum = _mm_setzero_ps();
  __m128 c_3p_sum = _mm_setzero_ps();
  for (int p = 0; p < k; ++p) {
    __m128 b_reg = _mm_loadu_ps(b); // Load 4 floats from b
    b += 4;
    float a_0p_reg = *a_0p_pntr++;
    float a_1p_reg = *a_1p_pntr++;
    float a_2p_reg = *a_2p_pntr++;
    float a_3p_reg = *a_3p_pntr++;
    c_0p_sum = _mm_add_ps(c_0p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_0p_reg)));
    c_1p_sum = _mm_add_ps(c_1p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_1p_reg)));
    c_2p_sum = _mm_add_ps(c_2p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_2p_reg)));
    c_3p_sum = _mm_add_ps(c_3p_sum, _mm_mul_ps(b_reg, _mm_set1_ps(a_3p_reg)));
  }
  // Store results back into C matrix
  for (int row = 0; row < 4; ++row) {
    float* c_pntr = &C(row, 0);
    __m128 c_reg = _mm_loadu_ps(c_pntr);
    switch (row) {
      case 0: c_reg = _mm_add_ps(c_reg, c_0p_sum); break;
      case 1: c_reg = _mm_add_ps(c_reg, c_1p_sum); break;
      case 2: c_reg = _mm_add_ps(c_reg, c_2p_sum); break;
      case 3: c_reg = _mm_add_ps(c_reg, c_3p_sum); break;
    }
    _mm_storeu_ps(c_pntr, c_reg);
  }
}
void InnerKernel_SSE(int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc) {
  int i, j;
  float packedB[k * n];
  for (j = 0; j < n; j += 4) { /* Loop over the columns of C, unrolled by 4 */
    PackMatrixB_SSE(k, &B(0, j), ldb, &packedB[j * k]);
    for (i = 0; i < m; i += 4) { /* Loop over the rows of C */
      AddDot4x4_SSE(k, &A(i, 0), lda, &packedB[j * k], 4, &C(i, j), ldc);
    }
  }
}
void MY_MMult_4x4_12_SSE(int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc) {
  int i, p, pb, ib;
  for (p = 0; p < k; p += kc) {
    pb = min(k - p, kc);
    for (i = 0; i < m; i += mc) {
      ib = min(m - i, mc);
      InnerKernel_SSE(ib, n, pb, &A(i, p), lda, &B(p, 0), ldb, &C(i, 0), ldc);
    }
  }
}
void GemmAccelerateSimd(){
    int M = 16;
    int K = 64;
    int N = 16;
    float* MatriA = (float*)malloc(sizeof(float)* M*K);
    float* MatriB = (float*)malloc(sizeof(float)* K*N);
    float* MatriC = (float*)malloc(sizeof(float)* M*N);
    for(int index=0; index<M*K; index<M*K; index++){
        MatriA[index] = 1;
    }
    for(int index=0; index<K*N; index++){
        MatriB[index] = 1;
    }
    MY_MMult_4x4_12_SSE(M,N,K,MatriA,K,MatriB,N,MatriC,N);
}

#9. 进一步 pack

将 A 和 B 两个矩阵都进行 pack。

#include <emmintrin.h> // SSE2
#include <smmintrin.h> // SSE4.1, for _mm_dp_ps if needed
#include <stdio.h>
/* Block sizes */
#define mc 256
#define kc 128
/* Create macros so that the matrices are stored in row-major order */
#define A(i,j) a[ (i)*lda + (j) ]
#define B(i,j) b[ (i)*ldb + (j) ]
#define C(i,j) c[ (i)*ldc + (j) ]
#define min(i, j) ((i) < (j) ? (i): (j))
/* Routine for computing C = A * B + C */
void PackMatrixB_SSE(int k, float *b, int ldb, float *b_to) {
  for (int j = 0; j < k; ++j) {
    float *b_ij_pntr = &B(j, 0);
    *b_to++ = b_ij_pntr[0];
    *b_to++ = b_ij_pntr[1];
    *b_to++ = b_ij_pntr[2];
    *b_to++ = b_ij_pntr[3];
  }
}
void PackMatrixA_SSE(int k, float *a, int lda, float *a_to) {
  for (int i = 0; i < k; ++i) {
    *a_to++ = A(0, i);
    *a_to++ = A(1, i);
    *a_to++ = A(2, i);
    *a_to++ = A(3, i);
  }
}
void AddDot4x4_SSE(int k, float *a, int lda, float *b, int ldb, float *c, int ldc) {
  __m128 c_0p_sum = _mm_setzero_ps();
  __m128 c_1p_sum = _mm_setzero_ps();
  __m128 c_2p_sum = _mm_setzero_ps();
  __m128 c_3p_sum = _mm_setzero_ps();
  for (int p = 0; p < k; ++p) {
    __m128 b_reg = _mm_loadu_ps(b); // Load 4 floats from b
    b += 4;
    __m128 a_reg = _mm_loadu_ps(a); // Load 4 floats from a
    a += 4;
    c_0p_sum = _mm_add_ps(c_0p_sum, _mm_mul_ps(b_reg, _mm_shuffle_ps(a_reg, a_reg, _MM_SHUFFLE(0, 0, 0, 0))));
    c_1p_sum = _mm_add_ps(c_1p_sum, _mm_mul_ps(b_reg, _mm_shuffle_ps(a_reg, a_reg, _MM_SHUFFLE(1, 1, 1, 1))));
    c_2p_sum = _mm_add_ps(c_2p_sum, _mm_mul_ps(b_reg, _mm_shuffle_ps(a_reg, a_reg, _MM_SHUFFLE(2, 2, 2, 2))));
    c_3p_sum = _mm_add_ps(c_3p_sum, _mm_mul_ps(b_reg, _mm_shuffle_ps(a_reg, a_reg, _MM_SHUFFLE(3, 3, 3, 3))));
  }
  // Store results back into C matrix
  for (int row = 0; row < 4; ++row) {
    float* c_pntr = &C(row, 0);
    __m128 c_reg = _mm_loadu_ps(c_pntr);
    switch (row) {
      case 0: c_reg = _mm_add_ps(c_reg, c_0p_sum); break;
      case 1: c_reg = _mm_add_ps(c_reg, c_1p_sum); break;
      case 2: c_reg = _mm_add_ps(c_reg, c_2p_sum); break;
      case 3: c_reg = _mm_add_ps(c_reg, c_3p_sum); break;
    }
    _mm_storeu_ps(c_pntr, c_reg);
  }
}
void InnerKernel_SSE(int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc) {
  int i, j;
  float packedA[m * k];
  float packedB[k * n];
  for (j = 0; j < n; j += 4) { /* Loop over the columns of C, unrolled by 4 */
    PackMatrixB_SSE(k, &B(0, j), ldb, packedB + j * k);
    for (i = 0; i < m; i += 4) { /* Loop over the rows of C */
      if (0 == (j % 4)) {
        PackMatrixA_SSE(k, &A(i, 0), lda, packedA + i * k);
      }
      AddDot4x4_SSE(k, packedA + i * k, k, packedB + j * k, 4, &C(i, j), ldc);
    }
  }
}
void MY_MMult_4x4_13_SSE(int m, int n, int k, float *a, int lda, float *b, int ldb, float *c, int ldc) {
  int i, p, pb, ib;
  for (p = 0; p < k; p += kc) {
    pb = min(k - p, kc);
    for (i = 0; i < m; i += mc) {
      ib = min(m - i, mc);
      InnerKernel_SSE(ib, n, pb, &A(i, p), lda, &B(p, 0), ldb, &C(i, 0), ldc);
    }
  }
}
void GemmAccelerateSimd(){
    int M = 16;
    int K = 64;
    int N = 16;
    float* MatriA = (float*)malloc(sizeof(float)* M*K);
    float* MatriB = (float*)malloc(sizeof(float)* K*N);
    float* MatriC = (float*)malloc(sizeof(float)* M*N);
    for(int index=0; index<M*K; index<M*K; index++){
        MatriA[index] = 1;
    }
    for(int index=0; index<K*N; index++){
        MatriB[index] = 1;
    }
    MY_MMult_4x4_13_SSE(M,N,K,MatriA,K,MatriB,N,MatriC,N);
}

# 后记

本博客目前以及可预期的将来都不会支持评论功能。各位大侠如若有指教和问题,可以在我的 github 项目 或随便一个项目下提出 issue,或者知乎 私信,并指明哪一篇博客,我看到一定及时回复,感激不尽!

Edited on

Give me a cup of [coffee]~( ̄▽ ̄)~*

XianMu WeChat Pay

WeChat Pay

XianMu Alipay

Alipay