full_pw_conv_2d_nhwc_cbsm - Joejiong/buddy-mlir GitHub Wiki

#map0 = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> (d0 ceildiv 256)>
module  {
  
  func private @print_memref_f32(memref<*xf32>)

  // Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
  func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref<?x?x?x?xf32> {
    %buf = memref.alloc(%s1, %s2, %s3, %s4) : memref<?x?x?x?xf32>
    linalg.fill(%f, %buf) : f32, memref<?x?x?x?xf32>
    return %buf : memref<?x?x?x?xf32>
  }

  func @conv_2d_nhwc_hwcf(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
    linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
                            strides = dense<1> : tensor<2xi64>}
      ins (%arg0, %arg1: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
      outs (%arg2: memref<?x?x?x?xf32>)
    return
  }

  func @conv_2d_tensor(%input:  tensor<?x?xi32>,
               %filter: tensor<?x?xi32>,
               %output: tensor<?x?xi32>) -> tensor<?x?xi32> {
    %0 = linalg.conv_2d
      ins  (%input, %filter: tensor<?x?xi32>, tensor<?x?xi32>)
      outs (%output: tensor<?x?xi32>) -> tensor<?x?xi32>
    return %0 : tensor<?x?xi32>
  }

  func @conv_2d_memref(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
    linalg.conv_2d ins (%arg0, %arg1: memref<?x?xf32>, memref<?x?xf32>)
                  outs (%arg2: memref<?x?xf32>)
    return
  }

  func @cbsm_conv_inner_func(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    
    %0 = memref.dim %arg1, %c0 : memref<?x?xf32> // H
    %1 = memref.dim %arg1, %c1 : memref<?x?xf32> // W
    
    %2 = memref.dim %arg2, %c0 : memref<?x?xf32> //FH
    %3 = memref.dim %arg2, %c1 : memref<?x?xf32> //FW
    
    affine.for %arg3 = #map0(%c0) to #map0(%2) {       // a3 : 0-fh
      affine.for %arg4 = #map0(%c0) to #map0(%0) {     // a4 : 0-h
        affine.for %arg5 = #map0(%c0) to #map0(%1) {   // a5 : 0-w
          affine.for %arg6 = #map0(%c0) to #map1(%3) { // a6 : 0-up[fw/256]
          	
          	// f4 = vector.load(filter[h,w])
            %4 = affine.vector_load %arg1[%arg4, %arg5] : memref<?x?xf32>, vector<1xf32> 	
            
            // vec.bcast(vector.load(filter[h,w])) 1-256
            %5 = vector.broadcast %4 : vector<1xf32> to vector<256xf32>
            
            // %6=vec.load256(img[fh+h, w+fw*256])
            %6 = affine.vector_load %arg0[%arg3 + %arg4, %arg5 + %arg6 * 256] : memref<?x?xf32>, vector<256xf32>
            
            // fi = vector.load(out[fw, up(fw/256)*256])
            %7 = affine.vector_load %arg2[%arg3, %arg6 * 256] : memref<?x?xf32>, vector<256xf32>
            
            // vec.fma(vec.load256(img[fh+h, w+fw*256])*vec.bcast(vector.load(filter[h,w]))
            //                                         +vec.load(out[fw, up(fw/256)*256]))
            %8 = vector.fma %6, %5, %7 : vector<256xf32>
            
            // out[fh,up(fw/256)*256]
            affine.vector_store %8, %arg2[%arg3, %arg6 * 256] : memref<?x?xf32>, vector<256xf32>
          }
        }
      }
    }
    return
  }

  func @pw_cbsm_conv2d_outer_func(%input: memref<?x?x?x?xf32>, %filter: memref<?x?x?x?xf32>, %output: memref<?x?x?x?xf32>) {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    
    %KH = memref.dim %filter, %c0 : memref<?x?x?x?xf32> // FH
    %KW = memref.dim %filter, %c1 : memref<?x?x?x?xf32> // FW
    %KC = memref.dim %filter, %c2 : memref<?x?x?x?xf32> // FC
    
    %ON = memref.dim %output, %c1 : memref<?x?x?x?xf32> // ON
    %OH = memref.dim %output, %c1 : memref<?x?x?x?xf32> // OH
    %OW = memref.dim %output, %c0 : memref<?x?x?x?xf32> // OW

    %OF = memref.dim %output, %c2 : memref<?x?x?x?xf32> // OF

    affine.for %on = #map0(%c0) to #map0(%ON) {           // on : 0-on(batch)
        affine.for %of = #map0(%c0) to #map0(%OF) {          // of : 0-of
            // out for adds out_adds_kc_tmp <1,OH,OW,1>
            affine.for %kc = #map0(%c0) to #map0(%KC) {          // kc : 0-kc (need to add)
                // init kc_out_tmp[OH,OW]
                %input_inner = memref.subview %input
                %filter_inner = memref.subview %filter
                
                call @conv_2d(%input_inner, %filter_inner, %output_inner) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
                %out_adds_kc_tmp =+ %output_inner
            }
            // output.insert_stride_slice 
        }
    }
  }

  func @pw_cbsm_conv2d_outer_func(%input: memref<?x?x?x?xf32>, %filter: memref<?x?x?x?xf32>, %output: memref<?x?x?x?xf32>) {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    
    %KH = memref.dim %filter, %c0 : memref<?x?x?x?xf32> // FH
    %KW = memref.dim %filter, %c1 : memref<?x?x?x?xf32> // FW
    %KC = memref.dim %filter, %c2 : memref<?x?x?x?xf32> // FC
    
    %ON = memref.dim %output, %c1 : memref<?x?x?x?xf32> // ON
    %OH = memref.dim %output, %c1 : memref<?x?x?x?xf32> // OH
    %OW = memref.dim %output, %c0 : memref<?x?x?x?xf32> // OW

    %OF = memref.dim %output, %c2 : memref<?x?x?x?xf32> // OF

    affine.for %on = #map0(%c0) to #map0(%ON) {           // on : 0-on(batch)
        affine.for %of = #map0(%c0) to #map0(%OF) {          // of : 0-of
            // out for adds out_adds_kc_tmp <1,OH,OW,1>
            affine.for %kc = #map0(%c0) to #map0(%KC) {          // kc : 0-kc (need to add)
                // init kc_out_tmp[OH,OW]
                %input_inner = memref.subview %input
                %filter_inner = memref.subview %filter
                
                call @conv_2d(%input_inner, %filter_inner, %output_inner) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
                %out_adds_kc_tmp =+ %output_inner
            }
            // output.insert_stride_slice 
        }
    }
  }

  func @pw_cbsm_conv2d_outer_func(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?xf32>) -> %output: tensor<?x?x?x?xf32> {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    
    %KH = tensor.dim %filter, %c0 : tensor<?x?x?x?xf32> // FH
    %KW = tensor.dim %filter, %c1 : tensor<?x?x?x?xf32> // FW
    %KC = tensor.dim %filter, %c2 : tensor<?x?x?x?xf32> // FC
    
    %ON = tensor.dim %output, %c1 : tensor<?x?x?x?xf32> // ON
    %OH = tensor.dim %output, %c1 : tensor<?x?x?x?xf32> // OH
    %OW = tensor.dim %output, %c0 : tensor<?x?x?x?xf32> // OW

    %OF = tensor.dim %output, %c2 : tensor<?x?x?x?xf32> // OF
    
    // out for adds out_adds_kc_tmp <1,OH,OW,1>
    %output = arith.constant dense<0> : tensor<%ONx%OHx%OWx%OFxf32>

    affine.for %on = #map0(%c0) to #map0(%ON) {           // on : 0-on(batch)
        affine.for %of = #map0(%c0) to #map0(%OF) {          // of : 0-of
            // out for adds out_adds_kc_tmp <1,OH,OW,1>
            %out_adds_kc_tmp = arith.constant dense<0> : tensor<%ONx%OHx%OWx%OFxf32>
            affine.for %kc = #map0(%c0) to #map0(%KC) {          // kc : 0-kc (need to add)
                // 1. init kc_out_tmp[OH,OW]
                %output_inner = arith.constant dense<0> : tensor<%OHx%OWxf32>
                // 2. silce input for cbsm
                // input_inner = input[on,:,:,kc]
                // filter_inner = filter[0,0,kc,of]
                %input_inner = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>
                %filter_inner = tensor.extract_slice %filter[0,0,%kc,%of][1,1,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>
                // 3. call conv_2d
                call @conv_2d(%input_inner, %filter_inner, %output_inner) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
                // 4. accmulate all kc with elementwise add to one of tensor
                %out_adds_kc_tmp =+ %output_inner
            }
            // 5. insert added kc_out to one layer of real output using: output.insert_stride_slice 
            %output = tensor.insert_slice %out_adds_kc_tmp %into %output[%on,0,0,%of][1,%OH,%OW,1][1,1,1,1] : tensor<%OHx%OWxf32> into tensor<%ONx%OHx%OWxOFxf32>
        }
    }
    return %output
  }

  func @pw_cbsm_conv2d(%input: memref<?x?x?x?xf32>, %filter: memref<?x?x?x?xf32>, %output: memref<?x?x?x?xf32>) {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    
    %KH = memref.dim %filter, %c0 : memref<?x?x?x?xf32> // FH
    %KW = memref.dim %filter, %c1 : memref<?x?x?x?xf32> // FW
    %KC = memref.dim %filter, %c2 : memref<?x?x?x?xf32> // FC
    
    %ON = memref.dim %output, %c1 : memref<?x?x?x?xf32> // ON
    %OH = memref.dim %output, %c1 : memref<?x?x?x?xf32> // OH
    %OW = memref.dim %output, %c0 : memref<?x?x?x?xf32> // OW

    %OF = memref.dim %output, %c2 : memref<?x?x?x?xf32> // OF

    // for (j:OF) {
    //   for (i:KC) {
    //      img_subview[:,:,:,kc]
    //      %output[i,of] += cbsm(img_subview[:,:,:,kc],filter[0,0,kc,of])
    //    }
    // }
    affine.for %on = #map0(%c0) to #map0(%ON) {           // on : 0-on(batch)
        affine.for %of = #map0(%c0) to #map0(%OF) {          // of : 0-of
            // output[n,h,w,f] = subview as output[:,:,:,f]
            // output[n,h,w,f] = subview as output[:,:,:,0]
            // output[n,h,w,f] = subview as output[:,:,:,1]
            affine.for %kc = #map0(%c0) to #map0(%KC) {          // kc : 0-kc (need to add)
                // subview as output[:,:,:,f] += partial_output[:,:,kc,0]
                affine.for %oh = #map0(%c0) to #map0(%OH) {       // a3 : 0-oh  
                    affine.for %ow_256 = #map0(%c0) to #map1(%OW) { // a6 : 0-up[ow/256]
                        // f4 = vector.load(filter[fh,fw,?fc,of]) (对于所有kc只有一个值of)
                        %4 = affine.vector_load %filter[0,0,%kc,%of] : memref<?x?x?x?xf32>, vector<1xf32> 	

                        // vec.bcast(vector.load(filter[fh,fw,kc,of])) 1-256
                        %5 = vector.broadcast %4 : vector<1xf32> to vector<256xf32>
                        // img[n,h,w,c] = subview as img[:,:,:,0]
                        // img[n,h,w,c] = subview as img[:,:,:,1] 
                        // img[n,h,w,c] = subview as img[:,:,:,2]  
                        
                        // %6=vec.load256(img[on, fh+oh, ow+fw*256,kc])                      
                        %6 = affine.vector_load %input[%on, 0+%oh, 0+%ow_256*256, %kc] : memref<?x?x?x?xf32>, vector<256xf32>
                        // %66 = affine.vector_load %img[:,:,:,0][%on, 0+%oh, 0+%ow_256*256, %kc] : memref<?x?x?x?xf32>, vector<256xf32>
                        // %66 = affine.vector_load %img[:,:,:,1][%on, 0+%oh, 0+%ow_256*256, %kc] : memref<?x?x?x?xf32>, vector<256xf32>
                        // %66 = affine.vector_load %img[:,:,:,2][%on, 0+%oh, 0+%ow_256*256, %kc] : memref<?x?x?x?xf32>, vector<256xf32>

                        // img[n,h,w,c] = subview as img[:,:,:,0]
                        // %66 = vector.extract_strided_slice %6 {offsets = [0, 0, 0, 0], sizes = [2, 4,,], strides = [1, 1,,]}: vector<4x8x16xf32> to vector<2x4x16xf32>
                        
                        // out[:,:,0,0]
                        // out[:,:,1,0]
                        // out[:,:,2,0]
                        // out[:,:,:,0] = reduce(dim[3],)
                        // fi = vector.load(out[on, oh, up(ow/256)*256], of]) [out fix with OF meaning add all kc to each of]
                        %7 = affine.vector_load %output[%on, %oh, %ow_256 * 256, %of] : memref<?x?x?x?xf32>, vector<256xf32>

                        // vec.fma(vec.load256(img[fh+oh, fw+ow*256])*vec.bcast(vector.load(filter[fh,fw]))
                        //                                           +vec.load(out[ow, up(ow/256)*256]))
                        %8 = vector.fma %6, %5, %7 : vector<256xf32>
                        
                        // out[oh,up(ow/256)*256]
                        affine.vector_store %8, %output[%on, %oh, %ow_256 * 256, %of] : memref<?x?x?x?xf32>, vector<256xf32>  
                        // %66 = vector.insert_strided_slice %6 {offsets = [0, 0, 0, 0], sizes = [2, 4,,], strides = [1, 1,,]}: vector<4x8x16xf32> to vector<2x4x16xf32>
                    }
                }
            }
        }    
    }
    return
  }

  func @main() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    %c6 = arith.constant 6 : index
    %c8 = arith.constant 8 : index
    %f10 = arith.constant 10.00000e+00 : f32
    %val = arith.constant 2.00000e+00 : f32
    %zero = arith.constant 0.00000e+00 : f32

    // normal_conv2d_test
    // filter: 1,1,1,3 
    // in    : 1,2,2,1
    // out   : 1,2,2,3
    %filter2D_nhwc = call @alloc_4d_filled_f32(%c1, %c1, %c1, %c3, %val) :(index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
    %in2D_nhwc = call @alloc_4d_filled_f32(%c1, %c2, %c2, %c1, %val) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
    %out2D_nhwc = call @alloc_4d_filled_f32(%c1, %c2, %c2, %c3, %zero) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)

    memref.store %f10, %in2D_nhwc[%c0, %c0, %c1, %c0] : memref<?x?x?x?xf32>
    call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
    %out2D_nhwc_ = memref.cast %out2D_nhwc : memref<?x?x?x?xf32> to memref<*xf32>
    call @print_memref_f32(%out2D_nhwc_): (memref<*xf32>) -> ()
 
    // pw_conv2d_test
    // filter: 1,1,1,3 
    // in    : 1,2,2,1
    // out   : 1,2,2,3
    %filter2D_nhwc_pw = call @alloc_4d_filled_f32(%c1, %c1, %c1, %c3, %val) :(index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
    %in2D_nhwc_pw = call @alloc_4d_filled_f32(%c1, %c2, %c2, %c1, %val) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
    %out2D_nhwc_pw = call @alloc_4d_filled_f32(%c1, %c2, %c2, %c3, %zero) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)

    memref.store %f10, %in2D_nhwc_pw[%c0, %c0, %c1, %c0] : memref<?x?x?x?xf32>
    call @pw_cbsm_conv2d(%in2D_nhwc_pw, %filter2D_nhwc_pw, %out2D_nhwc_pw) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
    %out2D_nhwc_pw_ = memref.cast %out2D_nhwc_pw : memref<?x?x?x?xf32> to memref<*xf32>
    call @print_memref_f32(%out2D_nhwc_pw_): (memref<*xf32>) -> ()

    // dealloc memref
    memref.dealloc %filter2D_nhwc : memref<?x?x?x?xf32>
    memref.dealloc %in2D_nhwc : memref<?x?x?x?xf32>
    memref.dealloc %out2D_nhwc : memref<?x?x?x?xf32>

    memref.dealloc %filter2D_nhwc_pw : memref<?x?x?x?xf32>
    memref.dealloc %in2D_nhwc_pw : memref<?x?x?x?xf32>
    memref.dealloc %out2D_nhwc_pw : memref<?x?x?x?xf32>
    return
  }
}
⚠️ **GitHub.com Fallback** ⚠️