cbsm_pw_combine - Joejiong/buddy-mlir GitHub Wiki

// Generated from Mobilenet.mlir file
// func @pointwise_conv_2d_nhwc_hwcf_with_return_origin(%input: tensor<1x4x5x2xf32>, %filter: tensor<1x1x2x7xf32>) -> tensor<1x4x5x7xf32> {
//     %0 = linalg.init_tensor [1, 4, 5, 7] : tensor<1x4x5x7xf32>
//     %1 = linalg.conv_2d_nhwc_hwcf {
//         dilations = dense<1> : tensor<2xi64>,
//         strides = dense<1> : tensor<2xi64>
//     } ins(%input, %filter : tensor<1x4x5x2xf32>, tensor<1x1x2x7xf32>) outs(%0 : tensor<1x4x5x7xf32>) -> tensor<1x4x5x7xf32>
//     return %1 : tensor<1x4x5x7xf32>
// }

// Generated from Mobilenet.mlir file
func @pointwise_conv_2d_nhwc_hwcf_with_return_origin(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?xf32>, %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {    
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    %zero = arith.constant 0.00000e+00 : f32

    %KH = tensor.dim %filter, %c0 : tensor<?x?x?x?xf32> // FH
    %KW = tensor.dim %filter, %c1 : tensor<?x?x?x?xf32> // FW
    %KC = tensor.dim %filter, %c2 : tensor<?x?x?x?xf32> // FC

    %ON = tensor.dim %output, %c0 : tensor<?x?x?x?xf32> // ON
    %OH = tensor.dim %output, %c1 : tensor<?x?x?x?xf32> // OH
    %OW = tensor.dim %output, %c2 : tensor<?x?x?x?xf32> // OW
    %OF = tensor.dim %output, %c3 : tensor<?x?x?x?xf32> // OF
    
    %buf = linalg.init_tensor [%ON, %OH, %OW, %OF] : tensor<?x?x?x?xf32> 
    %0 = linalg.fill(%zero, %buf) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>
    
    %buf3 = linalg.init_tensor [%OH, %OW, %OF] : tensor<?x?x?xf32> 
    %res_of_0 = linalg.fill(%zero, %buf3) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>

    %1 = linalg.conv_2d_nhwc_hwcf {
        dilations = dense<1> : tensor<2xi64>,
        strides = dense<1> : tensor<2xi64>
    } ins(%input, %filter : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) outs(%0 : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
    return %1 : tensor<?x?x?x?xf32>
}

// func @pointwise_conv_2d_nhwc_hwcf_with_return(%arg0: tensor<1x4x5x2xf32>, %arg1: tensor<1x1x2x7xf32>) -> tensor<1x4x5x7xf32> {
//     %0 = linalg.init_tensor [1, 4, 5, 7] : tensor<1x4x5x7xf32>
//     %1 = tensor.collapse_shape %arg0 [[0, 1, 2], [3]] : tensor<1x4x5x2xf32> into tensor<20x2xf32>
//     %2 = tensor.collapse_shape %arg1 [[0, 1, 2], [3]] : tensor<1x1x2x7xf32> into tensor<2x7xf32>
//     %3 = tensor.collapse_shape %0 [[0, 1, 2], [3]] : tensor<1x4x5x7xf32> into tensor<20x7xf32>
//     %4 = linalg.matmul ins(%1, %2 : tensor<20x2xf32>, tensor<2x7xf32>) outs(%3 : tensor<20x7xf32>) -> tensor<20x7xf32>
//     %5 = tensor.expand_shape %4 [[0, 1, 2], [3]] : tensor<20x7xf32> into tensor<1x4x5x7xf32>
//     return %5 : tensor<1x4x5x7xf32>
// }

// generate from iree processed mobilenet mlir file
// func @pointwise_conv_2d_nhwc_hwcf(%input: memref<?x?x?x?xf32>, %filter: memref<1x1x?x?xf32>, %output: memref<?x?x?x?xf32>) {
//     linalg.conv_2d_nhwc_hwcf 
//     {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} 
//     ins(%input, %filter : memref<?x?x?x?xf32>, memref<1x1x?x?xf32>) 
//     outs(%output : memref<?x?x?x?xf32>) 
//     return
// }

// // test for specific shape
// func @pointwise_conv_2d_nhwc_hwcf_spec(%input: memref<1x4x5x2xf32>, %filter: memref<1x1x2x7xf32>, %output: memref<1x4x5x7xf32>) {
//     linalg.conv_2d_nhwc_hwcf 
//     {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} 
//     ins(%input, %filter : memref<1x4x5x2xf32>, memref<1x1x2x7xf32>) 
//     outs(%output : memref<1x4x5x7xf32>) 
//     return
// }

// #map0 = affine_map<(d0) -> (d0)>
// #map1 = affine_map<(d0) -> (d0 ceildiv 256)>
// func @pw_cbsm_conv2d_outer_func_tensor(
//     %input:  tensor<?x?x?x?xf32>,
//     %filter: tensor<?x?x?x?xf32>,
//     %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {

//     %c0 = arith.constant 0 : index
//     %c1 = arith.constant 1 : index
//     %c2 = arith.constant 2 : index
//     %c3 = arith.constant 3 : index
    
//     %KH = tensor.dim %filter, %c0 : tensor<?x?x?x?xf32> // FH
//     %KW = tensor.dim %filter, %c1 : tensor<?x?x?x?xf32> // FW
//     %KC = tensor.dim %filter, %c2 : tensor<?x?x?x?xf32> // FC
    
//     %ON = tensor.dim %output, %c0 : tensor<?x?x?x?xf32> // ON
//     %OH = tensor.dim %output, %c1 : tensor<?x?x?x?xf32> // OH
//     %OW = tensor.dim %output, %c2 : tensor<?x?x?x?xf32> // OW

//     %OF = tensor.dim %output, %c3 : tensor<?x?x?x?xf32> // OF
    
//     // -1. out for adds out_adds_kc_tmp <1,OH,OW,1>
//     %zero = arith.constant 0.00000e+00 : f32
//     %buf = linalg.init_tensor [%ON, %OH, %OW, %OF] : tensor<?x?x?x?xf32> 
//     %buf3 = linalg.init_tensor [%OH, %OW, %OF] : tensor<?x?x?xf32> 
//     %res_of_0 = linalg.fill(%zero, %buf3) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
//     %res_on_0 = linalg.fill(%zero, %buf) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>

//     %res_on = affine.for %on = #map0(%c0) to #map0(%ON)   // on : 0-on(batch)
//         iter_args(%sum_on_iter = %res_on_0) -> (tensor<?x?x?x?xf32>) {
//         // iter_args binds initial values to the loop's region arguments.
//         // omp.parallel {
//         %res_of = affine.for %of = #map0(%c0) to #map0(%OF) //  step (%c1)
//             iter_args(%sum_of_iter = %res_of_0) -> (tensor<?x?x?xf32>) {
            
//             // 0. out for adds out_adds_kc_tmp <1,OH,OW,1>
//             // %out_adds_kc_tmp = arith.constant dense<0> : tensor<%ONx%OHx%OWx%OFxf32>
//             %buf1 = linalg.init_tensor [%OH, %OW] : tensor<?x?xf32> 
//             %out_adds_kc_tmp_0 = linalg.fill(%zero, %buf1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>    
            
//             // iter_args binds initial values to the loop's region arguments.
//             %out_adds_kc_tmp = affine.for %kc = #map0(%c0) to #map0(%KC) 
//                 iter_args(%sum_iter = %out_adds_kc_tmp_0) -> (tensor<?x?xf32>) {
                
//                 // 1. init kc_out_tmp[OH,OW]
//                 %buf2 = linalg.init_tensor [%OH, %OW] : tensor<?x?xf32> 
//                 %output_inner = linalg.fill(%zero, %buf2) : f32, tensor<?x?xf32> -> tensor<?x?xf32>    
                
//                 // 2. silce input for cbsm
//                 // input_inner = input[on,:,:,kc]
//                 // filter_inner = filter[0,0,kc,of]
//                 // %input_inner = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>
//                 // %filter_inner = tensor.extract_slice %filter[0,0,%kc,%of][%KH,%KW,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>

//                 %input_inner0 = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<1x?x?x1xf32>
//                 // %filter_inner0 = tensor.extract_slice %filter[0,0,%kc,%of][%KH,%KW,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?x1x1xf32>
//                 %filter_inner0 = tensor.extract_slice %filter[0,0,%kc,%of][1,1,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<1x1x1x1xf32>

//                 %input_inner1 = tensor.collapse_shape %input_inner0 [[0, 1], [2, 3]] : tensor<1x?x?x1xf32> into tensor<?x?xf32>
//                 %filter_inner1 = tensor.collapse_shape %filter_inner0 [[0, 1, 2], [3]] : tensor<1x1x1x1xf32> into tensor<1x1xf32>

//                 // 3. call conv_2d
//                 // %output_inner1 = call @conv_2d_tensor(%input_inner1, %filter_inner1, %output_inner) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
//                 %output_inner1 = linalg.conv_2d
//                     ins  (%input_inner1, %filter_inner1: tensor<?x?xf32>, tensor<1x1xf32>)
//                     outs (%output_inner: tensor<?x?xf32>) -> tensor<?x?xf32>
//                 // %dynamic_2 = tensor.cast %output_inner : tensor<?x?xf32> to tensor<*xf32>
//                 // call @print_memref_f32(%dynamic_2): (tensor<*xf32>) -> ()

//                 %out_adds_kc_tmp_next = arith.addf %output_inner1, %sum_iter : tensor<?x?xf32> 
//                 // Yield current iteration sum to next iteration %sum_iter or to %sum
//                 // if final iteration.
//                 affine.yield %out_adds_kc_tmp_next : tensor<?x?xf32> 
//             } // end-kc
//             // a += a
//             // 5. insert added kc_out to one layer of real output using: output.insert_stride_slice 2x3x1
//             %out_adds_kc_tmp_expanded = tensor.expand_shape %out_adds_kc_tmp [[0],[1,2]] : tensor<?x?xf32> into tensor<?x?x1xf32>
//             %res_next = tensor.insert_slice %out_adds_kc_tmp_expanded into %sum_of_iter[0,0,%of][%OH,%OW,1][1,1,1] : tensor<?x?x1xf32> into tensor<?x?x?xf32>
//             // res insert a into res
//             // Yield current iteration sum to next iteration %sum_iter or to %sum
//             // if final iteration.
//             affine.yield %res_next : tensor<?x?x?xf32>
//         } // end-of
//         // 1 231
//         %res_of_expand = tensor.expand_shape %res_of [[0,1],[2],[3]] : tensor<?x?x?xf32> into tensor<1x?x?x?xf32>
//         %res_on_next = tensor.insert_slice %res_of_expand into %sum_on_iter[%on,0,0,0][1,%OH,%OW,%OF][1,1,1,1] : tensor<1x?x?x?xf32> into tensor<?x?x?x?xf32>
//         // omp.yield %res_on_next : tensor<?x?x?x?xf32>
//         // }
//         affine.yield %res_on_next : tensor<?x?x?x?xf32>
//     } // end-on
//     return %res_on : tensor<?x?x?x?xf32>
// } 

#map0 = affine_map<(d0) -> (d0)>
#map = affine_map<()[s0] -> (s0 ceildiv 256)>

func @conv_2d_cbsm_memref(%arg0: memref<?x?xf32>, %arg1: memref<1x1xf32>, %arg2: memref<?x?xf32>) {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c256 = arith.constant 256 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = splat %cst : vector<256xf32>
    %1 = memref.dim %arg1, %c0 : memref<1x1xf32>
    %2 = memref.dim %arg1, %c1 : memref<1x1xf32>
    %3 = memref.dim %arg2, %c0 : memref<?x?xf32>
    %4 = memref.dim %arg2, %c1 : memref<?x?xf32>
    affine.for %arg3 = 0 to %3 {
      affine.for %arg4 = 0 to %1 {
        affine.for %arg5 = 0 to %2 {
          affine.for %arg6 = 0 to #map()[%4] {
            %5 = affine.vector_load %arg1[%arg4, %arg5] : memref<1x1xf32>, vector<1xf32>
            %6 = vector.broadcast %5 : vector<1xf32> to vector<256xf32>
            %7 = arith.muli %arg6, %c256 : index
            %8 = arith.subi %4, %7 : index
            %9 = arith.cmpi sge, %8, %c256 : index
            scf.if %9 {
              %10 = affine.vector_load %arg0[%arg3 + %arg4, %arg5 + %arg6 * 256] : memref<?x?xf32>, vector<256xf32>
              %11 = affine.vector_load %arg2[%arg3, %arg6 * 256] : memref<?x?xf32>, vector<256xf32>
              %12 = vector.fma %10, %6, %11 : vector<256xf32>
              affine.vector_store %12, %arg2[%arg3, %arg6 * 256] : memref<?x?xf32>, vector<256xf32>
            } else {
              %10 = vector.create_mask %8 : vector<256xi1>
              %11 = arith.addi %arg3, %arg4 : index
              %12 = arith.muli %arg6, %c256 : index
              %13 = arith.addi %arg5, %12 : index
              %14 = vector.maskedload %arg0[%11, %13], %10, %0 : memref<?x?xf32>, vector<256xi1>, vector<256xf32> into vector<256xf32>
              %15 = vector.maskedload %arg2[%arg3, %12], %10, %0 : memref<?x?xf32>, vector<256xi1>, vector<256xf32> into vector<256xf32>
              %16 = vector.fma %14, %6, %15 : vector<256xf32>
              vector.maskedstore %arg2[%arg3, %12], %10, %16 : memref<?x?xf32>, vector<256xi1>, vector<256xf32>
            }
          }
        }
      }
    }
    return
}

func @pw_cbsm_conv2d_outer_func_tensor(
    %input:  tensor<?x?x?x?xf32>,
    %filter: tensor<?x?x?x?xf32>,
    %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {

    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    
    %KH = tensor.dim %filter, %c0 : tensor<?x?x?x?xf32> // FH
    %KW = tensor.dim %filter, %c1 : tensor<?x?x?x?xf32> // FW
    %KC = tensor.dim %filter, %c2 : tensor<?x?x?x?xf32> // FC
    
    %ON = tensor.dim %output, %c0 : tensor<?x?x?x?xf32> // ON
    %OH = tensor.dim %output, %c1 : tensor<?x?x?x?xf32> // OH
    %OW = tensor.dim %output, %c2 : tensor<?x?x?x?xf32> // OW

    %OF = tensor.dim %output, %c3 : tensor<?x?x?x?xf32> // OF
    
    // -1. out for adds out_adds_kc_tmp <1,OH,OW,1>
    %zero = arith.constant 0.00000e+00 : f32
    %buf = linalg.init_tensor [%ON, %OH, %OW, %OF] : tensor<?x?x?x?xf32> 
    %buf3 = linalg.init_tensor [%OH, %OW, %OF] : tensor<?x?x?xf32> 
    %res_of_0 = linalg.fill(%zero, %buf3) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
    %res_on_0 = linalg.fill(%zero, %buf) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>

    %res_on = affine.for %on = #map0(%c0) to #map0(%ON)   // on : 0-on(batch)
        iter_args(%sum_on_iter = %res_on_0) -> (tensor<?x?x?x?xf32>) {
        // iter_args binds initial values to the loop's region arguments.
        %res_of = affine.for %of = #map0(%c0) to #map0(%OF) //  step (%c1)
            iter_args(%sum_of_iter = %res_of_0) -> (tensor<?x?x?xf32>) {
            
            // 0. out for adds out_adds_kc_tmp <1,OH,OW,1>
            // %out_adds_kc_tmp = arith.constant dense<0> : tensor<%ONx%OHx%OWx%OFxf32>
            %buf1 = linalg.init_tensor [%OH, %OW] : tensor<?x?xf32> 
            %out_adds_kc_tmp_0 = linalg.fill(%zero, %buf1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>    
            
            // iter_args binds initial values to the loop's region arguments.
            %out_adds_kc_tmp = affine.for %kc = #map0(%c0) to #map0(%KC) 
                iter_args(%sum_iter = %out_adds_kc_tmp_0) -> (tensor<?x?xf32>) {
                
                // 1. init kc_out_tmp[OH,OW]
                %buf2 = linalg.init_tensor [%OH, %OW] : tensor<?x?xf32> 
                %output_inner = linalg.fill(%zero, %buf2) : f32, tensor<?x?xf32> -> tensor<?x?xf32>    
                
                // 2. silce input for cbsm
                // input_inner = input[on,:,:,kc]
                // filter_inner = filter[0,0,kc,of]
                // %input_inner = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>
                // %filter_inner = tensor.extract_slice %filter[0,0,%kc,%of][%KH,%KW,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>

                %input_inner0 = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<1x?x?x1xf32>
                // %filter_inner0 = tensor.extract_slice %filter[0,0,%kc,%of][%KH,%KW,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?x1x1xf32>
                %filter_inner0 = tensor.extract_slice %filter[0,0,%kc,%of][1,1,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<1x1x1x1xf32>

                %input_inner1 = tensor.collapse_shape %input_inner0 [[0, 1], [2, 3]] : tensor<1x?x?x1xf32> into tensor<?x?xf32>
                %filter_inner1 = tensor.collapse_shape %filter_inner0 [[0, 1, 2], [3]] : tensor<1x1x1x1xf32> into tensor<1x1xf32>

                // 3. call conv_2d
                // %output_inner1 = call @conv_2d_tensor(%input_inner1, %filter_inner1, %output_inner) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>

                %input_inner1_memref = bufferization.to_memref %input_inner1 : memref<?x?xf32>
                %filter_inner1_memref = bufferization.to_memref %filter_inner1 : memref<1x1xf32>
                %output_inner_memref = bufferization.to_memref %output_inner : memref<?x?xf32>
                
                call @conv_2d_cbsm_memref(%input_inner1_memref, %filter_inner1_memref, %output_inner_memref) : (memref<?x?xf32>, memref<1x1xf32>, memref<?x?xf32>) -> ()
                
                %output_inner1 = bufferization.to_tensor %output_inner_memref : memref<?x?xf32>
                // %dynamic_2 = tensor.cast %output_inner : tensor<?x?xf32> to tensor<*xf32>
                // call @print_memref_f32(%dynamic_2): (tensor<*xf32>) -> ()

                %out_adds_kc_tmp_next = arith.addf %output_inner1, %sum_iter : tensor<?x?xf32> 
                // Yield current iteration sum to next iteration %sum_iter or to %sum
                // if final iteration.
                affine.yield %out_adds_kc_tmp_next : tensor<?x?xf32> 
            } // end-kc
            // a += a
            // 5. insert added kc_out to one layer of real output using: output.insert_stride_slice 2x3x1
            %out_adds_kc_tmp_expanded = tensor.expand_shape %out_adds_kc_tmp [[0],[1,2]] : tensor<?x?xf32> into tensor<?x?x1xf32>
            %res_next = tensor.insert_slice %out_adds_kc_tmp_expanded into %sum_of_iter[0,0,%of][%OH,%OW,1][1,1,1] : tensor<?x?x1xf32> into tensor<?x?x?xf32>
            // res insert a into res
            // Yield current iteration sum to next iteration %sum_iter or to %sum
            // if final iteration.
            affine.yield %res_next : tensor<?x?x?xf32>
        } // end-of
        // 1 231
        %res_of_expand = tensor.expand_shape %res_of [[0,1],[2],[3]] : tensor<?x?x?xf32> into tensor<1x?x?x?xf32>
        %res_on_next = tensor.insert_slice %res_of_expand into %sum_on_iter[%on,0,0,0][1,%OH,%OW,%OF][1,1,1,1] : tensor<1x?x?x?xf32> into tensor<?x?x?x?xf32>
        affine.yield %res_on_next : tensor<?x?x?x?xf32>
    } // end-on
    return %res_on : tensor<?x?x?x?xf32>
  } 

⚠️ **GitHub.com Fallback** ⚠️