with_iter_args - Joejiong/buddy-mlir GitHub Wiki

/Workspace/buddy-mlir/llvm/build/bin/mlir-opt pw_third.mlir \
    -canonicalize \
    -cse \
    -linalg-bufferize \
    -std-bufferize \
    -tensor-constant-bufferize \
    -tensor-bufferize \
    -func-bufferize \
    -finalizing-bufferize \
    -buffer-deallocation \
    -convert-linalg-to-loops \
    -lower-affine \
    -convert-scf-to-std \
    -canonicalize \
    -convert-linalg-to-llvm \
    -convert-vector-to-llvm \
    -arith-expand \
    --convert-memref-to-llvm \
    -convert-math-to-llvm \
    -convert-std-to-llvm \
    -reconcile-unrealized-casts \
    -verify-diagnostics | /Workspace/buddy-mlir/llvm/build/bin/mlir-cpu-runner \
    -e main \
    -entry-point-result=void \
    -shared-libs=/Workspace/buddy-mlir/llvm/build/lib/libmlir_runner_utils.so
#map0 = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> (d0 ceildiv 256)>
module  {

// func private @print_memref_f32(memref<*xf32>)
  func private @print_memref_f32(tensor<*xf32>) -> ()

  // Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
  // func @init_4d_filled_f32_tensor(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor<?x?x?x?xf32> {
  //   %buf = linalg.init_tensor [%s1, %s2, %s3, %s4] : tensor<?x?x?x?xf32> 
  //   %res = linalg.fill(%f, %buf) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>
  //   return %res : tensor<?x?x?x?xf32>
  // }

  func @pw_cbsm_conv2d_outer_func_tensor(
    %input:  tensor<?x?x?x?xf32>,
    %filter: tensor<?x?x?x?xf32>,
    %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {

    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    
    %KH = tensor.dim %filter, %c0 : tensor<?x?x?x?xf32> // FH
    %KW = tensor.dim %filter, %c1 : tensor<?x?x?x?xf32> // FW
    %KC = tensor.dim %filter, %c2 : tensor<?x?x?x?xf32> // FC
    
    %ON = tensor.dim %output, %c0 : tensor<?x?x?x?xf32> // ON
    %OH = tensor.dim %output, %c1 : tensor<?x?x?x?xf32> // OH
    %OW = tensor.dim %output, %c2 : tensor<?x?x?x?xf32> // OW

    %OF = tensor.dim %output, %c3 : tensor<?x?x?x?xf32> // OF
    
    // -1. out for adds out_adds_kc_tmp <1,OH,OW,1>
    %zero = arith.constant 0.00000e+00 : f32
    %buf = linalg.init_tensor [%ON, %OH, %OW, %OF] : tensor<?x?x?x?xf32> 
    %res_0 = linalg.fill(%zero, %buf) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>
    %res_on_0 = linalg.fill(%zero, %buf) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>

    affine.for %on = #map0(%c0) to #map0(%ON) {           // on : 0-on(batch)

        // iter_args binds initial values to the loop's region arguments.
        %res_of = affine.for %of = #map0(%c0) to #map0(%OF) //  step (%c1)
            iter_args(%sum_of_iter = %res_0) -> (tensor<?x?x?x?xf32>) {
            
            // 0. out for adds out_adds_kc_tmp <1,OH,OW,1>
            // %out_adds_kc_tmp = arith.constant dense<0> : tensor<%ONx%OHx%OWx%OFxf32>
            %buf1 = linalg.init_tensor [%OH, %OW] : tensor<?x?xf32> 
            %out_adds_kc_tmp_0 = linalg.fill(%zero, %buf1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>    
            
            // iter_args binds initial values to the loop's region arguments.
            %out_adds_kc_tmp = affine.for %kc = #map0(%c0) to #map0(%KC) 
                iter_args(%sum_iter = %out_adds_kc_tmp_0) -> (tensor<?x?xf32>) {
                
                // 1. init kc_out_tmp[OH,OW]
                %buf2 = linalg.init_tensor [%OH, %OW] : tensor<?x?xf32> 
                %output_inner = linalg.fill(%zero, %buf2) : f32, tensor<?x?xf32> -> tensor<?x?xf32>    
                
                // 2. silce input for cbsm
                // input_inner = input[on,:,:,kc]
                // filter_inner = filter[0,0,kc,of]
                // %input_inner = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>
                // %filter_inner = tensor.extract_slice %filter[0,0,%kc,%of][%KH,%KW,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>

                %input_inner0 = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<1x?x?x1xf32>
                %filter_inner0 = tensor.extract_slice %filter[0,0,%kc,%of][%KH,%KW,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?x1x1xf32>

                %input_inner1 = tensor.collapse_shape %input_inner0 [[0, 1], [2, 3]] : tensor<1x?x?x1xf32> into tensor<?x?xf32>
                %filter_inner1 = tensor.collapse_shape %filter_inner0 [[0], [1, 2, 3]] : tensor<?x?x1x1xf32> into tensor<?x?xf32>

                // 3. call conv_2d
                call @conv_2d_tensor(%input_inner1, %filter_inner1, %output_inner) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
              
                %out_adds_kc_tmp_next = arith.addf %output_inner, %sum_iter : tensor<?x?xf32> 
                // Yield current iteration sum to next iteration %sum_iter or to %sum
                // if final iteration.
                affine.yield %out_adds_kc_tmp_next : tensor<?x?xf32> 
            } // end-kc
            // a += a
            // 5. insert added kc_out to one layer of real output using: output.insert_stride_slice 
            %out_adds_kc_tmp_expanded = tensor.expand_shape %out_adds_kc_tmp [[0],[1],[2],[3]] : tensor<?x?xf32> into tensor<?x?x?x?xf32>
            %res_next = tensor.insert_slice %out_adds_kc_tmp_expanded into %res_0[%on,0,0,%of][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> into tensor<?x?x?x?xf32>
            // res insert a into res
            // Yield current iteration sum to next iteration %sum_iter or to %sum
            // if final iteration.
            affine.yield %res_next : tensor<?x?x?x?xf32>
        } // end-of
        return %res_of : tensor<?x?x?x?xf32>
    } // end-on
  }

  func @conv_2d_tensor(%input:  tensor<?x?xf32>,
               %filter: tensor<?x?xf32>,
               %output: tensor<?x?xf32>) -> tensor<?x?xf32> {
    %0 = linalg.conv_2d
      ins  (%input, %filter: tensor<?x?xf32>, tensor<?x?xf32>)
      outs (%output: tensor<?x?xf32>) -> tensor<?x?xf32>
    return %0 : tensor<?x?xf32>
  }

  func @conv_2d_nhwc_hwcf(
    %input:  tensor<?x?x?x?xf32>,
    %filter: tensor<?x?x?x?xf32>,
    %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
    %res = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
                            strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
      outs (%output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
    return %res :tensor<?x?x?x?xf32>
  }

  func @main() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    %c6 = arith.constant 6 : index
    %c8 = arith.constant 8 : index
    %f10 = arith.constant 10.00000e+00 : f32
    %val = arith.constant 2.00000e+00 : f32
    %zero = arith.constant 0.00000e+00 : f32

    
    // normal_conv2d_test
    // filter: 1,1,1,3 
    // in    : 1,2,2,1
    // out   : 1,2,2,3
    // %filter2D_nhwc = call @init_4d_filled_f32_tensor(%c1, %c1, %c1, %c3, %val) :(index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
    // %in2D_nhwc = call @init_4d_filled_f32_tensor(%c1, %c2, %c2, %c1, %val) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
    // %out2D_nhwc = call @init_4d_filled_f32_tensor(%c1, %c2, %c2, %c3, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
    %const_in2D_tensor = arith.constant dense<2.0> : tensor<1x2x2x1xf32>
    %const_filter2D_tensor = arith.constant dense<2.0> : tensor<1x1x1x3xf32>
    %const_out2D_tensor = arith.constant dense<0.0> : tensor<1x2x2x3xf32>

    %dynamic_const_in2D_tensor = tensor.cast %const_in2D_tensor: tensor<1x2x2x1xf32> to tensor<?x?x?x?xf32>
    %dynamic_const_filter2D_tensor = tensor.cast %const_filter2D_tensor: tensor<1x1x1x3xf32> to tensor<?x?x?x?xf32>
    %dynamic_const_out2D_tensor = tensor.cast %const_out2D_tensor: tensor<1x2x2x3xf32> to tensor<?x?x?x?xf32>

    // memref.store %f10, %in2D_nhwc[%c0, %c0, %c1, %c0] : memref<?x?x?x?xf32>
    // %res_out2D_nhwc = call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32>,
    //       tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
    // %out2D_nhwc_ = tensor.cast %res_out2D_nhwc : tensor<?x?x?x?xf32> to tensor<*xf32>
    // call @print_memref_f32(%out2D_nhwc_): (tensor<*xf32>) -> ()
    %res_out2D_nhwc = call @conv_2d_nhwc_hwcf(%dynamic_const_in2D_tensor, %dynamic_const_filter2D_tensor, %dynamic_const_out2D_tensor) : (
          tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
    %dynamic_const_out2D_tensor_ = tensor.cast %res_out2D_nhwc : tensor<?x?x?x?xf32> to tensor<*xf32>
    call @print_memref_f32(%dynamic_const_out2D_tensor_): (tensor<*xf32>) -> ()
 

    // pw_conv2d_test
    // filter: 1,1,1,3 
    // in    : 1,2,2,1
    // out   : 1,2,2,3
    // %filter2D_nhwc_pw = call @init_4d_filled_f32_tensor(%c1, %c1, %c1, %c3, %val) :(index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
    // %in2D_nhwc_pw = call @init_4d_filled_f32_tensor(%c1, %c2, %c2, %c1, %val) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
    // %out2D_nhwc_pw = call @init_4d_filled_f32_tensor(%c1, %c2, %c2, %c3, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
    
    %const_in2D_tensor_pw = arith.constant dense<2.0> : tensor<1x2x2x1xf32>
    %const_filter2D_tensor_pw = arith.constant dense<2.0> : tensor<1x1x1x3xf32>
    %const_out2D_tensor_pw = arith.constant dense<0.0> : tensor<1x2x2x3xf32>

    %dynamic_const_in2D_tensor_pw = tensor.cast %const_in2D_tensor_pw: tensor<1x2x2x1xf32> to tensor<?x?x?x?xf32>
    %dynamic_const_filter2D_tensor_pw = tensor.cast %const_filter2D_tensor_pw: tensor<1x1x1x3xf32> to tensor<?x?x?x?xf32>
    %dynamic_const_out2D_tensor_pw = tensor.cast %const_out2D_tensor_pw: tensor<1x2x2x3xf32> to tensor<?x?x?x?xf32>

    // memref.store %f10, %in2D_nhwc_pw[%c0, %c0, %c1, %c0] : memref<?x?x?x?xf32>
    // call @pw_cbsm_conv2d_outer_func_tensor(%in2D_nhwc_pw, %filter2D_nhwc_pw, %out2D_nhwc_pw) : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
    // %out2D_nhwc_pw_ = tensor.cast %out2D_nhwc_pw : tensor<?x?x?x?xf32> to tensor<*xf32>
    // call @print_memref_f32(%out2D_nhwc_pw_): (tensor<*xf32>) -> ()
    
    %res_out2D_nhwc_pw = call @pw_cbsm_conv2d_outer_func_tensor(%dynamic_const_in2D_tensor_pw, %dynamic_const_filter2D_tensor_pw, %dynamic_const_out2D_tensor_pw) : (
         tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
    %dynamic_const_out2D_tensor_pw_ = tensor.cast %res_out2D_nhwc_pw : tensor<?x?x?x?xf32> to tensor<*xf32>
    call @print_memref_f32(%dynamic_const_out2D_tensor_pw_): (tensor<*xf32>) -> ()
 
    return
  }
}
#map0 = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> (d0 ceildiv 256)>
module  {

// func private @print_memref_f32(memref<*xf32>)
  func private @print_memref_f32(tensor<*xf32>) -> ()

  // Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
  // func @init_4d_filled_f32_tensor(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor<?x?x?x?xf32> {
  //   %buf = linalg.init_tensor [%s1, %s2, %s3, %s4] : tensor<?x?x?x?xf32> 
  //   %res = linalg.fill(%f, %buf) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>
  //   return %res : tensor<?x?x?x?xf32>
  // }

  func @pw_cbsm_conv2d_outer_func_tensor(
    %input:  tensor<?x?x?x?xf32>,
    %filter: tensor<?x?x?x?xf32>,
    %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {

    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    
    %KH = tensor.dim %filter, %c0 : tensor<?x?x?x?xf32> // FH
    %KW = tensor.dim %filter, %c1 : tensor<?x?x?x?xf32> // FW
    %KC = tensor.dim %filter, %c2 : tensor<?x?x?x?xf32> // FC
    
    %ON = tensor.dim %output, %c0 : tensor<?x?x?x?xf32> // ON
    %OH = tensor.dim %output, %c1 : tensor<?x?x?x?xf32> // OH
    %OW = tensor.dim %output, %c2 : tensor<?x?x?x?xf32> // OW

    %OF = tensor.dim %output, %c3 : tensor<?x?x?x?xf32> // OF
    
    // -1. out for adds out_adds_kc_tmp <1,OH,OW,1>
    %zero = arith.constant 0.00000e+00 : f32
    %buf = linalg.init_tensor [%ON, %OH, %OW, %OF] : tensor<?x?x?x?xf32> 
    %buf3 = linalg.init_tensor [%OH, %OW, %OF] : tensor<?x?x?xf32> 
    %res_of_0 = linalg.fill(%zero, %buf3) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
    %res_on_0 = linalg.fill(%zero, %buf) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>

    %res_on = affine.for %on = #map0(%c0) to #map0(%ON)   // on : 0-on(batch)
        iter_args(%sum_on_iter = %res_on_0) -> (tensor<?x?x?x?xf32>) {
        // iter_args binds initial values to the loop's region arguments.
        %res_of = affine.for %of = #map0(%c0) to #map0(%OF) //  step (%c1)
            iter_args(%sum_of_iter = %res_of_0) -> (tensor<?x?x?xf32>) {
            
            // 0. out for adds out_adds_kc_tmp <1,OH,OW,1>
            // %out_adds_kc_tmp = arith.constant dense<0> : tensor<%ONx%OHx%OWx%OFxf32>
            %buf1 = linalg.init_tensor [%OH, %OW] : tensor<?x?xf32> 
            %out_adds_kc_tmp_0 = linalg.fill(%zero, %buf1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>    
            
            // iter_args binds initial values to the loop's region arguments.
            %out_adds_kc_tmp = affine.for %kc = #map0(%c0) to #map0(%KC) 
                iter_args(%sum_iter = %out_adds_kc_tmp_0) -> (tensor<?x?xf32>) {
                
                // 1. init kc_out_tmp[OH,OW]
                %buf2 = linalg.init_tensor [%OH, %OW] : tensor<?x?xf32> 
                %output_inner = linalg.fill(%zero, %buf2) : f32, tensor<?x?xf32> -> tensor<?x?xf32>    
                
                // 2. silce input for cbsm
                // input_inner = input[on,:,:,kc]
                // filter_inner = filter[0,0,kc,of]
                // %input_inner = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>
                // %filter_inner = tensor.extract_slice %filter[0,0,%kc,%of][%KH,%KW,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>

                %input_inner0 = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<1x?x?x1xf32>
                %filter_inner0 = tensor.extract_slice %filter[0,0,%kc,%of][%KH,%KW,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?x1x1xf32>

                %input_inner1 = tensor.collapse_shape %input_inner0 [[0, 1], [2, 3]] : tensor<1x?x?x1xf32> into tensor<?x?xf32>
                %filter_inner1 = tensor.collapse_shape %filter_inner0 [[0], [1, 2, 3]] : tensor<?x?x1x1xf32> into tensor<?x?xf32>

                // 3. call conv_2d
                call @conv_2d_tensor(%input_inner1, %filter_inner1, %output_inner) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
              
                %out_adds_kc_tmp_next = arith.addf %output_inner, %sum_iter : tensor<?x?xf32> 
                // Yield current iteration sum to next iteration %sum_iter or to %sum
                // if final iteration.
                affine.yield %out_adds_kc_tmp_next : tensor<?x?xf32> 
            } // end-kc
            // a += a
            // 5. insert added kc_out to one layer of real output using: output.insert_stride_slice 2x3x1
            %out_adds_kc_tmp_expanded = tensor.expand_shape %out_adds_kc_tmp [[0],[1,2]] : tensor<?x?xf32> into tensor<?x?x1xf32>
            %res_next = tensor.insert_slice %out_adds_kc_tmp_expanded into %sum_of_iter[0,0,%of][%OH,%OW,1][1,1,1] : tensor<?x?x1xf32> into tensor<?x?x?xf32>
            // res insert a into res
            // Yield current iteration sum to next iteration %sum_iter or to %sum
            // if final iteration.
            affine.yield %res_next : tensor<?x?x?xf32>
        } // end-of
        // 1 231
        %res_of_expand = tensor.expand_shape %res_of [[0,1],[2],[3]] : tensor<?x?x?xf32> into tensor<1x?x?x?xf32>
        %res_on_next = tensor.insert_slice %res_of_expand into %sum_on_iter[%on,0,0,0][1,%OH,%OW,%OF][1,1,1,1] : tensor<1x?x?x?xf32> into tensor<?x?x?x?xf32>
        affine.yield %res_on_next : tensor<?x?x?x?xf32>
    } // end-on
    return %res_on : tensor<?x?x?x?xf32>
  } 

  func @conv_2d_tensor(%input:  tensor<?x?xf32>,
               %filter: tensor<?x?xf32>,
               %output: tensor<?x?xf32>) -> tensor<?x?xf32> {
    %0 = linalg.conv_2d
      ins  (%input, %filter: tensor<?x?xf32>, tensor<?x?xf32>)
      outs (%output: tensor<?x?xf32>) -> tensor<?x?xf32>
    return %0 : tensor<?x?xf32>
  }

  func @conv_2d_nhwc_hwcf(
    %input:  tensor<?x?x?x?xf32>,
    %filter: tensor<?x?x?x?xf32>,
    %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
    %res = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
                            strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
      outs (%output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
    return %res :tensor<?x?x?x?xf32>
  }

  func @main() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    %c6 = arith.constant 6 : index
    %c8 = arith.constant 8 : index
    %f10 = arith.constant 10.00000e+00 : f32
    %val = arith.constant 2.00000e+00 : f32
    %zero = arith.constant 0.00000e+00 : f32

    
    // normal_conv2d_test
    // filter: 1,1,1,3 
    // in    : 1,2,2,1
    // out   : 1,2,2,3
    // %filter2D_nhwc = call @init_4d_filled_f32_tensor(%c1, %c1, %c1, %c3, %val) :(index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
    // %in2D_nhwc = call @init_4d_filled_f32_tensor(%c1, %c2, %c2, %c1, %val) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
    // %out2D_nhwc = call @init_4d_filled_f32_tensor(%c1, %c2, %c2, %c3, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
    %const_in2D_tensor = arith.constant dense<2.0> : tensor<1x2x2x1xf32>
    %const_filter2D_tensor = arith.constant dense<2.0> : tensor<1x1x1x3xf32>
    %const_out2D_tensor = arith.constant dense<0.0> : tensor<1x2x2x3xf32>

    %dynamic_const_in2D_tensor = tensor.cast %const_in2D_tensor: tensor<1x2x2x1xf32> to tensor<?x?x?x?xf32>
    %dynamic_const_filter2D_tensor = tensor.cast %const_filter2D_tensor: tensor<1x1x1x3xf32> to tensor<?x?x?x?xf32>
    %dynamic_const_out2D_tensor = tensor.cast %const_out2D_tensor: tensor<1x2x2x3xf32> to tensor<?x?x?x?xf32>

    // memref.store %f10, %in2D_nhwc[%c0, %c0, %c1, %c0] : memref<?x?x?x?xf32>
    // %res_out2D_nhwc = call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32>,
    //       tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
    // %out2D_nhwc_ = tensor.cast %res_out2D_nhwc : tensor<?x?x?x?xf32> to tensor<*xf32>
    // call @print_memref_f32(%out2D_nhwc_): (tensor<*xf32>) -> ()
    %res_out2D_nhwc = call @conv_2d_nhwc_hwcf(%dynamic_const_in2D_tensor, %dynamic_const_filter2D_tensor, %dynamic_const_out2D_tensor) : (
          tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
    %dynamic_const_out2D_tensor_ = tensor.cast %res_out2D_nhwc : tensor<?x?x?x?xf32> to tensor<*xf32>
    call @print_memref_f32(%dynamic_const_out2D_tensor_): (tensor<*xf32>) -> ()
 

    // pw_conv2d_test
    // filter: 1,1,1,3 
    // in    : 1,2,2,1
    // out   : 1,2,2,3
    // %filter2D_nhwc_pw = call @init_4d_filled_f32_tensor(%c1, %c1, %c1, %c3, %val) :(index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
    // %in2D_nhwc_pw = call @init_4d_filled_f32_tensor(%c1, %c2, %c2, %c1, %val) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
    // %out2D_nhwc_pw = call @init_4d_filled_f32_tensor(%c1, %c2, %c2, %c3, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
    
    %const_in2D_tensor_pw = arith.constant dense<2.0> : tensor<1x2x2x1xf32>
    %const_filter2D_tensor_pw = arith.constant dense<2.0> : tensor<1x1x1x3xf32>
    %const_out2D_tensor_pw = arith.constant dense<0.0> : tensor<1x2x2x3xf32>

    %dynamic_const_in2D_tensor_pw = tensor.cast %const_in2D_tensor_pw: tensor<1x2x2x1xf32> to tensor<?x?x?x?xf32>
    %dynamic_const_filter2D_tensor_pw = tensor.cast %const_filter2D_tensor_pw: tensor<1x1x1x3xf32> to tensor<?x?x?x?xf32>
    %dynamic_const_out2D_tensor_pw = tensor.cast %const_out2D_tensor_pw: tensor<1x2x2x3xf32> to tensor<?x?x?x?xf32>

    // memref.store %f10, %in2D_nhwc_pw[%c0, %c0, %c1, %c0] : memref<?x?x?x?xf32>
    // call @pw_cbsm_conv2d_outer_func_tensor(%in2D_nhwc_pw, %filter2D_nhwc_pw, %out2D_nhwc_pw) : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
    // %out2D_nhwc_pw_ = tensor.cast %out2D_nhwc_pw : tensor<?x?x?x?xf32> to tensor<*xf32>
    // call @print_memref_f32(%out2D_nhwc_pw_): (tensor<*xf32>) -> ()
    
    %res_out2D_nhwc_pw = call @pw_cbsm_conv2d_outer_func_tensor(%dynamic_const_in2D_tensor_pw, %dynamic_const_filter2D_tensor_pw, %dynamic_const_out2D_tensor_pw) : (
         tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
    %dynamic_const_out2D_tensor_pw_ = tensor.cast %res_out2D_nhwc_pw : tensor<?x?x?x?xf32> to tensor<*xf32>
    call @print_memref_f32(%dynamic_const_out2D_tensor_pw_): (tensor<*xf32>) -> ()
 
    return
  }
}
/Workspace/buddy-mlir/llvm/build/bin/mlir-opt pw_third.mlir \
    -canonicalize \
    -cse \
    -arith-expand \
    -convert-elementwise-to-linalg \
    -convert-linalg-to-loops \
    -lower-affine \
    -convert-scf-to-std \
    -linalg-bufferize \
    -func-bufferize \
    -tensor-constant-bufferize \
    -tensor-bufferize \
    -std-bufferize \
    -finalizing-bufferize \
    -buffer-deallocation \
    -canonicalize \
    -convert-linalg-to-llvm \
    -convert-vector-to-llvm \
    --convert-memref-to-llvm \
    -convert-math-to-llvm \
    -convert-std-to-llvm \
    -reconcile-unrealized-casts \
    -verify-diagnostics | /Workspace/buddy-mlir/llvm/build/bin/mlir-cpu-runner \
    -e main \
    -entry-point-result=void \
    -shared-libs=/Workspace/buddy-mlir/llvm/build/lib/libmlir_runner_utils.so
/Workspace/buddy-mlir/llvm/build/bin/mlir-opt pw_third.mlir \
    -convert-elementwise-to-linalg \
    -linalg-bufferize \
    -convert-linalg-to-loops \
    -lower-affine \
    -convert-vector-to-scf \
    -convert-scf-to-std \
    -func-bufferize \
    -tensor-constant-bufferize \
    -tensor-bufferize \
    -std-bufferize \
    -finalizing-bufferize \
    -canonicalize \
    -convert-linalg-to-llvm \
    -convert-vector-to-llvm \
    --convert-memref-to-llvm \
    -convert-math-to-llvm \
    -convert-std-to-llvm \
    -reconcile-unrealized-casts | /Workspace/buddy-mlir/llvm/build/bin/mlir-cpu-runner \
    -e main \
    -entry-point-result=void \
    -shared-libs=/Workspace/buddy-mlir/llvm/build/lib/libmlir_runner_utils.so
⚠️ **GitHub.com Fallback** ⚠️