six_scf_parallel - Joejiong/buddy-mlir GitHub Wiki

// /Workspace/buddy-mlir/llvm/build/bin/mlir-opt pw_sixth_scf.mlir \
// -parallel-loop-fusion \
// -convert-vector-to-scf \
// -convert-scf-to-std \
// -func-bufferize \
// -tensor-constant-bufferize \
// -tensor-bufferize \
// -std-bufferize \
// -finalizing-bufferize \
// -canonicalize \
// -convert-linalg-to-llvm \
// -convert-vector-to-llvm \
// --convert-memref-to-llvm \
// -convert-math-to-llvm \
// -convert-std-to-llvm \
// -reconcile-unrealized-casts | /Workspace/buddy-mlir/llvm/build/bin/mlir-cpu-runner \
// -e main \
// -entry-point-result=void \
// -shared-libs=/Workspace/buddy-mlir/llvm/build/lib/libmlir_runner_utils.so \
// -shared-libs=/Workspace/buddy-mlir/llvm/build/lib/libmlir_c_runner_utils.so \
// -shared-libs=/Workspace/buddy-mlir/llvm/build/lib/libmlir_async_runtime.so

#map0 = affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3)>
#map1 = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2 + d2)>
module  {
  func private @rtclock() -> f64
  func private @print_memref_f32(memref<*xf32>) attributes {llvm.emit_c_interface}
  func @conv_2d_cbsm_tensor(%arg0: tensor<?x?xf32>, %arg1: tensor<1x1xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c256 = arith.constant 256 : index
    %cst = arith.constant dense<0.000000e+00> : vector<256xf32>
    %0 = bufferization.to_memref %arg0 : memref<?x?xf32>
    %1 = bufferization.to_memref %arg1 : memref<1x1xf32>
    %2 = bufferization.to_memref %arg2 : memref<?x?xf32>
    %3 = memref.dim %2, %c0 : memref<?x?xf32>
    %4 = memref.dim %2, %c1 : memref<?x?xf32>
    %5 = memref.alloc(%3, %4) : memref<?x?xf32>
    %6 = memref.dim %2, %c0 : memref<?x?xf32>
    %7 = memref.dim %2, %c1 : memref<?x?xf32>

    scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%6, %7) step (%c1, %c1) {
      %9 = memref.load %2[%arg3, %arg4] : memref<?x?xf32>
      memref.store %9, %5[%arg3, %arg4] : memref<?x?xf32>
    }
    // first change
    // scf.for %arg3 = %c0 to %6 step %c1 {
    //   scf.for %arg4 = %c0 to %7 step %c1 {
    //     %9 = memref.load %2[%arg3, %arg4] : memref<?x?xf32>
    //     memref.store %9, %5[%arg3, %arg4] : memref<?x?xf32>
    //   }
    // }
    %c0_0 = arith.constant 0 : index
    %c1_1 = arith.constant 1 : index
    scf.parallel (%arg3) = (%c0_0) to (%3) step (%c1_1) {
      %c0_2 = arith.constant 0 : index
      %c1_3 = arith.constant 1 : index
      %c1_4 = arith.constant 1 : index
      scf.for %arg4 = %c0_2 to %c1_3 step %c1_4 {
        %c0_5 = arith.constant 0 : index
        %c1_6 = arith.constant 1 : index
        %c1_7 = arith.constant 1 : index
        scf.for %arg5 = %c0_5 to %c1_6 step %c1_7 {
          %c0_8 = arith.constant 0 : index
          %c256_9 = arith.constant 256 : index
          %c0_10 = arith.constant 0 : index
          %c1_11 = arith.constant 1 : index
          %9 = arith.cmpi sle, %4, %c0_10 : index
          %10 = arith.subi %c0_10, %4 : index
          %11 = arith.subi %4, %c1_11 : index
          %12 = select %9, %10, %11 : index
          %13 = arith.divsi %12, %c256_9 : index
          %14 = arith.subi %c0_10, %13 : index
          %15 = arith.addi %13, %c1_11 : index
          %16 = select %9, %14, %15 : index
          %c1_12 = arith.constant 1 : index
          scf.for %arg6 = %c0_8 to %16 step %c1_12 {
            %17 = vector.load %1[%arg4, %arg5] : memref<1x1xf32>, vector<1xf32>
            %18 = vector.broadcast %17 : vector<1xf32> to vector<256xf32>
            %19 = arith.muli %arg6, %c256 : index
            %20 = arith.subi %4, %19 : index
            %21 = arith.cmpi sge, %20, %c256 : index
            scf.if %21 {
              %22 = arith.addi %arg3, %arg4 : index
              %c256_13 = arith.constant 256 : index
              %23 = arith.muli %arg6, %c256_13 : index
              %24 = arith.addi %arg5, %23 : index
              %25 = vector.load %0[%22, %24] : memref<?x?xf32>, vector<256xf32>
              %c256_14 = arith.constant 256 : index
              %26 = arith.muli %arg6, %c256_14 : index
              %27 = vector.load %5[%arg3, %26] : memref<?x?xf32>, vector<256xf32>
              %28 = vector.fma %25, %18, %27 : vector<256xf32>
              %c256_15 = arith.constant 256 : index
              %29 = arith.muli %arg6, %c256_15 : index
              vector.store %28, %5[%arg3, %29] : memref<?x?xf32>, vector<256xf32>
            } else {
              %22 = vector.create_mask %20 : vector<256xi1>
              %23 = arith.addi %arg3, %arg4 : index
              %24 = arith.muli %arg6, %c256 : index
              %25 = arith.addi %arg5, %24 : index
              %26 = vector.maskedload %0[%23, %25], %22, %cst : memref<?x?xf32>, vector<256xi1>, vector<256xf32> into vector<256xf32>
              %27 = vector.maskedload %5[%arg3, %24], %22, %cst : memref<?x?xf32>, vector<256xi1>, vector<256xf32> into vector<256xf32>
              %28 = vector.fma %26, %18, %27 : vector<256xf32>
              vector.maskedstore %5[%arg3, %24], %22, %28 : memref<?x?xf32>, vector<256xi1>, vector<256xf32>
            }
          }
        }
      }
    }
    %8 = bufferization.to_tensor %5 : memref<?x?xf32>
    return %8 : tensor<?x?xf32>
  }
  func @pw_cbsm_conv2d_outer_func_tensor(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = bufferization.to_memref %arg0 : memref<?x?x?x?xf32>
    %1 = bufferization.to_memref %arg1 : memref<?x?x?x?xf32>
    %2 = tensor.dim %arg1, %c2 : tensor<?x?x?x?xf32>
    %3 = tensor.dim %arg2, %c0 : tensor<?x?x?x?xf32>
    %4 = tensor.dim %arg2, %c1 : tensor<?x?x?x?xf32>
    %5 = tensor.dim %arg2, %c2 : tensor<?x?x?x?xf32>
    %6 = tensor.dim %arg2, %c3 : tensor<?x?x?x?xf32>
    %7 = memref.alloc(%3, %4, %5, %6) : memref<?x?x?x?xf32>
    %8 = memref.alloc(%4, %5, %6) : memref<?x?x?xf32>
    scf.for %arg3 = %c0 to %4 step %c1 {
      scf.for %arg4 = %c0 to %5 step %c1 {
        scf.for %arg5 = %c0 to %6 step %c1 {
          memref.store %cst, %8[%arg3, %arg4, %arg5] : memref<?x?x?xf32>
        }
      }
    }
    %9 = bufferization.to_tensor %8 : memref<?x?x?xf32>
    scf.for %arg3 = %c0 to %3 step %c1 {
      scf.for %arg4 = %c0 to %4 step %c1 {
        scf.for %arg5 = %c0 to %5 step %c1 {
          scf.for %arg6 = %c0 to %6 step %c1 {
            memref.store %cst, %7[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x?xf32>
          }
        }
      }
    }
    %10 = bufferization.to_tensor %7 : memref<?x?x?x?xf32>
    %c0_0 = arith.constant 0 : index
    %c1_1 = arith.constant 1 : index
    %11 = scf.for %arg3 = %c0_0 to %3 step %c1_1 iter_args(%arg4 = %10) -> (tensor<?x?x?x?xf32>) {
      %12 = bufferization.to_memref %arg4 : memref<?x?x?x?xf32>
      %c0_2 = arith.constant 0 : index
      %c1_3 = arith.constant 1 : index
      %13 = scf.for %arg5 = %c0_2 to %6 step %c1_3 iter_args(%arg6 = %9) -> (tensor<?x?x?xf32>) {
        %30 = bufferization.to_memref %arg6 : memref<?x?x?xf32>
        %31 = memref.alloc(%4, %5) : memref<?x?xf32>
        // change three
        // scf.for %arg7 = %c0 to %4 step %c1 {
        //   scf.for %arg8 = %c0 to %5 step %c1 {
        //     memref.store %cst, %31[%arg7, %arg8] : memref<?x?xf32>
        //   }
        // }
        scf.parallel (%arg7, %arg8) = (%c0, %c0) to (%4, %5) step (%c1, %c1) {
          memref.store %cst, %31[%arg7, %arg8] : memref<?x?xf32>
        }

        %32 = bufferization.to_tensor %31 : memref<?x?xf32>
        %c0_4 = arith.constant 0 : index
        %c1_5 = arith.constant 1 : index
        %33 = scf.for %arg7 = %c0_4 to %2 step %c1_5 iter_args(%arg8 = %32) -> (tensor<?x?xf32>) {
          %47 = bufferization.to_memref %arg8 : memref<?x?xf32>
          %48 = memref.alloc(%4, %5) : memref<?x?xf32>
          scf.for %arg9 = %c0 to %4 step %c1 {
            scf.for %arg10 = %c0 to %5 step %c1 {
              memref.store %cst, %48[%arg9, %arg10] : memref<?x?xf32>
            }
          }
          %49 = bufferization.to_tensor %48 : memref<?x?xf32>
          %50 = memref.alloc(%4, %5) : memref<1x?x?x1xf32>
          %51 = memref.subview %0[%arg3, 0, 0, %arg7] [1, %4, %5, 1] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<1x?x?x1xf32, #map0>
          scf.for %arg9 = %c0 to %c1 step %c1 {
            scf.for %arg10 = %c0 to %4 step %c1 {
              scf.for %arg11 = %c0 to %5 step %c1 {
                scf.for %arg12 = %c0 to %c1 step %c1 {
                  %66 = memref.load %51[%arg9, %arg10, %arg11, %arg12] : memref<1x?x?x1xf32, #map0>
                  memref.store %66, %50[%arg9, %arg10, %arg11, %arg12] : memref<1x?x?x1xf32>
                }
              }
            }
          }
          %52 = memref.alloc() : memref<1x1x1x1xf32>
          %53 = memref.subview %1[0, 0, %arg7, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<1x1x1x1xf32, #map0>
          scf.for %arg9 = %c0 to %c1 step %c1 {
            scf.for %arg10 = %c0 to %c1 step %c1 {
              scf.for %arg11 = %c0 to %c1 step %c1 {
                scf.for %arg12 = %c0 to %c1 step %c1 {
                  %66 = memref.load %53[%arg9, %arg10, %arg11, %arg12] : memref<1x1x1x1xf32, #map0>
                  memref.store %66, %52[%arg9, %arg10, %arg11, %arg12] : memref<1x1x1x1xf32>
                }
              }
            }
          }
          %54 = memref.collapse_shape %50 [[0, 1], [2, 3]] : memref<1x?x?x1xf32> into memref<?x?xf32>
          %55 = bufferization.to_tensor %54 : memref<?x?xf32>
          %56 = memref.collapse_shape %52 [[0, 1, 2], [3]] : memref<1x1x1x1xf32> into memref<1x1xf32>
          %57 = bufferization.to_tensor %56 : memref<1x1xf32>
          %58 = call @conv_2d_cbsm_tensor(%55, %57, %49) : (tensor<?x?xf32>, tensor<1x1xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
          %59 = bufferization.to_memref %58 : memref<?x?xf32>
          %60 = memref.dim %59, %c0 : memref<?x?xf32>
          %61 = memref.dim %59, %c1 : memref<?x?xf32>
          %62 = memref.alloc(%60, %61) : memref<?x?xf32>
          %63 = memref.dim %59, %c0 : memref<?x?xf32>
          %64 = memref.dim %59, %c1 : memref<?x?xf32>
          
          scf.for %arg9 = %c0 to %63 step %c1 {
            scf.for %arg10 = %c0 to %64 step %c1 {
              %66 = memref.load %59[%arg9, %arg10] : memref<?x?xf32>
              %67 = memref.load %47[%arg9, %arg10] : memref<?x?xf32>
              %68 = arith.addf %66, %67 : f32
              memref.store %68, %62[%arg9, %arg10] : memref<?x?xf32>
            }
          }
          %65 = bufferization.to_tensor %62 : memref<?x?xf32>
          scf.yield %65 : tensor<?x?xf32>
        }
        %34 = bufferization.to_memref %33 : memref<?x?xf32>
        %35 = memref.expand_shape %34 [[0], [1, 2]] : memref<?x?xf32> into memref<?x?x1xf32>
        %36 = memref.dim %30, %c0 : memref<?x?x?xf32>
        %37 = memref.dim %30, %c1 : memref<?x?x?xf32>
        %38 = memref.dim %30, %c2 : memref<?x?x?xf32>
        %39 = memref.alloc(%36, %37, %38) : memref<?x?x?xf32>
        %40 = memref.dim %30, %c0 : memref<?x?x?xf32>
        %41 = memref.dim %30, %c1 : memref<?x?x?xf32>
        %42 = memref.dim %30, %c2 : memref<?x?x?xf32>
        scf.for %arg7 = %c0 to %40 step %c1 {
          scf.for %arg8 = %c0 to %41 step %c1 {
            scf.for %arg9 = %c0 to %42 step %c1 {
              %47 = memref.load %30[%arg7, %arg8, %arg9] : memref<?x?x?xf32>
              memref.store %47, %39[%arg7, %arg8, %arg9] : memref<?x?x?xf32>
            }
          }
        }
        %43 = memref.subview %39[0, 0, %arg5] [%4, %5, 1] [1, 1, 1] : memref<?x?x?xf32> to memref<?x?x1xf32, #map1>
        %44 = memref.dim %35, %c0 : memref<?x?x1xf32>
        %45 = memref.dim %35, %c1 : memref<?x?x1xf32>
        scf.for %arg7 = %c0 to %44 step %c1 {
          scf.for %arg8 = %c0 to %45 step %c1 {
            scf.for %arg9 = %c0 to %c1 step %c1 {
              %47 = memref.load %35[%arg7, %arg8, %arg9] : memref<?x?x1xf32>
              memref.store %47, %43[%arg7, %arg8, %arg9] : memref<?x?x1xf32, #map1>
            }
          }
        }
        %46 = bufferization.to_tensor %39 : memref<?x?x?xf32>
        scf.yield %46 : tensor<?x?x?xf32>
      }
      %14 = bufferization.to_memref %13 : memref<?x?x?xf32>
      %15 = memref.expand_shape %14 [[0, 1], [2], [3]] : memref<?x?x?xf32> into memref<1x?x?x?xf32>
      %16 = memref.dim %12, %c0 : memref<?x?x?x?xf32>
      %17 = memref.dim %12, %c1 : memref<?x?x?x?xf32>
      %18 = memref.dim %12, %c2 : memref<?x?x?x?xf32>
      %19 = memref.dim %12, %c3 : memref<?x?x?x?xf32>
      %20 = memref.alloc(%16, %17, %18, %19) : memref<?x?x?x?xf32>
      %21 = memref.dim %12, %c0 : memref<?x?x?x?xf32>
      %22 = memref.dim %12, %c1 : memref<?x?x?x?xf32>
      %23 = memref.dim %12, %c2 : memref<?x?x?x?xf32>
      %24 = memref.dim %12, %c3 : memref<?x?x?x?xf32>
      scf.for %arg5 = %c0 to %21 step %c1 {
        scf.for %arg6 = %c0 to %22 step %c1 {
          scf.for %arg7 = %c0 to %23 step %c1 {
            scf.for %arg8 = %c0 to %24 step %c1 {
              %30 = memref.load %12[%arg5, %arg6, %arg7, %arg8] : memref<?x?x?x?xf32>
              memref.store %30, %20[%arg5, %arg6, %arg7, %arg8] : memref<?x?x?x?xf32>
            }
          }
        }
      }
      %25 = memref.subview %20[%arg3, 0, 0, 0] [1, %4, %5, %6] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<1x?x?x?xf32, #map0>
      %26 = memref.dim %15, %c1 : memref<1x?x?x?xf32>
      %27 = memref.dim %15, %c2 : memref<1x?x?x?xf32>
      %28 = memref.dim %15, %c3 : memref<1x?x?x?xf32>
      scf.for %arg5 = %c0 to %c1 step %c1 {
        scf.for %arg6 = %c0 to %26 step %c1 {
          scf.for %arg7 = %c0 to %27 step %c1 {
            scf.for %arg8 = %c0 to %28 step %c1 {
              %30 = memref.load %15[%arg5, %arg6, %arg7, %arg8] : memref<1x?x?x?xf32>
              memref.store %30, %25[%arg5, %arg6, %arg7, %arg8] : memref<1x?x?x?xf32, #map0>
            }
          }
        }
      }
      %29 = bufferization.to_tensor %20 : memref<?x?x?x?xf32>
      scf.yield %29 : tensor<?x?x?x?xf32>
    }
    return %11 : tensor<?x?x?x?xf32>
  }
  func @conv_2d_nhwc_hwcf(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    %0 = bufferization.to_memref %arg0 : memref<?x?x?x?xf32>
    %1 = bufferization.to_memref %arg1 : memref<?x?x?x?xf32>
    %2 = bufferization.to_memref %arg2 : memref<?x?x?x?xf32>
    %3 = memref.dim %2, %c0 : memref<?x?x?x?xf32>
    %4 = memref.dim %2, %c1 : memref<?x?x?x?xf32>
    %5 = memref.dim %2, %c2 : memref<?x?x?x?xf32>
    %6 = memref.dim %2, %c3 : memref<?x?x?x?xf32>
    %7 = memref.alloc(%3, %4, %5, %6) : memref<?x?x?x?xf32>
    %8 = memref.dim %2, %c0 : memref<?x?x?x?xf32>
    %9 = memref.dim %2, %c1 : memref<?x?x?x?xf32>
    %10 = memref.dim %2, %c2 : memref<?x?x?x?xf32>
    %11 = memref.dim %2, %c3 : memref<?x?x?x?xf32>
    scf.for %arg3 = %c0 to %8 step %c1 {
      scf.for %arg4 = %c0 to %9 step %c1 {
        scf.for %arg5 = %c0 to %10 step %c1 {
          scf.for %arg6 = %c0 to %11 step %c1 {
            %18 = memref.load %2[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x?xf32>
            memref.store %18, %7[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x?xf32>
          }
        }
      }
    }
    %12 = memref.dim %0, %c0 : memref<?x?x?x?xf32>
    %13 = memref.dim %0, %c3 : memref<?x?x?x?xf32>
    %14 = memref.dim %1, %c0 : memref<?x?x?x?xf32>
    %15 = memref.dim %1, %c1 : memref<?x?x?x?xf32>
    %16 = memref.dim %1, %c3 : memref<?x?x?x?xf32>
    scf.for %arg3 = %c0 to %12 step %c1 {
      scf.for %arg4 = %c0 to %4 step %c1 {
        scf.for %arg5 = %c0 to %5 step %c1 {
          scf.for %arg6 = %c0 to %16 step %c1 {
            scf.for %arg7 = %c0 to %14 step %c1 {
              scf.for %arg8 = %c0 to %15 step %c1 {
                scf.for %arg9 = %c0 to %13 step %c1 {
                  %18 = arith.addi %arg4, %arg7 : index
                  %19 = arith.addi %arg5, %arg8 : index
                  %20 = memref.load %0[%arg3, %18, %19, %arg9] : memref<?x?x?x?xf32>
                  %21 = memref.load %1[%arg7, %arg8, %arg9, %arg6] : memref<?x?x?x?xf32>
                  %22 = memref.load %7[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x?xf32>
                  %23 = arith.mulf %20, %21 : f32
                  %24 = arith.addf %22, %23 : f32
                  memref.store %24, %7[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x?xf32>
                }
              }
            }
          }
        }
      }
    }
    %17 = bufferization.to_tensor %7 : memref<?x?x?x?xf32>
    return %17 : tensor<?x?x?x?xf32>
  }
  func @conv_2d_1x1(%arg0: tensor<1x4x5x2xf32>, %arg1: tensor<1x1x2x7xf32>) -> tensor<1x4x5x7xf32> {
    %c20 = arith.constant 20 : index
    %c7 = arith.constant 7 : index
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %0 = bufferization.to_memref %arg0 : memref<1x4x5x2xf32>
    %1 = bufferization.to_memref %arg1 : memref<1x1x2x7xf32>
    %2 = memref.alloc() : memref<1x4x5x7xf32>
    %3 = memref.collapse_shape %0 [[0, 1, 2], [3]] : memref<1x4x5x2xf32> into memref<20x2xf32>
    %4 = memref.collapse_shape %1 [[0, 1, 2], [3]] : memref<1x1x2x7xf32> into memref<2x7xf32>
    %5 = memref.collapse_shape %2 [[0, 1, 2], [3]] : memref<1x4x5x7xf32> into memref<20x7xf32>
    %6 = memref.alloc() : memref<20x7xf32>
    scf.for %arg2 = %c0 to %c20 step %c1 {
      scf.for %arg3 = %c0 to %c7 step %c1 {
        %9 = memref.load %5[%arg2, %arg3] : memref<20x7xf32>
        memref.store %9, %6[%arg2, %arg3] : memref<20x7xf32>
      }
    }
    scf.for %arg2 = %c0 to %c20 step %c1 {
      scf.for %arg3 = %c0 to %c7 step %c1 {
        scf.for %arg4 = %c0 to %c2 step %c1 {
          %9 = memref.load %3[%arg2, %arg4] : memref<20x2xf32>
          %10 = memref.load %4[%arg4, %arg3] : memref<2x7xf32>
          %11 = memref.load %6[%arg2, %arg3] : memref<20x7xf32>
          %12 = arith.mulf %9, %10 : f32
          %13 = arith.addf %11, %12 : f32
          memref.store %13, %6[%arg2, %arg3] : memref<20x7xf32>
        }
      }
    }
    %7 = memref.expand_shape %6 [[0, 1, 2], [3]] : memref<20x7xf32> into memref<1x4x5x7xf32>
    %8 = bufferization.to_tensor %7 : memref<1x4x5x7xf32>
    return %8 : tensor<1x4x5x7xf32>
  }
  func @main() {
    %cst = arith.constant dense<1.000000e+00> : tensor<1x200x200x1xf32>
    %cst_0 = arith.constant dense<2.000000e+00> : tensor<1x1x1x3xf32>
    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x200x200x3xf32>
    %0 = tensor.cast %cst : tensor<1x200x200x1xf32> to tensor<?x?x?x?xf32>
    %1 = tensor.cast %cst_0 : tensor<1x1x1x3xf32> to tensor<?x?x?x?xf32>
    %2 = tensor.cast %cst_1 : tensor<1x200x200x3xf32> to tensor<?x?x?x?xf32>
    %3 = call @rtclock() : () -> f64
    %4 = call @conv_2d_nhwc_hwcf(%0, %1, %2) : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
    %5 = call @rtclock() : () -> f64
    %6 = arith.subf %5, %3 : f64
    vector.print %6 : f64
    %7 = tensor.cast %cst : tensor<1x200x200x1xf32> to tensor<?x?x?x?xf32>
    %8 = tensor.cast %cst_0 : tensor<1x1x1x3xf32> to tensor<?x?x?x?xf32>
    %9 = tensor.cast %cst_1 : tensor<1x200x200x3xf32> to tensor<?x?x?x?xf32>
    %10 = call @rtclock() : () -> f64
    %11 = call @pw_cbsm_conv2d_outer_func_tensor(%7, %8, %9) : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
    %12 = call @rtclock() : () -> f64
    %13 = arith.subf %12, %10 : f64
    vector.print %13 : f64
    return
  }
}


⚠️ **GitHub.com Fallback** ⚠️