// /Workspace/buddy-mlir/llvm/build/bin/mlir-opt pw_sixth_scf.mlir \
// -parallel-loop-fusion \
// -convert-vector-to-scf \
// -convert-scf-to-std \
// -func-bufferize \
// -tensor-constant-bufferize \
// -tensor-bufferize \
// -std-bufferize \
// -finalizing-bufferize \
// -canonicalize \
// -convert-linalg-to-llvm \
// -convert-vector-to-llvm \
// --convert-memref-to-llvm \
// -convert-math-to-llvm \
// -convert-std-to-llvm \
// -reconcile-unrealized-casts | /Workspace/buddy-mlir/llvm/build/bin/mlir-cpu-runner \
// -e main \
// -entry-point-result=void \
// -shared-libs=/Workspace/buddy-mlir/llvm/build/lib/libmlir_runner_utils.so \
// -shared-libs=/Workspace/buddy-mlir/llvm/build/lib/libmlir_c_runner_utils.so \
// -shared-libs=/Workspace/buddy-mlir/llvm/build/lib/libmlir_async_runtime.so
#map0 = affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3)>
#map1 = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2 + d2)>
module {
func private @rtclock() -> f64
func private @print_memref_f32(memref<*xf32>) attributes {llvm.emit_c_interface}
func @conv_2d_cbsm_tensor(%arg0: tensor<?x?xf32>, %arg1: tensor<1x1xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c256 = arith.constant 256 : index
%cst = arith.constant dense<0.000000e+00> : vector<256xf32>
%0 = bufferization.to_memref %arg0 : memref<?x?xf32>
%1 = bufferization.to_memref %arg1 : memref<1x1xf32>
%2 = bufferization.to_memref %arg2 : memref<?x?xf32>
%3 = memref.dim %2, %c0 : memref<?x?xf32>
%4 = memref.dim %2, %c1 : memref<?x?xf32>
%5 = memref.alloc(%3, %4) : memref<?x?xf32>
%6 = memref.dim %2, %c0 : memref<?x?xf32>
%7 = memref.dim %2, %c1 : memref<?x?xf32>
scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%6, %7) step (%c1, %c1) {
%9 = memref.load %2[%arg3, %arg4] : memref<?x?xf32>
memref.store %9, %5[%arg3, %arg4] : memref<?x?xf32>
}
// first change
// scf.for %arg3 = %c0 to %6 step %c1 {
// scf.for %arg4 = %c0 to %7 step %c1 {
// %9 = memref.load %2[%arg3, %arg4] : memref<?x?xf32>
// memref.store %9, %5[%arg3, %arg4] : memref<?x?xf32>
// }
// }
%c0_0 = arith.constant 0 : index
%c1_1 = arith.constant 1 : index
scf.parallel (%arg3) = (%c0_0) to (%3) step (%c1_1) {
%c0_2 = arith.constant 0 : index
%c1_3 = arith.constant 1 : index
%c1_4 = arith.constant 1 : index
scf.for %arg4 = %c0_2 to %c1_3 step %c1_4 {
%c0_5 = arith.constant 0 : index
%c1_6 = arith.constant 1 : index
%c1_7 = arith.constant 1 : index
scf.for %arg5 = %c0_5 to %c1_6 step %c1_7 {
%c0_8 = arith.constant 0 : index
%c256_9 = arith.constant 256 : index
%c0_10 = arith.constant 0 : index
%c1_11 = arith.constant 1 : index
%9 = arith.cmpi sle, %4, %c0_10 : index
%10 = arith.subi %c0_10, %4 : index
%11 = arith.subi %4, %c1_11 : index
%12 = select %9, %10, %11 : index
%13 = arith.divsi %12, %c256_9 : index
%14 = arith.subi %c0_10, %13 : index
%15 = arith.addi %13, %c1_11 : index
%16 = select %9, %14, %15 : index
%c1_12 = arith.constant 1 : index
scf.for %arg6 = %c0_8 to %16 step %c1_12 {
%17 = vector.load %1[%arg4, %arg5] : memref<1x1xf32>, vector<1xf32>
%18 = vector.broadcast %17 : vector<1xf32> to vector<256xf32>
%19 = arith.muli %arg6, %c256 : index
%20 = arith.subi %4, %19 : index
%21 = arith.cmpi sge, %20, %c256 : index
scf.if %21 {
%22 = arith.addi %arg3, %arg4 : index
%c256_13 = arith.constant 256 : index
%23 = arith.muli %arg6, %c256_13 : index
%24 = arith.addi %arg5, %23 : index
%25 = vector.load %0[%22, %24] : memref<?x?xf32>, vector<256xf32>
%c256_14 = arith.constant 256 : index
%26 = arith.muli %arg6, %c256_14 : index
%27 = vector.load %5[%arg3, %26] : memref<?x?xf32>, vector<256xf32>
%28 = vector.fma %25, %18, %27 : vector<256xf32>
%c256_15 = arith.constant 256 : index
%29 = arith.muli %arg6, %c256_15 : index
vector.store %28, %5[%arg3, %29] : memref<?x?xf32>, vector<256xf32>
} else {
%22 = vector.create_mask %20 : vector<256xi1>
%23 = arith.addi %arg3, %arg4 : index
%24 = arith.muli %arg6, %c256 : index
%25 = arith.addi %arg5, %24 : index
%26 = vector.maskedload %0[%23, %25], %22, %cst : memref<?x?xf32>, vector<256xi1>, vector<256xf32> into vector<256xf32>
%27 = vector.maskedload %5[%arg3, %24], %22, %cst : memref<?x?xf32>, vector<256xi1>, vector<256xf32> into vector<256xf32>
%28 = vector.fma %26, %18, %27 : vector<256xf32>
vector.maskedstore %5[%arg3, %24], %22, %28 : memref<?x?xf32>, vector<256xi1>, vector<256xf32>
}
}
}
}
}
%8 = bufferization.to_tensor %5 : memref<?x?xf32>
return %8 : tensor<?x?xf32>
}
func @pw_cbsm_conv2d_outer_func_tensor(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = bufferization.to_memref %arg0 : memref<?x?x?x?xf32>
%1 = bufferization.to_memref %arg1 : memref<?x?x?x?xf32>
%2 = tensor.dim %arg1, %c2 : tensor<?x?x?x?xf32>
%3 = tensor.dim %arg2, %c0 : tensor<?x?x?x?xf32>
%4 = tensor.dim %arg2, %c1 : tensor<?x?x?x?xf32>
%5 = tensor.dim %arg2, %c2 : tensor<?x?x?x?xf32>
%6 = tensor.dim %arg2, %c3 : tensor<?x?x?x?xf32>
%7 = memref.alloc(%3, %4, %5, %6) : memref<?x?x?x?xf32>
%8 = memref.alloc(%4, %5, %6) : memref<?x?x?xf32>
scf.for %arg3 = %c0 to %4 step %c1 {
scf.for %arg4 = %c0 to %5 step %c1 {
scf.for %arg5 = %c0 to %6 step %c1 {
memref.store %cst, %8[%arg3, %arg4, %arg5] : memref<?x?x?xf32>
}
}
}
%9 = bufferization.to_tensor %8 : memref<?x?x?xf32>
scf.for %arg3 = %c0 to %3 step %c1 {
scf.for %arg4 = %c0 to %4 step %c1 {
scf.for %arg5 = %c0 to %5 step %c1 {
scf.for %arg6 = %c0 to %6 step %c1 {
memref.store %cst, %7[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x?xf32>
}
}
}
}
%10 = bufferization.to_tensor %7 : memref<?x?x?x?xf32>
%c0_0 = arith.constant 0 : index
%c1_1 = arith.constant 1 : index
%11 = scf.for %arg3 = %c0_0 to %3 step %c1_1 iter_args(%arg4 = %10) -> (tensor<?x?x?x?xf32>) {
%12 = bufferization.to_memref %arg4 : memref<?x?x?x?xf32>
%c0_2 = arith.constant 0 : index
%c1_3 = arith.constant 1 : index
%13 = scf.for %arg5 = %c0_2 to %6 step %c1_3 iter_args(%arg6 = %9) -> (tensor<?x?x?xf32>) {
%30 = bufferization.to_memref %arg6 : memref<?x?x?xf32>
%31 = memref.alloc(%4, %5) : memref<?x?xf32>
// change three
// scf.for %arg7 = %c0 to %4 step %c1 {
// scf.for %arg8 = %c0 to %5 step %c1 {
// memref.store %cst, %31[%arg7, %arg8] : memref<?x?xf32>
// }
// }
scf.parallel (%arg7, %arg8) = (%c0, %c0) to (%4, %5) step (%c1, %c1) {
memref.store %cst, %31[%arg7, %arg8] : memref<?x?xf32>
}
%32 = bufferization.to_tensor %31 : memref<?x?xf32>
%c0_4 = arith.constant 0 : index
%c1_5 = arith.constant 1 : index
%33 = scf.for %arg7 = %c0_4 to %2 step %c1_5 iter_args(%arg8 = %32) -> (tensor<?x?xf32>) {
%47 = bufferization.to_memref %arg8 : memref<?x?xf32>
%48 = memref.alloc(%4, %5) : memref<?x?xf32>
scf.for %arg9 = %c0 to %4 step %c1 {
scf.for %arg10 = %c0 to %5 step %c1 {
memref.store %cst, %48[%arg9, %arg10] : memref<?x?xf32>
}
}
%49 = bufferization.to_tensor %48 : memref<?x?xf32>
%50 = memref.alloc(%4, %5) : memref<1x?x?x1xf32>
%51 = memref.subview %0[%arg3, 0, 0, %arg7] [1, %4, %5, 1] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<1x?x?x1xf32, #map0>
scf.for %arg9 = %c0 to %c1 step %c1 {
scf.for %arg10 = %c0 to %4 step %c1 {
scf.for %arg11 = %c0 to %5 step %c1 {
scf.for %arg12 = %c0 to %c1 step %c1 {
%66 = memref.load %51[%arg9, %arg10, %arg11, %arg12] : memref<1x?x?x1xf32, #map0>
memref.store %66, %50[%arg9, %arg10, %arg11, %arg12] : memref<1x?x?x1xf32>
}
}
}
}
%52 = memref.alloc() : memref<1x1x1x1xf32>
%53 = memref.subview %1[0, 0, %arg7, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<1x1x1x1xf32, #map0>
scf.for %arg9 = %c0 to %c1 step %c1 {
scf.for %arg10 = %c0 to %c1 step %c1 {
scf.for %arg11 = %c0 to %c1 step %c1 {
scf.for %arg12 = %c0 to %c1 step %c1 {
%66 = memref.load %53[%arg9, %arg10, %arg11, %arg12] : memref<1x1x1x1xf32, #map0>
memref.store %66, %52[%arg9, %arg10, %arg11, %arg12] : memref<1x1x1x1xf32>
}
}
}
}
%54 = memref.collapse_shape %50 [[0, 1], [2, 3]] : memref<1x?x?x1xf32> into memref<?x?xf32>
%55 = bufferization.to_tensor %54 : memref<?x?xf32>
%56 = memref.collapse_shape %52 [[0, 1, 2], [3]] : memref<1x1x1x1xf32> into memref<1x1xf32>
%57 = bufferization.to_tensor %56 : memref<1x1xf32>
%58 = call @conv_2d_cbsm_tensor(%55, %57, %49) : (tensor<?x?xf32>, tensor<1x1xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
%59 = bufferization.to_memref %58 : memref<?x?xf32>
%60 = memref.dim %59, %c0 : memref<?x?xf32>
%61 = memref.dim %59, %c1 : memref<?x?xf32>
%62 = memref.alloc(%60, %61) : memref<?x?xf32>
%63 = memref.dim %59, %c0 : memref<?x?xf32>
%64 = memref.dim %59, %c1 : memref<?x?xf32>
scf.for %arg9 = %c0 to %63 step %c1 {
scf.for %arg10 = %c0 to %64 step %c1 {
%66 = memref.load %59[%arg9, %arg10] : memref<?x?xf32>
%67 = memref.load %47[%arg9, %arg10] : memref<?x?xf32>
%68 = arith.addf %66, %67 : f32
memref.store %68, %62[%arg9, %arg10] : memref<?x?xf32>
}
}
%65 = bufferization.to_tensor %62 : memref<?x?xf32>
scf.yield %65 : tensor<?x?xf32>
}
%34 = bufferization.to_memref %33 : memref<?x?xf32>
%35 = memref.expand_shape %34 [[0], [1, 2]] : memref<?x?xf32> into memref<?x?x1xf32>
%36 = memref.dim %30, %c0 : memref<?x?x?xf32>
%37 = memref.dim %30, %c1 : memref<?x?x?xf32>
%38 = memref.dim %30, %c2 : memref<?x?x?xf32>
%39 = memref.alloc(%36, %37, %38) : memref<?x?x?xf32>
%40 = memref.dim %30, %c0 : memref<?x?x?xf32>
%41 = memref.dim %30, %c1 : memref<?x?x?xf32>
%42 = memref.dim %30, %c2 : memref<?x?x?xf32>
scf.for %arg7 = %c0 to %40 step %c1 {
scf.for %arg8 = %c0 to %41 step %c1 {
scf.for %arg9 = %c0 to %42 step %c1 {
%47 = memref.load %30[%arg7, %arg8, %arg9] : memref<?x?x?xf32>
memref.store %47, %39[%arg7, %arg8, %arg9] : memref<?x?x?xf32>
}
}
}
%43 = memref.subview %39[0, 0, %arg5] [%4, %5, 1] [1, 1, 1] : memref<?x?x?xf32> to memref<?x?x1xf32, #map1>
%44 = memref.dim %35, %c0 : memref<?x?x1xf32>
%45 = memref.dim %35, %c1 : memref<?x?x1xf32>
scf.for %arg7 = %c0 to %44 step %c1 {
scf.for %arg8 = %c0 to %45 step %c1 {
scf.for %arg9 = %c0 to %c1 step %c1 {
%47 = memref.load %35[%arg7, %arg8, %arg9] : memref<?x?x1xf32>
memref.store %47, %43[%arg7, %arg8, %arg9] : memref<?x?x1xf32, #map1>
}
}
}
%46 = bufferization.to_tensor %39 : memref<?x?x?xf32>
scf.yield %46 : tensor<?x?x?xf32>
}
%14 = bufferization.to_memref %13 : memref<?x?x?xf32>
%15 = memref.expand_shape %14 [[0, 1], [2], [3]] : memref<?x?x?xf32> into memref<1x?x?x?xf32>
%16 = memref.dim %12, %c0 : memref<?x?x?x?xf32>
%17 = memref.dim %12, %c1 : memref<?x?x?x?xf32>
%18 = memref.dim %12, %c2 : memref<?x?x?x?xf32>
%19 = memref.dim %12, %c3 : memref<?x?x?x?xf32>
%20 = memref.alloc(%16, %17, %18, %19) : memref<?x?x?x?xf32>
%21 = memref.dim %12, %c0 : memref<?x?x?x?xf32>
%22 = memref.dim %12, %c1 : memref<?x?x?x?xf32>
%23 = memref.dim %12, %c2 : memref<?x?x?x?xf32>
%24 = memref.dim %12, %c3 : memref<?x?x?x?xf32>
scf.for %arg5 = %c0 to %21 step %c1 {
scf.for %arg6 = %c0 to %22 step %c1 {
scf.for %arg7 = %c0 to %23 step %c1 {
scf.for %arg8 = %c0 to %24 step %c1 {
%30 = memref.load %12[%arg5, %arg6, %arg7, %arg8] : memref<?x?x?x?xf32>
memref.store %30, %20[%arg5, %arg6, %arg7, %arg8] : memref<?x?x?x?xf32>
}
}
}
}
%25 = memref.subview %20[%arg3, 0, 0, 0] [1, %4, %5, %6] [1, 1, 1, 1] : memref<?x?x?x?xf32> to memref<1x?x?x?xf32, #map0>
%26 = memref.dim %15, %c1 : memref<1x?x?x?xf32>
%27 = memref.dim %15, %c2 : memref<1x?x?x?xf32>
%28 = memref.dim %15, %c3 : memref<1x?x?x?xf32>
scf.for %arg5 = %c0 to %c1 step %c1 {
scf.for %arg6 = %c0 to %26 step %c1 {
scf.for %arg7 = %c0 to %27 step %c1 {
scf.for %arg8 = %c0 to %28 step %c1 {
%30 = memref.load %15[%arg5, %arg6, %arg7, %arg8] : memref<1x?x?x?xf32>
memref.store %30, %25[%arg5, %arg6, %arg7, %arg8] : memref<1x?x?x?xf32, #map0>
}
}
}
}
%29 = bufferization.to_tensor %20 : memref<?x?x?x?xf32>
scf.yield %29 : tensor<?x?x?x?xf32>
}
return %11 : tensor<?x?x?x?xf32>
}
func @conv_2d_nhwc_hwcf(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%0 = bufferization.to_memref %arg0 : memref<?x?x?x?xf32>
%1 = bufferization.to_memref %arg1 : memref<?x?x?x?xf32>
%2 = bufferization.to_memref %arg2 : memref<?x?x?x?xf32>
%3 = memref.dim %2, %c0 : memref<?x?x?x?xf32>
%4 = memref.dim %2, %c1 : memref<?x?x?x?xf32>
%5 = memref.dim %2, %c2 : memref<?x?x?x?xf32>
%6 = memref.dim %2, %c3 : memref<?x?x?x?xf32>
%7 = memref.alloc(%3, %4, %5, %6) : memref<?x?x?x?xf32>
%8 = memref.dim %2, %c0 : memref<?x?x?x?xf32>
%9 = memref.dim %2, %c1 : memref<?x?x?x?xf32>
%10 = memref.dim %2, %c2 : memref<?x?x?x?xf32>
%11 = memref.dim %2, %c3 : memref<?x?x?x?xf32>
scf.for %arg3 = %c0 to %8 step %c1 {
scf.for %arg4 = %c0 to %9 step %c1 {
scf.for %arg5 = %c0 to %10 step %c1 {
scf.for %arg6 = %c0 to %11 step %c1 {
%18 = memref.load %2[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x?xf32>
memref.store %18, %7[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x?xf32>
}
}
}
}
%12 = memref.dim %0, %c0 : memref<?x?x?x?xf32>
%13 = memref.dim %0, %c3 : memref<?x?x?x?xf32>
%14 = memref.dim %1, %c0 : memref<?x?x?x?xf32>
%15 = memref.dim %1, %c1 : memref<?x?x?x?xf32>
%16 = memref.dim %1, %c3 : memref<?x?x?x?xf32>
scf.for %arg3 = %c0 to %12 step %c1 {
scf.for %arg4 = %c0 to %4 step %c1 {
scf.for %arg5 = %c0 to %5 step %c1 {
scf.for %arg6 = %c0 to %16 step %c1 {
scf.for %arg7 = %c0 to %14 step %c1 {
scf.for %arg8 = %c0 to %15 step %c1 {
scf.for %arg9 = %c0 to %13 step %c1 {
%18 = arith.addi %arg4, %arg7 : index
%19 = arith.addi %arg5, %arg8 : index
%20 = memref.load %0[%arg3, %18, %19, %arg9] : memref<?x?x?x?xf32>
%21 = memref.load %1[%arg7, %arg8, %arg9, %arg6] : memref<?x?x?x?xf32>
%22 = memref.load %7[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x?xf32>
%23 = arith.mulf %20, %21 : f32
%24 = arith.addf %22, %23 : f32
memref.store %24, %7[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x?xf32>
}
}
}
}
}
}
}
%17 = bufferization.to_tensor %7 : memref<?x?x?x?xf32>
return %17 : tensor<?x?x?x?xf32>
}
func @conv_2d_1x1(%arg0: tensor<1x4x5x2xf32>, %arg1: tensor<1x1x2x7xf32>) -> tensor<1x4x5x7xf32> {
%c20 = arith.constant 20 : index
%c7 = arith.constant 7 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%0 = bufferization.to_memref %arg0 : memref<1x4x5x2xf32>
%1 = bufferization.to_memref %arg1 : memref<1x1x2x7xf32>
%2 = memref.alloc() : memref<1x4x5x7xf32>
%3 = memref.collapse_shape %0 [[0, 1, 2], [3]] : memref<1x4x5x2xf32> into memref<20x2xf32>
%4 = memref.collapse_shape %1 [[0, 1, 2], [3]] : memref<1x1x2x7xf32> into memref<2x7xf32>
%5 = memref.collapse_shape %2 [[0, 1, 2], [3]] : memref<1x4x5x7xf32> into memref<20x7xf32>
%6 = memref.alloc() : memref<20x7xf32>
scf.for %arg2 = %c0 to %c20 step %c1 {
scf.for %arg3 = %c0 to %c7 step %c1 {
%9 = memref.load %5[%arg2, %arg3] : memref<20x7xf32>
memref.store %9, %6[%arg2, %arg3] : memref<20x7xf32>
}
}
scf.for %arg2 = %c0 to %c20 step %c1 {
scf.for %arg3 = %c0 to %c7 step %c1 {
scf.for %arg4 = %c0 to %c2 step %c1 {
%9 = memref.load %3[%arg2, %arg4] : memref<20x2xf32>
%10 = memref.load %4[%arg4, %arg3] : memref<2x7xf32>
%11 = memref.load %6[%arg2, %arg3] : memref<20x7xf32>
%12 = arith.mulf %9, %10 : f32
%13 = arith.addf %11, %12 : f32
memref.store %13, %6[%arg2, %arg3] : memref<20x7xf32>
}
}
}
%7 = memref.expand_shape %6 [[0, 1, 2], [3]] : memref<20x7xf32> into memref<1x4x5x7xf32>
%8 = bufferization.to_tensor %7 : memref<1x4x5x7xf32>
return %8 : tensor<1x4x5x7xf32>
}
func @main() {
%cst = arith.constant dense<1.000000e+00> : tensor<1x200x200x1xf32>
%cst_0 = arith.constant dense<2.000000e+00> : tensor<1x1x1x3xf32>
%cst_1 = arith.constant dense<0.000000e+00> : tensor<1x200x200x3xf32>
%0 = tensor.cast %cst : tensor<1x200x200x1xf32> to tensor<?x?x?x?xf32>
%1 = tensor.cast %cst_0 : tensor<1x1x1x3xf32> to tensor<?x?x?x?xf32>
%2 = tensor.cast %cst_1 : tensor<1x200x200x3xf32> to tensor<?x?x?x?xf32>
%3 = call @rtclock() : () -> f64
%4 = call @conv_2d_nhwc_hwcf(%0, %1, %2) : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
%5 = call @rtclock() : () -> f64
%6 = arith.subf %5, %3 : f64
vector.print %6 : f64
%7 = tensor.cast %cst : tensor<1x200x200x1xf32> to tensor<?x?x?x?xf32>
%8 = tensor.cast %cst_0 : tensor<1x1x1x3xf32> to tensor<?x?x?x?xf32>
%9 = tensor.cast %cst_1 : tensor<1x200x200x3xf32> to tensor<?x?x?x?xf32>
%10 = call @rtclock() : () -> f64
%11 = call @pw_cbsm_conv2d_outer_func_tensor(%7, %8, %9) : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
%12 = call @rtclock() : () -> f64
%13 = arith.subf %12, %10 : f64
vector.print %13 : f64
return
}
}