root@xxxxx:/Workspace/pw_mlir_test
> /Workspace/buddy-mlir/llvm/build/bin/mlir-opt pw_first.mlir \
-canonicalize \
-cse \
-linalg-bufferize \
-std-bufferize \
-tensor-constant-bufferize \
-tensor-bufferize \
-func-bufferize \
-finalizing-bufferize \
-buffer-deallocation \
-convert-linalg-to-loops \
-lower-affine \
-convert-scf-to-std \
-canonicalize \
-convert-linalg-to-llvm \
-convert-vector-to-llvm \
--convert-memref-to-llvm \
-convert-std-to-llvm \
-reconcile-unrealized-casts \
-verify-diagnostics | /Workspace/buddy-mlir/llvm/build/bin/mlir-cpu-runner \
-e main \
-entry-point-result=void \
-shared-libs=/Workspace/buddy-mlir/llvm/build/lib/libmlir_runner_utils.so
Unranked Memref base@ = 0x56008250e430 rank = 4 offset = 0 sizes = [1, 2, 2, 3] strides = [12, 6, 3, 1] data =
[[[[4, 4, 4],
[4, 4, 4]],
[[4, 4, 4],
[4, 4, 4]]]]
Unranked Memref base@ = 0x56008250e430 rank = 4 offset = 0 sizes = [1, 2, 2, 3] strides = [12, 6, 3, 1] data =
[[[[4, 4, 4],
[4, 4, 4]],
[[4, 4, 4],
[4, 4, 4]]]]
#map0 = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> (d0 ceildiv 256)>
module {
// func private @print_memref_f32(memref<*xf32>)
func private @print_memref_f32(tensor<*xf32>) -> ()
// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
// func @init_4d_filled_f32_tensor(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor<?x?x?x?xf32> {
// %buf = linalg.init_tensor [%s1, %s2, %s3, %s4] : tensor<?x?x?x?xf32>
// %res = linalg.fill(%f, %buf) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>
// return %res : tensor<?x?x?x?xf32>
// }
func @pw_cbsm_conv2d_outer_func_tensor(
%input: tensor<?x?x?x?xf32>,
%filter: tensor<?x?x?x?xf32>,
%output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%KH = tensor.dim %filter, %c0 : tensor<?x?x?x?xf32> // FH
%KW = tensor.dim %filter, %c1 : tensor<?x?x?x?xf32> // FW
%KC = tensor.dim %filter, %c2 : tensor<?x?x?x?xf32> // FC
%ON = tensor.dim %output, %c0 : tensor<?x?x?x?xf32> // ON
%OH = tensor.dim %output, %c1 : tensor<?x?x?x?xf32> // OH
%OW = tensor.dim %output, %c2 : tensor<?x?x?x?xf32> // OW
%OF = tensor.dim %output, %c3 : tensor<?x?x?x?xf32> // OF
// -1. out for adds out_adds_kc_tmp <1,OH,OW,1>
%zero = arith.constant 0.00000e+00 : f32
%buf = linalg.init_tensor [%ON, %OH, %OW, %OF] : tensor<?x?x?x?xf32>
%res = linalg.fill(%zero, %buf) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>
affine.for %on = #map0(%c0) to #map0(%ON) { // on : 0-on(batch)
affine.for %of = #map0(%c0) to #map0(%OF) { // of : 0-of
// 0. out for adds out_adds_kc_tmp <1,OH,OW,1>
// %out_adds_kc_tmp = arith.constant dense<0> : tensor<%ONx%OHx%OWx%OFxf32>
%buf1 = linalg.init_tensor [%OH, %OW] : tensor<?x?xf32>
%out_adds_kc_tmp = linalg.fill(%zero, %buf1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
affine.for %kc = #map0(%c0) to #map0(%KC) { // kc : 0-kc (need to add)
// 1. init kc_out_tmp[OH,OW]
%buf2 = linalg.init_tensor [%OH, %OW] : tensor<?x?xf32>
%output_inner = linalg.fill(%zero, %buf2) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
// 2. silce input for cbsm
// input_inner = input[on,:,:,kc]
// filter_inner = filter[0,0,kc,of]
// %input_inner = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>
// %filter_inner = tensor.extract_slice %filter[0,0,%kc,%of][%KH,%KW,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>
%input_inner0 = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<1x?x?x1xf32>
%filter_inner0 = tensor.extract_slice %filter[0,0,%kc,%of][%KH,%KW,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?x1x1xf32>
%input_inner1 = tensor.collapse_shape %input_inner0 [[0, 1], [2, 3]] : tensor<1x?x?x1xf32> into tensor<?x?xf32>
%filter_inner1 = tensor.collapse_shape %filter_inner0 [[0], [1, 2, 3]] : tensor<?x?x1x1xf32> into tensor<?x?xf32>
// 3. call conv_2d
call @conv_2d_tensor(%input_inner1, %filter_inner1, %output_inner) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
// 4. accmulate all kc with elementwise add to one of tensor
// %out_adds_kc_tmp =+ %output_inner
%out_adds_kc_tmp1 = arith.addf %output_inner, %out_adds_kc_tmp : tensor<?x?xf32>
}
// 5. insert added kc_out to one layer of real output using: output.insert_stride_slice
%output1 = tensor.insert_slice %out_adds_kc_tmp into %output[%on,0,0,%of][1,%OH,%OW,1][1,1,1,1] : tensor<?x?xf32> into tensor<?x?x?x?xf32>
}
}
return %output : tensor<?x?x?x?xf32>
}
func @conv_2d_tensor(%input: tensor<?x?xf32>,
%filter: tensor<?x?xf32>,
%output: tensor<?x?xf32>) -> tensor<?x?xf32> {
%0 = linalg.conv_2d
ins (%input, %filter: tensor<?x?xf32>, tensor<?x?xf32>)
outs (%output: tensor<?x?xf32>) -> tensor<?x?xf32>
return %0 : tensor<?x?xf32>
}
func @conv_2d_nhwc_hwcf(
%input: tensor<?x?x?x?xf32>,
%filter: tensor<?x?x?x?xf32>,
%output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
%res = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
strides = dense<1> : tensor<2xi64>}
ins (%input, %filter: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
outs (%output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
return %res :tensor<?x?x?x?xf32>
}
func @main() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c6 = arith.constant 6 : index
%c8 = arith.constant 8 : index
%f10 = arith.constant 10.00000e+00 : f32
%val = arith.constant 2.00000e+00 : f32
%zero = arith.constant 0.00000e+00 : f32
// normal_conv2d_test
// filter: 1,1,1,3
// in : 1,2,2,1
// out : 1,2,2,3
// %filter2D_nhwc = call @init_4d_filled_f32_tensor(%c1, %c1, %c1, %c3, %val) :(index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
// %in2D_nhwc = call @init_4d_filled_f32_tensor(%c1, %c2, %c2, %c1, %val) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
// %out2D_nhwc = call @init_4d_filled_f32_tensor(%c1, %c2, %c2, %c3, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
%const_in2D_tensor = arith.constant dense<2.0> : tensor<1x2x2x1xf32>
%const_filter2D_tensor = arith.constant dense<2.0> : tensor<1x1x1x3xf32>
%const_out2D_tensor = arith.constant dense<0.0> : tensor<1x2x2x3xf32>
%dynamic_const_in2D_tensor = tensor.cast %const_in2D_tensor: tensor<1x2x2x1xf32> to tensor<?x?x?x?xf32>
%dynamic_const_filter2D_tensor = tensor.cast %const_filter2D_tensor: tensor<1x1x1x3xf32> to tensor<?x?x?x?xf32>
%dynamic_const_out2D_tensor = tensor.cast %const_out2D_tensor: tensor<1x2x2x3xf32> to tensor<?x?x?x?xf32>
// memref.store %f10, %in2D_nhwc[%c0, %c0, %c1, %c0] : memref<?x?x?x?xf32>
// %res_out2D_nhwc = call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32>,
// tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
// %out2D_nhwc_ = tensor.cast %res_out2D_nhwc : tensor<?x?x?x?xf32> to tensor<*xf32>
// call @print_memref_f32(%out2D_nhwc_): (tensor<*xf32>) -> ()
%res_out2D_nhwc = call @conv_2d_nhwc_hwcf(%dynamic_const_in2D_tensor, %dynamic_const_filter2D_tensor, %dynamic_const_out2D_tensor) : (
tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
%dynamic_const_out2D_tensor_ = tensor.cast %res_out2D_nhwc : tensor<?x?x?x?xf32> to tensor<*xf32>
call @print_memref_f32(%dynamic_const_out2D_tensor_): (tensor<*xf32>) -> ()
// pw_conv2d_test
// filter: 1,1,1,3
// in : 1,2,2,1
// out : 1,2,2,3
// %filter2D_nhwc_pw = call @init_4d_filled_f32_tensor(%c1, %c1, %c1, %c3, %val) :(index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
// %in2D_nhwc_pw = call @init_4d_filled_f32_tensor(%c1, %c2, %c2, %c1, %val) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
// %out2D_nhwc_pw = call @init_4d_filled_f32_tensor(%c1, %c2, %c2, %c3, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
%const_in2D_tensor_pw = arith.constant dense<2.0> : tensor<1x2x2x1xf32>
%const_filter2D_tensor_pw = arith.constant dense<2.0> : tensor<1x1x1x3xf32>
%const_out2D_tensor_pw = arith.constant dense<0.0> : tensor<1x2x2x3xf32>
%dynamic_const_in2D_tensor_pw = tensor.cast %const_in2D_tensor: tensor<1x2x2x1xf32> to tensor<?x?x?x?xf32>
%dynamic_const_filter2D_tensor_pw = tensor.cast %const_filter2D_tensor: tensor<1x1x1x3xf32> to tensor<?x?x?x?xf32>
%dynamic_const_out2D_tensor_pw = tensor.cast %const_out2D_tensor: tensor<1x2x2x3xf32> to tensor<?x?x?x?xf32>
// memref.store %f10, %in2D_nhwc_pw[%c0, %c0, %c1, %c0] : memref<?x?x?x?xf32>
// call @pw_cbsm_conv2d_outer_func_tensor(%in2D_nhwc_pw, %filter2D_nhwc_pw, %out2D_nhwc_pw) : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
// %out2D_nhwc_pw_ = tensor.cast %out2D_nhwc_pw : tensor<?x?x?x?xf32> to tensor<*xf32>
// call @print_memref_f32(%out2D_nhwc_pw_): (tensor<*xf32>) -> ()
%res_out2D_nhwc_pw = call @pw_cbsm_conv2d_outer_func_tensor(%dynamic_const_in2D_tensor_pw, %dynamic_const_filter2D_tensor_pw, %dynamic_const_out2D_tensor_pw) : (
tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
%dynamic_const_out2D_tensor_pw_ = tensor.cast %res_out2D_nhwc : tensor<?x?x?x?xf32> to tensor<*xf32>
call @print_memref_f32(%dynamic_const_out2D_tensor_pw_): (tensor<*xf32>) -> ()
return
}
}