/Workspace/buddy-mlir/llvm/build/bin/mlir-opt test_pw_with_const.mlir -convert-linalg-to-loops -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm -reconcile-unrealized-casts | /Workspace/buddy-mlir/llvm/build/bin/mlir-cpu-runner -e main -entry-point-result=void -shared-libs=/Workspace/buddy-mlir/llvm/build/lib/libmlir_runner_utils.so
func private @print_memref_f32(memref<*xf32>)
// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref<?x?x?x?xf32> {
%buf = memref.alloc(%s1, %s2, %s3, %s4) : memref<?x?x?x?xf32>
linalg.fill(%f, %buf) : f32, memref<?x?x?x?xf32>
return %buf : memref<?x?x?x?xf32>
}
func @conv_2d_nhwc_hwcf(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
strides = dense<1> : tensor<2xi64>}
ins (%arg0, %arg1: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
outs (%arg2: memref<?x?x?x?xf32>)
return
}
func @main() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c6 = arith.constant 6 : index
%c8 = arith.constant 8 : index
%f10 = arith.constant 10.00000e+00 : f32
%val = arith.constant 2.00000e+00 : f32
%zero = arith.constant 0.00000e+00 : f32
%filter2D_nhwc = call @alloc_4d_filled_f32(%c3, %c3, %c3, %c1, %val) :(index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
%in2D_nhwc = call @alloc_4d_filled_f32(%c3, %c8, %c8, %c3, %val) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
%out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c6, %c6, %c1, %zero) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
memref.store %f10, %in2D_nhwc[%c0, %c0, %c3, %c0] : memref<?x?x?x?xf32>
call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
%out2D_nhwc_ = memref.cast %out2D_nhwc : memref<?x?x?x?xf32> to memref<*xf32>
call @print_memref_f32(%out2D_nhwc_): (memref<*xf32>) -> ()
memref.dealloc %filter2D_nhwc : memref<?x?x?x?xf32>
memref.dealloc %in2D_nhwc : memref<?x?x?x?xf32>
memref.dealloc %out2D_nhwc : memref<?x?x?x?xf32>
// pw_test
%filter2D_nhwc = call @alloc_4d_filled_f32(%c3, %c3, %c3, %c1, %val) :(index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
%in2D_nhwc = call @alloc_4d_filled_f32(%c3, %c8, %c8, %c3, %val) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
%out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c6, %c6, %c1, %zero) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
memref.store %f10, %in2D_nhwc[%c0, %c0, %c3, %c0] : memref<?x?x?x?xf32>
call @pw_cbsm_conv2d(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
%out2D_nhwc_ = memref.cast %out2D_nhwc : memref<?x?x?x?xf32> to memref<*xf32>
call @print_memref_f32(%out2D_nhwc_): (memref<*xf32>) -> ()
memref.dealloc %filter2D_nhwc : memref<?x?x?x?xf32>
memref.dealloc %in2D_nhwc : memref<?x?x?x?xf32>
memref.dealloc %out2D_nhwc : memref<?x?x?x?xf32>
return
}
#map0 = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> (d0 ceildiv 256)>
module {
func @pw_cbsm_conv2d(%input: memref<?x?x?x?xf32>, %filter: memref<?x?x?x?xf32>, %output: memref<?x?x?x?xf32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%KH = memref.dim %filter, %c0 : memref<?x?x?x?xf32> // FH
%KW = memref.dim %filter, %c1 : memref<?x?x?x?xf32> // FW
%KC = memref.dim %filter, %c2 : memref<?x?x?x?xf32> // FC
%ON = memref.dim %output, %c1 : memref<?x?x?x?xf32> // ON
%OH = memref.dim %output, %c1 : memref<?x?x?x?xf32> // OH
%OW = memref.dim %output, %c0 : memref<?x?x?x?xf32> // OW
%OF = memref.dim %output, %c2 : memref<?x?x?x?xf32> // OF
affine.for %on = #map0(%c0) to #map0(%ON) { // on : 0-on(batch)
affine.for %of = #map0(%c0) to #map0(%OF) { // of : 0-of
affine.for %kc = #map0(%c0) to #map0(%KC) { // kc : 0-kc (need to add)
affine.for %oh = #map0(%c0) to #map0(%OH) { // a3 : 0-oh
affine.for %ow_256 = #map0(%c0) to #map1(%OW) { // a6 : 0-up[ow/256]
// f4 = vector.load(filter[fh,fw,?fc,of]) (对于所有kc只有一个值of)
%4 = affine.vector_load %filter[0,0,%kc,%of] : memref<?x?x?x?xf32>, vector<1xf32>
// vec.bcast(vector.load(filter[fh,fw,kc,of])) 1-256
%5 = vector.broadcast %4 : vector<1xf32> to vector<256xf32>
// %6=vec.load256(img[on, fh+oh, ow+fw*256,kc])
%6 = affine.vector_load %input[%on, 0+%oh, 0+%ow_256*256, %kc] : memref<?x?x?x?xf32>, vector<256xf32>
// fi = vector.load(out[on, oh, up(ow/256)*256], of)
%7 = affine.vector_load %output[%on, %oh, %ow_256 * 256, %of] : memref<?x?x?x?xf32>, vector<256xf32>
// vec.fma(vec.load256(img[fh+oh, fw+ow*256])*vec.bcast(vector.load(filter[fh,fw]))
// +vec.load(out[ow, up(ow/256)*256]))
%8 = vector.fma %6, %5, %7 : vector<256xf32>
// out[oh,up(ow/256)*256]
affine.vector_store %8, %output[%on, %oh, %ow_256 * 256, %of] : memref<?x?x?x?xf32>, vector<256xf32>
}
}
}
}
}
return
}
}
#map0 = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> (d0 ceildiv 256)>
module {
func private @print_memref_f32(memref<*xf32>)
// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref<?x?x?x?xf32> {
%buf = memref.alloc(%s1, %s2, %s3, %s4) : memref<?x?x?x?xf32>
linalg.fill(%f, %buf) : f32, memref<?x?x?x?xf32>
return %buf : memref<?x?x?x?xf32>
}
func @conv_2d_nhwc_hwcf(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
strides = dense<1> : tensor<2xi64>}
ins (%arg0, %arg1: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
outs (%arg2: memref<?x?x?x?xf32>)
return
}
func @pw_cbsm_conv2d(%input: memref<?x?x?x?xf32>, %filter: memref<?x?x?x?xf32>, %output: memref<?x?x?x?xf32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%KH = memref.dim %filter, %c0 : memref<?x?x?x?xf32> // FH
%KW = memref.dim %filter, %c1 : memref<?x?x?x?xf32> // FW
%KC = memref.dim %filter, %c2 : memref<?x?x?x?xf32> // FC
%ON = memref.dim %output, %c1 : memref<?x?x?x?xf32> // ON
%OH = memref.dim %output, %c1 : memref<?x?x?x?xf32> // OH
%OW = memref.dim %output, %c0 : memref<?x?x?x?xf32> // OW
%OF = memref.dim %output, %c2 : memref<?x?x?x?xf32> // OF
affine.for %on = #map0(%c0) to #map0(%ON) { // on : 0-on(batch)
affine.for %of = #map0(%c0) to #map0(%OF) { // of : 0-of
affine.for %kc = #map0(%c0) to #map0(%KC) { // kc : 0-kc (need to add)
affine.for %oh = #map0(%c0) to #map0(%OH) { // a3 : 0-oh
affine.for %ow_256 = #map0(%c0) to #map1(%OW) { // a6 : 0-up[ow/256]
// f4 = vector.load(filter[fh,fw,?fc,of]) (对于所有kc只有一个值of)
%4 = affine.vector_load %filter[0,0,%kc,%of] : memref<?x?x?x?xf32>, vector<1xf32>
// vec.bcast(vector.load(filter[fh,fw,kc,of])) 1-256
%5 = vector.broadcast %4 : vector<1xf32> to vector<256xf32>
// %6=vec.load256(img[on, fh+oh, ow+fw*256,kc])
%6 = affine.vector_load %input[%on, 0+%oh, 0+%ow_256*256, %kc] : memref<?x?x?x?xf32>, vector<256xf32>
// fi = vector.load(out[on, oh, up(ow/256)*256], of)
%7 = affine.vector_load %output[%on, %oh, %ow_256 * 256, %of] : memref<?x?x?x?xf32>, vector<256xf32>
// vec.fma(vec.load256(img[fh+oh, fw+ow*256])*vec.bcast(vector.load(filter[fh,fw]))
// +vec.load(out[ow, up(ow/256)*256]))
%8 = vector.fma %6, %5, %7 : vector<256xf32>
// out[oh,up(ow/256)*256]
affine.vector_store %8, %output[%on, %oh, %ow_256 * 256, %of] : memref<?x?x?x?xf32>, vector<256xf32>
}
}
}
}
}
return
}
func @main() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c6 = arith.constant 6 : index
%c8 = arith.constant 8 : index
%f10 = arith.constant 10.00000e+00 : f32
%val = arith.constant 2.00000e+00 : f32
%zero = arith.constant 0.00000e+00 : f32
// normal_conv2d_test
// filter: 1,1,1,3
// in : 1,2,2,1
// out : 1,2,2,3
%filter2D_nhwc = call @alloc_4d_filled_f32(%c1, %c1, %c1, %c3, %val) :(index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
%in2D_nhwc = call @alloc_4d_filled_f32(%c1, %c2, %c2, %c1, %val) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
%out2D_nhwc = call @alloc_4d_filled_f32(%c1, %c2, %c2, %c3, %zero) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
memref.store %f10, %in2D_nhwc[%c0, %c0, %c1, %c0] : memref<?x?x?x?xf32>
call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
%out2D_nhwc_ = memref.cast %out2D_nhwc : memref<?x?x?x?xf32> to memref<*xf32>
call @print_memref_f32(%out2D_nhwc_): (memref<*xf32>) -> ()
// pw_conv2d_test
%filter2D_nhwc_pw = call @alloc_4d_filled_f32(%c1, %c1, %c1, %c3, %val) :(index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
%in2D_nhwc_pw = call @alloc_4d_filled_f32(%c1, %c2, %c2, %c1, %val) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
%out2D_nhwc_pw = call @alloc_4d_filled_f32(%c1, %c2, %c2, %c3, %zero) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
memref.store %f10, %in2D_nhwc_pw[%c0, %c0, %c1, %c0] : memref<?x?x?x?xf32>
call @pw_cbsm_conv2d(%in2D_nhwc_pw, %filter2D_nhwc_pw, %out2D_nhwc_pw) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
%out2D_nhwc_pw_ = memref.cast %out2D_nhwc_pw : memref<?x?x?x?xf32> to memref<*xf32>
call @print_memref_f32(%out2D_nhwc_pw_): (memref<*xf32>) -> ()
memref.dealloc %filter2D_nhwc : memref<?x?x?x?xf32>
memref.dealloc %in2D_nhwc : memref<?x?x?x?xf32>
memref.dealloc %out2D_nhwc : memref<?x?x?x?xf32>
memref.dealloc %filter2D_nhwc_pw : memref<?x?x?x?xf32>
memref.dealloc %in2D_nhwc_pw : memref<?x?x?x?xf32>
memref.dealloc %out2D_nhwc_pw : memref<?x?x?x?xf32>
return
}
}
#map0 = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> (d0 ceildiv 256)>
module {
func private @print_memref_f32(memref<*xf32>)
// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref<?x?x?x?xf32> {
%buf = memref.alloc(%s1, %s2, %s3, %s4) : memref<?x?x?x?xf32>
linalg.fill(%f, %buf) : f32, memref<?x?x?x?xf32>
return %buf : memref<?x?x?x?xf32>
}
func @conv_2d_nhwc_hwcf(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
strides = dense<1> : tensor<2xi64>}
ins (%arg0, %arg1: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
outs (%arg2: memref<?x?x?x?xf32>)
return
}
func @pw_cbsm_conv2d(%input: memref<?x?x?x?xf32>, %filter: memref<?x?x?x?xf32>, %output: memref<?x?x?x?xf32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%KH = memref.dim %filter, %c0 : memref<?x?x?x?xf32> // FH
%KW = memref.dim %filter, %c1 : memref<?x?x?x?xf32> // FW
%KC = memref.dim %filter, %c2 : memref<?x?x?x?xf32> // FC
%ON = memref.dim %output, %c1 : memref<?x?x?x?xf32> // ON
%OH = memref.dim %output, %c1 : memref<?x?x?x?xf32> // OH
%OW = memref.dim %output, %c0 : memref<?x?x?x?xf32> // OW
%OF = memref.dim %output, %c2 : memref<?x?x?x?xf32> // OF
affine.for %on = #map0(%c0) to #map0(%ON) { // on : 0-on(batch)
affine.for %of = #map0(%c0) to #map0(%OF) { // of : 0-of
affine.for %kc = #map0(%c0) to #map0(%KC) { // kc : 0-kc (need to add)
affine.for %oh = #map0(%c0) to #map0(%OH) { // a3 : 0-oh
affine.for %ow_256 = #map0(%c0) to #map1(%OW) { // a6 : 0-up[ow/256]
// f4 = vector.load(filter[fh,fw,?fc,of]) (对于所有kc只有一个值of)
%4 = affine.vector_load %filter[0,0,%kc,%of] : memref<?x?x?x?xf32>, vector<1xf32>
// vec.bcast(vector.load(filter[fh,fw,kc,of])) 1-256
%5 = vector.broadcast %4 : vector<1xf32> to vector<256xf32>
// %6=vec.load256(img[on, fh+oh, ow+fw*256,kc])
//
// for (i:kc) {
// img_subview[:,:,:,i]
// %output[i] = cbsm(img_subview[:,:,:,i])
//
// }
// img[n,h,w,c] = subview as img[:,:,:,0]
// img[n,h,w,c] = subview as img[:,:,:,1]
// img[n,h,w,c] = subview as img[:,:,:,2]
%6 = affine.vector_load %input[%on, 0+%oh, 0+%ow_256*256, %kc] : memref<?x?x?x?xf32>, vector<256xf32>
// img[n,h,w,c] = subview as img[:,:,:,0]
// %66 = vector.extract_strided_slice %6 {offsets = [0, 0, 0, 0], sizes = [2, 4,,], strides = [1, 1,,]}: vector<4x8x16xf32> to vector<2x4x16xf32>
// fi = vector.load(out[on, oh, up(ow/256)*256], of)
%7 = affine.vector_load %output[%on, %oh, %ow_256 * 256, %of] : memref<?x?x?x?xf32>, vector<256xf32>
// vec.fma(vec.load256(img[fh+oh, fw+ow*256])*vec.bcast(vector.load(filter[fh,fw]))
// +vec.load(out[ow, up(ow/256)*256]))
%8 = vector.fma %6, %5, %7 : vector<256xf32>
// out[oh,up(ow/256)*256]
affine.vector_store %8, %output[%on, %oh, %ow_256 * 256, %of] : memref<?x?x?x?xf32>, vector<256xf32>
}
}
}
}
}
return
}
func @main() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%c6 = arith.constant 6 : index
%c8 = arith.constant 8 : index
%f10 = arith.constant 10.00000e+00 : f32
%val = arith.constant 2.00000e+00 : f32
%zero = arith.constant 0.00000e+00 : f32
// normal_conv2d_test
// filter: 1,1,1,3
// in : 1,2,2,1
// out : 1,2,2,3
%filter2D_nhwc = call @alloc_4d_filled_f32(%c1, %c1, %c1, %c3, %val) :(index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
%in2D_nhwc = call @alloc_4d_filled_f32(%c1, %c2, %c2, %c1, %val) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
%out2D_nhwc = call @alloc_4d_filled_f32(%c1, %c2, %c2, %c3, %zero) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
memref.store %f10, %in2D_nhwc[%c0, %c0, %c1, %c0] : memref<?x?x?x?xf32>
call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
%out2D_nhwc_ = memref.cast %out2D_nhwc : memref<?x?x?x?xf32> to memref<*xf32>
call @print_memref_f32(%out2D_nhwc_): (memref<*xf32>) -> ()
// pw_conv2d_test
// filter: 1,1,1,3
// in : 1,2,2,1
// out : 1,2,2,3
%filter2D_nhwc_pw = call @alloc_4d_filled_f32(%c1, %c1, %c1, %c3, %val) :(index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
%in2D_nhwc_pw = call @alloc_4d_filled_f32(%c1, %c2, %c2, %c1, %val) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
%out2D_nhwc_pw = call @alloc_4d_filled_f32(%c1, %c2, %c2, %c3, %zero) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
memref.store %f10, %in2D_nhwc_pw[%c0, %c0, %c1, %c0] : memref<?x?x?x?xf32>
call @pw_cbsm_conv2d(%in2D_nhwc_pw, %filter2D_nhwc_pw, %out2D_nhwc_pw) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
%out2D_nhwc_pw_ = memref.cast %out2D_nhwc_pw : memref<?x?x?x?xf32> to memref<*xf32>
call @print_memref_f32(%out2D_nhwc_pw_): (memref<*xf32>) -> ()
// dealloc memref
memref.dealloc %filter2D_nhwc : memref<?x?x?x?xf32>
memref.dealloc %in2D_nhwc : memref<?x?x?x?xf32>
memref.dealloc %out2D_nhwc : memref<?x?x?x?xf32>
memref.dealloc %filter2D_nhwc_pw : memref<?x?x?x?xf32>
memref.dealloc %in2D_nhwc_pw : memref<?x?x?x?xf32>
memref.dealloc %out2D_nhwc_pw : memref<?x?x?x?xf32>
return
}
}