// Generated from Mobilenet.mlir file
// func @pointwise_conv_2d_nhwc_hwcf_with_return_origin(%input: tensor<1x4x5x2xf32>, %filter: tensor<1x1x2x7xf32>) -> tensor<1x4x5x7xf32> {
// %0 = linalg.init_tensor [1, 4, 5, 7] : tensor<1x4x5x7xf32>
// %1 = linalg.conv_2d_nhwc_hwcf {
// dilations = dense<1> : tensor<2xi64>,
// strides = dense<1> : tensor<2xi64>
// } ins(%input, %filter : tensor<1x4x5x2xf32>, tensor<1x1x2x7xf32>) outs(%0 : tensor<1x4x5x7xf32>) -> tensor<1x4x5x7xf32>
// return %1 : tensor<1x4x5x7xf32>
// }
// Generated from Mobilenet.mlir file
func @pointwise_conv_2d_nhwc_hwcf_with_return_origin(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?xf32>, %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%zero = arith.constant 0.00000e+00 : f32
%KH = tensor.dim %filter, %c0 : tensor<?x?x?x?xf32> // FH
%KW = tensor.dim %filter, %c1 : tensor<?x?x?x?xf32> // FW
%KC = tensor.dim %filter, %c2 : tensor<?x?x?x?xf32> // FC
%ON = tensor.dim %output, %c0 : tensor<?x?x?x?xf32> // ON
%OH = tensor.dim %output, %c1 : tensor<?x?x?x?xf32> // OH
%OW = tensor.dim %output, %c2 : tensor<?x?x?x?xf32> // OW
%OF = tensor.dim %output, %c3 : tensor<?x?x?x?xf32> // OF
%buf = linalg.init_tensor [%ON, %OH, %OW, %OF] : tensor<?x?x?x?xf32>
%0 = linalg.fill(%zero, %buf) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>
%buf3 = linalg.init_tensor [%OH, %OW, %OF] : tensor<?x?x?xf32>
%res_of_0 = linalg.fill(%zero, %buf3) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
%1 = linalg.conv_2d_nhwc_hwcf {
dilations = dense<1> : tensor<2xi64>,
strides = dense<1> : tensor<2xi64>
} ins(%input, %filter : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) outs(%0 : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
return %1 : tensor<?x?x?x?xf32>
}
// func @pointwise_conv_2d_nhwc_hwcf_with_return(%arg0: tensor<1x4x5x2xf32>, %arg1: tensor<1x1x2x7xf32>) -> tensor<1x4x5x7xf32> {
// %0 = linalg.init_tensor [1, 4, 5, 7] : tensor<1x4x5x7xf32>
// %1 = tensor.collapse_shape %arg0 [[0, 1, 2], [3]] : tensor<1x4x5x2xf32> into tensor<20x2xf32>
// %2 = tensor.collapse_shape %arg1 [[0, 1, 2], [3]] : tensor<1x1x2x7xf32> into tensor<2x7xf32>
// %3 = tensor.collapse_shape %0 [[0, 1, 2], [3]] : tensor<1x4x5x7xf32> into tensor<20x7xf32>
// %4 = linalg.matmul ins(%1, %2 : tensor<20x2xf32>, tensor<2x7xf32>) outs(%3 : tensor<20x7xf32>) -> tensor<20x7xf32>
// %5 = tensor.expand_shape %4 [[0, 1, 2], [3]] : tensor<20x7xf32> into tensor<1x4x5x7xf32>
// return %5 : tensor<1x4x5x7xf32>
// }
// generate from iree processed mobilenet mlir file
// func @pointwise_conv_2d_nhwc_hwcf(%input: memref<?x?x?x?xf32>, %filter: memref<1x1x?x?xf32>, %output: memref<?x?x?x?xf32>) {
// linalg.conv_2d_nhwc_hwcf
// {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
// ins(%input, %filter : memref<?x?x?x?xf32>, memref<1x1x?x?xf32>)
// outs(%output : memref<?x?x?x?xf32>)
// return
// }
// // test for specific shape
// func @pointwise_conv_2d_nhwc_hwcf_spec(%input: memref<1x4x5x2xf32>, %filter: memref<1x1x2x7xf32>, %output: memref<1x4x5x7xf32>) {
// linalg.conv_2d_nhwc_hwcf
// {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
// ins(%input, %filter : memref<1x4x5x2xf32>, memref<1x1x2x7xf32>)
// outs(%output : memref<1x4x5x7xf32>)
// return
// }
// #map0 = affine_map<(d0) -> (d0)>
// #map1 = affine_map<(d0) -> (d0 ceildiv 256)>
// func @pw_cbsm_conv2d_outer_func_tensor(
// %input: tensor<?x?x?x?xf32>,
// %filter: tensor<?x?x?x?xf32>,
// %output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
// %c0 = arith.constant 0 : index
// %c1 = arith.constant 1 : index
// %c2 = arith.constant 2 : index
// %c3 = arith.constant 3 : index
// %KH = tensor.dim %filter, %c0 : tensor<?x?x?x?xf32> // FH
// %KW = tensor.dim %filter, %c1 : tensor<?x?x?x?xf32> // FW
// %KC = tensor.dim %filter, %c2 : tensor<?x?x?x?xf32> // FC
// %ON = tensor.dim %output, %c0 : tensor<?x?x?x?xf32> // ON
// %OH = tensor.dim %output, %c1 : tensor<?x?x?x?xf32> // OH
// %OW = tensor.dim %output, %c2 : tensor<?x?x?x?xf32> // OW
// %OF = tensor.dim %output, %c3 : tensor<?x?x?x?xf32> // OF
// // -1. out for adds out_adds_kc_tmp <1,OH,OW,1>
// %zero = arith.constant 0.00000e+00 : f32
// %buf = linalg.init_tensor [%ON, %OH, %OW, %OF] : tensor<?x?x?x?xf32>
// %buf3 = linalg.init_tensor [%OH, %OW, %OF] : tensor<?x?x?xf32>
// %res_of_0 = linalg.fill(%zero, %buf3) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
// %res_on_0 = linalg.fill(%zero, %buf) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>
// %res_on = affine.for %on = #map0(%c0) to #map0(%ON) // on : 0-on(batch)
// iter_args(%sum_on_iter = %res_on_0) -> (tensor<?x?x?x?xf32>) {
// // iter_args binds initial values to the loop's region arguments.
// // omp.parallel {
// %res_of = affine.for %of = #map0(%c0) to #map0(%OF) // step (%c1)
// iter_args(%sum_of_iter = %res_of_0) -> (tensor<?x?x?xf32>) {
// // 0. out for adds out_adds_kc_tmp <1,OH,OW,1>
// // %out_adds_kc_tmp = arith.constant dense<0> : tensor<%ONx%OHx%OWx%OFxf32>
// %buf1 = linalg.init_tensor [%OH, %OW] : tensor<?x?xf32>
// %out_adds_kc_tmp_0 = linalg.fill(%zero, %buf1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
// // iter_args binds initial values to the loop's region arguments.
// %out_adds_kc_tmp = affine.for %kc = #map0(%c0) to #map0(%KC)
// iter_args(%sum_iter = %out_adds_kc_tmp_0) -> (tensor<?x?xf32>) {
// // 1. init kc_out_tmp[OH,OW]
// %buf2 = linalg.init_tensor [%OH, %OW] : tensor<?x?xf32>
// %output_inner = linalg.fill(%zero, %buf2) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
// // 2. silce input for cbsm
// // input_inner = input[on,:,:,kc]
// // filter_inner = filter[0,0,kc,of]
// // %input_inner = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>
// // %filter_inner = tensor.extract_slice %filter[0,0,%kc,%of][%KH,%KW,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>
// %input_inner0 = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<1x?x?x1xf32>
// // %filter_inner0 = tensor.extract_slice %filter[0,0,%kc,%of][%KH,%KW,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?x1x1xf32>
// %filter_inner0 = tensor.extract_slice %filter[0,0,%kc,%of][1,1,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<1x1x1x1xf32>
// %input_inner1 = tensor.collapse_shape %input_inner0 [[0, 1], [2, 3]] : tensor<1x?x?x1xf32> into tensor<?x?xf32>
// %filter_inner1 = tensor.collapse_shape %filter_inner0 [[0, 1, 2], [3]] : tensor<1x1x1x1xf32> into tensor<1x1xf32>
// // 3. call conv_2d
// // %output_inner1 = call @conv_2d_tensor(%input_inner1, %filter_inner1, %output_inner) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
// %output_inner1 = linalg.conv_2d
// ins (%input_inner1, %filter_inner1: tensor<?x?xf32>, tensor<1x1xf32>)
// outs (%output_inner: tensor<?x?xf32>) -> tensor<?x?xf32>
// // %dynamic_2 = tensor.cast %output_inner : tensor<?x?xf32> to tensor<*xf32>
// // call @print_memref_f32(%dynamic_2): (tensor<*xf32>) -> ()
// %out_adds_kc_tmp_next = arith.addf %output_inner1, %sum_iter : tensor<?x?xf32>
// // Yield current iteration sum to next iteration %sum_iter or to %sum
// // if final iteration.
// affine.yield %out_adds_kc_tmp_next : tensor<?x?xf32>
// } // end-kc
// // a += a
// // 5. insert added kc_out to one layer of real output using: output.insert_stride_slice 2x3x1
// %out_adds_kc_tmp_expanded = tensor.expand_shape %out_adds_kc_tmp [[0],[1,2]] : tensor<?x?xf32> into tensor<?x?x1xf32>
// %res_next = tensor.insert_slice %out_adds_kc_tmp_expanded into %sum_of_iter[0,0,%of][%OH,%OW,1][1,1,1] : tensor<?x?x1xf32> into tensor<?x?x?xf32>
// // res insert a into res
// // Yield current iteration sum to next iteration %sum_iter or to %sum
// // if final iteration.
// affine.yield %res_next : tensor<?x?x?xf32>
// } // end-of
// // 1 231
// %res_of_expand = tensor.expand_shape %res_of [[0,1],[2],[3]] : tensor<?x?x?xf32> into tensor<1x?x?x?xf32>
// %res_on_next = tensor.insert_slice %res_of_expand into %sum_on_iter[%on,0,0,0][1,%OH,%OW,%OF][1,1,1,1] : tensor<1x?x?x?xf32> into tensor<?x?x?x?xf32>
// // omp.yield %res_on_next : tensor<?x?x?x?xf32>
// // }
// affine.yield %res_on_next : tensor<?x?x?x?xf32>
// } // end-on
// return %res_on : tensor<?x?x?x?xf32>
// }
#map0 = affine_map<(d0) -> (d0)>
#map = affine_map<()[s0] -> (s0 ceildiv 256)>
func @conv_2d_cbsm_memref(%arg0: memref<?x?xf32>, %arg1: memref<1x1xf32>, %arg2: memref<?x?xf32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c256 = arith.constant 256 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = splat %cst : vector<256xf32>
%1 = memref.dim %arg1, %c0 : memref<1x1xf32>
%2 = memref.dim %arg1, %c1 : memref<1x1xf32>
%3 = memref.dim %arg2, %c0 : memref<?x?xf32>
%4 = memref.dim %arg2, %c1 : memref<?x?xf32>
affine.for %arg3 = 0 to %3 {
affine.for %arg4 = 0 to %1 {
affine.for %arg5 = 0 to %2 {
affine.for %arg6 = 0 to #map()[%4] {
%5 = affine.vector_load %arg1[%arg4, %arg5] : memref<1x1xf32>, vector<1xf32>
%6 = vector.broadcast %5 : vector<1xf32> to vector<256xf32>
%7 = arith.muli %arg6, %c256 : index
%8 = arith.subi %4, %7 : index
%9 = arith.cmpi sge, %8, %c256 : index
scf.if %9 {
%10 = affine.vector_load %arg0[%arg3 + %arg4, %arg5 + %arg6 * 256] : memref<?x?xf32>, vector<256xf32>
%11 = affine.vector_load %arg2[%arg3, %arg6 * 256] : memref<?x?xf32>, vector<256xf32>
%12 = vector.fma %10, %6, %11 : vector<256xf32>
affine.vector_store %12, %arg2[%arg3, %arg6 * 256] : memref<?x?xf32>, vector<256xf32>
} else {
%10 = vector.create_mask %8 : vector<256xi1>
%11 = arith.addi %arg3, %arg4 : index
%12 = arith.muli %arg6, %c256 : index
%13 = arith.addi %arg5, %12 : index
%14 = vector.maskedload %arg0[%11, %13], %10, %0 : memref<?x?xf32>, vector<256xi1>, vector<256xf32> into vector<256xf32>
%15 = vector.maskedload %arg2[%arg3, %12], %10, %0 : memref<?x?xf32>, vector<256xi1>, vector<256xf32> into vector<256xf32>
%16 = vector.fma %14, %6, %15 : vector<256xf32>
vector.maskedstore %arg2[%arg3, %12], %10, %16 : memref<?x?xf32>, vector<256xi1>, vector<256xf32>
}
}
}
}
}
return
}
func @pw_cbsm_conv2d_outer_func_tensor(
%input: tensor<?x?x?x?xf32>,
%filter: tensor<?x?x?x?xf32>,
%output: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%c3 = arith.constant 3 : index
%KH = tensor.dim %filter, %c0 : tensor<?x?x?x?xf32> // FH
%KW = tensor.dim %filter, %c1 : tensor<?x?x?x?xf32> // FW
%KC = tensor.dim %filter, %c2 : tensor<?x?x?x?xf32> // FC
%ON = tensor.dim %output, %c0 : tensor<?x?x?x?xf32> // ON
%OH = tensor.dim %output, %c1 : tensor<?x?x?x?xf32> // OH
%OW = tensor.dim %output, %c2 : tensor<?x?x?x?xf32> // OW
%OF = tensor.dim %output, %c3 : tensor<?x?x?x?xf32> // OF
// -1. out for adds out_adds_kc_tmp <1,OH,OW,1>
%zero = arith.constant 0.00000e+00 : f32
%buf = linalg.init_tensor [%ON, %OH, %OW, %OF] : tensor<?x?x?x?xf32>
%buf3 = linalg.init_tensor [%OH, %OW, %OF] : tensor<?x?x?xf32>
%res_of_0 = linalg.fill(%zero, %buf3) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
%res_on_0 = linalg.fill(%zero, %buf) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>
%res_on = affine.for %on = #map0(%c0) to #map0(%ON) // on : 0-on(batch)
iter_args(%sum_on_iter = %res_on_0) -> (tensor<?x?x?x?xf32>) {
// iter_args binds initial values to the loop's region arguments.
%res_of = affine.for %of = #map0(%c0) to #map0(%OF) // step (%c1)
iter_args(%sum_of_iter = %res_of_0) -> (tensor<?x?x?xf32>) {
// 0. out for adds out_adds_kc_tmp <1,OH,OW,1>
// %out_adds_kc_tmp = arith.constant dense<0> : tensor<%ONx%OHx%OWx%OFxf32>
%buf1 = linalg.init_tensor [%OH, %OW] : tensor<?x?xf32>
%out_adds_kc_tmp_0 = linalg.fill(%zero, %buf1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
// iter_args binds initial values to the loop's region arguments.
%out_adds_kc_tmp = affine.for %kc = #map0(%c0) to #map0(%KC)
iter_args(%sum_iter = %out_adds_kc_tmp_0) -> (tensor<?x?xf32>) {
// 1. init kc_out_tmp[OH,OW]
%buf2 = linalg.init_tensor [%OH, %OW] : tensor<?x?xf32>
%output_inner = linalg.fill(%zero, %buf2) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
// 2. silce input for cbsm
// input_inner = input[on,:,:,kc]
// filter_inner = filter[0,0,kc,of]
// %input_inner = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>
// %filter_inner = tensor.extract_slice %filter[0,0,%kc,%of][%KH,%KW,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?xf32>
%input_inner0 = tensor.extract_slice %input[%on,0,0,%kc][1,%OH,%OW,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<1x?x?x1xf32>
// %filter_inner0 = tensor.extract_slice %filter[0,0,%kc,%of][%KH,%KW,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<?x?x1x1xf32>
%filter_inner0 = tensor.extract_slice %filter[0,0,%kc,%of][1,1,1,1][1,1,1,1] : tensor<?x?x?x?xf32> to tensor<1x1x1x1xf32>
%input_inner1 = tensor.collapse_shape %input_inner0 [[0, 1], [2, 3]] : tensor<1x?x?x1xf32> into tensor<?x?xf32>
%filter_inner1 = tensor.collapse_shape %filter_inner0 [[0, 1, 2], [3]] : tensor<1x1x1x1xf32> into tensor<1x1xf32>
// 3. call conv_2d
// %output_inner1 = call @conv_2d_tensor(%input_inner1, %filter_inner1, %output_inner) : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
%input_inner1_memref = bufferization.to_memref %input_inner1 : memref<?x?xf32>
%filter_inner1_memref = bufferization.to_memref %filter_inner1 : memref<1x1xf32>
%output_inner_memref = bufferization.to_memref %output_inner : memref<?x?xf32>
call @conv_2d_cbsm_memref(%input_inner1_memref, %filter_inner1_memref, %output_inner_memref) : (memref<?x?xf32>, memref<1x1xf32>, memref<?x?xf32>) -> ()
%output_inner1 = bufferization.to_tensor %output_inner_memref : memref<?x?xf32>
// %dynamic_2 = tensor.cast %output_inner : tensor<?x?xf32> to tensor<*xf32>
// call @print_memref_f32(%dynamic_2): (tensor<*xf32>) -> ()
%out_adds_kc_tmp_next = arith.addf %output_inner1, %sum_iter : tensor<?x?xf32>
// Yield current iteration sum to next iteration %sum_iter or to %sum
// if final iteration.
affine.yield %out_adds_kc_tmp_next : tensor<?x?xf32>
} // end-kc
// a += a
// 5. insert added kc_out to one layer of real output using: output.insert_stride_slice 2x3x1
%out_adds_kc_tmp_expanded = tensor.expand_shape %out_adds_kc_tmp [[0],[1,2]] : tensor<?x?xf32> into tensor<?x?x1xf32>
%res_next = tensor.insert_slice %out_adds_kc_tmp_expanded into %sum_of_iter[0,0,%of][%OH,%OW,1][1,1,1] : tensor<?x?x1xf32> into tensor<?x?x?xf32>
// res insert a into res
// Yield current iteration sum to next iteration %sum_iter or to %sum
// if final iteration.
affine.yield %res_next : tensor<?x?x?xf32>
} // end-of
// 1 231
%res_of_expand = tensor.expand_shape %res_of [[0,1],[2],[3]] : tensor<?x?x?xf32> into tensor<1x?x?x?xf32>
%res_on_next = tensor.insert_slice %res_of_expand into %sum_on_iter[%on,0,0,0][1,%OH,%OW,%OF][1,1,1,1] : tensor<1x?x?x?xf32> into tensor<?x?x?x?xf32>
affine.yield %res_on_next : tensor<?x?x?x?xf32>
} // end-on
return %res_on : tensor<?x?x?x?xf32>
}