#map0 = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> (d0 ceildiv 256)>
module {
func @conv_2d(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%0 = memref.dim %arg1, %c0 : memref<?x?xf32> // H
%1 = memref.dim %arg1, %c1 : memref<?x?xf32> // W
%2 = memref.dim %arg2, %c0 : memref<?x?xf32> //FH
%3 = memref.dim %arg2, %c1 : memref<?x?xf32> //FW
affine.for %arg3 = #map0(%c0) to #map0(%2) { // a3 : 0-fh
affine.for %arg4 = #map0(%c0) to #map0(%0) { // a4 : 0-h
affine.for %arg5 = #map0(%c0) to #map0(%1) { // a5 : 0-w
affine.for %arg6 = #map0(%c0) to #map1(%3) { // a6 : 0-up[fw/256]
// f4 = vector.load(filter[h,w])
%4 = affine.vector_load %arg1[%arg4, %arg5] : memref<?x?xf32>, vector<1xf32>
// vec.bcast(vector.load(filter[h,w])) 1-256
%5 = vector.broadcast %4 : vector<1xf32> to vector<256xf32>
// %6=vec.load256(img[fh+h, w+fw*256])
%6 = affine.vector_load %arg0[%arg3 + %arg4, %arg5 + %arg6 * 256] : memref<?x?xf32>, vector<256xf32>
// fi = vector.load(out[fw, up(fw/256)*256])
%7 = affine.vector_load %arg2[%arg3, %arg6 * 256] : memref<?x?xf32>, vector<256xf32>
// vec.fma(vec.load256(img[fh+h, w+fw*256])*vec.bcast(vector.load(filter[h,w]))
// +vec.load(out[fw, up(fw/256)*256]))
%8 = vector.fma %6, %5, %7 : vector<256xf32>
// out[fh,up(fw/256)*256]
affine.vector_store %8, %arg2[%arg3, %arg6 * 256] : memref<?x?xf32>, vector<256xf32>
}
}
}
}
return
}
}