ncnn - YingkunZhou/transfer-learning GitHub Wiki
#if NCNN_BENCHMARK
b /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:238
b /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:784
b /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:1041
b /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:1234
b /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:791
b /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:1048
/media/loongson/phd19/home/zhou/graduate9/work/ncnn/build/src/layer_registry.h
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2088 */
Extractor Net::create_extractor() const {
return Extractor(this, d->blobs.size());
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2273 */
Extractor::Extractor(const Net* _net, size_t blob_count): d(new ExtractorPrivate(_net)) {
d->blob_mats.resize(blob_count);
d->opt = d->net->opt;
d->local_blob_vkallocator = 0;
d->local_staging_vkallocator = 0;
d->blob_mats_gpu.resize(blob_count);
d->blob_mats_gpu_image.resize(blob_count);
}
}
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2407 */
int Extractor::input(const char* blob_name, const Mat& in) {
return input(blob_index, in);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2444 */
int Extractor::input(int blob_index, const Mat& in)
d->blob_mats[blob_index] = in;
}
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2423 */
int Extractor::extract(const char* blob_name, Mat& feat, int type) {
int blob_index = d->net->find_blob_index_by_name(blob_name);
return extract(blob_index, feat, type);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2452 */
int Extractor::extract(int blob_index, Mat& feat, int type) {
set_kmp_blocktime(d->opt.openmp_blocktime);
set_flush_denormals(d->opt.flush_denormals);
d->blob_mats[blob_index].dims
int layer_index = d->net->blobs()[blob_index].producer;
d->opt.blob_allocator = d->net->d->local_blob_allocator;
d->opt.workspace_allocator = d->net->d->local_workspace_allocator;
d->opt.use_vulkan_compute
d->local_blob_vkallocator = d->net->vulkan_device()->acquire_blob_allocator();
d->opt.blob_vkallocator = d->local_blob_vkallocator;
d->opt.workspace_vkallocator = d->opt.blob_vkallocator;
d->local_staging_vkallocator = d->net->vulkan_device()->acquire_staging_allocator();
d->opt.staging_vkallocator = d->local_staging_vkallocator;
ncnn::VkCompute cmd(d->net->vulkan_device());
!d->opt.use_image_storage
ret = extract(blob_index, feat_gpu, cmd); | VkMat feat_gpu;
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2713 */
int Extractor::extract(int blob_index, VkMat& feat, VkCompute& cmd) {
set_kmp_blocktime(d->opt.openmp_blocktime);
set_flush_denormals(d->opt.flush_denormals);
d->blob_mats_gpu[blob_index].dims == 0
d->blob_mats_gpu_image[blob_index].dims == 0
d->blob_mats[blob_index].dims == 0
ret = d->net->d->forward_layer(layer_index, d->blob_mats, d->blob_mats_gpu, cmd, d->opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:234 */
int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector<VkMat>& blob_mats_gpu, VkCompute& cmd, const Option& opt) const {
const Layer* layer = layers[layer_index];
for (size_t i = 0; i < layer->bottoms.size(); i++) // load bottom blobs
int bottom_blob_index = layer->bottoms[i];
blob_mats_gpu[bottom_blob_index].dims == 0 && blob_mats[bottom_blob_index].dims == 0
int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, cmd, opt);
.... 递归调用!
layer->support_vulkan
blob_mats_gpu[bottom_blob_index].dims == 0
cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); // host to buffer
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:358 */
void VkCompute::record_upload(const Mat& src, VkMat& dst, const Option& opt) {
src.elemsize == src.elempack * 4u
!(vkdev->info.type() == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0)))
dst_staging.create_like(src_fp16, opt.staging_vkallocator); // VkMat dst_staging;
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:814 */
void VkMat::create_like(const Mat& m, VkAllocator* _allocator) {
int _dims = m.dims;
create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); // _dims == 3
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:748 */
void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator) {
!(dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
release();
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.h:1543 */
NCNN_FORCEINLINE void VkMat::release() {
!(refcount && NCNN_XADD(refcount, -1) == 1) // refcount==0
}
cstep = alignSize(w * h * elemsize, 16) / elemsize;
total() > 0
size_t totalsize = alignSize(total() * elemsize, 4);
data = allocator->fastMalloc(totalsize);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:1695 */
VkBufferMemory* VkStagingAllocator::fastMalloc(size_t size) {
VkBufferMemory* ptr = new VkBufferMemory;
ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:434 */
VkBuffer VkAllocator::create_buffer(size_t size, VkBufferUsageFlags usage) {
VkResult ret = <<vkCreateBuffer(vkdev->vkdevice(), &bufferCreateInfo, 0, &buffer);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:2340 */
VkDevice VulkanDevice::vkdevice() const
}
<<vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements);
buffer_memory_type_index == (uint32_t)-1
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:2690 */
uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const {
const VkPhysicalDeviceMemoryProperties& memory_properties = info.physical_device_memory_properties();
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:285 */
const VkPhysicalDeviceMemoryProperties& GpuInfo::physical_device_memory_properties() const {
return d->physical_device_memory_properties;
}
}
ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:457 */
VkDeviceMemory VkAllocator::allocate_memory(size_t size, uint32_t memory_type_index) {
VkResult ret = <<vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
}
<<vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0);
<<vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);
}
refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
}
}
d->upload_staging_buffers.push_back(dst_staging);
// memcpy src to device
<<memcpy(dst_staging.mapped_ptr(), src_fp16.data, src_fp16.total() * src_fp16.elemsize);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.h:1529 */
NCNN_FORCEINLINE void* VkMat::mapped_ptr() const
dst_staging.allocator->flush(dst_staging.data);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:390 */
int VkAllocator::flush(VkBufferMemory* ptr)
coherent
vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:3064 */
void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& _opt) const {
src.elembits() == 32
const ncnn::Packing_vulkan* uop = d->get_utility_operator(0, 0, cast_type_from_index, cast_type_to_index, packing_type_to_index);
const ncnn::Packing_vulkan*
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1899 */
VulkanDevicePrivate::get_utility_operator(int storage_type_from, int storage_type_to, int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const {
ncnn::Packing_vulkan* uop = new ncnn::Packing_vulkan;
uop->load_param(pd); // ncnn::ParamDict pd;
uop->create_pipeline(opt); // Option opt;
}
uop->forward(src, dst, cmd, opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/vulkan/packing_vulkan.cpp:257 */
int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const {
out_elemsize = out_elempack * 2u;
top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:748 */
void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
elempack == 1 && out_elempack == 1
cmd.record_pipeline(pipeline_packing, buffer_bindings, image_bindings, constants, top_blob);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:1506 */
void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher) {
Mat dispatcher_mat(dispatcher.w, dispatcher.h, dispatcher.d, dispatcher.c, (void*)0);
record_pipeline(pipeline, buffer_bindings, image_bindings, constants, dispatcher_mat);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:1520 */
void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher) {
const ShaderInfo& shader_info = pipeline->shader_info();
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/pipeline.cpp:170 */
return d->shader_info;
binding_type == 1
const VkMat& binding = buffer_bindings[buffer_index].empty() ? vkdev->get_dummy_buffer() : buffer_bindings[buffer_index];
barrier_readwrite(binding);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:2600 */
void VkCompute::barrier_readwrite(const VkMat& binding) {
binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT
VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1];
vkdev->info.support_VK_KHR_push_descriptor()
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:600 */
int GpuInfo::support_VK_KHR_push_descriptor() const
return d->support_VK_KHR_push_descriptor;
<<vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0);
}
<<vkCmdBindPipeline(d->compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline());
binding_count > 0
descriptorInfos.resize(sizeof(VkDescriptorBufferInfo) * buffer_binding_count + sizeof(VkDescriptorImageInfo) * image_binding_count);
memcpy(p_descriptorInfos, &descriptorBufferInfo, sizeof(VkDescriptorBufferInfo)); // VkDescriptorBufferInfo descriptorBufferInfo;
vkdev->vkCmdPushDescriptorSetWithTemplateKHR(d->compute_command_buffer, pipeline->descriptor_update_template(), pipeline->pipeline_layout(), 0, descriptorInfos.data());
// /usr/lib/aarch64-linux-gnu/tegra/libnvidia-eglcore.so.35.2.1
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/pipeline.cpp:165 */
VkDescriptorUpdateTemplateKHR Pipeline::descriptor_update_template() const
return d->descriptor_update_template;
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/pipeline.cpp:155 */
VkPipelineLayout Pipeline::pipeline_layout() const
return d->pipeline_layout;
<<vkCmdPushConstants(d->compute_command_buffer, pipeline->pipeline_layout(), VK_SHADER_STAGE_COMPUTE_BIT, 0, constant_count * sizeof(vk_constant_type), constants.data());
<<vkCmdDispatch(d->compute_command_buffer, group_count_x, group_count_y, group_count_z);
}
}
}
}
}
!layer->featmask
ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:879 */
int NetPrivate::do_forward_layer(const Layer* layer, std::vector<VkMat>& blob_mats_gpu, VkCompute& cmd, const Option& opt) const {
layer->one_blob_only
!(layer->support_inplace && *bottom_blob_ref.refcount != 1)
int ret = layer->forward(bottom_blob, top_blob, cmd, opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/vulkan/convolution_vulkan.cpp:1106 */
int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const {
(pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/vulkan/padding_vulkan.cpp:354 */
int Padding_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const {
top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:748 */
void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
cmd.record_pipeline(pipeline, bindings, constants, top_blob);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:1496 */
void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher) {
record_pipeline(pipeline, bindings, std::vector<VkImageMat>(), constants, dispatcher);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:1520 */
void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher)
}
}
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:748 */
void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:1496 */
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:1520 */
void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher)
}
}
}
}
}
}
g_blob_pool_allocator.set_size_compare_ratio(0.f);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:272 */
void UnlockedPoolAllocator::set_size_compare_ratio(float scr)
d->size_compare_ratio = (unsigned int)(scr * 256);
g_workspace_pool_allocator.set_size_compare_ratio(0.f);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:95 */
void PoolAllocator::set_size_compare_ratio(float scr)
d->size_compare_ratio = (unsigned int)(scr * 256);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:3285 */
VulkanDevice* get_gpu_device(int device_index) {
try_create_gpu_instance();
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1735 */
static void try_create_gpu_instance() {
!is_gpu_instance_ready()
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1727 */
static bool is_gpu_instance_ready() {
MutexLockGuard lock(g_instance_lock);
return (VkInstance)g_instance != 0;
}
create_gpu_instance();
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:900 */
int create_gpu_instance() {
MutexLockGuard lock(g_instance_lock);
(VkInstance)g_instance == 0
td::vector<const char*> enabledLayers;
ret = <<vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, NULL); | uint32_t instanceExtensionPropertyCount
std::vector<VkExtensionProperties> instanceExtensionProperties(instanceExtensionPropertyCount);
ret = <<vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, instanceExtensionProperties.data());
const VkExtensionProperties& exp = instanceExtensionProperties[j];
!support_VK_EXT_validation_features
support_VK_KHR_external_memory_capabilities --> enabledExtensions.push_back("VK_KHR_external_memory_capabilities");
support_VK_KHR_get_physical_device_properties2 --> enabledExtensions.push_back("VK_KHR_get_physical_device_properties2");
support_VK_KHR_get_surface_capabilities2 --> enabledExtensions.push_back("VK_KHR_get_surface_capabilities2");
!support_VK_KHR_portability_enumeration
support_VK_KHR_surface --> enabledExtensions.push_back("VK_KHR_surface");
typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t * pApiVersion);
PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion = (PFN_vkEnumerateInstanceVersion)<<vkGetInstanceProcAddr(0, "vkEnumerateInstanceVersion");
ret = vkEnumerateInstanceVersion(&instance_api_version);
ret = <<vkCreateInstance(&instanceCreateInfo, 0, &instance);
init_instance_extension();
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:674 */
static int init_instance_extension() {
// support_VK_KHR_external_memory_capabilities -->
vkGetPhysicalDeviceExternalBufferPropertiesKHR = (PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceExternalBufferPropertiesKHR");
// support_VK_KHR_get_physical_device_properties2 -->
vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures2KHR");
vkGetPhysicalDeviceProperties2KHR = (PFN_vkGetPhysicalDeviceProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceProperties2KHR");
vkGetPhysicalDeviceFormatProperties2KHR = (PFN_vkGetPhysicalDeviceFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFormatProperties2KHR");
vkGetPhysicalDeviceImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceImageFormatProperties2KHR");
vkGetPhysicalDeviceQueueFamilyProperties2KHR = (PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceQueueFamilyProperties2KHR");
vkGetPhysicalDeviceMemoryProperties2KHR = (PFN_vkGetPhysicalDeviceMemoryProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceMemoryProperties2KHR");
vkGetPhysicalDeviceSparseImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSparseImageFormatProperties2KHR");
// support_VK_KHR_get_surface_capabilities2 -->
vkGetPhysicalDeviceSurfaceCapabilities2KHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilities2KHR");
vkGetPhysicalDeviceSurfaceFormats2KHR = (PFN_vkGetPhysicalDeviceSurfaceFormats2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormats2KHR");
// support_VK_KHR_surface -->
vkDestroySurfaceKHR = (PFN_vkDestroySurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkDestroySurfaceKHR");
vkGetPhysicalDeviceSurfaceSupportKHR = (PFN_vkGetPhysicalDeviceSurfaceSupportKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceSupportKHR");
vkGetPhysicalDeviceSurfaceCapabilitiesKHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilitiesKHR");
vkGetPhysicalDeviceSurfaceFormatsKHR = (PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormatsKHR");
vkGetPhysicalDeviceSurfacePresentModesKHR = (PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfacePresentModesKHR");
// VK_NV_cooperative_matrix
vkGetPhysicalDeviceCooperativeMatrixPropertiesNV = (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesNV");
}
ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, 0);
std::vector<VkPhysicalDevice> physicalDevices(physicalDeviceCount);
ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, physicalDevices.data());
g_gpu_infos[gpu_info_index] = new GpuInfo;
GpuInfoPrivate& gpu_info = *(g_gpu_infos[gpu_info_index]->d)
<<vkGetPhysicalDeviceProperties(physicalDevice, &physicalDeviceProperties); | VkPhysicalDeviceProperties physicalDeviceProperties;
(physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) --> gpu_info.type = 1;
// find compute queue
vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, 0); | uint32_t queueFamilyPropertiesCount;
std::vector<VkQueueFamilyProperties> queueFamilyProperties(queueFamilyPropertiesCount);
<<vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, queueFamilyProperties.data());
gpu_info.compute_queue_family_index = find_device_compute_queue(queueFamilyProperties);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:752 */
static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
gpu_info.graphics_queue_family_index = find_device_graphics_queue(queueFamilyProperties);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:793 */
static uint32_t find_device_graphics_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
gpu_info.transfer_queue_family_index = find_device_transfer_queue(queueFamilyProperties);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:834 */
static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
<<vkGetPhysicalDeviceProperties2KHR(physicalDevice, &queryProperties);
// cache memory properties
<<vkGetPhysicalDeviceMemoryProperties(physicalDevice, &gpu_info.physical_device_memory_properties);
// get device extension
ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, NULL);
ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, deviceExtensionProperties.data());
// support_VK_KHR_get_physical_device_properties2
// query int8 storage
VkPhysicalDevice8BitStorageFeaturesKHR query8BitStorageFeatures;
// query fp16/int16 storage
VkPhysicalDevice16BitStorageFeaturesKHR query16BitStorageFeatures;
// query fp16/int8 arithmetic
VkPhysicalDeviceFloat16Int8FeaturesKHR queryFloat16Int8Features;
// query ycbcr_conversion
VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures;
// query cooperative_matrix
VkPhysicalDeviceCooperativeMatrixFeaturesNV queryCooperativeMatrixFeatures;
<<vkGetPhysicalDeviceFeatures2KHR(physicalDevice, &queryFeatures);
ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, 0);
ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, properties.data());
const VkCooperativeMatrixPropertiesNV& cmp = properties[j]; //j==5
--> gpu_info.support_cooperative_matrix_16_8_8 = true;
**NCNN_LOGE
// the default gpu device
g_default_gpu_index = find_default_vulkan_device_index();
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/glslang/glslang/MachineIndependent/ShaderLang.cpp:1740 */
bool InitializeProcess() {
return ShInitialize() != 0;
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/glslang/glslang/MachineIndependent/ShaderLang.cpp:1332 */
// ShInitialize() should be called exactly once per process, not per thread.
int ShInitialize() {
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/glslang/glslang/OSDependent/Unix/ossource.cpp:139 */
void InitGlobalLock()
}
}
}
}
g_default_vkdev[device_index] = new VulkanDevice(device_index);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:2028 */
VulkanDevice::VulkanDevice(int device_index)
: info(get_gpu_info(device_index)), d(new VulkanDevicePrivate(this)) {
try_create_gpu_instance();
std::vector<const char*> enabledExtensions;
info.support_VK_KHR_8bit_storage() --> enabledExtensions.push_back("VK_KHR_8bit_storage");
info.support_VK_KHR_16bit_storage() --> enabledExtensions.push_back("VK_KHR_16bit_storage");
info.support_VK_KHR_bind_memory2() --> enabledExtensions.push_back("VK_KHR_bind_memory2");
info.support_VK_KHR_buffer_device_address() --> enabledExtensions.push_back("VK_KHR_buffer_device_address");
info.support_VK_KHR_create_renderpass2() --> enabledExtensions.push_back("VK_KHR_create_renderpass2");
info.support_VK_KHR_dedicated_allocation() --> enabledExtensions.push_back("VK_KHR_dedicated_allocation");
...
>>> print *(enabledExtensions._M_impl._M_start)@enabledExtensions.size()
$62 = {[0] = 0xfffff7da57e0 "VK_KHR_8bit_storage", [1] = 0xfffff7da57f8 "VK_KHR_16bit_storage", [2] = 0xfffff7da5810 "VK_KHR_bind_memory2", [3] = 0xfffff7da5828 "VK_KHR_buffer_device_address", [4] = 0xfffff7da5848 "VK_KHR_create_renderpass2", [5] = 0xfffff7da5868 "VK_KHR_dedicated_allocation", [6] = 0xfffff7da5888 "VK_KHR_descriptor_update_template", [7] = 0xfffff7da58b0 "VK_KHR_external_memory", [8] = 0xfffff7da58c8 "VK_KHR_get_memory_requirements2", [9] = 0xfffff7da58e8 "VK_KHR_maintenance1", [10] = 0xfffff7da5900 "VK_KHR_maintenance2", [11] = 0xfffff7da5918 "VK_KHR_maintenance3", [12] = 0xfffff7da5930 "VK_KHR_multiview", [13] = 0xfffff7da5968 "VK_KHR_push_descriptor", [14] = 0xfffff7da5980 "VK_KHR_sampler_ycbcr_conversion", [15] = 0xfffff7da59a0 "VK_KHR_shader_float16_int8", [16] = 0xfffff7da59c0 "VK_KHR_shader_float_controls", [17] = 0xfffff7da59e0 "VK_KHR_storage_buffer_storage_class", [18] = 0xfffff7da5a08 "VK_KHR_swapchain", [19] = 0xfffff7da5a40 "VK_EXT_descriptor_indexing", [20] = 0xfffff7da5a60 "VK_EXT_memory_budget", [21] = 0xfffff7da5a90 "VK_EXT_queue_family_foreign", [22] = 0xfffff7da5ad0 "VK_NV_cooperative_matrix"}
// enable int8 storage
VkPhysicalDevice8BitStorageFeaturesKHR enabled8BitStorageFeatures;
// enable fp16/int16 storage
VkPhysicalDevice16BitStorageFeaturesKHR enabled16BitStorageFeatures;
// enable fp16/int8 arithmetic
VkPhysicalDeviceFloat16Int8FeaturesKHR enabledFloat16Int8Features;
// enable ycbcr conversion
VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures;
// enable cooperative matrix
VkPhysicalDeviceCooperativeMatrixFeaturesNV queryCooperativeMatrixFeatures;
VkDeviceQueueCreateInfo deviceComputeQueueCreateInfo;
VkDeviceQueueCreateInfo deviceGraphicsQueueCreateInfo;
VkDeviceQueueCreateInfo deviceTransferQueueCreateInfo;
VkResult ret = <<vkCreateDevice(info.physical_device(), &deviceCreateInfo, 0, &d->device);
init_device_extension();
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:3198 */
<<vkGetDeviceQueue(d->device, info.compute_queue_family_index(), i, &d->compute_queues[i]);
<<vkGetDeviceQueue(d->device, info.graphics_queue_family_index(), i, &d->graphics_queues[i]);
<<vkGetDeviceQueue(d->device, info.transfer_queue_family_index(), i, &d->transfer_queues[i]);
prepare immutable texelfetch sampler
VkSamplerCreateInfo samplerCreateInfo;
ret = vkCreateSampler(d->device, &samplerCreateInfo, 0, &d->texelfetch_sampler);
int cret = d->create_dummy_buffer_image();
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1858 */
int VulkanDevicePrivate::create_dummy_buffer_image() {
dummy_allocator = new VkDummyAllocator(vkdev);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1765 */
class VkDummyAllocator : public VkBlobAllocator {
VkBlobAllocator(_vkdev, 16 * 1024)
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:594 */
VkBlobAllocator::VkBlobAllocator(const VulkanDevice* _vkdev, size_t preferred_block_size) {
VkAllocator(_vkdev), d(new VkBlobAllocatorPrivate)
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:362 */
VkAllocator::VkAllocator(const VulkanDevice* _vkdev)
}
dummy_buffer.create(1, 4u, dummy_allocator);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:552 */
void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator) {
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:675 */
VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size)
**NCNN_LOGE
}
dummy_image.create(1, 4u, dummy_allocator);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:855 */
void VkImageMat::create(int _w, size_t _elemsize, VkAllocator* _allocator) {
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:860 */
VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int elempack) {
VkImageMemory* ptr = new VkImageMemory;
ptr->image = create_image(width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:503 */
VkImage VkAllocator::create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage) {
VkResult ret = <<vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image);
}
vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements); | VkMemoryRequirements memoryRequirements;
ptr->imageview = create_imageview(ptr->image, format)
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:535 */
VkImageView VkAllocator::create_imageview(VkImage image, VkFormat format)
VkResult ret = <<vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview);
**NCNN_LOGE
}
}
dummy_image_readonly.create(1, 4u, dummy_allocator);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:855 */
VkDummyCompute cmd(vkdev);
class VkDummyCompute : public VkCompute
cmd.record_dummy(dummy_buffer);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1780 */
void record_dummy(const VkMat& buffer) {
barrier_readwrite(buffer);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:2601 */
void VkCompute::barrier_readwrite(const VkMat& binding) {
(binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
vkdev->info.support_VK_KHR_push_descriptor() --> vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0);
}
}
cmd.record_dummy(dummy_image);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1785 */
void record_dummy(const VkImageMat& image) {
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:2643 */
void VkCompute::barrier_readwrite(const VkImageMat& binding) {
(binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_GENERAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
vkdev->info.support_VK_KHR_push_descriptor() --> vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers);
}
}
cmd.record_dummy_readonly(dummy_image_readonly);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1790 */
void record_dummy_readonly(const VkImageMat& image) {
barrier_readonly(image);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:2691 */
void VkCompute::barrier_readonly(const VkImageMat& binding) {
vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers);
}
}
return cmd.submit_and_wait();
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:2292 */
int VkCompute::submit_and_wait() {
vkdev->info.support_VK_KHR_push_descriptor()
d->end_command_buffer();
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:338 */
int VkComputePrivate::end_command_buffer() {
VkResult ret = <<vkEndCommandBuffer(compute_command_buffer);
}
VkQueue compute_queue = vkdev->acquire_queue(vkdev->info.compute_queue_family_index());
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:2774 */
VkQueue VulkanDevice::acquire_queue(uint32_t queue_family_index) const
VkResult ret = <<vkQueueSubmit(compute_queue, 1, &submitInfo, d->compute_command_fence);
vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:2833 */
void VulkanDevice::reclaim_queue(uint32_t queue_family_index, VkQueue queue) const
VkResult ret = <<vkWaitForFences(vkdev->vkdevice(), 1, &d->compute_command_fence, VK_TRUE, (uint64_t)-1);
d->delayed_records.size()
d->delayed_records.clear();
}
}
}
d->pipeline_cache = new PipelineCache(this);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/pipelinecache.cpp:172 */
PipelineCache::PipelineCache(const VulkanDevice* _vkdev): vkdev(_vkdev), d(new PipelineCachePrivate)
memset(d->uop_packing, 0, sizeof(d->uop_packing));
}
}
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:1859 */
int Net::load_param(const char* protopath) {
int ret = load_param(fp);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:1845 */
int Net::load_param(FILE* fp) {
DataReaderFromStdio dr(fp);
return load_param(dr);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:1289 */
int Net::load_param(const DataReader& dr) {
SCAN_VALUE("%d", magic)
SCAN_VALUE("%d", layer_count)
SCAN_VALUE("%d", blob_count)
// TODO enable gpu when bf16 conversion implemented
if (opt.use_bf16_storage) --> opt.use_vulkan_compute = false;
... ...
SCAN_VALUE("%255s", layer_type)
SCAN_VALUE("%255s", layer_name)
SCAN_VALUE("%d", bottom_count)
SCAN_VALUE("%d", top_count)
Layer* layer = create_overwrite_builtin_layer(layer_type);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2204 */
Layer* Net::create_overwrite_builtin_layer(const char* type) {
int typeindex = layer_to_index(type);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer.cpp:208 */
int layer_to_index(const char* type)
return create_overwrite_builtin_layer(typeindex);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2229 */
layer = create_layer(layer_type);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer.cpp:219 */
Layer* create_layer(const char* type) {
int index = layer_to_index(type);
return create_layer(index);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer.cpp:229 */
Layer* create_layer(int index) {
layer_creator = layer_registry[index].creator;
Layer* layer = layer_creator(0);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/build/src/layer_declaration.h:340 */
DEFINE_LAYER_CREATOR(Input_final)
DEFINE_LAYER_CREATOR(Reshape_final)
}
}
}
SCAN_VALUE("%255s", blob_name)
int pdlr = pd.load_param(dr);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/paramdict.cpp:247 */
int ParamDict::load_param(const DataReader& dr) {
clear();
bool is_float = vstr_is_float(vstr);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/paramdict.cpp:154 */
static bool vstr_is_float(const char vstr[16])
}
// pull out top shape hints
Mat shape_hints = pd.get(30, Mat());
shape_hints.empty()
layer->featmask = pd.get(31, 0);
int lr = layer->load_param(pd);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/input.cpp:31 */
int Input::load_param(const ParamDict& pd)
}
}
}
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:1900 */
int Net::load_model(const char* modelpath) {
int ret = load_model(fp);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:1893 */
int Net::load_model(FILE* fp) {
return load_model(dr);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:1726 */
int Net::load_model(const DataReader& dr) {
ModelBinFromDataReader mb(dr);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/modelbin.cpp:74 */
ModelBinFromDataReader::ModelBinFromDataReader(const DataReader& _dr) : ModelBin(), d(new ModelBinFromDataReaderPrivate(_dr))
int lret = layer->load_model(mb);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/memorydata.cpp:36 */
int MemoryData::load_model(const ModelBin& mb) {
data = mb.load(w, h, c, 1);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/modelbin.cpp:46 */
Mat ModelBin::load(int w, int h, int c, int type) const {
Mat m = load(w * h * c, type);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/modelbin.cpp:94 */
Mat ModelBinFromDataReader::load(int w, int type) const
return m.reshape(w, h, c);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:144 */
Mat Mat::reshape(int _w, int _h, int _c, Allocator* _allocator) const {
m.create(_w, _h, _c, elemsize, elempack, _allocator);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:440 */
void Mat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator) {
data = fastMalloc(totalsize + (int)sizeof(*refcount));
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.h:76 */
static NCNN_FORCEINLINE void* fastMalloc(size_t size)
}
}
}
}
d->pipeline_cache = new PipelineCache(d->vkdev);
Option opt1 = get_masked_option(opt, layer->featmask);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:113 */
static Option get_masked_option(const Option& opt, int featmask)
int cret = layer->create_pipeline(opt1);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/build/src/layer_declaration.h:332 */
ncnn::Input_final::create_pipeline(ncnn::Option const&)
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/build/src/layer_declaration.h:391 */
ncnn::MemoryData_final::create_pipeline {
int ret = MemoryData_vulkan::create_pipeline(opt)
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/vulkan/memorydata_vulkan.cpp:28 */
int MemoryData_vulkan::create_pipeline(const Option& opt) {
vkdev->shape_support_image_storage(out_shape_packed)
}
}
d->local_blob_allocator = new PoolAllocator;
d->local_blob_allocator->set_size_compare_ratio(0.f);
d->local_workspace_allocator = new PoolAllocator;
d->local_workspace_allocator->set_size_compare_ratio(0.f);
ret = d->upload_model();
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:134 */
int NetPrivate::upload_model() {
ncnn::VkTransfer cmd(vkdev);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:2985 */
VkTransfer::VkTransfer(const VulkanDevice* _vkdev): vkdev(_vkdev), d(new VkTransferPrivate(_vkdev))
weight_vkallocator = new VkWeightAllocator(vkdev);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:1144 */
VkWeightAllocator::VkWeightAllocator(const VulkanDevice* _vkdev, size_t preferred_block_size): VkAllocator(_vkdev), d(new VkWeightAllocatorPrivate)
weight_staging_vkallocator = new VkWeightStagingAllocator(vkdev);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:1802 */
VkWeightStagingAllocator::VkWeightStagingAllocator(const VulkanDevice* _vkdev): VkAllocator(_vkdev), d(new VkWeightStagingAllocatorPrivate)
layers[i]->upload_model(cmd, get_masked_option(opt_upload, layers[i]->featmask))
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:113 */
static Option get_masked_option(const Option& opt, int featmask)
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/vulkan/memorydata_vulkan.cpp:65 */
int MemoryData_vulkan::upload_model(VkTransfer& cmd, const Option& opt) {
const Mat& shape = data.shape();
convert_packing(data, data_packed, elempack, opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:1560 */
void convert_packing(const Mat& src, Mat& dst, int _elempack, const Option& opt) {
Layer* packing = create_layer(LayerType::Packing); {
Layer* layer = layer_creator(0);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/build/src/layer_declaration.h:1252 */
DEFINE_LAYER_CREATOR(Packing_final)
}
packing->load_param(pd);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/packing.cpp:27 */
int Packing::load_param(const ParamDict& pd)
packing->create_pipeline(opt);
packing->forward(src, dst, opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/arm/packing_arm.cpp:37 */
int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const {
!use_padding
top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
pack1to4
// why GDB cannot step
#pragma omp parallel for num_threads(opt.num_threads)
}
packing->destroy_pipeline(opt);
}
cmd.record_upload(data_packed, data_gpu, opt, /*bool flatten*/ false);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:2995 */
void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten) {
src.elembits() == 32
(opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0))
cast_float32_to_float16(src, src_fp16, opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:1595 */
void cast_float32_to_float16(const Mat& src, Mat& dst, const Option& opt) {
Layer* cast = create_layer(LayerType::Cast); {
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/build/src/layer_declaration.h:1295 */
DEFINE_LAYER_CREATOR(Cast_final)
class Cast_final : virtual public Cast, virtual public Cast_vulkan, virtual public Cast_arm
}
cast->load_param(pd);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/cast.cpp:28 */
int Cast::load_param(const ParamDict& pd)
cast->create_pipeline(opt);
cast->forward(src, dst, opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/arm/cast_arm.cpp:40 */
int Cast_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const {
top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
(type_from == 1 && type_to == 2) -->
cast_fp32_to_fp16_neon(bottom_blob, top_blob, opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/arm/cast_fp16.h:21 */
static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt) {
#pragma omp parallel for num_threads(opt.num_threads)
}
cast->destroy_pipeline(opt);
}
}
record_upload(src_fp16, dst, opt, flatten);
void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten) {
Mat src_flattened = flatten ? src.reshape(src.w * src.h * src.c) : src;
dst.create_like(src_flattened, opt.blob_vkallocator);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:816 */
void VkMat::create_like(const Mat& m, VkAllocator* _allocator) {
data = allocator->fastMalloc(totalsize);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:1234 */
VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size) {
VkBufferMemory* block = new VkBufferMemory;
block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
vkdev->vkGetBufferMemoryRequirements2KHR(vkdev->vkdevice(), &bufferMemoryRequirementsInfo2, &memoryRequirements2);
}
}
// memcpy src_flattened to device
dst.allocator->mappable --> memcpy(dst.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize);
dst.allocator->flush(dst.data);
// barrier device host-write @ null to shader-read @ compute
vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
}
}
}
return cmd.submit_and_wait();
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:3372 */
int VkTransfer::submit_and_wait() {
!vkdev->info.unified_compute_transfer_queue()
VkQueue compute_queue = vkdev->acquire_queue(vkdev->info.compute_queue_family_index());
VkQueue transfer_queue = vkdev->acquire_queue(vkdev->info.transfer_queue_family_index());
VkResult ret = vkQueueSubmit(transfer_queue, 1, &submitInfo, d->upload_command_fence);
VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, d->compute_command_fence);
vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index(), transfer_queue);
vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue);
VkFence fences[2] = {d->upload_command_fence, d->compute_command_fence};
VkResult ret = vkWaitForFences(vkdev->vkdevice(), 2, fences, VK_TRUE, (uint64_t)-1);
}
}
}
}
}
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:325 */
ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:880 */
int NetPrivate::do_forward_layer(const Layer* layer, std::vector<VkMat>& blob_mats_gpu, VkCompute& cmd, const Option& opt) const {
opt.lightmode
!layer->support_inplace
int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/vulkan/memorydata_vulkan.cpp:90 */
int MemoryData_vulkan::forward(const std::vector<VkMat>& /*bottom_blobs*/, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const {
cmd.record_clone(data_gpu, top_blob, opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:979 */
void VkCompute::record_clone(const VkMat& src, VkMat& dst, const Option& opt) {
dst.create_like(src, opt.blob_vkallocator);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:829 */
void VkMat::create_like(const VkMat& m, VkAllocator* _allocator) {
create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator);
}
(src.data->access_flags & VK_ACCESS_TRANSFER_WRITE_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) -->
vkdev->info.support_VK_KHR_push_descriptor() -->
vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0);
VkBufferCopy* regions = new VkBufferCopy[1];
// record device to staging
vkdev->info.support_VK_KHR_push_descriptor() -->
vkCmdCopyBuffer(d->compute_command_buffer, src.buffer(), dst.buffer(), 1, regions);
}
}
}
ret == 0 && d->blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0 -->
cmd.record_download(feat_gpu, d->blob_mats[blob_index], d->opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:494 */
void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt) {
vkdev->convert_packing(src, dst_staging, dst_elempack, *this, opt_staging);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:3065 */
void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& _opt) const {
const ncnn::Packing_vulkan* uop = d->get_utility_operator(0, 0, cast_type_from_index, cast_type_to_index, packing_type_to_index);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1899 */
const ncnn::Packing_vulkan* VulkanDevicePrivate::get_utility_operator(int storage_type_from, int storage_type_to, int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const {
const ncnn::Packing_vulkan* cached_uop = uop_packing[storage_type_from][storage_type_to][cast_type_from_index][cast_type_to_index][packing_type_to_index];
ncnn::Packing_vulkan* uop = new ncnn::Packing_vulkan;
uop->load_param(pd);
uop->create_pipeline(opt);
uop_packing[storage_type_from][storage_type_to][cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop;
}
uop->forward(src, dst, cmd, opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/vulkan/packing_vulkan.cpp:257 */
int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
}
}