ncnn - YingkunZhou/transfer-learning GitHub Wiki

#if NCNN_BENCHMARK

b /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:238
b /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:784
b /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:1041
b /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:1234
b /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:791
b /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:1048

/media/loongson/phd19/home/zhou/graduate9/work/ncnn/build/src/layer_registry.h

ncnn::Extractor ex = net.create_extractor();

/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2088 */
Extractor Net::create_extractor() const {
    return Extractor(this, d->blobs.size());
    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2273 */
    Extractor::Extractor(const Net* _net, size_t blob_count): d(new ExtractorPrivate(_net)) {
        d->blob_mats.resize(blob_count);
        d->opt = d->net->opt;
        d->local_blob_vkallocator = 0;
        d->local_staging_vkallocator = 0;
        d->blob_mats_gpu.resize(blob_count);
        d->blob_mats_gpu_image.resize(blob_count);
    }
}

ex.input(input_names[0], input_tensor);

/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2407 */
int Extractor::input(const char* blob_name, const Mat& in) {
    return input(blob_index, in);
    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2444 */
    int Extractor::input(int blob_index, const Mat& in)
    d->blob_mats[blob_index] = in;
}

ex.extract(output_names[0], output_tensor);

/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2423 */
int Extractor::extract(const char* blob_name, Mat& feat, int type) {
    int blob_index = d->net->find_blob_index_by_name(blob_name);
    return extract(blob_index, feat, type);
    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2452 */
    int Extractor::extract(int blob_index, Mat& feat, int type) {
        set_kmp_blocktime(d->opt.openmp_blocktime);
        set_flush_denormals(d->opt.flush_denormals);
        d->blob_mats[blob_index].dims
        int layer_index = d->net->blobs()[blob_index].producer;
        d->opt.blob_allocator = d->net->d->local_blob_allocator;
        d->opt.workspace_allocator = d->net->d->local_workspace_allocator;
        d->opt.use_vulkan_compute
        d->local_blob_vkallocator = d->net->vulkan_device()->acquire_blob_allocator();
        d->opt.blob_vkallocator = d->local_blob_vkallocator;
        d->opt.workspace_vkallocator = d->opt.blob_vkallocator;
        d->local_staging_vkallocator = d->net->vulkan_device()->acquire_staging_allocator();
        d->opt.staging_vkallocator = d->local_staging_vkallocator;
        ncnn::VkCompute cmd(d->net->vulkan_device());
        !d->opt.use_image_storage
        ret = extract(blob_index, feat_gpu, cmd); | VkMat feat_gpu;
        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2713 */
        int Extractor::extract(int blob_index, VkMat& feat, VkCompute& cmd) {
            set_kmp_blocktime(d->opt.openmp_blocktime);
            set_flush_denormals(d->opt.flush_denormals);
            d->blob_mats_gpu[blob_index].dims == 0
            d->blob_mats_gpu_image[blob_index].dims == 0
            d->blob_mats[blob_index].dims == 0
            ret = d->net->d->forward_layer(layer_index, d->blob_mats, d->blob_mats_gpu, cmd, d->opt);
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:234 */
            int NetPrivate::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector<VkMat>& blob_mats_gpu, VkCompute& cmd, const Option& opt) const {
                const Layer* layer = layers[layer_index];
                for (size_t i = 0; i < layer->bottoms.size(); i++) // load bottom blobs
                int bottom_blob_index = layer->bottoms[i];
                blob_mats_gpu[bottom_blob_index].dims == 0 && blob_mats[bottom_blob_index].dims == 0
                int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, cmd, opt);
                .... 递归调用!
                layer->support_vulkan
                blob_mats_gpu[bottom_blob_index].dims == 0
                cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); // host to buffer
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:358 */
                void VkCompute::record_upload(const Mat& src, VkMat& dst, const Option& opt) {
                    src.elemsize == src.elempack * 4u
                    !(vkdev->info.type() == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0)))
                    dst_staging.create_like(src_fp16, opt.staging_vkallocator); //  VkMat dst_staging;
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:814 */
                    void VkMat::create_like(const Mat& m, VkAllocator* _allocator) {
                        int _dims = m.dims;
                        create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); // _dims == 3
                        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:748 */
                        void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator) {
                            !(dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
                            release();
                            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.h:1543 */
                            NCNN_FORCEINLINE void VkMat::release() {
                                !(refcount && NCNN_XADD(refcount, -1) == 1) // refcount==0
                            }
                            cstep = alignSize(w * h * elemsize, 16) / elemsize;
                            total() > 0
                            size_t totalsize = alignSize(total() * elemsize, 4);
                            data = allocator->fastMalloc(totalsize);
                            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:1695 */
                            VkBufferMemory* VkStagingAllocator::fastMalloc(size_t size) {
                                VkBufferMemory* ptr = new VkBufferMemory;
                                ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
                                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:434 */
                                VkBuffer VkAllocator::create_buffer(size_t size, VkBufferUsageFlags usage) {
                                    VkResult ret = <<vkCreateBuffer(vkdev->vkdevice(), &bufferCreateInfo, 0, &buffer);
                                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:2340 */
                                    VkDevice VulkanDevice::vkdevice() const
                                }
                                <<vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements);
                                buffer_memory_type_index == (uint32_t)-1
                                buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
                                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:2690 */
                                uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const {
                                    const VkPhysicalDeviceMemoryProperties& memory_properties = info.physical_device_memory_properties();
                                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:285 */
                                    const VkPhysicalDeviceMemoryProperties& GpuInfo::physical_device_memory_properties() const {
                                        return d->physical_device_memory_properties;
                                    }
                                }
                                ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
                                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:457 */
                                VkDeviceMemory VkAllocator::allocate_memory(size_t size, uint32_t memory_type_index) {
                                    VkResult ret = <<vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
                                }
                                <<vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0);
                                <<vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);
                            }
                            refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
                        }
                    }
                    d->upload_staging_buffers.push_back(dst_staging);
                    // memcpy src to device
                    <<memcpy(dst_staging.mapped_ptr(), src_fp16.data, src_fp16.total() * src_fp16.elemsize);
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.h:1529 */
                    NCNN_FORCEINLINE void* VkMat::mapped_ptr() const
                    dst_staging.allocator->flush(dst_staging.data);
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:390 */
                    int VkAllocator::flush(VkBufferMemory* ptr)
                        coherent
                    vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt);
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:3064 */
                    void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& _opt) const {
                        src.elembits() == 32
                        const ncnn::Packing_vulkan* uop = d->get_utility_operator(0, 0, cast_type_from_index, cast_type_to_index, packing_type_to_index);
                        const ncnn::Packing_vulkan*
                        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1899 */
                        VulkanDevicePrivate::get_utility_operator(int storage_type_from, int storage_type_to, int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const {
                            ncnn::Packing_vulkan* uop = new ncnn::Packing_vulkan;
                            uop->load_param(pd); // ncnn::ParamDict pd;
                            uop->create_pipeline(opt); // Option opt;
                        }
                        uop->forward(src, dst, cmd, opt);
                        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/vulkan/packing_vulkan.cpp:257 */
                        int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const {
                            out_elemsize = out_elempack * 2u;
                            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator);
                            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:748 */
                            void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
                            elempack == 1 && out_elempack == 1
                            cmd.record_pipeline(pipeline_packing, buffer_bindings, image_bindings, constants, top_blob);
                            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:1506 */
                            void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher) {
                                Mat dispatcher_mat(dispatcher.w, dispatcher.h, dispatcher.d, dispatcher.c, (void*)0);
                                record_pipeline(pipeline, buffer_bindings, image_bindings, constants, dispatcher_mat);
                                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:1520 */
                                void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher) {
                                    const ShaderInfo& shader_info = pipeline->shader_info();
                                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/pipeline.cpp:170 */
                                        return d->shader_info;
                                    binding_type == 1
                                    const VkMat& binding = buffer_bindings[buffer_index].empty() ? vkdev->get_dummy_buffer() : buffer_bindings[buffer_index];
                                    barrier_readwrite(binding);
                                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:2600 */
                                    void VkCompute::barrier_readwrite(const VkMat& binding) {
                                        binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT
                                        VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1];
                                        vkdev->info.support_VK_KHR_push_descriptor()
                                        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:600 */
                                        int GpuInfo::support_VK_KHR_push_descriptor() const
                                            return d->support_VK_KHR_push_descriptor;
                                        <<vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0);
                                    }
                                    <<vkCmdBindPipeline(d->compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline());
                                    binding_count > 0
                                    descriptorInfos.resize(sizeof(VkDescriptorBufferInfo) * buffer_binding_count + sizeof(VkDescriptorImageInfo) * image_binding_count);
                                    memcpy(p_descriptorInfos, &descriptorBufferInfo, sizeof(VkDescriptorBufferInfo)); // VkDescriptorBufferInfo descriptorBufferInfo;
                                    vkdev->vkCmdPushDescriptorSetWithTemplateKHR(d->compute_command_buffer, pipeline->descriptor_update_template(), pipeline->pipeline_layout(), 0, descriptorInfos.data());
                                    // /usr/lib/aarch64-linux-gnu/tegra/libnvidia-eglcore.so.35.2.1
                                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/pipeline.cpp:165 */
                                    VkDescriptorUpdateTemplateKHR Pipeline::descriptor_update_template() const
                                        return d->descriptor_update_template;
                                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/pipeline.cpp:155 */
                                    VkPipelineLayout Pipeline::pipeline_layout() const
                                        return d->pipeline_layout;
                                    <<vkCmdPushConstants(d->compute_command_buffer, pipeline->pipeline_layout(), VK_SHADER_STAGE_COMPUTE_BIT, 0, constant_count * sizeof(vk_constant_type), constants.data());
                                    <<vkCmdDispatch(d->compute_command_buffer, group_count_x, group_count_y, group_count_z);
                                }
                            }
                        }
                    }
                }
                !layer->featmask
                ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:879 */
                int NetPrivate::do_forward_layer(const Layer* layer, std::vector<VkMat>& blob_mats_gpu, VkCompute& cmd, const Option& opt) const {
                    layer->one_blob_only
                    !(layer->support_inplace && *bottom_blob_ref.refcount != 1)
                    int ret = layer->forward(bottom_blob, top_blob, cmd, opt);
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/vulkan/convolution_vulkan.cpp:1106 */
                    int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const {
                        (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
                        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/vulkan/padding_vulkan.cpp:354 */
                        int Padding_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const {
                            top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
                            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:748 */
                            void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
                            cmd.record_pipeline(pipeline, bindings, constants, top_blob);
                            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:1496 */
                            void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher) {
                                record_pipeline(pipeline, bindings, std::vector<VkImageMat>(), constants, dispatcher);
                                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:1520 */
                                void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher)
                            }
                        }
                        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
                        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:748 */
                        void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
                        cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);
                        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:1496 */
                            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:1520 */
                            void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher)
                    }
                }
            }
        }
    }
}

global allocator

g_blob_pool_allocator.set_size_compare_ratio(0.f);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:272 */
void UnlockedPoolAllocator::set_size_compare_ratio(float scr)
    d->size_compare_ratio = (unsigned int)(scr * 256);
g_workspace_pool_allocator.set_size_compare_ratio(0.f);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:95 */
void PoolAllocator::set_size_compare_ratio(float scr)
    d->size_compare_ratio = (unsigned int)(scr * 256);

g_vkdev = ncnn::get_gpu_device(gpu_device);

/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:3285 */
VulkanDevice* get_gpu_device(int device_index) {
    try_create_gpu_instance();
    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1735 */
    static void try_create_gpu_instance() {
        !is_gpu_instance_ready()
        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1727 */
        static bool is_gpu_instance_ready() {
            MutexLockGuard lock(g_instance_lock);
            return (VkInstance)g_instance != 0;
        }
        create_gpu_instance();
        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:900 */
        int create_gpu_instance() {
            MutexLockGuard lock(g_instance_lock);
            (VkInstance)g_instance == 0
            td::vector<const char*> enabledLayers;
            ret = <<vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, NULL); | uint32_t instanceExtensionPropertyCount
            std::vector<VkExtensionProperties> instanceExtensionProperties(instanceExtensionPropertyCount);
            ret = <<vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, instanceExtensionProperties.data());
            const VkExtensionProperties& exp = instanceExtensionProperties[j];
            !support_VK_EXT_validation_features
            support_VK_KHR_external_memory_capabilities --> enabledExtensions.push_back("VK_KHR_external_memory_capabilities");
            support_VK_KHR_get_physical_device_properties2 --> enabledExtensions.push_back("VK_KHR_get_physical_device_properties2");
            support_VK_KHR_get_surface_capabilities2 --> enabledExtensions.push_back("VK_KHR_get_surface_capabilities2");
            !support_VK_KHR_portability_enumeration
            support_VK_KHR_surface --> enabledExtensions.push_back("VK_KHR_surface");
            typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t * pApiVersion);
            PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion = (PFN_vkEnumerateInstanceVersion)<<vkGetInstanceProcAddr(0, "vkEnumerateInstanceVersion");
            ret = vkEnumerateInstanceVersion(&instance_api_version);
            ret = <<vkCreateInstance(&instanceCreateInfo, 0, &instance);
            init_instance_extension();
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:674 */
            static int init_instance_extension() {
                // support_VK_KHR_external_memory_capabilities -->
                vkGetPhysicalDeviceExternalBufferPropertiesKHR = (PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceExternalBufferPropertiesKHR");
                // support_VK_KHR_get_physical_device_properties2 -->
                vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures2KHR");
                vkGetPhysicalDeviceProperties2KHR = (PFN_vkGetPhysicalDeviceProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceProperties2KHR");
                vkGetPhysicalDeviceFormatProperties2KHR = (PFN_vkGetPhysicalDeviceFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFormatProperties2KHR");
                vkGetPhysicalDeviceImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceImageFormatProperties2KHR");
                vkGetPhysicalDeviceQueueFamilyProperties2KHR = (PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceQueueFamilyProperties2KHR");
                vkGetPhysicalDeviceMemoryProperties2KHR = (PFN_vkGetPhysicalDeviceMemoryProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceMemoryProperties2KHR");
                vkGetPhysicalDeviceSparseImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSparseImageFormatProperties2KHR");
                // support_VK_KHR_get_surface_capabilities2 -->
                vkGetPhysicalDeviceSurfaceCapabilities2KHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilities2KHR");
                vkGetPhysicalDeviceSurfaceFormats2KHR = (PFN_vkGetPhysicalDeviceSurfaceFormats2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormats2KHR");
                // support_VK_KHR_surface -->
                vkDestroySurfaceKHR = (PFN_vkDestroySurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkDestroySurfaceKHR");
                vkGetPhysicalDeviceSurfaceSupportKHR = (PFN_vkGetPhysicalDeviceSurfaceSupportKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceSupportKHR");
                vkGetPhysicalDeviceSurfaceCapabilitiesKHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilitiesKHR");
                vkGetPhysicalDeviceSurfaceFormatsKHR = (PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormatsKHR");
                vkGetPhysicalDeviceSurfacePresentModesKHR = (PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfacePresentModesKHR");
                // VK_NV_cooperative_matrix
                vkGetPhysicalDeviceCooperativeMatrixPropertiesNV = (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesNV");
            }
            ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, 0);
            std::vector<VkPhysicalDevice> physicalDevices(physicalDeviceCount);
            ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, physicalDevices.data());
            g_gpu_infos[gpu_info_index] = new GpuInfo;
            GpuInfoPrivate& gpu_info = *(g_gpu_infos[gpu_info_index]->d)
            <<vkGetPhysicalDeviceProperties(physicalDevice, &physicalDeviceProperties); | VkPhysicalDeviceProperties physicalDeviceProperties;
            (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) --> gpu_info.type = 1;
            // find compute queue
            vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, 0); | uint32_t queueFamilyPropertiesCount;
            std::vector<VkQueueFamilyProperties> queueFamilyProperties(queueFamilyPropertiesCount);
            <<vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, queueFamilyProperties.data());
            gpu_info.compute_queue_family_index = find_device_compute_queue(queueFamilyProperties);
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:752 */
            static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
            gpu_info.graphics_queue_family_index = find_device_graphics_queue(queueFamilyProperties);
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:793 */
            static uint32_t find_device_graphics_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
            gpu_info.transfer_queue_family_index = find_device_transfer_queue(queueFamilyProperties);
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:834 */
            static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
            <<vkGetPhysicalDeviceProperties2KHR(physicalDevice, &queryProperties);
            // cache memory properties
            <<vkGetPhysicalDeviceMemoryProperties(physicalDevice, &gpu_info.physical_device_memory_properties);
            // get device extension
            ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, NULL);
            ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, deviceExtensionProperties.data());
            // support_VK_KHR_get_physical_device_properties2
            // query int8 storage
            VkPhysicalDevice8BitStorageFeaturesKHR query8BitStorageFeatures;
            // query fp16/int16 storage
            VkPhysicalDevice16BitStorageFeaturesKHR query16BitStorageFeatures;
            // query fp16/int8 arithmetic
            VkPhysicalDeviceFloat16Int8FeaturesKHR queryFloat16Int8Features;
            // query ycbcr_conversion
            VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures;
            // query cooperative_matrix
            VkPhysicalDeviceCooperativeMatrixFeaturesNV queryCooperativeMatrixFeatures;
            <<vkGetPhysicalDeviceFeatures2KHR(physicalDevice, &queryFeatures);
            ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, 0);
            ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, properties.data());
            const VkCooperativeMatrixPropertiesNV& cmp = properties[j]; //j==5
            --> gpu_info.support_cooperative_matrix_16_8_8 = true;
            **NCNN_LOGE
            // the default gpu device
            g_default_gpu_index = find_default_vulkan_device_index();
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/glslang/glslang/MachineIndependent/ShaderLang.cpp:1740 */
            bool InitializeProcess() {
                return ShInitialize() != 0;
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/glslang/glslang/MachineIndependent/ShaderLang.cpp:1332 */
                // ShInitialize() should be called exactly once per process, not per thread.
                int ShInitialize() {
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/glslang/glslang/OSDependent/Unix/ossource.cpp:139 */
                    void InitGlobalLock()
                }
            }
        }
    }
    g_default_vkdev[device_index] = new VulkanDevice(device_index);
    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:2028 */
    VulkanDevice::VulkanDevice(int device_index)
    : info(get_gpu_info(device_index)), d(new VulkanDevicePrivate(this)) {
  try_create_gpu_instance();
        std::vector<const char*> enabledExtensions;
        info.support_VK_KHR_8bit_storage() --> enabledExtensions.push_back("VK_KHR_8bit_storage");
        info.support_VK_KHR_16bit_storage() --> enabledExtensions.push_back("VK_KHR_16bit_storage");
        info.support_VK_KHR_bind_memory2() --> enabledExtensions.push_back("VK_KHR_bind_memory2");
        info.support_VK_KHR_buffer_device_address() --> enabledExtensions.push_back("VK_KHR_buffer_device_address");
        info.support_VK_KHR_create_renderpass2() --> enabledExtensions.push_back("VK_KHR_create_renderpass2");
        info.support_VK_KHR_dedicated_allocation() --> enabledExtensions.push_back("VK_KHR_dedicated_allocation");
        ...
        >>> print *(enabledExtensions._M_impl._M_start)@enabledExtensions.size()
        $62 = {[0] = 0xfffff7da57e0 "VK_KHR_8bit_storage", [1] = 0xfffff7da57f8 "VK_KHR_16bit_storage", [2] = 0xfffff7da5810 "VK_KHR_bind_memory2", [3] = 0xfffff7da5828 "VK_KHR_buffer_device_address", [4] = 0xfffff7da5848 "VK_KHR_create_renderpass2", [5] = 0xfffff7da5868 "VK_KHR_dedicated_allocation", [6] = 0xfffff7da5888 "VK_KHR_descriptor_update_template", [7] = 0xfffff7da58b0 "VK_KHR_external_memory", [8] = 0xfffff7da58c8 "VK_KHR_get_memory_requirements2", [9] = 0xfffff7da58e8 "VK_KHR_maintenance1", [10] = 0xfffff7da5900 "VK_KHR_maintenance2", [11] = 0xfffff7da5918 "VK_KHR_maintenance3", [12] = 0xfffff7da5930 "VK_KHR_multiview", [13] = 0xfffff7da5968 "VK_KHR_push_descriptor", [14] = 0xfffff7da5980 "VK_KHR_sampler_ycbcr_conversion", [15] = 0xfffff7da59a0 "VK_KHR_shader_float16_int8", [16] = 0xfffff7da59c0 "VK_KHR_shader_float_controls", [17] = 0xfffff7da59e0 "VK_KHR_storage_buffer_storage_class", [18] = 0xfffff7da5a08 "VK_KHR_swapchain", [19] = 0xfffff7da5a40 "VK_EXT_descriptor_indexing", [20] = 0xfffff7da5a60 "VK_EXT_memory_budget", [21] = 0xfffff7da5a90 "VK_EXT_queue_family_foreign", [22] = 0xfffff7da5ad0 "VK_NV_cooperative_matrix"}
        // enable int8 storage
        VkPhysicalDevice8BitStorageFeaturesKHR enabled8BitStorageFeatures;
        // enable fp16/int16 storage
        VkPhysicalDevice16BitStorageFeaturesKHR enabled16BitStorageFeatures;
        // enable fp16/int8 arithmetic
        VkPhysicalDeviceFloat16Int8FeaturesKHR enabledFloat16Int8Features;
        // enable ycbcr conversion
        VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures;
        // enable cooperative matrix
        VkPhysicalDeviceCooperativeMatrixFeaturesNV queryCooperativeMatrixFeatures;
        VkDeviceQueueCreateInfo deviceComputeQueueCreateInfo;
        VkDeviceQueueCreateInfo deviceGraphicsQueueCreateInfo;
        VkDeviceQueueCreateInfo deviceTransferQueueCreateInfo;
        VkResult ret = <<vkCreateDevice(info.physical_device(), &deviceCreateInfo, 0, &d->device);
        init_device_extension();
        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:3198 */
        <<vkGetDeviceQueue(d->device, info.compute_queue_family_index(), i, &d->compute_queues[i]);
        <<vkGetDeviceQueue(d->device, info.graphics_queue_family_index(), i, &d->graphics_queues[i]);
        <<vkGetDeviceQueue(d->device, info.transfer_queue_family_index(), i, &d->transfer_queues[i]);
         prepare immutable texelfetch sampler
        VkSamplerCreateInfo samplerCreateInfo;
        ret = vkCreateSampler(d->device, &samplerCreateInfo, 0, &d->texelfetch_sampler);
        int cret = d->create_dummy_buffer_image();
        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1858 */
        int VulkanDevicePrivate::create_dummy_buffer_image() {
            dummy_allocator = new VkDummyAllocator(vkdev);
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1765 */
            class VkDummyAllocator : public VkBlobAllocator {
                VkBlobAllocator(_vkdev, 16 * 1024)
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:594 */
                VkBlobAllocator::VkBlobAllocator(const VulkanDevice* _vkdev, size_t preferred_block_size) {
                    VkAllocator(_vkdev), d(new VkBlobAllocatorPrivate)
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:362 */
                    VkAllocator::VkAllocator(const VulkanDevice* _vkdev)
                }
                dummy_buffer.create(1, 4u, dummy_allocator);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:552 */
                void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator) {
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:675 */
                    VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size)
                        **NCNN_LOGE
                }
                dummy_image.create(1, 4u, dummy_allocator);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:855 */
                void VkImageMat::create(int _w, size_t _elemsize, VkAllocator* _allocator) {
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:860 */
                    VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int elempack) {
                        VkImageMemory* ptr = new VkImageMemory;
                        ptr->image = create_image(width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
                        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:503 */
                        VkImage VkAllocator::create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage) {
                            VkResult ret = <<vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image);
                        }
                        vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements); | VkMemoryRequirements memoryRequirements;
                        ptr->imageview = create_imageview(ptr->image, format)
                        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:535 */
                        VkImageView VkAllocator::create_imageview(VkImage image, VkFormat format)
                            VkResult ret = <<vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview);
                        **NCNN_LOGE
                    }
                }
                dummy_image_readonly.create(1, 4u, dummy_allocator);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:855 */
                VkDummyCompute cmd(vkdev);
                class VkDummyCompute : public VkCompute
                cmd.record_dummy(dummy_buffer);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1780 */
                void record_dummy(const VkMat& buffer) {
                    barrier_readwrite(buffer);
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:2601 */
                    void VkCompute::barrier_readwrite(const VkMat& binding) {
                        (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
                        vkdev->info.support_VK_KHR_push_descriptor() --> vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0);
                    }
                }
                cmd.record_dummy(dummy_image);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1785 */
                void record_dummy(const VkImageMat& image) {
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:2643 */
                    void VkCompute::barrier_readwrite(const VkImageMat& binding) {
                        (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_GENERAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
                        vkdev->info.support_VK_KHR_push_descriptor() --> vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers);
                    }
                }
                cmd.record_dummy_readonly(dummy_image_readonly);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1790 */
                void record_dummy_readonly(const VkImageMat& image) {
                    barrier_readonly(image);
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:2691 */
                    void VkCompute::barrier_readonly(const VkImageMat& binding) {
                        vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers);
                    }
                }
                return cmd.submit_and_wait();
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:2292 */
                int VkCompute::submit_and_wait() {
                    vkdev->info.support_VK_KHR_push_descriptor()
                    d->end_command_buffer();
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:338 */
                    int VkComputePrivate::end_command_buffer() {
                        VkResult ret = <<vkEndCommandBuffer(compute_command_buffer);
                    }
                    VkQueue compute_queue = vkdev->acquire_queue(vkdev->info.compute_queue_family_index());
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:2774 */
                    VkQueue VulkanDevice::acquire_queue(uint32_t queue_family_index) const
                    VkResult ret = <<vkQueueSubmit(compute_queue, 1, &submitInfo, d->compute_command_fence);
                    vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue);
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:2833 */
                    void VulkanDevice::reclaim_queue(uint32_t queue_family_index, VkQueue queue) const
                    VkResult ret = <<vkWaitForFences(vkdev->vkdevice(), 1, &d->compute_command_fence, VK_TRUE, (uint64_t)-1);
                    d->delayed_records.size()
                    d->delayed_records.clear();
                }
            }
        }
        d->pipeline_cache = new PipelineCache(this);
        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/pipelinecache.cpp:172 */
        PipelineCache::PipelineCache(const VulkanDevice* _vkdev): vkdev(_vkdev), d(new PipelineCachePrivate)
        memset(d->uop_packing, 0, sizeof(d->uop_packing));
    }
}

net.load_param(param_file);

/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:1859 */
int Net::load_param(const char* protopath) {
    int ret = load_param(fp);
    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:1845 */
    int Net::load_param(FILE* fp) {
        DataReaderFromStdio dr(fp);
        return load_param(dr);
        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:1289 */
        int Net::load_param(const DataReader& dr) {
            SCAN_VALUE("%d", magic)
            SCAN_VALUE("%d", layer_count)
            SCAN_VALUE("%d", blob_count)
            // TODO enable gpu when bf16 conversion implemented
            if (opt.use_bf16_storage) --> opt.use_vulkan_compute = false;
            ... ...
            SCAN_VALUE("%255s", layer_type)
            SCAN_VALUE("%255s", layer_name)
            SCAN_VALUE("%d", bottom_count)
            SCAN_VALUE("%d", top_count)
            Layer* layer = create_overwrite_builtin_layer(layer_type);
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2204 */
            Layer* Net::create_overwrite_builtin_layer(const char* type) {
                int typeindex = layer_to_index(type);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer.cpp:208 */
                int layer_to_index(const char* type)
                return create_overwrite_builtin_layer(typeindex);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:2229 */
                layer = create_layer(layer_type);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer.cpp:219 */
                Layer* create_layer(const char* type) {
                    int index = layer_to_index(type);
                    return create_layer(index);
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer.cpp:229 */
                    Layer* create_layer(int index) {
                        layer_creator = layer_registry[index].creator;
                        Layer* layer = layer_creator(0);
                        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/build/src/layer_declaration.h:340 */
                        DEFINE_LAYER_CREATOR(Input_final)
                        DEFINE_LAYER_CREATOR(Reshape_final)
                    }
                }
            }
            SCAN_VALUE("%255s", blob_name)
            int pdlr = pd.load_param(dr);
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/paramdict.cpp:247 */
            int ParamDict::load_param(const DataReader& dr) {
                clear();
                bool is_float = vstr_is_float(vstr);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/paramdict.cpp:154 */
                static bool vstr_is_float(const char vstr[16])
            }
            // pull out top shape hints
            Mat shape_hints = pd.get(30, Mat());
            shape_hints.empty()
            layer->featmask = pd.get(31, 0);
            int lr = layer->load_param(pd);
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/input.cpp:31 */
            int Input::load_param(const ParamDict& pd)
        }
    }
}

net.load_model(model_file);

/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:1900 */
int Net::load_model(const char* modelpath) {
    int ret = load_model(fp);
    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:1893 */
    int Net::load_model(FILE* fp) {
        return load_model(dr);
        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:1726 */
        int Net::load_model(const DataReader& dr) {
            ModelBinFromDataReader mb(dr);
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/modelbin.cpp:74 */
            ModelBinFromDataReader::ModelBinFromDataReader(const DataReader& _dr) : ModelBin(), d(new ModelBinFromDataReaderPrivate(_dr))
            int lret = layer->load_model(mb);
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/memorydata.cpp:36 */
            int MemoryData::load_model(const ModelBin& mb) {
                data = mb.load(w, h, c, 1);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/modelbin.cpp:46 */
                Mat ModelBin::load(int w, int h, int c, int type) const {
                    Mat m = load(w * h * c, type);
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/modelbin.cpp:94 */
                    Mat ModelBinFromDataReader::load(int w, int type) const
                    return m.reshape(w, h, c);
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:144 */
                    Mat Mat::reshape(int _w, int _h, int _c, Allocator* _allocator) const {
                        m.create(_w, _h, _c, elemsize, elempack, _allocator);
                        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:440 */
                        void Mat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator) {
                            data = fastMalloc(totalsize + (int)sizeof(*refcount));
                            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.h:76 */
                            static NCNN_FORCEINLINE void* fastMalloc(size_t size)
                        }
                    }
                }
            }
            d->pipeline_cache = new PipelineCache(d->vkdev);
            Option opt1 = get_masked_option(opt, layer->featmask);
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:113 */
            static Option get_masked_option(const Option& opt, int featmask)
            int cret = layer->create_pipeline(opt1);
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/build/src/layer_declaration.h:332 */
            ncnn::Input_final::create_pipeline(ncnn::Option const&)
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/build/src/layer_declaration.h:391 */
            ncnn::MemoryData_final::create_pipeline {
                int ret = MemoryData_vulkan::create_pipeline(opt)
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/vulkan/memorydata_vulkan.cpp:28 */
                int MemoryData_vulkan::create_pipeline(const Option& opt) {
                    vkdev->shape_support_image_storage(out_shape_packed)
                }
            }
            d->local_blob_allocator = new PoolAllocator;
            d->local_blob_allocator->set_size_compare_ratio(0.f);
            d->local_workspace_allocator = new PoolAllocator;
            d->local_workspace_allocator->set_size_compare_ratio(0.f);
            ret = d->upload_model();
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:134 */
            int NetPrivate::upload_model() {
                ncnn::VkTransfer cmd(vkdev);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:2985 */
                VkTransfer::VkTransfer(const VulkanDevice* _vkdev): vkdev(_vkdev), d(new VkTransferPrivate(_vkdev))
                weight_vkallocator = new VkWeightAllocator(vkdev);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:1144 */
                VkWeightAllocator::VkWeightAllocator(const VulkanDevice* _vkdev, size_t preferred_block_size): VkAllocator(_vkdev), d(new VkWeightAllocatorPrivate)
                weight_staging_vkallocator = new VkWeightStagingAllocator(vkdev);
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:1802 */
                VkWeightStagingAllocator::VkWeightStagingAllocator(const VulkanDevice* _vkdev): VkAllocator(_vkdev), d(new VkWeightStagingAllocatorPrivate)
                layers[i]->upload_model(cmd, get_masked_option(opt_upload, layers[i]->featmask))
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:113 */
                static Option get_masked_option(const Option& opt, int featmask)
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/vulkan/memorydata_vulkan.cpp:65 */
                int MemoryData_vulkan::upload_model(VkTransfer& cmd, const Option& opt) {
                    const Mat& shape = data.shape();
                    convert_packing(data, data_packed, elempack, opt);
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:1560 */
                    void convert_packing(const Mat& src, Mat& dst, int _elempack, const Option& opt) {
                        Layer* packing = create_layer(LayerType::Packing); {
                            Layer* layer = layer_creator(0);
                            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/build/src/layer_declaration.h:1252 */
                            DEFINE_LAYER_CREATOR(Packing_final)
                        }
                        packing->load_param(pd);
                        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/packing.cpp:27 */
                        int Packing::load_param(const ParamDict& pd)
                        packing->create_pipeline(opt);
                        packing->forward(src, dst, opt);
                        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/arm/packing_arm.cpp:37 */
                        int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const {
                            !use_padding
                            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
                            pack1to4
                            // why GDB cannot step
                            #pragma omp parallel for num_threads(opt.num_threads)
                        }
                        packing->destroy_pipeline(opt);
                    }
                    cmd.record_upload(data_packed, data_gpu, opt, /*bool flatten*/ false);
                    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:2995 */
                    void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten) {
                        src.elembits() == 32
                        (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0))
                        cast_float32_to_float16(src, src_fp16, opt);
                        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:1595 */
                        void cast_float32_to_float16(const Mat& src, Mat& dst, const Option& opt) {
                            Layer* cast = create_layer(LayerType::Cast); {
                                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/build/src/layer_declaration.h:1295 */
                                DEFINE_LAYER_CREATOR(Cast_final)
                                class Cast_final : virtual public Cast, virtual public Cast_vulkan, virtual public Cast_arm
                            }
                            cast->load_param(pd);
                            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/cast.cpp:28 */
                            int Cast::load_param(const ParamDict& pd)
                            cast->create_pipeline(opt);
                            cast->forward(src, dst, opt);
                            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/arm/cast_arm.cpp:40 */
                            int Cast_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const {
                                top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
                                (type_from == 1 && type_to == 2) -->
                                cast_fp32_to_fp16_neon(bottom_blob, top_blob, opt);
                                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/arm/cast_fp16.h:21 */
                                static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt) {
                                    #pragma omp parallel for num_threads(opt.num_threads)
                                }
                                cast->destroy_pipeline(opt);
                            }
                        }
                        record_upload(src_fp16, dst, opt, flatten);
                        void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten) {
                            Mat src_flattened = flatten ? src.reshape(src.w * src.h * src.c) : src;
                            dst.create_like(src_flattened, opt.blob_vkallocator);
                            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:816 */
                            void VkMat::create_like(const Mat& m, VkAllocator* _allocator) {
                                data = allocator->fastMalloc(totalsize);
                                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/allocator.cpp:1234 */
                                VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size) {
                                    VkBufferMemory* block = new VkBufferMemory;
                                    block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
                                    vkdev->vkGetBufferMemoryRequirements2KHR(vkdev->vkdevice(), &bufferMemoryRequirementsInfo2, &memoryRequirements2);
                                }

                            }
                            // memcpy src_flattened to device
                            dst.allocator->mappable --> memcpy(dst.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize);
                            dst.allocator->flush(dst.data);
                            // barrier device host-write @ null to shader-read @ compute
                            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
                        }
                    }
                }
                return cmd.submit_and_wait();
                /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:3372 */
                int VkTransfer::submit_and_wait() {
                    !vkdev->info.unified_compute_transfer_queue()
                    VkQueue compute_queue = vkdev->acquire_queue(vkdev->info.compute_queue_family_index());
                    VkQueue transfer_queue = vkdev->acquire_queue(vkdev->info.transfer_queue_family_index());
                    VkResult ret = vkQueueSubmit(transfer_queue, 1, &submitInfo, d->upload_command_fence);
                    VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, d->compute_command_fence);
                    vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index(), transfer_queue);
                    vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue);
                    VkFence fences[2] = {d->upload_command_fence, d->compute_command_fence};
                    VkResult ret = vkWaitForFences(vkdev->vkdevice(), 2, fences, VK_TRUE, (uint64_t)-1);
                }
            }
        }
    }
}
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:325 */
ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/net.cpp:880 */
int NetPrivate::do_forward_layer(const Layer* layer, std::vector<VkMat>& blob_mats_gpu, VkCompute& cmd, const Option& opt) const {
    opt.lightmode
    !layer->support_inplace
    int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt);
    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/vulkan/memorydata_vulkan.cpp:90 */
    int MemoryData_vulkan::forward(const std::vector<VkMat>& /*bottom_blobs*/, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const {
        cmd.record_clone(data_gpu, top_blob, opt);
        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:979 */
        void VkCompute::record_clone(const VkMat& src, VkMat& dst, const Option& opt) {
            dst.create_like(src, opt.blob_vkallocator);
            /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/mat.cpp:829 */
            void VkMat::create_like(const VkMat& m, VkAllocator* _allocator) {
                create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator);
            }
            (src.data->access_flags & VK_ACCESS_TRANSFER_WRITE_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) -->
                vkdev->info.support_VK_KHR_push_descriptor() -->
            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0);
            VkBufferCopy* regions = new VkBufferCopy[1];
            // record device to staging
            vkdev->info.support_VK_KHR_push_descriptor() -->
            vkCmdCopyBuffer(d->compute_command_buffer, src.buffer(), dst.buffer(), 1, regions);
        }
    }
}
ret == 0 && d->blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0 -->
cmd.record_download(feat_gpu, d->blob_mats[blob_index], d->opt);
/* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/command.cpp:494 */
void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt) {
    vkdev->convert_packing(src, dst_staging, dst_elempack, *this, opt_staging);
    /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:3065 */
    void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& _opt) const {
        const ncnn::Packing_vulkan* uop = d->get_utility_operator(0, 0, cast_type_from_index, cast_type_to_index, packing_type_to_index);
        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/gpu.cpp:1899 */
        const ncnn::Packing_vulkan* VulkanDevicePrivate::get_utility_operator(int storage_type_from, int storage_type_to, int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const {
            const ncnn::Packing_vulkan* cached_uop = uop_packing[storage_type_from][storage_type_to][cast_type_from_index][cast_type_to_index][packing_type_to_index];
            ncnn::Packing_vulkan* uop = new ncnn::Packing_vulkan;
            uop->load_param(pd);
            uop->create_pipeline(opt);
            uop_packing[storage_type_from][storage_type_to][cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop;
        }
        uop->forward(src, dst, cmd, opt);
        /* /media/loongson/phd19/home/zhou/graduate9/work/ncnn/src/layer/vulkan/packing_vulkan.cpp:257 */
        int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
    }
}
⚠️ **GitHub.com Fallback** ⚠️