HPU && GPU - yiliu30/yi GitHub Wiki

HPU Profiler

export HABANA_PROFILE=1
export HABANA_PROFILE_WRITE_HLTV=1
export GRAPH_VISUALIZATION=1
hl-prof-config --use-template profile_api_with_nics --fuser on --trace-analyzer on --trace-analyzer-xlsx on
#  hl-prof-config --gaudi2
hl-prof-config --gaudi2
hl-prof-config   -o  ./new_trace_results_decode_32_640_t

HPU Profiling demo

import torch

import os

os.environ["PT_HPU_LAZY_MODE"] = "1"

from habana_frameworks.torch import core as htcore


import torch
import habana_frameworks.torch.core as htcore
from habana_frameworks import torch as ht

activities = [torch.profiler.ProfilerActivity.CPU]

# CUDA:
# device = torch.device('cuda:0')
# activities.append(torch.profiler.ProfilerActivity.CUDA)

# HPU:
device = torch.device("hpu")
activities.append(torch.profiler.ProfilerActivity.HPU)

schedule = (torch.profiler.schedule(wait=0, warmup=20, active=5, repeat=1),)

def bf16_gemm(a, b):
    return torch.matmul(a, b)

with torch.profiler.profile(
    activities=activities, on_trace_ready=torch.profiler.tensorboard_trace_handler("./profile_bf16", use_gzip=True)
) as profiler:
    # for i in range(100):
    #     input = torch.tensor([[i]*10]*10, dtype=torch.float32, device=device)
    #     result = torch.matmul(input, input)
    #     result.to('cpu')
    #     htcore.mark_step()
    #     profiler.step()
    with torch.no_grad():
        device = "hpu"
        # accepted = torch.tensor([[False, False], [False, False]], device=device)
        # limits = (accepted == 0).amax(1)
        # print(accepted)
        # print(f"limits: {limits}")
        a = torch.randn(1024, 1024, device=device, dtype=torch.bfloat16)
        b = torch.randn(1024, 1024, device=device, dtype=torch.bfloat16)
        out = bf16_gemm(a, b)
        print(out.max())
        
        htcore.mark_step()
        ht.hpu.synchronize()
        profiler.step()
        # profiler.stop()

"""
PT_HPU_PLACE_ON_CPU=none  python test_check_acc.py

export HABANA_PROFILE=1
export HABANA_PROFILE_WRITE_HLTV=1
export GRAPH_VISUALIZATION=1
hl-prof-config --use-template profile_api_with_nics --fuser on --trace-analyzer on --trace-analyzer-xlsx on
#  hl-prof-config --gaudi2
hl-prof-config --gaudi2
hl-prof-config   -o  ./new_trace_results_decode_32_640_t
"""

# <ptdev> xichen@habana-server-12:~/workspace$ PT_HPU_PLACE_ON_CPU=none python max.py
# libibverbs: Warning: couldn't open config directory '/tmp/tmp.xMaZmNx553/build/etc/libibverbs.d'.
# ============================= HPU PT BRIDGE CONFIGURATION ON RANK = 0 =============
#  PT_HPU_LAZY_MODE = 1
#  PT_HPU_RECIPE_CACHE_CONFIG = ,false,1024
#  PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
#  PT_HPU_LAZY_ACC_PAR_MODE = 1
#  PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
#  PT_HPU_EAGER_PIPELINE_ENABLE = 1
#  PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE = 1
#  PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
# ---------------------------: System Configuration :---------------------------
# Num CPU Cores : 160
# CPU RAM       : 1007 GB
# ------------------------------------------------------------------------------
# Traceback (most recent call last):
#   File "/home/xichen/workspace/max.py", line 9, in <module>
#     limits = (accepted == 0).max(1).indices
# RuntimeError: aten::max.dim is not yet supported on HPU.

How to Accurately Time CUDA Kernels in Pytorch

https://www.speechmatics.com/company/articles-and-news/timing-operations-in-pytorch

# Custom function to search files by partial name and content (case-insensitive)
find_files() {
    if [ $# -ne 2 ]; then
        echo "Usage: find_files <partial-filename> <content-pattern>"
        return 1
    fi
    find . -type f -iname "*$1*" -exec grep -i -l "$2" {} +
}

ENABLE_EXPERIMENTAL_FLAGS=1 PRINT_FILE_AND_LINE=1  LOG_LEVEL_PASS_MANAGER=1  LOG_LEVEL_DATA_TYPES=0 HABANA_LOGS=.habana_logs408-dtype  GRAPH_VISUALIZATION=1    ENABLE_GVD=1   python test_remove_dq_q.py

HPU && GPU - yiliu30/yi GitHub Wiki

HPU Profiler

HPU Profiling demo

How to Accurately Time CUDA Kernels in Pytorch

⚠️ **GitHub.com Fallback** ⚠️

⚠️ GitHub.com Fallback ⚠️