export HABANA_PROFILE=1
export HABANA_PROFILE_WRITE_HLTV=1
export GRAPH_VISUALIZATION=1
hl-prof-config --use-template profile_api_with_nics --fuser on --trace-analyzer on --trace-analyzer-xlsx on
# hl-prof-config --gaudi2
hl-prof-config --gaudi2
hl-prof-config -o ./new_trace_results_decode_32_640_t
import torch
import os
os .environ ["PT_HPU_LAZY_MODE" ] = "1"
from habana_frameworks .torch import core as htcore
import torch
import habana_frameworks .torch .core as htcore
from habana_frameworks import torch as ht
activities = [torch .profiler .ProfilerActivity .CPU ]
# CUDA:
# device = torch.device('cuda:0')
# activities.append(torch.profiler.ProfilerActivity.CUDA)
# HPU:
device = torch .device ("hpu" )
activities .append (torch .profiler .ProfilerActivity .HPU )
schedule = (torch .profiler .schedule (wait = 0 , warmup = 20 , active = 5 , repeat = 1 ),)
def bf16_gemm (a , b ):
return torch .matmul (a , b )
with torch .profiler .profile (
activities = activities , on_trace_ready = torch .profiler .tensorboard_trace_handler ("./profile_bf16" , use_gzip = True )
) as profiler :
# for i in range(100):
# input = torch.tensor([[i]*10]*10, dtype=torch.float32, device=device)
# result = torch.matmul(input, input)
# result.to('cpu')
# htcore.mark_step()
# profiler.step()
with torch .no_grad ():
device = "hpu"
# accepted = torch.tensor([[False, False], [False, False]], device=device)
# limits = (accepted == 0).amax(1)
# print(accepted)
# print(f"limits: {limits}")
a = torch .randn (1024 , 1024 , device = device , dtype = torch .bfloat16 )
b = torch .randn (1024 , 1024 , device = device , dtype = torch .bfloat16 )
out = bf16_gemm (a , b )
print (out .max ())
htcore .mark_step ()
ht .hpu .synchronize ()
profiler .step ()
# profiler.stop()
"""
PT_HPU_PLACE_ON_CPU=none python test_check_acc.py
export HABANA_PROFILE=1
export HABANA_PROFILE_WRITE_HLTV=1
export GRAPH_VISUALIZATION=1
hl-prof-config --use-template profile_api_with_nics --fuser on --trace-analyzer on --trace-analyzer-xlsx on
# hl-prof-config --gaudi2
hl-prof-config --gaudi2
hl-prof-config -o ./new_trace_results_decode_32_640_t
"""
# <ptdev> xichen@habana-server-12:~/workspace$ PT_HPU_PLACE_ON_CPU=none python max.py
# libibverbs: Warning: couldn't open config directory '/tmp/tmp.xMaZmNx553/build/etc/libibverbs.d'.
# ============================= HPU PT BRIDGE CONFIGURATION ON RANK = 0 =============
# PT_HPU_LAZY_MODE = 1
# PT_HPU_RECIPE_CACHE_CONFIG = ,false,1024
# PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
# PT_HPU_LAZY_ACC_PAR_MODE = 1
# PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
# PT_HPU_EAGER_PIPELINE_ENABLE = 1
# PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE = 1
# PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
# ---------------------------: System Configuration :---------------------------
# Num CPU Cores : 160
# CPU RAM : 1007 GB
# ------------------------------------------------------------------------------
# Traceback (most recent call last):
# File "/home/xichen/workspace/max.py", line 9, in <module>
# limits = (accepted == 0).max(1).indices
# RuntimeError: aten::max.dim is not yet supported on HPU.
How to Accurately Time CUDA Kernels in Pytorch
# Custom function to search files by partial name and content (case-insensitive)
find_files () {
if [ $# -ne 2 ]; then
echo " Usage: find_files <partial-filename> <content-pattern>"
return 1
fi
find . -type f -iname " *$1 *" -exec grep -i -l " $2 " {} +
}
ENABLE_EXPERIMENTAL_FLAGS=1 PRINT_FILE_AND_LINE=1 LOG_LEVEL_PASS_MANAGER=1 LOG_LEVEL_DATA_TYPES=0 HABANA_LOGS=.habana_logs408-dtype GRAPH_VISUALIZATION=1 ENABLE_GVD=1 python test_remove_dq_q.py