Torch Env Var

Inductor

TORCHINDUCTOR_UNIQUE_KERNEL_NAMES=1 TORCHINDUCTOR_BENCHMARK_KERNEL=1  TORCHINDUCTOR_FORCE_DISABLE_CACHES=1  TORCH_COMPILE_DEBUG=1

https://pytorch.org/docs/stable/torch.compiler_inductor_profiling.html

# For vis
INDUCTOR_ORIG_FX_SVG=1 INDUCTOR_POST_FUSION_SVG=1

CUDA_VISIBLE_DEVICES=1

TORCH_LOGS

TORCH_LOGS_OUT=.torch_logs TORCH_LOGS="all"

TORCH_LOGS_OUT=.torch_logs_recompile TORCH_LOGS="recompiles,recompiles_verbose"

https://pytorch.org/docs/stable/logging.html

`HPU`

ENABLE_EXPERIMENTAL_FLAGS=1 PRINT_FILE_AND_LINE=1  LOG_LEVEL_PASS_MANAGER=1  LOG_LEVEL_ALL=1 HABANA_LOGS=.habana_logs-515  GRAPH_VISUALIZATION=1

find error

HABANA_LOGS=.habana_logs LOG_LEVEL_ALL=1

HABANA_LOGS=.habana_logs LOG_LEVEL_ALL_PT=1 TORCH_COMPILE_DEBUG=1

HABANA_LOGS=.habana_logs LOG_LEVEL_ALL_PT=1 TORCH_COMPILE_DEBUG=1 PT_ENABLE_INT64_SUPPORT=1 PT_HPU_LAZY_MODE=0

dump graph

GRAPH_VISUALIZATION=1

pytest

cd  pytorch-integration/tests/pytest_working
HABANA_LOGS=.habana_logs_moe_op_fp8_lazy_2  GRAPH_VISUALIZATION=1  LOG_LEVEL_ALL_PT=1  pytest -svv  ./any_mode/test_hpu_mixture_of_experts.py -k test_mixture_of_experts_fp8

Multiple Cards

import torch
from habana_frameworks.torch import core as htcore
import torch.distributed

htcore.hpu_initialize()
a = torch.tensor([1, 2, 3])
device = torch.device("hpu")
a.to(device)
print(a)
print(f"torch version: {torch.__version__}")


import torch
import habana_frameworks.torch.dynamo.compile_backend

# input_cpu = torch.randint(size=[3,4], low=-10, high=10, dtype=torch.int32)
# input_hpu = input_cpu.to("hpu")

# def fn(input, other):
#     return torch.div(input, other)

# output_cpu = fn(input_cpu, 0.6)
# output_hpu = fn(input_hpu, 0.6)

# print(output_cpu)
# print(output_hpu.cpu())

# init torch distributed group
import torch.distributed as dist

dist.init_process_group(backend="hccl")


# torch.distributed.get_rank() if torch.distributed.is_initialized() else -1
def run():
    import time

    with torch.device("hpu"):
        for i in range(1000000):
            M, N, K = 4096, 4096, 4096
            a = torch.randn(M, N)
            b = torch.randn(N, K)
            out = a @ b
            if i % 100 == 0:
                print(out.shape)
                rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else -1
                print(f"Rank {rank} iteration {i}")
                torch.distributed.barrier()
            htcore.mark_step()
            time.sleep(1)


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--local-rank", "--local_rank", type=int)
    args = parser.parse_args()
    run()

"""
 python -m torch.distributed.run  --nproc-per-node 8  keep_run.py 
"""

# Test PyTorch NCCL
import torch
import torch.distributed as dist
dist.init_process_group(backend="nccl")
local_rank = dist.get_rank() % torch.cuda.device_count()
torch.cuda.set_device(local_rank)
data = torch.FloatTensor([1,] * 128).to("cuda")
dist.all_reduce(data, op=dist.ReduceOp.SUM)
torch.cuda.synchronize()
value = data.mean().item()
world_size = dist.get_world_size()
assert value == world_size, f"Expected {world_size}, got {value}"

print("PyTorch NCCL is successful!")

# Test PyTorch GLOO
gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
cpu_data = torch.FloatTensor([1,] * 128)
dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
value = cpu_data.mean().item()
assert value == world_size, f"Expected {world_size}, got {value}"

print("PyTorch GLOO is successful!")

if world_size <= 1:
    exit()

# Test vLLM NCCL, with cuda graph
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator

pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
# pynccl is enabled by default for 0.6.5+,
# but for 0.6.4 and below, we need to enable it manually.
# keep the code for backward compatibility when because people
# prefer to read the latest documentation.
pynccl.disabled = False

s = torch.cuda.Stream()
with torch.cuda.stream(s):
    data.fill_(1)
    out = pynccl.all_reduce(data, stream=s)
    value = out.mean().item()
    assert value == world_size, f"Expected {world_size}, got {value}"

print("vLLM NCCL is successful!")

g = torch.cuda.CUDAGraph()
with torch.cuda.graph(cuda_graph=g, stream=s):
    out = pynccl.all_reduce(data, stream=torch.cuda.current_stream())

data.fill_(1)
g.replay()
torch.cuda.current_stream().synchronize()
value = out.mean().item()
assert value == world_size, f"Expected {world_size}, got {value}"

print("vLLM NCCL with cuda graph is successful!")

dist.destroy_process_group(gloo_group)
dist.destroy_process_group()

https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#incorrect-hardware-driver

Torch Compile Q&A

44 | #include <crypt.h>

conda install --channel=conda-forge libxcrypt
export CPATH=/opt/conda/include/

https://github.com/stanford-futuredata/ColBERT/issues/309#issuecomment-1958177044

Torch Recipe

skip init weight

TORCH_INIT_FUNCTIONS = {
    "uniform_": nn.init.uniform_,
    "normal_": nn.init.normal_,
    "trunc_normal_": nn.init.trunc_normal_,
    "constant_": nn.init.constant_,
    "xavier_uniform_": nn.init.xavier_uniform_,
    "xavier_normal_": nn.init.xavier_normal_,
    "kaiming_uniform_": nn.init.kaiming_uniform_,
    "kaiming_normal_": nn.init.kaiming_normal_,
    "uniform": nn.init.uniform,
    "normal": nn.init.normal,
    "xavier_uniform": nn.init.xavier_uniform,
    "xavier_normal": nn.init.xavier_normal,
    "kaiming_uniform": nn.init.kaiming_uniform,
    "kaiming_normal": nn.init.kaiming_normal,
}


@contextmanager
def no_init_weights(_enable=True):
    """
    Context manager to globally disable weight initialization to speed up loading large models.

    TODO(Patrick): Delete safety argument `_enable=True` at next major version. .
    """
    global _init_weights
    old_init_weights = _init_weights

    if _enable:
        _init_weights = False

        def _skip_init(*args, **kwargs):
            pass

        # # Save the original initialization functions
        for name, init_func in TORCH_INIT_FUNCTIONS.items():
            setattr(torch.nn.init, name, _skip_init)
    try:
        yield
    finally:
        _init_weights = old_init_weights
        if _enable:
            # # Restore the original initialization functions
            for name, init_func in TORCH_INIT_FUNCTIONS.items():
                setattr(torch.nn.init, name, init_func)

https://github.com/huggingface/transformers/blob/816f4424964c1a1631e303b663fc3d68f731e923/src/transformers/modeling_utils.py#L188-L213

Two steps to remove the graph break caused by customize ops

from typing import Sequence

# Use torch.library.custom_op to define a new custom operator.
# If your operator mutates any input Tensors, their names must be specified
# in the ``mutates_args`` argument.
@torch.library.custom_op("mylib::crop", mutates_args=())
def crop(pic: [torch.Tensor](https://pytorch.org/docs/stable/tensors.html#torch.Tensor), box: Sequence[int]) -> [torch.Tensor](https://pytorch.org/docs/stable/tensors.html#torch.Tensor):
    [img](https://pytorch.org/docs/stable/tensors.html#torch.Tensor) = [to_pil_image](https://pytorch.org/vision/stable/generated/torchvision.transforms.functional.to_pil_image.html#torchvision.transforms.functional.to_pil_image)(pic.cpu())
    [cropped_img](https://pytorch.org/docs/stable/tensors.html#torch.Tensor) = [img.crop](https://pytorch.org/docs/stable/tensors.html#torch.Tensor)(box)
    return ([pil_to_tensor](https://pytorch.org/vision/stable/generated/torchvision.transforms.functional.pil_to_tensor.html#torchvision.transforms.functional.pil_to_tensor)([cropped_img](https://pytorch.org/docs/stable/tensors.html#torch.Tensor)) / 255.).to(pic.device, pic.dtype)

# Use register_fake to add a ``FakeTensor`` kernel for the operator
@crop.register_fake
def _(pic, box):
    channels = pic.shape[0]
    x0, y0, x1, y1 = box
    return pic.new_empty(channels, y1 - y0, x1 - x0)

python-custom-ops-tutorial

Git

git config --global alias.xxx
git config --global alias.shortcmd 'full command'
git config --global alias.dn 'diff --name-only'

VSCODE

Save pwd
https://stackoverflow.com/a/72650548

Gerrit

git push origin HEAD:refs/heads/dev/y/q3
Enumerating objects: 36, done.
Counting objects: 100% (36/36), done.
Delta compression using up to 12 threads
Compressing objects: 100% (17/17), done.
Writing objects: 100% (21/21), 6.04 KiB | 6.04 MiB/s, done.
Total 21 (delta 13), reused 5 (delta 4), pack-reused 0
remote: Resolving deltas: 100% (13/13)
remote: Processing changes: refs: 1, done    
remote: warning: xxx: subject >50 characters; use shorter first paragraph
To ssh://xxx/proj-name
 * [new branch]          HEAD -> dev/yi/qdq3

push and create PR if needed

git push origin HEAD:refs/for/master_next

Github Markdown

> [!NOTE]
> Useful information that users should know, even when skimming content.

> [!TIP]
> Helpful advice for doing things better or more easily.

> [!IMPORTANT]
> Key information users need to know to achieve their goal.

> [!WARNING]
> Urgent info that needs immediate user attention to avoid problems.

> [!CAUTION]
> Advises about risks or negative outcomes of certain actions.

Note

Useful information that users should know, even when skimming content.

Tip

Helpful advice for doing things better or more easily.

Important

Key information users need to know to achieve their goal.

Warning

Urgent info that needs immediate user attention to avoid problems.

Caution

Advises about risks or negative outcomes of certain actions.

https://gist.github.com/yiliu30/72955a5c1cd8957a095a7b2aaacd1b83

`:bowtie:`	😄 `:smile:`	😆 `:laughing:`
😊 `:blush:`	😃 `:smiley:`	☺️ `:relaxed:`
😏 `:smirk:`	😍 `:heart_eyes:`	😘 `:kissing_heart:`
😚 `:kissing_closed_eyes:`	😳 `:flushed:`	😌 `:relieved:`
😆 `:satisfied:`	😁 `:grin:`	😉 `:wink:`
😜 `:stuck_out_tongue_winking_eye:`	😝 `:stuck_out_tongue_closed_eyes:`	😀 `:grinning:`
😗 `:kissing:`	😙 `:kissing_smiling_eyes:`	😛 `:stuck_out_tongue:`
😴 `:sleeping:`	😟 `:worried:`	😦 `:frowning:`
😧 `:anguished:`	😮 `:open_mouth:`	😬 `:grimacing:`
😕 `:confused:`	😯 `:hushed:`	😑 `:expressionless:`
😒 `:unamused:`	😅 `:sweat_smile:`	😓 `:sweat:`
😥 `:disappointed_relieved:`	😩 `:weary:`	😔 `:pensive:`
😞 `:disappointed:`	😖 `:confounded:`	😨 `:fearful:`
😰 `:cold_sweat:`	😣 `:persevere:`	😢 `:cry:`
😭 `:sob:`	😂 `:joy:`	😲 `:astonished:`
😱 `:scream:`	`:neckbeard:`	😫 `:tired_face:`
😠 `:angry:`	😡 `:rage:`	😤 `:triumph:`
😪 `:sleepy:`	😋 `:yum:`	😷 `:mask:`
😎 `:sunglasses:`	😵 `:dizzy_face:`	👿 `:imp:`
😈 `:smiling_imp:`	😐 `:neutral_face:`	😶 `:no_mouth:`
😇 `:innocent:`	👽 `:alien:`	💛 `:yellow_heart:`
💙 `:blue_heart:`	💜 `:purple_heart:`	❤️ `:heart:`
💚 `:green_heart:`	💔 `:broken_heart:`	💓 `:heartbeat:`
💗 `:heartpulse:`	💕 `:two_hearts:`	💞 `:revolving_hearts:`
💘 `:cupid:`	💖 `:sparkling_heart:`	✨ `:sparkles:`
⭐ `:star:`	🌟 `:star2:`	💫 `:dizzy:`
💥 `:boom:`	💥 `:collision:`	💢 `:anger:`
❗ `:exclamation:`	❓ `:question:`	❕ `:grey_exclamation:`
❔ `:grey_question:`	💤 `:zzz:`	💨 `:dash:`
💦 `:sweat_drops:`	🎶 `:notes:`	🎵 `:musical_note:`
🔥 `:fire:`	💩 `:hankey:`	💩 `:poop:`
💩 `:shit:`	👍 `:+1:`	👍 `:thumbsup:`
👎 `:-1:`	👎 `:thumbsdown:`	👌 `:ok_hand:`
👊 `:punch:`	👊 `:facepunch:`	✊ `:fist:`
✌️ `:v:`	👋 `:wave:`	✋ `:hand:`
✋ `:raised_hand:`	👐 `:open_hands:`	☝️ `:point_up:`
👇 `:point_down:`	👈 `:point_left:`	👉 `:point_right:`
🙌 `:raised_hands:`	🙏 `:pray:`	👆 `:point_up_2:`
👏 `:clap:`	💪 `:muscle:`	🤘 `:metal:`
🖕 `:fu:`	🚶 `:walking:`	🏃 `:runner:`
🏃 `:running:`	👫 `:couple:`	👪 `:family:`
👬 `:two_men_holding_hands:`	👭 `:two_women_holding_hands:`	💃 `:dancer:`
👯 `:dancers:`	🙆‍♀️ `:ok_woman:`	🙅 `:no_good:`
💁 `:information_desk_person:`	🙋 `:raising_hand:`	👰‍♀️ `:bride_with_veil:`
:person_with_pouting_face: `:person_with_pouting_face:`	:person_frowning: `:person_frowning:`	🙇 `:bow:`
💏 `:couplekiss:`	💑 `:couple_with_heart:`	💆 `:massage:`
💇 `:haircut:`	💅 `:nail_care:`	👦 `:boy:`
👧 `:girl:`	👩 `:woman:`	👨 `:man:`
👶 `:baby:`	👵 `:older_woman:`	👴 `:older_man:`
:person_with_blond_hair: `:person_with_blond_hair:`	👲 `:man_with_gua_pi_mao:`	👳‍♂️ `:man_with_turban:`
👷 `:construction_worker:`	👮 `:cop:`	👼 `:angel:`
👸 `:princess:`	😺 `:smiley_cat:`	😸 `:smile_cat:`
😻 `:heart_eyes_cat:`	😽 `:kissing_cat:`	😼 `:smirk_cat:`
🙀 `:scream_cat:`	😿 `:crying_cat_face:`	😹 `:joy_cat:`
😾 `:pouting_cat:`	👹 `:japanese_ogre:`	👺 `:japanese_goblin:`
🙈 `:see_no_evil:`	🙉 `:hear_no_evil:`	🙊 `:speak_no_evil:`
💂‍♂️ `:guardsman:`	💀 `:skull:`	🐾 `:feet:`
👄 `:lips:`	💋 `:kiss:`	💧 `:droplet:`
👂 `:ear:`	👀 `:eyes:`	👃 `:nose:`
👅 `:tongue:`	💌 `:love_letter:`	👤 `:bust_in_silhouette:`
👥 `:busts_in_silhouette:`	💬 `:speech_balloon:`	💭 `:thought_balloon:`
`:feelsgood:`	`:finnadie:`	`:goberserk:`
`:godmode:`	`:hurtrealbad:`	`:rage1:`
`:rage2:`	`:rage3:`	`:rage4:`
`:suspect:`	`:trollface:`

HPU

run hpu test

python3 -m pytest  --mode lazy  -sv ./any_mode/test_hpu_fp8_ops_any_mode.py -k test_fp8_gemm_v2_quick
more details `env_var_in_scope`

For example, mark_step which is used to trigger execution of accumulated graphs in Lazy mode.

Python

PDB

import sys
import pdb

class ForkedPdb(pdb.Pdb):
    """A Pdb subclass that may be used
    from a forked multiprocessing child

    """
    def interaction(self, *args, **kwargs):
        _stdin = sys.stdin
        try:
            sys.stdin = open('/dev/stdin')
            pdb.Pdb.interaction(self, *args, **kwargs)
        finally:
            sys.stdin = _stdin


...
ForkedPdb.set_trace()in

Tensor mem size

mem = lambda a:  a.element_size() * a.nelement() / (1.0 *1024**2)

breakpoint

# main.py
a = 1
breakpoint() # It's a built-in method
b = a + 1
# debug it
# python main.py
# ignore breakpoint
# PYTHONBREAKPOINT=0 python main.py
# https://stackoverflow.com/a/71541003/23445462

Mem

def see_memory_usage(message, force=True):
    # Modified from DeepSpeed
    import gc
    import logging

    import torch.distributed as dist

    if not force:
        return
    if dist.is_initialized() and not dist.get_rank() == 0:
        return

    # python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports
    gc.collect()

    # Print message except when distributed but not rank 0
    logging.info(message)
    logging.info(
        f"AllocatedMem {round(torch.cuda.memory_allocated() / (1024 * 1024 * 1024),2 )} GB \
        MaxAllocatedMem {round(torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),2)} GB \
        ReservedMem {round(torch.cuda.memory_reserved() / (1024 * 1024 * 1024),2)} GB \
        MaxReservedMem {round(torch.cuda.max_memory_reserved() / (1024 * 1024 * 1024))} GB "
    )

    # get the peak memory to report correct data, so reset the counter for the next call
    torch.cuda.reset_peak_memory_stats()

LLM

@torch.no_grad()
def batch_gen_text(model, tokenizer, msg="", prompt="What's AI?", max_tokens = 50, device="cpu"):
    model = model.to(device)
    inputs = tokenizer.batch_encode_plus(prompt, return_tensors="pt", padding=True, truncation=True)
    inputs = move_data_to_device(inputs, device)
    new_tokens = model.generate(**inputs.to(device), max_length=max_tokens)
    text = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
    for i, t in enumerate(text):
        print(f"Generated text ({msg}): {t}")


def get_example_inputs(tokenizer):
    iters = 4
    prompt = "What are we having for dinner?"
    example_inputs = tokenizer(prompt, return_tensors="pt")
    for i in range(iters):
        yield example_inputs


def check_package(package_name: str):
    try:
        __import__(package_name)
        return True
    except ImportError:
        print(f"Package {package_name} not found.")
        return False

WA for logger issue

import logging
logging.basicConfig(level = logging.INFO)
### Your code
...

https://stackoverflow.com/a/57234760/23445462

Great register

https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/registry/registry.py

class RegistryMixin:
    """
    Universal registry to support registration and loading of child classes and plugins
    of neuralmagic utilities.

    Classes that require a registry or plugins may add the `RegistryMixin` and use
    `register` and `load` as the main entrypoints for adding new implementations and
    loading requested values from its registry.

    If a class should only have its child classes in its registry, the class should
    set the static attribute `registry_requires_subclass` to True

    example
    ```python
    class Dataset(RegistryMixin):
        pass


    # register with default name
    @Dataset.register()
    class ImageNetDataset(Dataset):
        pass

    # load as "ImageNetDataset"
    imagenet = Dataset.load("ImageNetDataset")

    # register with custom name
    @Dataset.register(name="cifar-dataset")
    class Cifar(Dataset):
        pass

    Note: the name will be standardized for lookup in the registry.
    For example, if a class is registered as "cifar_dataset" or
    "cifar dataset", it will be stored as "cifar-dataset". The user
    will be able to load the class with any of the three name variants.

    # register with multiple aliases
    @Dataset.register(alias=["cifar-10-dataset", "cifar_100_dataset"])
    class Cifar(Dataset):
        pass

    # load as "cifar-dataset"
    cifar = Dataset.load_from_registry("cifar-dataset")

    # load from custom file that implements a dataset
    mnist = Dataset.load_from_registry("/path/to/mnnist_dataset.py:MnistDataset")
    ```
    """

Measure mem

def measure_mem():
    import os
    import psutil
    # Get the process ID of the current process
    pid = os.getpid()

    # Create a Process object for the current process
    process = psutil.Process(pid)

    # Get the memory usage in bytes
    memory_info = process.memory_info()

    print(f"RAM mem: {memory_info.rss / 1024 ** 3} GB")
    print(f"Virtual mem: {memory_info.vms / 1024 ** 3} GB")
    print(f"Shared mem: {memory_info.shared / 1024 ** 3} GB")

Let's time it

import timeit

import numpy as np
import pytest
import torch

import neural_compressor.torch.algorithms.weight_only.modules as inc_modules

fn1 = inc_modules.WeightOnlyLinear.pack_tensor_with_numpy_static
fn2 = inc_modules.WeightOnlyLinear.pack_tensor_with_numpy_opt_np_numba
fn1 = torch.compile(inc_modules.WeightOnlyLinear.pack_tensor_with_torch_static)


@pytest.mark.parametrize("out_features", [128, 1024, 5120, 13824])
@pytest.mark.parametrize("in_features", [1024, 13824])
def test_pack(in_features, out_features):
    bits = 4

    raw_tensor = torch.randint(0, 15, (out_features, in_features), dtype=torch.int8)
    n_pack = 32 // 4
    compression_dtype: torch.dtype = torch.int32
    iters = 20
    raw_np = raw_tensor.numpy()
    time_ref = timeit.timeit(lambda: fn1(raw_tensor, n_pack, bits, compression_dtype), number=iters)
    time_res = timeit.timeit(lambda: fn1(raw_tensor, n_pack, bits, compression_dtype), number=iters)

    print(f"ref : {time_ref},  res: {time_res}, speed up: {time_ref / time_res}")

    # print(f"ref_dur:{ref_dur}, res_dur:{res_dur} res_np")

    # assert np.array_equal(ref.numpy(), res), f"ref:{ref}, res:{res}"
    # assert torch.allclose(ref, torch.from_numpy(res)), f"ref:{ref}, res:{res}"

Compare two objs

from typing import Optional, Callable, Any, List, Tuple, Dict
def assert_same(
    a: Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]],
    b: Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]],
):
    assert len(a) == len(b), f"len: {len(a)} != {len(b)}"
    for i in range(len(a)):
        assert type(a[i]) == type(b[i]), f"type: {type(a[i])} != {type(b[i])}"
        if isinstance(a[i], torch.Tensor):
            torch.testing.assert_allclose(a[i], b[i])
        elif isinstance(a[i], tuple):
            assert_same(a[i], b[i])
        elif isinstance(a[i], dict):
            for k in a[i].keys():
                assert k in b[i], f"key: {k} not in {b[i]}"
                assert_same(a[i][k], b[i].get(k))
        elif a[i] is None:
            assert b[i] is None
        else:
            raise ValueError(f"Unsupported type: {type(a[i])}")
    print("Same!")

Manual seed

seed = 0
import random
random.seed(seed)
import torch
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
import numpy as np
np.random.seed(seed)

torch.use_deterministic_algorithms(True)
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(0)

DataLoader(
    train_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    worker_init_fn=seed_worker,
    generator=g,
)

conda

Update g++/gcc

# update g++/gcc
conda install -c conda-forge cxx-compiler

conda replacement

curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
bash Miniforge3-$(uname)-$(uname -m).sh

Shell

Query NV Capability

nvidia-smi --query-gpu=compute_cap --format=csv

Some useful alias

gitrmb() {
  # Loop through all provided branch names
  for branch in "$@"
  do
    # Try to delete the local branch
    if git branch -d "$branch" 2>/dev/null; then
      echo "Successfully deleted local branch: $branch"
    else
      # If the branch was not merged, force delete it
      if git branch -D "$branch" 2>/dev/null; then
        echo "Forcefully deleted local branch: $branch"
      else
        echo "Failed to delete local branch: $branch"
      fi
    fi

    # Try to delete the remote branch
    if git push origin --delete "$branch" 2>/dev/null; then
      echo "Successfully deleted remote branch: $branch"
    else
      echo "Failed to delete remote branch or branch does not exist remotely: $branch"
    fi
  done
}

alias condact='conda activate '
alias p='python 
alias exportpath='export PYTHONPATH=$PYTHONPATH:$PWD'

NUMA For benchmark

OMP_NUM_THREADS=24 numactl -l -C 0-11,24-35  python main.py
# specify the memory nodes from which memory should be allocated.
# OMP_NUM_THREADS=<num_threads> numactl -m <node_index> -C <start_core>-<end_core>  python main.py
OMP_NUM_THREADS=12 numactl -m 0 -C 0-11  python main.py
# allocate memory only on the current node
# OMP_NUM_THREADS=<num_threads> numactl -c -C <start_core>-<end_core>  python main.py
OMP_NUM_THREADS=12 numactl -l -C 0-11  python main.py

OMP_NUM_THREADS=64 numactl -l -m 1 -C 64-127
nvidia-smi topo -m
        GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    GPU7    CPU Affinity    NUMA Affinity  GPU NUMA ID
GPU0     X      NV12    NV12    NV12    NV12    NV12    NV12    NV12    0-63,128-191    0              N/A
GPU1    NV12     X      NV12    NV12    NV12    NV12    NV12    NV12    0-63,128-191    0              N/A
GPU2    NV12    NV12     X      NV12    NV12    NV12    NV12    NV12    0-63,128-191    0              N/A
GPU3    NV12    NV12    NV12     X      NV12    NV12    NV12    NV12    0-63,128-191    0              N/A
GPU4    NV12    NV12    NV12    NV12     X      NV12    NV12    NV12    64-127,192-255  1              N/A
GPU5    NV12    NV12    NV12    NV12    NV12     X      NV12    NV12    64-127,192-255  1              N/A
GPU6    NV12    NV12    NV12    NV12    NV12    NV12     X      NV12    64-127,192-255  1              N/A
GPU7    NV12    NV12    NV12    NV12    NV12    NV12    NV12     X      64-127,192-255  1              N/A

taskset

taskset -c 0-11 python xxx

Benckmark LLMs

Tokens/second https://github.com/casper-hansen/AutoAWQ/blob/main/examples/benchmark.py

def generate_torch(model, input_ids, n_generate):
    context_time = 0
    generate_time = []

    with torch.inference_mode():
        for i in range(n_generate):
            if DEVICE != "cpu":
                torch.cuda.synchronize()
            start = time.time()

            if i == 0:
                # prefill context
                inputs = torch.as_tensor(input_ids, device=next(model.parameters()).device)
            else:
                # decode tokens
                inputs = torch.as_tensor(token, device=next(model.parameters()).device)

            out = model(inputs, use_cache=True)

            if DEVICE != "cpu":
                torch.cuda.synchronize()
            token = out[0][:, -1].max(1)[1].unsqueeze(1)

            if i == 0:
                context_time += time.time() - start
            else:
                generate_time.append(time.time() - start)

    return context_time, generate_time

Win-related

#export
set PYTHONPATH=%PYTHONPATH%;%CD%

Code snippet - yiliu30/yi GitHub Wiki

Torch Env Var

Inductor

TORCH_LOGS

`HPU`

Multiple Cards

Torch Compile Q&A

Torch Recipe

skip init weight

Two steps to remove the graph break caused by customize ops

Git

VSCODE

Gerrit

Github Markdown

HPU

run hpu test

Python

PDB

Tensor mem size

breakpoint

Mem

LLM

WA for logger issue

Great register

Measure mem

Let's time it

Compare two objs

Manual seed

conda

Shell

Query NV Capability

Some useful alias

NUMA For benchmark

taskset

Benckmark LLMs

Win-related

⚠️ GitHub.com Fallback ⚠️

Code snippet - yiliu30/yi GitHub Wiki

Torch Env Var

Inductor

TORCH_LOGS

HPU

Multiple Cards

Torch Compile Q&A

Torch Recipe

skip init weight

Two steps to remove the graph break caused by customize ops

Git

VSCODE

Gerrit

Github Markdown

HPU

run hpu test

Python

PDB

Tensor mem size

breakpoint

Mem

LLM

WA for logger issue

Great register

Measure mem

Let's time it

Compare two objs

Manual seed

conda

Shell

Query NV Capability

Some useful alias

NUMA For benchmark

taskset

Benckmark LLMs

Win-related

⚠️ **GitHub.com Fallback** ⚠️

`HPU`

⚠️ GitHub.com Fallback ⚠️