LLM Recipes - yiliu30/yi GitHub Wiki

Inspect Llama

Export LLAMA

# Copied from https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ#you-can-then-use-the-following-code

# ==------------------------------------------------------------------------------------------==
# Set the model name or path
# CUDA_VISIBLE_DEVICES=None OMP_NUM_THREADS=56 numactl -l -C 0-55  python  test_load.py
# ==------------------------------------------------------------------------------------------==

model_name_or_path = "meta-llama/Llama-3.2-1B"


from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
with torch.no_grad():
    # To use a different branch, change revision
    # For example: revision="gptq-4bit-64g-actorder_True"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    max_new_tokens = 20 if device == "cpu" else 100
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

    prompt = "Tell me about AI"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    model.eval()
    model = model.to(torch.float16)
    
    from torch.export import export
    exported_mod = export(model, (input_ids,))
    torch.export.save(exported_mod, 'exported_llama32-1b-fp16.pt2')

Visualize the llama

  • mlp image

  • post_layer_norm image

Gen test

    @dump_elapsed_time()
    def raw_gen_text(model, tokenizer, prompt="What is AI? ", max_length = 50, device="hpu", msg="", wrap_to_hpu=False):
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        input_ids = inputs["input_ids"]
        if wrap_to_hpu:
            from habana_frameworks.torch.hpu import wrap_in_hpu_graph
            model = wrap_in_hpu_graph(model)
        for i in range(max_length):
            outputs = model(input_ids)
            new_tokens = outputs.logits.argmax(dim=-1)[-1][-1:].unsqueeze(dim=0)
            input_ids = torch.cat([input_ids, new_tokens], dim=-1)
            text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
            print(f"====== Generated text ({msg}): {text}")
    # gen_text(model, tokenizer, msg="bf16")
    # gen_text(model_fp8, tokenizer, msg="fp8")

Mem size

  • hidden_states: [batch_size, seq_len, hidden_dim]
    • llama-2-7b, hiddim_dim is 4096, 128 * 1024 * 4096 = 2^(7 + 10 + 12) = 2^29 * 2 bytes = 2 ^ 30 bytes = 1GB

Prompt

showing the LLM some few shot exemplars where the reasoning process is explained in the exemplars...

image