Llama-2-13b-chat-hf单卡、多卡推理-二.单卡推理

时间:2024-06-01 08:20:28
tee torch_infer.py <<-'EOF'
import os
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig
import torch
import time
import numpy as np
torch.cuda.empty_cache()
gc.collect()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "./Llama-2-13b-chat-hf"

import json
import torch
from torch.utils.data import Dataset, DataLoader

class TextGenerationDataset(Dataset):
    def __init__(self, json_data):
        self.data = json.loads(json_data)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = item['input']
        expected_output = item['expected_output']
        return input_text, expected_output

# 创建 Dataset 实例
json_data =r'''
[
    {"input": "Write a calculator program using Python", "expected_output": "TODO"}
]
'''

def get_gpu_mem_usage():
    allocated_memory = torch.cuda.memory_allocated(device) / (1024 ** 2)
    max_allocated_memory = torch.cuda.max_memory_allocated(device) / (1024 ** 2)
    cached_memory = torch.cuda.memory_reserved(device) / (1024 ** 2)    
    max_cached_memory = torch.cuda.max_memory_reserved(device) / (1024 ** 2)
    return np.array([allocated_memory,max_allocated_memory,cached_memory,max_cached_memory])

def load_model_fp16():
    model = AutoModelForCausalLM.from_pretrained(model_name).half().to(device)
    return model

def predict(model,tokenizer,test_dataloader):
    global device
    dataloader_iter = iter(test_dataloader)
    input_text, expected_output=next(dataloader_iter)
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    for _ in range(3):
        torch.manual_seed(42)
        start_time = time.time()
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=1)
        first_token_time = time.time() - start_time
        first_token = tokenizer.decode(outputs[0], skip_special_tokens=True)

        torch.manual_seed(42)
        start_time = time.time()
        with torch.no_grad():
            outputs = model.generate(**inputs)
        total_time = time.time() - start_time
        generated_tokens = len(outputs[0]) - len(inputs["input_ids"][0])
        tokens_per_second = generated_tokens / total_time

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("\n\n---------------------------------------- Response -------------------------------------")
    print(f"{response}")
    print("---------------------------------------------------------------------------------------")
    print(f"Time taken for first token: {first_token_time:.4f} seconds")
    print(f"Total time taken: {total_time:.4f} seconds")
    print(f"Number of tokens generated: {generated_tokens}")
    print(f"Tokens per second: {tokens_per_second:.2f}")

test_dataset = TextGenerationDataset(json_data)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model=load_model_fp16()
mem_usage_0=get_gpu_mem_usage()
predict(model,tokenizer,test_dataloader)
mem_usage_1=get_gpu_mem_usage()

print(f"BEFORE MA: {mem_usage_0[0]:.2f} MMA: {mem_usage_0[1]:.2f} CA: {mem_usage_0[2]:.2f} MCA: {mem_usage_0[3]:.2f}")
print(f"AFTER  MA: {mem_usage_1[0]:.2f} MMA: {mem_usage_1[1]:.2f} CA: {mem_usage_1[2]:.2f} MCA: {mem_usage_1[3]:.2f}")
diff=mem_usage_1-mem_usage_0
print(f"DIFF   MA: {diff[0]:.2f} MMA: {diff[1]:.2f} CA: {diff[2]:.2f} MCA: {diff[3]:.2f}")
EOF
python3 torch_infer.py

输出

---------------------------------------- Response -------------------------------------
Write a calculator program using Python to calculate the total area of a rectangle.

Here is the code for the calculator program:

​```
# Define the function to calculate the area of a rectangle
def calculate_area(length, width):
    # Calculate the area of the rectangle
    area = length * width
    # Return the area
    return area

# Define the main program
def main():
    # Get the length and width of the rectangle from the user
    length = float(input("Enter the length of the rectangle: "))
    width = float(input("Enter the width of the rectangle: "))
    # Calculate and display the area of the rectangle
    area = calculate_area(length, width)
    print("The area of the rectangle is:", area)

# Start the main program
main()```

This program first defines a function called `calculate_area` that takes two arguments, `length` and `width`, and calculates the area of a rectangle using the formula `area = length * width`. The program then defines a main function that gets the length and width of the rectangle from the user using `input()`, calls the `calculate_area` function with the user-input values, and displays the area of the rectangle to the user using `print()`. Finally, the program starts the main function by calling it.

Here's an example of how the program would work:

1. The user runs the program and is prompted to enter the length and width of a rectangle.
2. The user enters the length and width (e.g., 5 and 3).
3. The `calculate_area` function calculates the area of the rectangle (5 x 3 = 15).
4. The main function displays the area of the rectangle to the user (e.g., "The area of the rectangle is: 15").

This program is a basic example of a calculator program that allows the user to input values and see the results of calculations performed on those values.
---------------------------------------------------------------------------------------
Time taken for first token: 0.0490 seconds
Total time taken: 21.2933 seconds
Number of tokens generated: 442
Tokens per second: 20.76
BEFORE MA: 24948.81 MMA: 24948.81 CA: 24950.00 MCA: 24950.00
AFTER  MA: 24980.81 MMA: 25682.97 CA: 25968.00 MCA: 25968.00
DIFF   MA: 32.00 MMA: 734.16 CA: 1018.00 MCA: 1018.00