from langchain_llm import HuggingFaceLLM

llm = HuggingFaceLLM(
    model_name="qwen-7b-chat",
    model_path="/data/checkpoints/Qwen-7B-Chat",
    load_model_kwargs={"device_map": "auto"},
)

# invoke method
prompt = "<|im_start|>user\n你是谁？<|im_end|>\n<|im_start|>assistant\n"
print(llm.invoke(prompt, stop=["<|im_end|>"]))

# Token Streaming
for chunk in llm.stream(prompt, stop=["<|im_end|>"]):
    print(chunk, end="", flush=True)

# openai usage
print(llm.call_as_openai(prompt, stop=["<|im_end|>"]))

# Streaming
for chunk in llm.call_as_openai(prompt, stop=["<|im_end|>"], stream=True):
    print(chunk.choices[0].text, end="", flush=True)

Chat Completion Usage

from langchain_llm import ChatHuggingFace

chat_llm = ChatHuggingFace(llm=llm)

# invoke method
query = "你是谁？"
print(chat_llm.invoke(query))

# Token Streaming
for chunk in chat_llm.stream(query):
    print(chunk.content, end="", flush=True)

# openai usage
messages = [
    {"role": "user", "content": query}
]
print(chat_llm.call_as_openai(messages))

# Streaming
for chunk in chat_llm.call_as_openai(messages, stream=True):
    print(chunk.choices[0].delta.content or "", end="", flush=True)

VLLM Inference

Completion Usage

from langchain_llm import VLLM

llm = VLLM(
    model_name="qwen", 
    model="/data/checkpoints/Qwen-7B-Chat", 
    trust_remote_code=True,
)

# invoke method
prompt = "<|im_start|>user\n你是谁？<|im_end|>\n<|im_start|>assistant\n"
print(llm.invoke(prompt, stop=["<|im_end|>"]))

# openai usage
print(llm.call_as_openai(prompt, stop=["<|im_end|>"]))

Chat Completion Usage

from langchain_llm import ChatVLLM

chat_llm = ChatVLLM(llm=llm)

# invoke method
query = "你是谁？"
print(chat_llm.invoke(query))

# openai usage
messages = [
    {"role": "user", "content": query}
]
print(chat_llm.call_as_openai(messages))

Custom Chat template

from langchain_llm import BaseTemplate, ChatHuggingFace

class CustomTemplate(BaseTemplate):
    
    @property
    def template(self) -> str:
        return (
            "{% for message in messages %}"
            "{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}"
            "{% endfor %}"
            "{% if add_generation_prompt %}"
            "{{ '<|im_start|>assistant\\n' }}"
            "{% endif %}"
        )

chat_llm = ChatHuggingFace(
    llm=llm, 
    prompt_adapter=CustomTemplate()
)

Load Model Kwargs

model_name_or_path: model name or path.
use_fast_tokenizer: default false.
device_map: "auto"、"cuda:0" etc.
dtype: "half", "bfloat16", "float32".
load_in_8bit: Load model in 8 bit.
load_in_4bit: Load model in 4 bit.
rope_scaling: Which scaling strategy should be adopted for the RoPE embeddings. Literal["linear", "dynamic"].
flash_attn: Enable FlashAttention-2.

Merge Lora model

from langchain_llm import apply_lora

apply_lora("base_model_path", "lora_path", "target_model_path")

FAQs

What is langchain-llm?

Is langchain-llm well maintained?

Did you know?

Socket for GitHub automatically highlights issues in each pull request and monitors the health of all your open source dependencies. Discover the contents of your packages and block harmful activity before you install or update your dependencies.

Install

langchain-llm

Langchain LLM

Get Started

Install

Inference Usage

HuggingFace Inference

VLLM Inference

Custom Chat template

Load Model Kwargs

Merge Lora model

Related posts

Critical Vulnerability in NestJS Devtools: Localhost RCE via Sandbox Escape

Introducing License Overlays: Smarter License Management for Real-World Code

Introducing Rust Support in Socket