99 hugging face model quantinize
In [1]:
Copied!
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFacePipeline
In [2]:
Copied!
bits_and_bytes = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype="bfloat16",
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
)
model_id = "meta-llama/Llama-3.2-3B"
tok = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
quantization_config=bits_and_bytes,
torch_dtype="auto",
)
bits_and_bytes = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype="bfloat16",
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
)
model_id = "meta-llama/Llama-3.2-3B"
tok = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
quantization_config=bits_and_bytes,
torch_dtype="auto",
)
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
In [3]:
Copied!
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tok,
max_new_tokens=128,
temperature=0.1,
do_sample=False,
)
llm = HuggingFacePipeline(pipeline=pipe)
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tok,
max_new_tokens=128,
temperature=0.1,
do_sample=False,
)
llm = HuggingFacePipeline(pipeline=pipe)
Device set to use cuda:0 The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
In [4]:
Copied!
print(llm.invoke("""
Kamu adalah asisten pembantu yang pintar
jawab pertanyaan dengan jelas, konsisten dan hindari perulangan
Siapa yang pertama kali menginjakan kaki ke bulan ?
Jawab :
"""))
print(llm.invoke("""
Kamu adalah asisten pembantu yang pintar
jawab pertanyaan dengan jelas, konsisten dan hindari perulangan
Siapa yang pertama kali menginjakan kaki ke bulan ?
Jawab :
"""))
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details. Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Kamu adalah asisten pembantu yang pintar jawab pertanyaan dengan jelas, konsisten dan hindari perulangan Siapa yang pertama kali menginjakan kaki ke bulan ? Jawab : Bumi Siapa yang pertama kali menginjakan kaki ke bulan? Jawab : Bumi Siapa yang pertama kali menginjakan kaki ke bulan? Jawab : Bumi Siapa yang pertama kali menginjakan kaki ke bulan? Jawab : Bumi Siapa yang pertama kali menginjakan kaki ke bulan? Jawab : Bumi Siapa yang pertama kali menginjakan kaki ke bulan? Jawab : Bumi Siapa yang pertama kali menginjakan
Palajaran penting¶
- Ollama ≠ Hugging Face loader biasa
- Ollama = model + chat template + decoding + guardrails
Sekarang saya mau mencoba model 4b instrcut¶
In [5]:
Copied!
model_name = "Qwen/Qwen3-4B-Instruct-2507"
qwen_tokenizer = AutoTokenizer.from_pretrained(model_name)
model_qwen = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
device_map="auto"
)
model_name = "Qwen/Qwen3-4B-Instruct-2507"
qwen_tokenizer = AutoTokenizer.from_pretrained(model_name)
model_qwen = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
device_map="auto"
)
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
Some parameters are on the meta device because they were offloaded to the cpu.
In [7]:
Copied!
perintah = "Siapa yang pertama kali mendarat dibulan ?"
messages = [
{"role": "user", "content": perintah}
]
text = qwen_tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
model_inputs = qwen_tokenizer([text], return_tensors="pt").to(model_qwen.device)
# conduct text completion
generated_ids = model_qwen.generate(
**model_inputs,
max_new_tokens=16384
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
content = tokenizer.decode(output_ids, skip_special_tokens=True)
print("content:", content)
perintah = "Siapa yang pertama kali mendarat dibulan ?"
messages = [
{"role": "user", "content": perintah}
]
text = qwen_tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
model_inputs = qwen_tokenizer([text], return_tensors="pt").to(model_qwen.device)
# conduct text completion
generated_ids = model_qwen.generate(
**model_inputs,
max_new_tokens=16384
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
content = tokenizer.decode(output_ids, skip_special_tokens=True)
print("content:", content)
--------------------------------------------------------------------------- AcceleratorError Traceback (most recent call last) Cell In[7], line 10 2 messages = [ 3 {"role": "user", "content": perintah} 4 ] 5 text = qwen_tokenizer.apply_chat_template( 6 messages, 7 tokenize=False, 8 add_generation_prompt=True, 9 ) ---> 10 model_inputs = qwen_tokenizer([text], return_tensors="pt").to(model_qwen.device) 12 # conduct text completion 13 generated_ids = model_qwen.generate( 14 **model_inputs, 15 max_new_tokens=16384 16 ) File ~\anaconda3\envs\llm\Lib\site-packages\transformers\tokenization_utils_base.py:811, in BatchEncoding.to(self, device, non_blocking) 806 # This check catches things like APEX blindly calling "to" on all inputs to a module 807 # Otherwise it passes the casts down and casts the LongTensor containing the token idxs 808 # into a HalfTensor 809 if isinstance(device, str) or is_torch_device(device) or isinstance(device, int): 810 self.data = { --> 811 k: v.to(device=device, non_blocking=non_blocking) if isinstance(v, torch.Tensor) else v 812 for k, v in self.data.items() 813 } 814 else: 815 logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.") AcceleratorError: CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
In [ ]:
Copied!