mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-29 09:58:44 +00:00
partners/HuggingFacePipeline[stream]: Change to use pipeline
instead of pipeline.model.generate
in stream() (#26531)
## Description I encountered an error while using the` gemma-2-2b-it model` with the `HuggingFacePipeline` class and have implemented a fix to resolve this issue. ### What is Problem ```python model_id="google/gemma-2-2b-it" gemma_2_model = AutoModelForCausalLM.from_pretrained(model_id) gemma_2_tokenizer = AutoTokenizer.from_pretrained(model_id) gen = pipeline( task='text-generation', model=gemma_2_model, tokenizer=gemma_2_tokenizer, max_new_tokens=1024, device=0 if torch.cuda.is_available() else -1, temperature=.5, top_p=0.7, repetition_penalty=1.1, do_sample=True, ) llm = HuggingFacePipeline(pipeline=gen) for chunk in llm.stream("Hello World. Hello World. Hello World. Hello World. Hello World. Hello World. Hello World. Hello World. Hello World. Hello World."): print(chunk, end="", flush=True) ``` This code outputs the following error message: ``` /usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1258: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation. warnings.warn( Exception in thread Thread-19 (generate): Traceback (most recent call last): File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner self.run() File "/usr/lib/python3.10/threading.py", line 953, in run self._target(*self._args, **self._kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context return func(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 1874, in generate self._validate_generated_length(generation_config, input_ids_length, has_default_max_length) File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 1266, in _validate_generated_length raise ValueError( ValueError: Input length of input_ids is 31, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`. ``` In addition, the following error occurs when the number of tokens is reduced. ```python for chunk in llm.stream("Hello World"): print(chunk, end="", flush=True) ``` ``` /usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1258: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation. warnings.warn( /usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1885: UserWarning: You are calling .generate() with the `input_ids` being on a device type different than your model's device. `input_ids` is on cpu, whereas the model is on cuda. You may experience unexpected behaviors or slower generation. Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('cuda') before running `.generate()`. warnings.warn( Exception in thread Thread-20 (generate): Traceback (most recent call last): File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner self.run() File "/usr/lib/python3.10/threading.py", line 953, in run self._target(*self._args, **self._kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context return func(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2024, in generate result = self._sample( File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2982, in _sample outputs = self(**model_inputs, return_dict=True) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/transformers/models/gemma2/modeling_gemma2.py", line 994, in forward outputs = self.model( File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/transformers/models/gemma2/modeling_gemma2.py", line 803, in forward inputs_embeds = self.embed_tokens(input_ids) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/sparse.py", line 164, in forward return F.embedding( File "/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py", line 2267, in embedding return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse) RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select) ``` On the other hand, in the case of invoke, the output is normal: ``` llm.invoke("Hello World. Hello World. Hello World. Hello World. Hello World. Hello World. Hello World. Hello World. Hello World. Hello World.") ``` ``` 'Hello World. Hello World. Hello World. Hello World. Hello World. Hello World. Hello World. Hello World. Hello World. Hello World.\n\nThis is a simple program that prints the phrase "Hello World" to the console. \n\n**Here\'s how it works:**\n\n* **`print("Hello World")`**: This line of code uses the `print()` function, which is a built-in function in most programming languages (like Python). The `print()` function takes whatever you put inside its parentheses and displays it on the screen.\n* **`"Hello World"`**: The text within the double quotes (`"`) is called a string. It represents the message we want to print.\n\n\nLet me know if you\'d like to explore other programming concepts or see more examples! \n' ``` ### Problem Analysis - Apparently, I put kwargs in while generating pipelines and it applied to `invoke()`, but it's not applied in the `stream()`. - When using the stream, `inputs = self.pipeline.tokenizer (prompt, return_tensors = "pt")` enters cpu. - This can crash when the model is in gpu. ### Solution Just use `self.pipeline` instead of `self.pipeline.model.generate`. - **Original Code** ```python stopping_criteria = StoppingCriteriaList([StopOnTokens()]) inputs = self.pipeline.tokenizer(prompt, return_tensors="pt") streamer = TextIteratorStreamer( self.pipeline.tokenizer, timeout=60.0, skip_prompt=skip_prompt, skip_special_tokens=True, ) generation_kwargs = dict( inputs, streamer=streamer, stopping_criteria=stopping_criteria, **pipeline_kwargs, ) t1 = Thread(target=self.pipeline.model.generate, kwargs=generation_kwargs) t1.start() ``` - **Updated Code** ```python stopping_criteria = StoppingCriteriaList([StopOnTokens()]) streamer = TextIteratorStreamer( self.pipeline.tokenizer, timeout=60.0, skip_prompt=skip_prompt, skip_special_tokens=True, ) generation_kwargs = dict( text_inputs= prompt, streamer=streamer, stopping_criteria=stopping_criteria, **pipeline_kwargs, ) t1 = Thread(target=self.pipeline, kwargs=generation_kwargs) t1.start() ``` By using the `pipeline` directly, the `kwargs` of the pipeline are applied, and there is no need to consider the `device` of the `tensor` made with the `tokenizer`. > According to the change to use `pipeline`, it was modified to put `text_inputs=prompts` directly into `generation_kwargs`. ## Issue None ## Dependencies None ## Twitter handle None --------- Co-authored-by: Vadym Barda <vadym@langchain.dev>
This commit is contained in:
parent
655ced84d7
commit
6227396e20
@ -349,7 +349,6 @@ class HuggingFacePipeline(BaseLLM):
|
||||
|
||||
stopping_criteria = StoppingCriteriaList([StopOnTokens()])
|
||||
|
||||
inputs = self.pipeline.tokenizer(prompt, return_tensors="pt")
|
||||
streamer = TextIteratorStreamer(
|
||||
self.pipeline.tokenizer,
|
||||
timeout=60.0,
|
||||
@ -357,12 +356,12 @@ class HuggingFacePipeline(BaseLLM):
|
||||
skip_special_tokens=True,
|
||||
)
|
||||
generation_kwargs = dict(
|
||||
inputs,
|
||||
text_inputs=prompt,
|
||||
streamer=streamer,
|
||||
stopping_criteria=stopping_criteria,
|
||||
**pipeline_kwargs,
|
||||
)
|
||||
t1 = Thread(target=self.pipeline.model.generate, kwargs=generation_kwargs)
|
||||
t1 = Thread(target=self.pipeline, kwargs=generation_kwargs)
|
||||
t1.start()
|
||||
|
||||
for char in streamer:
|
||||
|
Loading…
Reference in New Issue
Block a user