mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-10-31 16:08:59 +00:00 
			
		
		
		
	- **Description:** Tiny but important bugfix to use a more stable interface for specifying generation_config parameters for DeepSparse LLM
		
			
				
	
	
		
			229 lines
		
	
	
		
			8.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			229 lines
		
	
	
		
			8.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # flake8: noqa
 | |
| from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Union
 | |
| from langchain_core.pydantic_v1 import root_validator
 | |
| from langchain_core.callbacks import (
 | |
|     AsyncCallbackManagerForLLMRun,
 | |
|     CallbackManagerForLLMRun,
 | |
| )
 | |
| from langchain_core.language_models.llms import LLM
 | |
| from langchain_community.llms.utils import enforce_stop_tokens
 | |
| from langchain_core.outputs import GenerationChunk
 | |
| 
 | |
| 
 | |
| class DeepSparse(LLM):
 | |
|     """Neural Magic DeepSparse LLM interface.
 | |
|     To use, you should have the ``deepsparse`` or ``deepsparse-nightly``
 | |
|     python package installed. See https://github.com/neuralmagic/deepsparse
 | |
|     This interface let's you deploy optimized LLMs straight from the
 | |
|     [SparseZoo](https://sparsezoo.neuralmagic.com/?useCase=text_generation)
 | |
|     Example:
 | |
|         .. code-block:: python
 | |
|             from langchain_community.llms import DeepSparse
 | |
|             llm = DeepSparse(model="zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base_quant-none")
 | |
|     """  # noqa: E501
 | |
| 
 | |
|     pipeline: Any  #: :meta private:
 | |
| 
 | |
|     model: str
 | |
|     """The path to a model file or directory or the name of a SparseZoo model stub."""
 | |
| 
 | |
|     model_config: Optional[Dict[str, Any]] = None
 | |
|     """Keyword arguments passed to the pipeline construction.
 | |
|     Common parameters are sequence_length, prompt_sequence_length"""
 | |
| 
 | |
|     generation_config: Union[None, str, Dict] = None
 | |
|     """GenerationConfig dictionary consisting of parameters used to control
 | |
|     sequences generated for each prompt. Common parameters are:
 | |
|     max_length, max_new_tokens, num_return_sequences, output_scores,
 | |
|     top_p, top_k, repetition_penalty."""
 | |
| 
 | |
|     streaming: bool = False
 | |
|     """Whether to stream the results, token by token."""
 | |
| 
 | |
|     @property
 | |
|     def _identifying_params(self) -> Dict[str, Any]:
 | |
|         """Get the identifying parameters."""
 | |
|         return {
 | |
|             "model": self.model,
 | |
|             "model_config": self.model_config,
 | |
|             "generation_config": self.generation_config,
 | |
|             "streaming": self.streaming,
 | |
|         }
 | |
| 
 | |
|     @property
 | |
|     def _llm_type(self) -> str:
 | |
|         """Return type of llm."""
 | |
|         return "deepsparse"
 | |
| 
 | |
|     @root_validator()
 | |
|     def validate_environment(cls, values: Dict) -> Dict:
 | |
|         """Validate that ``deepsparse`` package is installed."""
 | |
|         try:
 | |
|             from deepsparse import Pipeline
 | |
|         except ImportError:
 | |
|             raise ImportError(
 | |
|                 "Could not import `deepsparse` package. "
 | |
|                 "Please install it with `pip install deepsparse[llm]`"
 | |
|             )
 | |
| 
 | |
|         model_config = values["model_config"] or {}
 | |
| 
 | |
|         values["pipeline"] = Pipeline.create(
 | |
|             task="text_generation",
 | |
|             model_path=values["model"],
 | |
|             **model_config,
 | |
|         )
 | |
|         return values
 | |
| 
 | |
|     def _call(
 | |
|         self,
 | |
|         prompt: str,
 | |
|         stop: Optional[List[str]] = None,
 | |
|         run_manager: Optional[CallbackManagerForLLMRun] = None,
 | |
|         **kwargs: Any,
 | |
|     ) -> str:
 | |
|         """Generate text from a prompt.
 | |
|         Args:
 | |
|             prompt: The prompt to generate text from.
 | |
|             stop: A list of strings to stop generation when encountered.
 | |
|         Returns:
 | |
|             The generated text.
 | |
|         Example:
 | |
|             .. code-block:: python
 | |
|                 from langchain_community.llms import DeepSparse
 | |
|                 llm = DeepSparse(model="zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base_quant-none")
 | |
|                 llm("Tell me a joke.")
 | |
|         """
 | |
|         if self.streaming:
 | |
|             combined_output = ""
 | |
|             for chunk in self._stream(
 | |
|                 prompt=prompt, stop=stop, run_manager=run_manager, **kwargs
 | |
|             ):
 | |
|                 combined_output += chunk.text
 | |
|             text = combined_output
 | |
|         else:
 | |
|             text = (
 | |
|                 self.pipeline(sequences=prompt, **self.generation_config)
 | |
|                 .generations[0]
 | |
|                 .text
 | |
|             )
 | |
| 
 | |
|         if stop is not None:
 | |
|             text = enforce_stop_tokens(text, stop)
 | |
| 
 | |
|         return text
 | |
| 
 | |
|     async def _acall(
 | |
|         self,
 | |
|         prompt: str,
 | |
|         stop: Optional[List[str]] = None,
 | |
|         run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
 | |
|         **kwargs: Any,
 | |
|     ) -> str:
 | |
|         """Generate text from a prompt.
 | |
|         Args:
 | |
|             prompt: The prompt to generate text from.
 | |
|             stop: A list of strings to stop generation when encountered.
 | |
|         Returns:
 | |
|             The generated text.
 | |
|         Example:
 | |
|             .. code-block:: python
 | |
|                 from langchain_community.llms import DeepSparse
 | |
|                 llm = DeepSparse(model="zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base_quant-none")
 | |
|                 llm("Tell me a joke.")
 | |
|         """
 | |
|         if self.streaming:
 | |
|             combined_output = ""
 | |
|             async for chunk in self._astream(
 | |
|                 prompt=prompt, stop=stop, run_manager=run_manager, **kwargs
 | |
|             ):
 | |
|                 combined_output += chunk.text
 | |
|             text = combined_output
 | |
|         else:
 | |
|             text = (
 | |
|                 self.pipeline(sequences=prompt, **self.generation_config)
 | |
|                 .generations[0]
 | |
|                 .text
 | |
|             )
 | |
| 
 | |
|         if stop is not None:
 | |
|             text = enforce_stop_tokens(text, stop)
 | |
| 
 | |
|         return text
 | |
| 
 | |
|     def _stream(
 | |
|         self,
 | |
|         prompt: str,
 | |
|         stop: Optional[List[str]] = None,
 | |
|         run_manager: Optional[CallbackManagerForLLMRun] = None,
 | |
|         **kwargs: Any,
 | |
|     ) -> Iterator[GenerationChunk]:
 | |
|         """Yields results objects as they are generated in real time.
 | |
|         It also calls the callback manager's on_llm_new_token event with
 | |
|         similar parameters to the OpenAI LLM class method of the same name.
 | |
|         Args:
 | |
|             prompt: The prompt to pass into the model.
 | |
|             stop: Optional list of stop words to use when generating.
 | |
|         Returns:
 | |
|             A generator representing the stream of tokens being generated.
 | |
|         Yields:
 | |
|             A dictionary like object containing a string token.
 | |
|         Example:
 | |
|             .. code-block:: python
 | |
|                 from langchain_community.llms import DeepSparse
 | |
|                 llm = DeepSparse(
 | |
|                     model="zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base_quant-none",
 | |
|                     streaming=True
 | |
|                 )
 | |
|                 for chunk in llm.stream("Tell me a joke",
 | |
|                         stop=["'","\n"]):
 | |
|                     print(chunk, end='', flush=True)
 | |
|         """
 | |
|         inference = self.pipeline(
 | |
|             sequences=prompt, streaming=True, **self.generation_config
 | |
|         )
 | |
|         for token in inference:
 | |
|             chunk = GenerationChunk(text=token.generations[0].text)
 | |
|             yield chunk
 | |
| 
 | |
|             if run_manager:
 | |
|                 run_manager.on_llm_new_token(token=chunk.text)
 | |
| 
 | |
|     async def _astream(
 | |
|         self,
 | |
|         prompt: str,
 | |
|         stop: Optional[List[str]] = None,
 | |
|         run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
 | |
|         **kwargs: Any,
 | |
|     ) -> AsyncIterator[GenerationChunk]:
 | |
|         """Yields results objects as they are generated in real time.
 | |
|         It also calls the callback manager's on_llm_new_token event with
 | |
|         similar parameters to the OpenAI LLM class method of the same name.
 | |
|         Args:
 | |
|             prompt: The prompt to pass into the model.
 | |
|             stop: Optional list of stop words to use when generating.
 | |
|         Returns:
 | |
|             A generator representing the stream of tokens being generated.
 | |
|         Yields:
 | |
|             A dictionary like object containing a string token.
 | |
|         Example:
 | |
|             .. code-block:: python
 | |
|                 from langchain_community.llms import DeepSparse
 | |
|                 llm = DeepSparse(
 | |
|                     model="zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base_quant-none",
 | |
|                     streaming=True
 | |
|                 )
 | |
|                 for chunk in llm.stream("Tell me a joke",
 | |
|                         stop=["'","\n"]):
 | |
|                     print(chunk, end='', flush=True)
 | |
|         """
 | |
|         inference = self.pipeline(
 | |
|             sequences=prompt, streaming=True, **self.generation_config
 | |
|         )
 | |
|         for token in inference:
 | |
|             chunk = GenerationChunk(text=token.generations[0].text)
 | |
|             yield chunk
 | |
| 
 | |
|             if run_manager:
 | |
|                 await run_manager.on_llm_new_token(token=chunk.text)
 |