Takeoff integration (#9045)

## Description:
This PR adds the Titan Takeoff Server to the available LLMs in
LangChain.

Titan Takeoff is an inference server created by
[TitanML](https://www.titanml.co/) that allows you to deploy large
language models locally on your hardware in a single command. Most
generative model architectures are included, such as Falcon, Llama 2,
GPT2, T5 and many more.

Read more about Titan Takeoff here:
-
[Blog](https://medium.com/@TitanML/introducing-titan-takeoff-6c30e55a8e1e)
- [Docs](https://docs.titanml.co/docs/titan-takeoff/getting-started)

#### Testing
As Titan Takeoff runs locally on port 8000 by default, no network access
is needed. Responses are mocked for testing.

- [x] Make Lint
- [x] Make Format
- [x] Make Test

#### Dependencies
No new dependencies are introduced. However, users will need to install
the titan-iris package in their local environment and start the Titan
Takeoff inferencing server in order to use the Titan Takeoff
integration.

Thanks for your help and please let me know if you have any questions.

cc: @hwchase17 @baskaryan
This commit is contained in:
Blake (Yung Cher Ho)
2023-08-10 18:56:06 +01:00
committed by GitHub
parent 3bdc273ab3
commit 8d351bfc20
4 changed files with 347 additions and 0 deletions

View File

@@ -76,6 +76,7 @@ from langchain.llms.self_hosted_hugging_face import SelfHostedHuggingFaceLLM
from langchain.llms.stochasticai import StochasticAI
from langchain.llms.symblai_nebula import Nebula
from langchain.llms.textgen import TextGen
from langchain.llms.titan_takeoff import TitanTakeoff
from langchain.llms.tongyi import Tongyi
from langchain.llms.vertexai import VertexAI
from langchain.llms.vllm import VLLM
@@ -142,6 +143,7 @@ __all__ = [
"SelfHostedHuggingFaceLLM",
"SelfHostedPipeline",
"StochasticAI",
"TitanTakeoff",
"Tongyi",
"VertexAI",
"VLLM",
@@ -203,6 +205,7 @@ type_to_cls_dict: Dict[str, Type[BaseLLM]] = {
"self_hosted_hugging_face": SelfHostedHuggingFaceLLM,
"stochasticai": StochasticAI,
"tongyi": Tongyi,
"titan_takeoff": TitanTakeoff,
"vertexai": VertexAI,
"openllm": OpenLLM,
"openllm_client": OpenLLM,

View File

@@ -0,0 +1,157 @@
from typing import Any, Iterator, List, Mapping, Optional
import requests
from requests.exceptions import ConnectionError
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from langchain.llms.utils import enforce_stop_tokens
from langchain.schema.output import GenerationChunk
class TitanTakeoff(LLM):
port: int = 8000
"""Specifies the port to use for the Titan Takeoff API. Default = 8000."""
generate_max_length: int = 128
"""Maximum generation length. Default = 128."""
sampling_topk: int = 1
"""Sample predictions from the top K most probable candidates. Default = 1."""
sampling_topp: float = 1.0
"""Sample from predictions whose cumulative probability exceeds this value.
Default = 1.0.
"""
sampling_temperature: float = 1.0
"""Sample with randomness. Bigger temperatures are associated with
more randomness and 'creativity'. Default = 1.0.
"""
repetition_penalty: float = 1.0
"""Penalise the generation of tokens that have been generated before.
Set to > 1 to penalize. Default = 1 (no penalty).
"""
no_repeat_ngram_size: int = 0
"""Prevent repetitions of ngrams of this size. Default = 0 (turned off)."""
streaming: bool = False
"""Whether to stream the output. Default = False."""
@property
def _default_params(self) -> Mapping[str, Any]:
"""Get the default parameters for calling Titan Takeoff Server."""
params = {
"generate_max_length": self.generate_max_length,
"sampling_topk": self.sampling_topk,
"sampling_topp": self.sampling_topp,
"sampling_temperature": self.sampling_temperature,
"repetition_penalty": self.repetition_penalty,
"no_repeat_ngram_size": self.no_repeat_ngram_size,
}
return params
@property
def _llm_type(self) -> str:
"""Return type of llm."""
return "titan_takeoff"
def _call(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any,
) -> str:
"""Call out to Titan Takeoff generate endpoint.
Args:
prompt: The prompt to pass into the model.
stop: Optional list of stop words to use when generating.
Returns:
The string generated by the model.
Example:
.. code-block:: python
prompt = "What is the capital of the United Kingdom?"
response = model(prompt)
"""
try:
if self.streaming:
text_output = ""
for chunk in self._stream(
prompt=prompt,
stop=stop,
run_manager=run_manager,
):
text_output += chunk.text
return text_output
url = f"http://localhost:{self.port}/generate"
params = {"text": prompt, **self._default_params}
response = requests.post(url, json=params)
response.raise_for_status()
response.encoding = "utf-8"
text = ""
if "message" in response.json():
text = response.json()["message"]
else:
raise ValueError("Something went wrong.")
if stop is not None:
text = enforce_stop_tokens(text, stop)
return text
except ConnectionError:
raise ConnectionError(
"Could not connect to Titan Takeoff server. \
Please make sure that the server is running."
)
def _stream(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any,
) -> Iterator[GenerationChunk]:
"""Call out to Titan Takeoff stream endpoint.
Args:
prompt: The prompt to pass into the model.
stop: Optional list of stop words to use when generating.
Returns:
The string generated by the model.
Yields:
A dictionary like object containing a string token.
Example:
.. code-block:: python
prompt = "What is the capital of the United Kingdom?"
response = model(prompt)
"""
url = f"http://localhost:{self.port}/generate_stream"
params = {"text": prompt, **self._default_params}
response = requests.post(url, json=params, stream=True)
response.encoding = "utf-8"
for text in response.iter_content(chunk_size=1, decode_unicode=True):
if text:
chunk = GenerationChunk(text=text)
yield chunk
if run_manager:
run_manager.on_llm_new_token(token=chunk.text)
@property
def _identifying_params(self) -> Mapping[str, Any]:
"""Get the identifying parameters."""
return {"port": self.port, **{}, **self._default_params}

View File

@@ -0,0 +1,18 @@
"""Test Titan Takeoff wrapper."""
import responses
from langchain.llms.titan_takeoff import TitanTakeoff
@responses.activate
def test_titan_takeoff_call() -> None:
"""Test valid call to Titan Takeoff."""
url = "http://localhost:8000/generate"
responses.add(responses.POST, url, json={"message": "2 + 2 is 4"}, status=200)
# response = requests.post(url)
llm = TitanTakeoff()
output = llm("What is 2 + 2?")
assert isinstance(output, str)