Support Fireworks batching (#8) (#12052)

Description * Add _generate and _agenerate to support Fireworks batching. * Add stop words test cases * Opt out retry mechanism Issue - Not applicable Dependencies - None Tag maintainer - @baskaryan
2025-10-11 14:13:41 +00:00 · 2023-10-26 13:01:08 -07:00
parent 3fbb2f3e52
commit 6ce276e099
4 changed files with 344 additions and 42 deletions
--- a/libs/langchain/langchain/chat_models/fireworks.py
+++ b/libs/langchain/langchain/chat_models/fireworks.py
@@ -89,6 +89,7 @@ class ChatFireworks(BaseChatModel):
    )
    fireworks_api_key: Optional[str] = None
    max_retries: int = 20
    use_retry: bool = True
    @property
    def lc_secrets(self) -> Dict[str, str]:
@@ -134,7 +135,11 @@ class ChatFireworks(BaseChatModel):
            **self.model_kwargs,
        }
        response = completion_with_retry(
-            self, run_manager=run_manager, stop=stop, **params
+            self,
            self.use_retry,
            run_manager=run_manager,
            stop=stop,
            **params,
        )
        return self._create_chat_result(response)
@@ -152,7 +157,7 @@ class ChatFireworks(BaseChatModel):
            **self.model_kwargs,
        }
        response = await acompletion_with_retry(
-            self, run_manager=run_manager, stop=stop, **params
+            self, self.use_retry, run_manager=run_manager, stop=stop, **params
        )
        return self._create_chat_result(response)
@@ -195,7 +200,7 @@ class ChatFireworks(BaseChatModel):
            **self.model_kwargs,
        }
        for chunk in completion_with_retry(
-            self, run_manager=run_manager, stop=stop, **params
+            self, self.use_retry, run_manager=run_manager, stop=stop, **params
        ):
            choice = chunk.choices[0]
            chunk = _convert_delta_to_message_chunk(choice.delta, default_chunk_class)
@@ -224,7 +229,7 @@ class ChatFireworks(BaseChatModel):
            **self.model_kwargs,
        }
        async for chunk in await acompletion_with_retry_streaming(
-            self, run_manager=run_manager, stop=stop, **params
+            self, self.use_retry, run_manager=run_manager, stop=stop, **params
        ):
            choice = chunk.choices[0]
            chunk = _convert_delta_to_message_chunk(choice.delta, default_chunk_class)
@@ -238,8 +243,20 @@ class ChatFireworks(BaseChatModel):
                await run_manager.on_llm_new_token(token=chunk.content, chunk=chunk)
 def conditional_decorator(
    condition: bool, decorator: Callable[[Any], Any]
 ) -> Callable[[Any], Any]:
    def actual_decorator(func: Callable[[Any], Any]) -> Callable[[Any], Any]:
        if condition:
            return decorator(func)
        return func
    return actual_decorator
 def completion_with_retry(
    llm: ChatFireworks,
    use_retry: bool,
    *,
    run_manager: Optional[CallbackManagerForLLMRun] = None,
    **kwargs: Any,
@@ -249,7 +266,7 @@ def completion_with_retry(
    retry_decorator = _create_retry_decorator(llm, run_manager=run_manager)
-    @retry_decorator
+    @conditional_decorator(use_retry, retry_decorator)
    def _completion_with_retry(**kwargs: Any) -> Any:
        return fireworks.client.ChatCompletion.create(
            **kwargs,
@@ -260,6 +277,7 @@ def completion_with_retry(
 async def acompletion_with_retry(
    llm: ChatFireworks,
    use_retry: bool,
    *,
    run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
    **kwargs: Any,
@@ -269,7 +287,7 @@ async def acompletion_with_retry(
    retry_decorator = _create_retry_decorator(llm, run_manager=run_manager)
-    @retry_decorator
+    @conditional_decorator(use_retry, retry_decorator)
    async def _completion_with_retry(**kwargs: Any) -> Any:
        return await fireworks.client.ChatCompletion.acreate(
            **kwargs,
@@ -280,6 +298,7 @@ async def acompletion_with_retry(
 async def acompletion_with_retry_streaming(
    llm: ChatFireworks,
    use_retry: bool,
    *,
    run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
    **kwargs: Any,
@@ -289,7 +308,7 @@ async def acompletion_with_retry_streaming(
    retry_decorator = _create_retry_decorator(llm, run_manager=run_manager)
-    @retry_decorator
+    @conditional_decorator(use_retry, retry_decorator)
    async def _completion_with_retry(**kwargs: Any) -> Any:
        return fireworks.client.ChatCompletion.acreate(
            **kwargs,
@@ -309,6 +328,8 @@ def _create_retry_decorator(
    errors = [
        fireworks.client.error.RateLimitError,
        fireworks.client.error.InternalServerError,
        fireworks.client.error.BadGatewayError,
        fireworks.client.error.ServiceUnavailableError,
    ]
    return create_base_retry_decorator(
--- a/libs/langchain/langchain/llms/fireworks.py
+++ b/libs/langchain/langchain/llms/fireworks.py
@@ -1,12 +1,14 @@
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 from typing import Any, AsyncIterator, Callable, Dict, Iterator, List, Optional, Union
 from langchain.callbacks.manager import (
    AsyncCallbackManagerForLLMRun,
    CallbackManagerForLLMRun,
 )
-from langchain.llms.base import LLM, create_base_retry_decorator
+from langchain.llms.base import BaseLLM, create_base_retry_decorator
 from langchain.pydantic_v1 import Field, root_validator
-from langchain.schema.output import GenerationChunk
+from langchain.schema.output import Generation, GenerationChunk, LLMResult
 from langchain.utils.env import get_from_dict_or_env
@@ -23,7 +25,7 @@ def _stream_response_to_generation_chunk(
    )
-class Fireworks(LLM):
+class Fireworks(BaseLLM):
    """Fireworks models."""
    model: str = "accounts/fireworks/models/llama-v2-7b-chat"
@@ -36,6 +38,8 @@ class Fireworks(LLM):
    )
    fireworks_api_key: Optional[str] = None
    max_retries: int = 20
    batch_size: int = 20
    use_retry: bool = True
    @property
    def lc_secrets(self) -> Dict[str, str]:
@@ -66,43 +70,92 @@ class Fireworks(LLM):
        """Return type of llm."""
        return "fireworks"
-    def _call(
+    def _generate(
        self,
-        prompt: str,
+        prompts: List[str],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
-    ) -> str:
+    ) -> LLMResult:
-        """Run the LLM on the given prompt and input."""
+        """Call out to Fireworks endpoint with k unique prompts.
-        params: dict = {
+        Args:
            prompts: The prompts to pass into the model.
            stop: Optional list of stop words to use when generating.
        Returns:
            The full LLM output.
        """
        params = {
            "model": self.model,
            "prompt": prompt,
            **self.model_kwargs,
        }
-        response = completion_with_retry(
+        sub_prompts = self.get_batch_prompts(prompts)
-            self, run_manager=run_manager, stop=stop, **params
+        choices = []
-        )
+        for _prompts in sub_prompts:
            response = completion_with_retry_batching(
                self,
                self.use_retry,
                prompt=_prompts,
                run_manager=run_manager,
                stop=stop,
                **params,
            )
            choices.extend(response)
-        return response.choices[0].text
+        return self.create_llm_result(choices, prompts)
-    async def _acall(
+    async def _agenerate(
        self,
-        prompt: str,
+        prompts: List[str],
        stop: Optional[List[str]] = None,
        run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
        **kwargs: Any,
-    ) -> str:
+    ) -> LLMResult:
-        """Run the LLM on the given prompt and input."""
+        """Call out to Fireworks endpoint async with k unique prompts."""
        params = {
            "model": self.model,
            "prompt": prompt,
            **self.model_kwargs,
        }
-        response = await acompletion_with_retry(
+        sub_prompts = self.get_batch_prompts(prompts)
-            self, run_manager=run_manager, stop=stop, **params
+        choices = []
-        )
+        for _prompts in sub_prompts:
            response = await acompletion_with_retry_batching(
                self,
                self.use_retry,
                prompt=_prompts,
                run_manager=run_manager,
                stop=stop,
                **params,
            )
            choices.extend(response)
-        return response.choices[0].text
+        return self.create_llm_result(choices, prompts)
    def get_batch_prompts(
        self,
        prompts: List[str],
    ) -> List[List[str]]:
        """Get the sub prompts for llm call."""
        sub_prompts = [
            prompts[i : i + self.batch_size]
            for i in range(0, len(prompts), self.batch_size)
        ]
        return sub_prompts
    def create_llm_result(self, choices: Any, prompts: List[str]) -> LLMResult:
        """Create the LLMResult from the choices and prompts."""
        generations = []
        for i, _ in enumerate(prompts):
            sub_choices = choices[i : (i + 1)]
            generations.append(
                [
                    Generation(
                        text=choice.__dict__["choices"][0].text,
                    )
                    for choice in sub_choices
                ]
            )
        llm_output = {"model": self.model}
        return LLMResult(generations=generations, llm_output=llm_output)
    def _stream(
        self,
@@ -118,7 +171,7 @@ class Fireworks(LLM):
            **self.model_kwargs,
        }
        for stream_resp in completion_with_retry(
-            self, run_manager=run_manager, stop=stop, **params
+            self, self.use_retry, run_manager=run_manager, stop=stop, **params
        ):
            chunk = _stream_response_to_generation_chunk(stream_resp)
            yield chunk
@@ -139,7 +192,7 @@ class Fireworks(LLM):
            **self.model_kwargs,
        }
        async for stream_resp in await acompletion_with_retry_streaming(
-            self, run_manager=run_manager, stop=stop, **params
+            self, self.use_retry, run_manager=run_manager, stop=stop, **params
        ):
            chunk = _stream_response_to_generation_chunk(stream_resp)
            yield chunk
@@ -147,8 +200,20 @@ class Fireworks(LLM):
                await run_manager.on_llm_new_token(chunk.text, chunk=chunk)
 def conditional_decorator(
    condition: bool, decorator: Callable[[Any], Any]
 ) -> Callable[[Any], Any]:
    def actual_decorator(func: Callable[[Any], Any]) -> Callable[[Any], Any]:
        if condition:
            return decorator(func)
        return func
    return actual_decorator
 def completion_with_retry(
    llm: Fireworks,
    use_retry: bool,
    *,
    run_manager: Optional[CallbackManagerForLLMRun] = None,
    **kwargs: Any,
@@ -158,7 +223,7 @@ def completion_with_retry(
    retry_decorator = _create_retry_decorator(llm, run_manager=run_manager)
-    @retry_decorator
+    @conditional_decorator(use_retry, retry_decorator)
    def _completion_with_retry(**kwargs: Any) -> Any:
        return fireworks.client.Completion.create(
            **kwargs,
@@ -169,6 +234,7 @@ def completion_with_retry(
 async def acompletion_with_retry(
    llm: Fireworks,
    use_retry: bool,
    *,
    run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
    **kwargs: Any,
@@ -178,7 +244,7 @@ async def acompletion_with_retry(
    retry_decorator = _create_retry_decorator(llm, run_manager=run_manager)
-    @retry_decorator
+    @conditional_decorator(use_retry, retry_decorator)
    async def _completion_with_retry(**kwargs: Any) -> Any:
        return await fireworks.client.Completion.acreate(
            **kwargs,
@@ -187,8 +253,79 @@ async def acompletion_with_retry(
    return await _completion_with_retry(**kwargs)
 def completion_with_retry_batching(
    llm: Fireworks,
    use_retry: bool,
    *,
    run_manager: Optional[CallbackManagerForLLMRun] = None,
    **kwargs: Any,
 ) -> Any:
    """Use tenacity to retry the completion call."""
    import fireworks.client
    prompt = kwargs["prompt"]
    del kwargs["prompt"]
    retry_decorator = _create_retry_decorator(llm, run_manager=run_manager)
    @conditional_decorator(use_retry, retry_decorator)
    def _completion_with_retry(prompt: str) -> Any:
        return fireworks.client.Completion.create(**kwargs, prompt=prompt)
    def batch_sync_run() -> List:
        with ThreadPoolExecutor() as executor:
            results = list(executor.map(_completion_with_retry, prompt))
        return results
    return batch_sync_run()
 async def acompletion_with_retry_batching(
    llm: Fireworks,
    use_retry: bool,
    *,
    run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
    **kwargs: Any,
 ) -> Any:
    """Use tenacity to retry the completion call."""
    import fireworks.client
    prompt = kwargs["prompt"]
    del kwargs["prompt"]
    retry_decorator = _create_retry_decorator(llm, run_manager=run_manager)
    @conditional_decorator(use_retry, retry_decorator)
    async def _completion_with_retry(prompt: str) -> Any:
        return await fireworks.client.Completion.acreate(**kwargs, prompt=prompt)
    def run_coroutine_in_new_loop(
        coroutine_func: Any, *args: Dict, **kwargs: Dict
    ) -> Any:
        new_loop = asyncio.new_event_loop()
        try:
            asyncio.set_event_loop(new_loop)
            return new_loop.run_until_complete(coroutine_func(*args, **kwargs))
        finally:
            new_loop.close()
    async def batch_sync_run() -> List:
        with ThreadPoolExecutor() as executor:
            results = list(
                executor.map(
                    run_coroutine_in_new_loop,
                    [_completion_with_retry] * len(prompt),
                    prompt,
                )
            )
        return results
    return await batch_sync_run()
 async def acompletion_with_retry_streaming(
    llm: Fireworks,
    use_retry: bool,
    *,
    run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
    **kwargs: Any,
@@ -198,7 +335,7 @@ async def acompletion_with_retry_streaming(
    retry_decorator = _create_retry_decorator(llm, run_manager=run_manager)
-    @retry_decorator
+    @conditional_decorator(use_retry, retry_decorator)
    async def _completion_with_retry(**kwargs: Any) -> Any:
        return fireworks.client.Completion.acreate(
            **kwargs,
@@ -219,6 +356,8 @@ def _create_retry_decorator(
    errors = [
        fireworks.client.error.RateLimitError,
        fireworks.client.error.InternalServerError,
        fireworks.client.error.BadGatewayError,
        fireworks.client.error.ServiceUnavailableError,
    ]
    return create_base_retry_decorator(
--- a/libs/langchain/tests/integration_tests/chat_models/test_fireworks.py
+++ b/libs/langchain/tests/integration_tests/chat_models/test_fireworks.py
@@ -3,11 +3,7 @@
 import pytest
 from langchain.chat_models.fireworks import ChatFireworks
-from langchain.schema import (
+from langchain.schema import ChatGeneration, ChatResult, LLMResult
    ChatGeneration,
    ChatResult,
    LLMResult,
 )
 from langchain.schema.messages import BaseMessage, HumanMessage, SystemMessage
@@ -72,6 +68,64 @@ def test_chat_fireworks_llm_output_contains_model_id() -> None:
    assert llm_result.llm_output["model"] == chat.model
 def test_fireworks_invoke() -> None:
    """Tests chat completion with invoke"""
    chat = ChatFireworks()
    result = chat.invoke("How is the weather in New York today?", stop=[","])
    assert isinstance(result.content, str)
    assert result.content[-1] == ","
@pytest.mark.asyncio
 async def test_fireworks_ainvoke() -> None:
    """Tests chat completion with invoke"""
    chat = ChatFireworks()
    result = await chat.ainvoke("How is the weather in New York today?", stop=[","])
    assert isinstance(result.content, str)
    assert result.content[-1] == ","
 def test_fireworks_batch() -> None:
    """Test batch tokens from ChatFireworks."""
    chat = ChatFireworks()
    result = chat.batch(
        [
            "What is the weather in Redwood City, CA today",
            "What is the weather in Redwood City, CA today",
            "What is the weather in Redwood City, CA today",
            "What is the weather in Redwood City, CA today",
            "What is the weather in Redwood City, CA today",
            "What is the weather in Redwood City, CA today",
        ],
        config={"max_concurrency": 5},
        stop=[","],
    )
    for token in result:
        assert isinstance(token.content, str)
        assert token.content[-1] == ","
@pytest.mark.asyncio
 async def test_fireworks_abatch() -> None:
    """Test batch tokens from ChatFireworks."""
    chat = ChatFireworks()
    result = await chat.abatch(
        [
            "What is the weather in Redwood City, CA today",
            "What is the weather in Redwood City, CA today",
            "What is the weather in Redwood City, CA today",
            "What is the weather in Redwood City, CA today",
            "What is the weather in Redwood City, CA today",
            "What is the weather in Redwood City, CA today",
        ],
        config={"max_concurrency": 5},
        stop=[","],
    )
    for token in result:
        assert isinstance(token.content, str)
        assert token.content[-1] == ","
 def test_fireworks_streaming() -> None:
    """Test streaming tokens from Fireworks."""
    llm = ChatFireworks()
@@ -80,6 +134,17 @@ def test_fireworks_streaming() -> None:
        assert isinstance(token.content, str)
 def test_fireworks_streaming_stop_words() -> None:
    """Test streaming tokens with stop words."""
    llm = ChatFireworks()
    last_token = ""
    for token in llm.stream("I'm Pickle Rick", stop=[","]):
        last_token = token.content
        assert isinstance(token.content, str)
    assert last_token[-1] == ","
@pytest.mark.asyncio
 async def test_chat_fireworks_agenerate() -> None:
    """Test ChatFireworks wrapper with generate."""
@@ -101,5 +166,10 @@ async def test_fireworks_astream() -> None:
    """Test streaming tokens from Fireworks."""
    llm = ChatFireworks()
-    async for token in llm.astream("Who's the best quarterback in the NFL?"):
+    last_token = ""
    async for token in llm.astream(
        "Who's the best quarterback in the NFL?", stop=[","]
    ):
        last_token = token.content
        assert isinstance(token.content, str)
    assert last_token[-1] == ","
--- a/libs/langchain/tests/integration_tests/llms/test_fireworks.py
+++ b/libs/langchain/tests/integration_tests/llms/test_fireworks.py
@@ -16,7 +16,7 @@ from langchain.schema import LLMResult
 def test_fireworks_call() -> None:
    """Test valid call to fireworks."""
    llm = Fireworks()
-    output = llm("Who's the best quarterback in the NFL?")
+    output = llm("How is the weather in New York today?")
    assert isinstance(output, str)
@@ -41,6 +41,60 @@ def test_fireworks_model_param() -> None:
    assert llm.model == "foo"
 def test_fireworks_invoke() -> None:
    """Tests completion with invoke"""
    llm = Fireworks()
    output = llm.invoke("How is the weather in New York today?", stop=[","])
    assert isinstance(output, str)
    assert output[-1] == ","
@pytest.mark.asyncio
 async def test_fireworks_ainvoke() -> None:
    """Tests completion with invoke"""
    llm = Fireworks()
    output = await llm.ainvoke("How is the weather in New York today?", stop=[","])
    assert isinstance(output, str)
    assert output[-1] == ","
 def test_fireworks_batch() -> None:
    """Tests completion with invoke"""
    llm = Fireworks()
    output = llm.batch(
        [
            "How is the weather in New York today?",
            "How is the weather in New York today?",
            "How is the weather in New York today?",
            "How is the weather in New York today?",
            "How is the weather in New York today?",
        ],
        stop=[","],
    )
    for token in output:
        assert isinstance(token, str)
        assert token[-1] == ","
@pytest.mark.asyncio
 async def test_fireworks_abatch() -> None:
    """Tests completion with invoke"""
    llm = Fireworks()
    output = await llm.abatch(
        [
            "How is the weather in New York today?",
            "How is the weather in New York today?",
            "How is the weather in New York today?",
            "How is the weather in New York today?",
            "How is the weather in New York today?",
        ],
        stop=[","],
    )
    for token in output:
        assert isinstance(token, str)
        assert token[-1] == ","
 def test_fireworks_multiple_prompts() -> None:
    """Test completion with multiple prompts."""
    llm = Fireworks()
@@ -60,13 +114,31 @@ def test_fireworks_streaming() -> None:
        assert isinstance(token, str)
 def test_fireworks_streaming_stop_words() -> None:
    """Test stream completion with stop words."""
    llm = Fireworks()
    generator = llm.stream("Who's the best quarterback in the NFL?", stop=[","])
    assert isinstance(generator, Generator)
    last_token = ""
    for token in generator:
        last_token = token
        assert isinstance(token, str)
    assert last_token[-1] == ","
@pytest.mark.asyncio
 async def test_fireworks_streaming_async() -> None:
    """Test stream completion."""
    llm = Fireworks()
-    async for token in llm.astream("Who's the best quarterback in the NFL?"):
+    last_token = ""
    async for token in llm.astream(
        "Who's the best quarterback in the NFL?", stop=[","]
    ):
        last_token = token
        assert isinstance(token, str)
    assert last_token[-1] == ","
@pytest.mark.asyncio