From a63cee04ac4374136c9caaebcc899e89ec877863 Mon Sep 17 00:00:00 2001 From: William De Vena <60664495+williamdevena@users.noreply.github.com> Date: Sun, 3 Mar 2024 23:15:11 +0100 Subject: [PATCH] nvidia-trt[patch]: Invoke callback prior to yielding token (#18446) ## PR title nvidia-trt[patch]: Invoke callback prior to yielding ## PR message - Description: Invoke on_llm_new_token callback prior to yielding token in _stream method. - Issue: https://github.com/langchain-ai/langchain/issues/16913 - Dependencies: None --- libs/partners/nvidia-trt/langchain_nvidia_trt/llms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/partners/nvidia-trt/langchain_nvidia_trt/llms.py b/libs/partners/nvidia-trt/langchain_nvidia_trt/llms.py index 0ea1fca1df8..161da60ba57 100644 --- a/libs/partners/nvidia-trt/langchain_nvidia_trt/llms.py +++ b/libs/partners/nvidia-trt/langchain_nvidia_trt/llms.py @@ -176,9 +176,9 @@ class TritonTensorRTLLM(BaseLLM): result_queue = self._invoke_triton(self.model_name, inputs, outputs, stop_words) for token in result_queue: - yield GenerationChunk(text=token) if run_manager: run_manager.on_llm_new_token(token) + yield GenerationChunk(text=token) self.client.stop_stream()