From 9246ec6b3672bd57ee5b230d48ff9e24b1ab1756 Mon Sep 17 00:00:00 2001 From: Barun Amalkumar Halder Date: Mon, 25 Mar 2024 17:28:05 -0700 Subject: [PATCH] community[patch] : [Fiddler] ensure dataset is not added if model is present (#19293) **Description:** - minor PR to speed up onboarding by not trying to add a dataset, if a model is already present. - replace batch publish API with streaming when single events are published. **Dependencies:** any dependencies required for this change **Twitter handle:** behalder Co-authored-by: Barun Halder --- .../callbacks/fiddler_callback.py | 79 ++++++++++--------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/libs/community/langchain_community/callbacks/fiddler_callback.py b/libs/community/langchain_community/callbacks/fiddler_callback.py index 3f5edefde23..c8dc9f0494e 100644 --- a/libs/community/langchain_community/callbacks/fiddler_callback.py +++ b/libs/community/langchain_community/callbacks/fiddler_callback.py @@ -131,43 +131,42 @@ class FiddlerCallbackHandler(BaseCallbackHandler): dataset_info.columns[i].data_type = self.fdl.DataType.CATEGORY dataset_info.columns[i].possible_values = [SUCCESS, FAILURE] - if self.model not in self.fiddler_client.get_dataset_names(self.project): - print( # noqa: T201 - f"adding dataset {self.model} to project {self.project}." - "This only has to be done once." - ) - try: - self.fiddler_client.upload_dataset( - project_id=self.project, - dataset_id=self.model, - dataset={"train": self._df}, - info=dataset_info, - ) - except Exception as e: - print( # noqa: T201 - f"Error adding dataset {self.model}: {e}." - "Fiddler integration will not work." - ) - raise e - - model_info = self.fdl.ModelInfo.from_dataset_info( - dataset_info=dataset_info, - dataset_id="train", - model_task=self.fdl.ModelTask.LLM, - features=[PROMPT, CONTEXT, RESPONSE], - target=FEEDBACK, - metadata_cols=[ - RUN_ID, - TOTAL_TOKENS, - PROMPT_TOKENS, - COMPLETION_TOKENS, - MODEL_NAME, - DURATION, - ], - custom_features=self.custom_features, - ) - if self.model not in self.fiddler_client.get_model_names(self.project): + if self.model not in self.fiddler_client.get_dataset_names(self.project): + print( # noqa: T201 + f"adding dataset {self.model} to project {self.project}." + "This only has to be done once." + ) + try: + self.fiddler_client.upload_dataset( + project_id=self.project, + dataset_id=self.model, + dataset={"train": self._df}, + info=dataset_info, + ) + except Exception as e: + print( # noqa: T201 + f"Error adding dataset {self.model}: {e}." + "Fiddler integration will not work." + ) + raise e + + model_info = self.fdl.ModelInfo.from_dataset_info( + dataset_info=dataset_info, + dataset_id="train", + model_task=self.fdl.ModelTask.LLM, + features=[PROMPT, CONTEXT, RESPONSE], + target=FEEDBACK, + metadata_cols=[ + RUN_ID, + TOTAL_TOKENS, + PROMPT_TOKENS, + COMPLETION_TOKENS, + MODEL_NAME, + DURATION, + ], + custom_features=self.custom_features, + ) print( # noqa: T201 f"adding model {self.model} to project {self.project}." "This only has to be done once." # noqa: T201 @@ -286,7 +285,13 @@ class FiddlerCallbackHandler(BaseCallbackHandler): df[key] = [value] * prompt_count if isinstance(value, int) else value try: - self.fiddler_client.publish_events_batch(self.project, self.model, df) + if df.shape[0] > 1: + self.fiddler_client.publish_events_batch(self.project, self.model, df) + else: + df_dict = df.to_dict(orient="records") + self.fiddler_client.publish_event( + self.project, self.model, event=df_dict[0] + ) except Exception as e: print( # noqa: T201 f"Error publishing events to fiddler: {e}. continuing..."