From 69d9eae5cd89fc57781bbb9fe68b656f35e02870 Mon Sep 17 00:00:00 2001 From: Holt Skinner <13262395+holtskinner@users.noreply.github.com> Date: Tue, 24 Oct 2023 10:49:11 -0500 Subject: [PATCH] feat: Add Client Info to available Google Cloud Clients (#12168) - This is used internally to gather aggregate usage metrics for the LangChain integrations - Note: This cannot be added to some of the Vertex AI integrations at this time because the SDK doesn't allow overriding the [`ClientInfo`](https://googleapis.dev/python/google-api-core/latest/client_info.html#module-google.api_core.client_info) - Added to: - BigQuery - Google Cloud Storage - Document AI - Vertex AI Model Garden - Document AI Warehouse - Vertex AI Search - Vertex AI Matching Engine (Cloud Storage Client) @baskaryan, @eyurtsev, @hwchase17 --------- Co-authored-by: Eugene Yurtsev --- .../langchain/document_loaders/bigquery.py | 7 ++++- .../document_loaders/gcs_directory.py | 8 +++-- .../langchain/document_loaders/gcs_file.py | 5 +++- .../document_loaders/parsers/docai.py | 6 +++- libs/langchain/langchain/llms/vertexai.py | 8 +++-- .../google_cloud_documentai_warehouse.py | 19 ++++++------ .../retrievers/google_vertex_ai_search.py | 9 ++++-- .../langchain/langchain/utilities/vertexai.py | 29 +++++++++++++++++++ .../langchain/vectorstores/matching_engine.py | 7 ++++- 9 files changed, 79 insertions(+), 19 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/bigquery.py b/libs/langchain/langchain/document_loaders/bigquery.py index 4b9be0ca60d..37f5440f9fd 100644 --- a/libs/langchain/langchain/document_loaders/bigquery.py +++ b/libs/langchain/langchain/document_loaders/bigquery.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, List, Optional from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader +from langchain.utilities.vertexai import get_client_info if TYPE_CHECKING: from google.auth.credentials import Credentials @@ -57,7 +58,11 @@ class BigQueryLoader(BaseLoader): "Please install it with `pip install google-cloud-bigquery`." ) from ex - bq_client = bigquery.Client(credentials=self.credentials, project=self.project) + bq_client = bigquery.Client( + credentials=self.credentials, + project=self.project, + client_info=get_client_info(module="bigquery"), + ) if not bq_client.project: error_desc = ( "GCP project for Big Query is not set! Either provide a " diff --git a/libs/langchain/langchain/document_loaders/gcs_directory.py b/libs/langchain/langchain/document_loaders/gcs_directory.py index d51a0fbc8e4..f427dcf5a28 100644 --- a/libs/langchain/langchain/document_loaders/gcs_directory.py +++ b/libs/langchain/langchain/document_loaders/gcs_directory.py @@ -3,6 +3,7 @@ from typing import Callable, List, Optional from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.gcs_file import GCSFileLoader +from langchain.utilities.vertexai import get_client_info class GCSDirectoryLoader(BaseLoader): @@ -18,7 +19,7 @@ class GCSDirectoryLoader(BaseLoader): """Initialize with bucket and key name. Args: - project_name: The name of the project for the GCS bucket. + project_name: The ID of the project for the GCS bucket. bucket: The name of the GCS bucket. prefix: The prefix of the GCS bucket. loader_func: A loader function that instantiates a loader based on a @@ -39,7 +40,10 @@ class GCSDirectoryLoader(BaseLoader): "Could not import google-cloud-storage python package. " "Please install it with `pip install google-cloud-storage`." ) - client = storage.Client(project=self.project_name) + client = storage.Client( + project=self.project_name, + client_info=get_client_info(module="google-cloud-storage"), + ) docs = [] for blob in client.list_blobs(self.bucket, prefix=self.prefix): # we shall just skip directories since GCSFileLoader creates diff --git a/libs/langchain/langchain/document_loaders/gcs_file.py b/libs/langchain/langchain/document_loaders/gcs_file.py index 5fd6519bf16..468812217cd 100644 --- a/libs/langchain/langchain/document_loaders/gcs_file.py +++ b/libs/langchain/langchain/document_loaders/gcs_file.py @@ -5,6 +5,7 @@ from typing import Callable, List, Optional from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader +from langchain.utilities.vertexai import get_client_info class GCSFileLoader(BaseLoader): @@ -57,7 +58,9 @@ class GCSFileLoader(BaseLoader): ) # Initialise a client - storage_client = storage.Client(self.project_name) + storage_client = storage.Client( + self.project_name, client_info=get_client_info("google-cloud-storage") + ) # Create a bucket object for our bucket bucket = storage_client.get_bucket(self.bucket) # Create a blob object from the filepath diff --git a/libs/langchain/langchain/document_loaders/parsers/docai.py b/libs/langchain/langchain/document_loaders/parsers/docai.py index 32f9d9122c3..91345d94010 100644 --- a/libs/langchain/langchain/document_loaders/parsers/docai.py +++ b/libs/langchain/langchain/document_loaders/parsers/docai.py @@ -12,6 +12,7 @@ from typing import TYPE_CHECKING, Iterator, List, Optional, Sequence from langchain.docstore.document import Document from langchain.document_loaders.base import BaseBlobParser from langchain.document_loaders.blob_loaders import Blob +from langchain.utilities.vertexai import get_client_info from langchain.utils.iter import batch_iterate if TYPE_CHECKING: @@ -89,7 +90,10 @@ class DocAIParser(BaseBlobParser): options = ClientOptions( api_endpoint=f"{location}-documentai.googleapis.com" ) - self._client = DocumentProcessorServiceClient(client_options=options) + self._client = DocumentProcessorServiceClient( + client_options=options, + client_info=get_client_info(module="document-ai"), + ) def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Parses a blob lazily. diff --git a/libs/langchain/langchain/llms/vertexai.py b/libs/langchain/langchain/llms/vertexai.py index a2dc147edff..e8a998cde63 100644 --- a/libs/langchain/langchain/llms/vertexai.py +++ b/libs/langchain/langchain/llms/vertexai.py @@ -25,6 +25,7 @@ from langchain.schema import ( ) from langchain.schema.output import GenerationChunk from langchain.utilities.vertexai import ( + get_client_info, init_vertexai, raise_vertex_import_error, ) @@ -370,9 +371,12 @@ class VertexAIModelGarden(_VertexAIBase, BaseLLM): client_options = ClientOptions( api_endpoint=f"{values['location']}-aiplatform.googleapis.com" ) - values["client"] = PredictionServiceClient(client_options=client_options) + client_info = get_client_info(module="vertex-ai-model-garden") + values["client"] = PredictionServiceClient( + client_options=client_options, client_info=client_info + ) values["async_client"] = PredictionServiceAsyncClient( - client_options=client_options + client_options=client_options, client_info=client_info ) return values diff --git a/libs/langchain/langchain/retrievers/google_cloud_documentai_warehouse.py b/libs/langchain/langchain/retrievers/google_cloud_documentai_warehouse.py index 760f8362daa..a99f35264f2 100644 --- a/libs/langchain/langchain/retrievers/google_cloud_documentai_warehouse.py +++ b/libs/langchain/langchain/retrievers/google_cloud_documentai_warehouse.py @@ -5,6 +5,7 @@ from langchain.callbacks.manager import CallbackManagerForRetrieverRun from langchain.docstore.document import Document from langchain.pydantic_v1 import root_validator from langchain.schema import BaseRetriever +from langchain.utilities.vertexai import get_client_info from langchain.utils import get_from_dict_or_env if TYPE_CHECKING: @@ -29,23 +30,21 @@ class GoogleDocumentAIWarehouseRetriever(BaseRetriever): """ location: str = "us" - "GCP location where DocAI Warehouse is placed." + """Google Cloud location where Document AI Warehouse is placed.""" project_number: str - "GCP project number, should contain digits only." + """Google Cloud project number, should contain digits only.""" schema_id: Optional[str] = None - "DocAI Warehouse schema to queary against. If nothing is provided, all documents " - "in the project will be searched." + """Document AI Warehouse schema to query against. + If nothing is provided, all documents in the project will be searched.""" qa_size_limit: int = 5 - "The limit on the number of documents returned." + """The limit on the number of documents returned.""" client: "DocumentServiceClient" = None #: :meta private: @root_validator() def validate_environment(cls, values: Dict) -> Dict: """Validates the environment.""" try: # noqa: F401 - from google.cloud.contentwarehouse_v1 import ( - DocumentServiceClient, - ) + from google.cloud.contentwarehouse_v1 import DocumentServiceClient except ImportError as exc: raise ImportError( "google.cloud.contentwarehouse is not installed." @@ -55,7 +54,9 @@ class GoogleDocumentAIWarehouseRetriever(BaseRetriever): values["project_number"] = get_from_dict_or_env( values, "project_number", "PROJECT_NUMBER" ) - values["client"] = DocumentServiceClient() + values["client"] = DocumentServiceClient( + client_info=get_client_info(module="document-ai-warehouse") + ) return values def _prepare_request_metadata(self, user_ldap: str) -> "RequestMetadata": diff --git a/libs/langchain/langchain/retrievers/google_vertex_ai_search.py b/libs/langchain/langchain/retrievers/google_vertex_ai_search.py index e3b1ebfc7bf..53144ffb5f9 100644 --- a/libs/langchain/langchain/retrievers/google_vertex_ai_search.py +++ b/libs/langchain/langchain/retrievers/google_vertex_ai_search.py @@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence from langchain.callbacks.manager import CallbackManagerForRetrieverRun from langchain.pydantic_v1 import BaseModel, Extra, Field, root_validator from langchain.schema import BaseRetriever, Document +from langchain.utilities.vertexai import get_client_info from langchain.utils import get_from_dict_or_env if TYPE_CHECKING: @@ -260,7 +261,9 @@ class GoogleVertexAISearchRetriever(BaseRetriever, _BaseGoogleVertexAISearchRetr # For more information, refer to: # https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store self._client = SearchServiceClient( - credentials=self.credentials, client_options=self.client_options + credentials=self.credentials, + client_options=self.client_options, + client_info=get_client_info(module="vertex-ai-search"), ) self._serving_config = self._client.serving_config_path( @@ -387,7 +390,9 @@ class GoogleVertexAIMultiTurnSearchRetriever( ) self._client = ConversationalSearchServiceClient( - credentials=self.credentials, client_options=self.client_options + credentials=self.credentials, + client_options=self.client_options, + client_info=get_client_info(module="vertex-ai-search"), ) self._serving_config = self._client.serving_config_path( diff --git a/libs/langchain/langchain/utilities/vertexai.py b/libs/langchain/langchain/utilities/vertexai.py index 0df556307a6..334ab57f2ed 100644 --- a/libs/langchain/langchain/utilities/vertexai.py +++ b/libs/langchain/langchain/utilities/vertexai.py @@ -1,7 +1,9 @@ """Utilities to init Vertex AI.""" +from importlib import metadata from typing import TYPE_CHECKING, Optional if TYPE_CHECKING: + from google.api_core.gapic_v1.client_info import ClientInfo from google.auth.credentials import Credentials @@ -46,3 +48,30 @@ def init_vertexai( location=location, credentials=credentials, ) + + +def get_client_info(module: Optional[str] = None) -> "ClientInfo": + r"""Returns a custom user agent header. + + Args: + module (Optional[str]): + Optional. The module for a custom user agent header. + Returns: + google.api_core.gapic_v1.client_info.ClientInfo + """ + try: + from google.api_core.gapic_v1.client_info import ClientInfo + except ImportError as exc: + raise ImportError( + "Could not import ClientInfo. Please, install it with " + "pip install google-api-core" + ) from exc + + langchain_version = metadata.version("langchain") + client_library_version = ( + f"{langchain_version}-{module}" if module else langchain_version + ) + return ClientInfo( + client_library_version=client_library_version, + user_agent=f"langchain/{client_library_version}", + ) diff --git a/libs/langchain/langchain/vectorstores/matching_engine.py b/libs/langchain/langchain/vectorstores/matching_engine.py index cd0a26d9d6c..e60987f0b17 100644 --- a/libs/langchain/langchain/vectorstores/matching_engine.py +++ b/libs/langchain/langchain/vectorstores/matching_engine.py @@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Type from langchain.schema.document import Document from langchain.schema.embeddings import Embeddings from langchain.schema.vectorstore import VectorStore +from langchain.utilities.vertexai import get_client_info if TYPE_CHECKING: from google.cloud import storage @@ -419,7 +420,11 @@ class MatchingEngine(VectorStore): from google.cloud import storage - return storage.Client(credentials=credentials, project=project_id) + return storage.Client( + credentials=credentials, + project=project_id, + client_info=get_client_info(module="vertex-ai-matching-engine"), + ) @classmethod def _init_aiplatform(