From 69d9eae5cd89fc57781bbb9fe68b656f35e02870 Mon Sep 17 00:00:00 2001
From: Holt Skinner <13262395+holtskinner@users.noreply.github.com>
Date: Tue, 24 Oct 2023 10:49:11 -0500
Subject: [PATCH] feat: Add Client Info to available Google Cloud Clients
 (#12168)

- This is used internally to gather aggregate usage metrics for the
LangChain integrations

- Note: This cannot be added to some of the Vertex AI integrations at
this time because the SDK doesn't allow overriding the
[`ClientInfo`](https://googleapis.dev/python/google-api-core/latest/client_info.html#module-google.api_core.client_info)

- Added to:
  - BigQuery
  - Google Cloud Storage
  - Document AI
  - Vertex AI Model Garden
  - Document AI Warehouse
  - Vertex AI Search
  - Vertex AI Matching Engine (Cloud Storage Client)

@baskaryan, @eyurtsev, @hwchase17

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
---
 .../langchain/document_loaders/bigquery.py    |  7 ++++-
 .../document_loaders/gcs_directory.py         |  8 +++--
 .../langchain/document_loaders/gcs_file.py    |  5 +++-
 .../document_loaders/parsers/docai.py         |  6 +++-
 libs/langchain/langchain/llms/vertexai.py     |  8 +++--
 .../google_cloud_documentai_warehouse.py      | 19 ++++++------
 .../retrievers/google_vertex_ai_search.py     |  9 ++++--
 .../langchain/langchain/utilities/vertexai.py | 29 +++++++++++++++++++
 .../langchain/vectorstores/matching_engine.py |  7 ++++-
 9 files changed, 79 insertions(+), 19 deletions(-)

diff --git a/libs/langchain/langchain/document_loaders/bigquery.py b/libs/langchain/langchain/document_loaders/bigquery.py
index 4b9be0ca60d..37f5440f9fd 100644
--- a/libs/langchain/langchain/document_loaders/bigquery.py
+++ b/libs/langchain/langchain/document_loaders/bigquery.py
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, List, Optional
 
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
+from langchain.utilities.vertexai import get_client_info
 
 if TYPE_CHECKING:
     from google.auth.credentials import Credentials
@@ -57,7 +58,11 @@ class BigQueryLoader(BaseLoader):
                 "Please install it with `pip install google-cloud-bigquery`."
             ) from ex
 
-        bq_client = bigquery.Client(credentials=self.credentials, project=self.project)
+        bq_client = bigquery.Client(
+            credentials=self.credentials,
+            project=self.project,
+            client_info=get_client_info(module="bigquery"),
+        )
         if not bq_client.project:
             error_desc = (
                 "GCP project for Big Query is not set! Either provide a "
diff --git a/libs/langchain/langchain/document_loaders/gcs_directory.py b/libs/langchain/langchain/document_loaders/gcs_directory.py
index d51a0fbc8e4..f427dcf5a28 100644
--- a/libs/langchain/langchain/document_loaders/gcs_directory.py
+++ b/libs/langchain/langchain/document_loaders/gcs_directory.py
@@ -3,6 +3,7 @@ from typing import Callable, List, Optional
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.gcs_file import GCSFileLoader
+from langchain.utilities.vertexai import get_client_info
 
 
 class GCSDirectoryLoader(BaseLoader):
@@ -18,7 +19,7 @@ class GCSDirectoryLoader(BaseLoader):
         """Initialize with bucket and key name.
 
         Args:
-            project_name: The name of the project for the GCS bucket.
+            project_name: The ID of the project for the GCS bucket.
             bucket: The name of the GCS bucket.
             prefix: The prefix of the GCS bucket.
             loader_func: A loader function that instantiates a loader based on a
@@ -39,7 +40,10 @@ class GCSDirectoryLoader(BaseLoader):
                 "Could not import google-cloud-storage python package. "
                 "Please install it with `pip install google-cloud-storage`."
             )
-        client = storage.Client(project=self.project_name)
+        client = storage.Client(
+            project=self.project_name,
+            client_info=get_client_info(module="google-cloud-storage"),
+        )
         docs = []
         for blob in client.list_blobs(self.bucket, prefix=self.prefix):
             # we shall just skip directories since GCSFileLoader creates
diff --git a/libs/langchain/langchain/document_loaders/gcs_file.py b/libs/langchain/langchain/document_loaders/gcs_file.py
index 5fd6519bf16..468812217cd 100644
--- a/libs/langchain/langchain/document_loaders/gcs_file.py
+++ b/libs/langchain/langchain/document_loaders/gcs_file.py
@@ -5,6 +5,7 @@ from typing import Callable, List, Optional
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.unstructured import UnstructuredFileLoader
+from langchain.utilities.vertexai import get_client_info
 
 
 class GCSFileLoader(BaseLoader):
@@ -57,7 +58,9 @@ class GCSFileLoader(BaseLoader):
             )
 
         # Initialise a client
-        storage_client = storage.Client(self.project_name)
+        storage_client = storage.Client(
+            self.project_name, client_info=get_client_info("google-cloud-storage")
+        )
         # Create a bucket object for our bucket
         bucket = storage_client.get_bucket(self.bucket)
         # Create a blob object from the filepath
diff --git a/libs/langchain/langchain/document_loaders/parsers/docai.py b/libs/langchain/langchain/document_loaders/parsers/docai.py
index 32f9d9122c3..91345d94010 100644
--- a/libs/langchain/langchain/document_loaders/parsers/docai.py
+++ b/libs/langchain/langchain/document_loaders/parsers/docai.py
@@ -12,6 +12,7 @@ from typing import TYPE_CHECKING, Iterator, List, Optional, Sequence
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseBlobParser
 from langchain.document_loaders.blob_loaders import Blob
+from langchain.utilities.vertexai import get_client_info
 from langchain.utils.iter import batch_iterate
 
 if TYPE_CHECKING:
@@ -89,7 +90,10 @@ class DocAIParser(BaseBlobParser):
             options = ClientOptions(
                 api_endpoint=f"{location}-documentai.googleapis.com"
             )
-            self._client = DocumentProcessorServiceClient(client_options=options)
+            self._client = DocumentProcessorServiceClient(
+                client_options=options,
+                client_info=get_client_info(module="document-ai"),
+            )
 
     def lazy_parse(self, blob: Blob) -> Iterator[Document]:
         """Parses a blob lazily.
diff --git a/libs/langchain/langchain/llms/vertexai.py b/libs/langchain/langchain/llms/vertexai.py
index a2dc147edff..e8a998cde63 100644
--- a/libs/langchain/langchain/llms/vertexai.py
+++ b/libs/langchain/langchain/llms/vertexai.py
@@ -25,6 +25,7 @@ from langchain.schema import (
 )
 from langchain.schema.output import GenerationChunk
 from langchain.utilities.vertexai import (
+    get_client_info,
     init_vertexai,
     raise_vertex_import_error,
 )
@@ -370,9 +371,12 @@ class VertexAIModelGarden(_VertexAIBase, BaseLLM):
         client_options = ClientOptions(
             api_endpoint=f"{values['location']}-aiplatform.googleapis.com"
         )
-        values["client"] = PredictionServiceClient(client_options=client_options)
+        client_info = get_client_info(module="vertex-ai-model-garden")
+        values["client"] = PredictionServiceClient(
+            client_options=client_options, client_info=client_info
+        )
         values["async_client"] = PredictionServiceAsyncClient(
-            client_options=client_options
+            client_options=client_options, client_info=client_info
         )
         return values
 
diff --git a/libs/langchain/langchain/retrievers/google_cloud_documentai_warehouse.py b/libs/langchain/langchain/retrievers/google_cloud_documentai_warehouse.py
index 760f8362daa..a99f35264f2 100644
--- a/libs/langchain/langchain/retrievers/google_cloud_documentai_warehouse.py
+++ b/libs/langchain/langchain/retrievers/google_cloud_documentai_warehouse.py
@@ -5,6 +5,7 @@ from langchain.callbacks.manager import CallbackManagerForRetrieverRun
 from langchain.docstore.document import Document
 from langchain.pydantic_v1 import root_validator
 from langchain.schema import BaseRetriever
+from langchain.utilities.vertexai import get_client_info
 from langchain.utils import get_from_dict_or_env
 
 if TYPE_CHECKING:
@@ -29,23 +30,21 @@ class GoogleDocumentAIWarehouseRetriever(BaseRetriever):
     """
 
     location: str = "us"
-    "GCP location where DocAI Warehouse is placed."
+    """Google Cloud location where Document AI Warehouse is placed."""
     project_number: str
-    "GCP project number, should contain digits only."
+    """Google Cloud project number, should contain digits only."""
     schema_id: Optional[str] = None
-    "DocAI Warehouse schema to queary against. If nothing is provided, all documents "
-    "in the project will be searched."
+    """Document AI Warehouse schema to query against.
+    If nothing is provided, all documents in the project will be searched."""
     qa_size_limit: int = 5
-    "The limit on the number of documents returned."
+    """The limit on the number of documents returned."""
     client: "DocumentServiceClient" = None  #: :meta private:
 
     @root_validator()
     def validate_environment(cls, values: Dict) -> Dict:
         """Validates the environment."""
         try:  # noqa: F401
-            from google.cloud.contentwarehouse_v1 import (
-                DocumentServiceClient,
-            )
+            from google.cloud.contentwarehouse_v1 import DocumentServiceClient
         except ImportError as exc:
             raise ImportError(
                 "google.cloud.contentwarehouse is not installed."
@@ -55,7 +54,9 @@ class GoogleDocumentAIWarehouseRetriever(BaseRetriever):
         values["project_number"] = get_from_dict_or_env(
             values, "project_number", "PROJECT_NUMBER"
         )
-        values["client"] = DocumentServiceClient()
+        values["client"] = DocumentServiceClient(
+            client_info=get_client_info(module="document-ai-warehouse")
+        )
         return values
 
     def _prepare_request_metadata(self, user_ldap: str) -> "RequestMetadata":
diff --git a/libs/langchain/langchain/retrievers/google_vertex_ai_search.py b/libs/langchain/langchain/retrievers/google_vertex_ai_search.py
index e3b1ebfc7bf..53144ffb5f9 100644
--- a/libs/langchain/langchain/retrievers/google_vertex_ai_search.py
+++ b/libs/langchain/langchain/retrievers/google_vertex_ai_search.py
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
 from langchain.callbacks.manager import CallbackManagerForRetrieverRun
 from langchain.pydantic_v1 import BaseModel, Extra, Field, root_validator
 from langchain.schema import BaseRetriever, Document
+from langchain.utilities.vertexai import get_client_info
 from langchain.utils import get_from_dict_or_env
 
 if TYPE_CHECKING:
@@ -260,7 +261,9 @@ class GoogleVertexAISearchRetriever(BaseRetriever, _BaseGoogleVertexAISearchRetr
         #  For more information, refer to:
         # https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store
         self._client = SearchServiceClient(
-            credentials=self.credentials, client_options=self.client_options
+            credentials=self.credentials,
+            client_options=self.client_options,
+            client_info=get_client_info(module="vertex-ai-search"),
         )
 
         self._serving_config = self._client.serving_config_path(
@@ -387,7 +390,9 @@ class GoogleVertexAIMultiTurnSearchRetriever(
         )
 
         self._client = ConversationalSearchServiceClient(
-            credentials=self.credentials, client_options=self.client_options
+            credentials=self.credentials,
+            client_options=self.client_options,
+            client_info=get_client_info(module="vertex-ai-search"),
         )
 
         self._serving_config = self._client.serving_config_path(
diff --git a/libs/langchain/langchain/utilities/vertexai.py b/libs/langchain/langchain/utilities/vertexai.py
index 0df556307a6..334ab57f2ed 100644
--- a/libs/langchain/langchain/utilities/vertexai.py
+++ b/libs/langchain/langchain/utilities/vertexai.py
@@ -1,7 +1,9 @@
 """Utilities to init Vertex AI."""
+from importlib import metadata
 from typing import TYPE_CHECKING, Optional
 
 if TYPE_CHECKING:
+    from google.api_core.gapic_v1.client_info import ClientInfo
     from google.auth.credentials import Credentials
 
 
@@ -46,3 +48,30 @@ def init_vertexai(
         location=location,
         credentials=credentials,
     )
+
+
+def get_client_info(module: Optional[str] = None) -> "ClientInfo":
+    r"""Returns a custom user agent header.
+
+    Args:
+        module (Optional[str]):
+            Optional. The module for a custom user agent header.
+    Returns:
+        google.api_core.gapic_v1.client_info.ClientInfo
+    """
+    try:
+        from google.api_core.gapic_v1.client_info import ClientInfo
+    except ImportError as exc:
+        raise ImportError(
+            "Could not import ClientInfo. Please, install it with "
+            "pip install google-api-core"
+        ) from exc
+
+    langchain_version = metadata.version("langchain")
+    client_library_version = (
+        f"{langchain_version}-{module}" if module else langchain_version
+    )
+    return ClientInfo(
+        client_library_version=client_library_version,
+        user_agent=f"langchain/{client_library_version}",
+    )
diff --git a/libs/langchain/langchain/vectorstores/matching_engine.py b/libs/langchain/langchain/vectorstores/matching_engine.py
index cd0a26d9d6c..e60987f0b17 100644
--- a/libs/langchain/langchain/vectorstores/matching_engine.py
+++ b/libs/langchain/langchain/vectorstores/matching_engine.py
@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Type
 from langchain.schema.document import Document
 from langchain.schema.embeddings import Embeddings
 from langchain.schema.vectorstore import VectorStore
+from langchain.utilities.vertexai import get_client_info
 
 if TYPE_CHECKING:
     from google.cloud import storage
@@ -419,7 +420,11 @@ class MatchingEngine(VectorStore):
 
         from google.cloud import storage
 
-        return storage.Client(credentials=credentials, project=project_id)
+        return storage.Client(
+            credentials=credentials,
+            project=project_id,
+            client_info=get_client_info(module="vertex-ai-matching-engine"),
+        )
 
     @classmethod
     def _init_aiplatform(