diff --git a/libs/community/langchain_community/vectorstores/matching_engine.py b/libs/community/langchain_community/vectorstores/matching_engine.py index fce60648255..ae77aa5ad22 100644 --- a/libs/community/langchain_community/vectorstores/matching_engine.py +++ b/libs/community/langchain_community/vectorstores/matching_engine.py @@ -22,7 +22,7 @@ if TYPE_CHECKING: from langchain_community.embeddings import TensorflowHubEmbeddings -logger = logging.getLogger() +logger = logging.getLogger(__name__) class MatchingEngine(VectorStore): @@ -49,6 +49,8 @@ class MatchingEngine(VectorStore): gcs_client: storage.Client, gcs_bucket_name: str, credentials: Optional[Credentials] = None, + *, + document_id_key: Optional[str] = None, ): """Google Vertex AI Vector Search (previously Matching Engine) implementation of the vector store. @@ -78,6 +80,9 @@ class MatchingEngine(VectorStore): gcs_client: The GCS client. gcs_bucket_name: The GCS bucket name. credentials (Optional): Created GCP credentials. + document_id_key (Optional): Key for storing document ID in document + metadata. If None, document ID will not be returned in document + metadata. """ super().__init__() self._validate_google_libraries_installation() @@ -89,6 +94,7 @@ class MatchingEngine(VectorStore): self.gcs_client = gcs_client self.credentials = credentials self.gcs_bucket_name = gcs_bucket_name + self.document_id_key = document_id_key @property def embeddings(self) -> Embeddings: @@ -229,6 +235,7 @@ class MatchingEngine(VectorStore): List[Tuple[Document, float]]: List of documents most similar to the query text and cosine distance in float for each. Lower score represents more similarity. + """ filter = filter or [] @@ -255,19 +262,27 @@ class MatchingEngine(VectorStore): if len(response) == 0: return [] - results = [] + docs: List[Tuple[Document, float]] = [] # I'm only getting the first one because queries receives an array # and the similarity_search method only receives one query. This # means that the match method will always return an array with only # one element. - for doc in response[0]: - page_content = self._download_from_gcs(f"documents/{doc.id}") - results.append((Document(page_content=page_content), doc.distance)) + for result in response[0]: + page_content = self._download_from_gcs(f"documents/{result.id}") + # TODO: return all metadata. + metadata = {} + if self.document_id_key is not None: + metadata[self.document_id_key] = result.id + document = Document( + page_content=page_content, + metadata=metadata, + ) + docs.append((document, result.distance)) logger.debug("Downloaded documents for query.") - return results + return docs def similarity_search( self, @@ -382,6 +397,7 @@ class MatchingEngine(VectorStore): endpoint_id: str, credentials_path: Optional[str] = None, embedding: Optional[Embeddings] = None, + **kwargs: Any, ) -> "MatchingEngine": """Takes the object creation out of the constructor. @@ -397,6 +413,7 @@ class MatchingEngine(VectorStore): the local file system. embedding: The :class:`Embeddings` that will be used for embedding the texts. + kwargs: Additional keyword arguments to pass to MatchingEngine.__init__(). Returns: A configured MatchingEngine with the texts added to the index. @@ -419,6 +436,7 @@ class MatchingEngine(VectorStore): gcs_client=gcs_client, credentials=credentials, gcs_bucket_name=gcs_bucket_name, + **kwargs, ) @classmethod