feat: add Google BigQueryVectorSearch in vectorstore (#14829)

BigQuery vector search lets you use GoogleSQL to do semantic search,
using vector indexes for fast but approximate results, or using brute
force for exact results.

This PR integrates LangChain vectorstore with BigQuery Vector Search.

<!-- Thank you for contributing to LangChain!

Replace this entire comment with:
  - **Description:** a description of the change, 
  - **Issue:** the issue # it fixes (if applicable),
  - **Dependencies:** any dependencies required for this change,
- **Tag maintainer:** for a quicker response, tag the relevant
maintainer (see below),
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!

Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` to check this
locally.

See contribution guidelines for more information on how to write/run
tests, lint, etc:
https://python.langchain.com/docs/contributing/

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in `docs/extras`
directory.

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
 -->

---------

Co-authored-by: Vlad Kolesnikov <vladkol@google.com>
This commit is contained in:
Ashley Xu
2024-01-02 15:57:14 -08:00
committed by GitHub
parent 02f59c2035
commit 0ce7858529
5 changed files with 1322 additions and 0 deletions

View File

@@ -0,0 +1,102 @@
"""Test BigQuery Vector Search.
In order to run this test, you need to install Google Cloud BigQuery SDK
pip install google-cloud-bigquery
Your end-user credentials would be used to make the calls (make sure you've run
`gcloud auth login` first).
"""
import os
import uuid
import pytest
from langchain_community.vectorstores.bigquery_vector_search import BigQueryVectorSearch
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
TEST_TABLE_NAME = "langchain_test_table"
@pytest.fixture(scope="class")
def store(request: pytest.FixtureRequest) -> BigQueryVectorSearch:
"""BigQueryVectorStore tests context.
In order to run this test, you define PROJECT environment variable
with GCP project id.
Example:
export PROJECT=...
"""
from google.cloud import bigquery
bigquery.Client(location="US").create_dataset(
TestBigQueryVectorStore.dataset_name, exists_ok=True
)
TestBigQueryVectorStore.store = BigQueryVectorSearch(
project_id=os.environ.get("PROJECT", None),
embedding=FakeEmbeddings(),
dataset_name=TestBigQueryVectorStore.dataset_name,
table_name=TEST_TABLE_NAME,
)
TestBigQueryVectorStore.store.add_texts(
TestBigQueryVectorStore.texts, TestBigQueryVectorStore.metadatas
)
def teardown() -> None:
bigquery.Client(location="US").delete_dataset(
TestBigQueryVectorStore.dataset_name,
delete_contents=True,
not_found_ok=True,
)
request.addfinalizer(teardown)
return TestBigQueryVectorStore.store
class TestBigQueryVectorStore:
"""BigQueryVectorStore tests class."""
dataset_name = uuid.uuid4().hex
store: BigQueryVectorSearch
texts = ["apple", "ice cream", "Saturn", "candy", "banana"]
metadatas = [
{
"kind": "fruit",
},
{
"kind": "treat",
},
{
"kind": "planet",
},
{
"kind": "treat",
},
{
"kind": "fruit",
},
]
def test_semantic_search(self, store: BigQueryVectorSearch) -> None:
"""Test on semantic similarity."""
docs = store.similarity_search("food", k=4)
print(docs)
kinds = [d.metadata["kind"] for d in docs]
assert "fruit" in kinds
assert "treat" in kinds
assert "planet" not in kinds
def test_semantic_search_filter_fruits(self, store: BigQueryVectorSearch) -> None:
"""Test on semantic similarity with metadata filter."""
docs = store.similarity_search("food", filter={"kind": "fruit"})
kinds = [d.metadata["kind"] for d in docs]
assert "fruit" in kinds
assert "treat" not in kinds
assert "planet" not in kinds
def test_get_doc_by_filter(self, store: BigQueryVectorSearch) -> None:
"""Test on document retrieval with metadata filter."""
docs = store.get_documents(filter={"kind": "fruit"})
kinds = [d.metadata["kind"] for d in docs]
assert "fruit" in kinds
assert "treat" not in kinds
assert "planet" not in kinds