mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-14 19:42:45 +00:00
community[minor]: add additional support for BigQueryVectorSearch
(#15904)
BigQuery vector search lets you use GoogleSQL to do semantic search, using vector indexes for fast but approximate results, or using brute force for exact results. This PR: 1. Add `metadata[_job_ib]` in Document returned by any similarity search 2. Add `explore_job_stats` to enable users to explore job statistics and better the debuggability 3. Set the minimum row limit for running create vector index.
This commit is contained in:
parent
8799b028a6
commit
ce7723c1e5
@ -324,6 +324,24 @@
|
|||||||
"docs = store.similarity_search_by_vector(query_vector, filter={\"len\": 6})\n",
|
"docs = store.similarity_search_by_vector(query_vector, filter={\"len\": 6})\n",
|
||||||
"print(docs)"
|
"print(docs)"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Explore job satistics with BigQuery Job Id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"job_id = \"\" # @param {type:\"string\"}\n",
|
||||||
|
"# Debug and explore the job statistics with a BigQuery Job id.\n",
|
||||||
|
"store.explore_job_stats(job_id)"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@ -28,6 +28,7 @@ DEFAULT_METADATA_COLUMN_NAME = "metadata" # document metadata
|
|||||||
DEFAULT_CONTENT_COLUMN_NAME = "content" # text content, do not rename
|
DEFAULT_CONTENT_COLUMN_NAME = "content" # text content, do not rename
|
||||||
DEFAULT_TOP_K = 4 # default number of documents returned from similarity search
|
DEFAULT_TOP_K = 4 # default number of documents returned from similarity search
|
||||||
|
|
||||||
|
_MIN_INDEX_ROWS = 5000 # minimal number of rows for creating an index
|
||||||
_INDEX_CHECK_PERIOD_SECONDS = 60 # Do not check for index more often that this.
|
_INDEX_CHECK_PERIOD_SECONDS = 60 # Do not check for index more often that this.
|
||||||
|
|
||||||
_vector_table_lock = Lock() # process-wide BigQueryVectorSearch table lock
|
_vector_table_lock = Lock() # process-wide BigQueryVectorSearch table lock
|
||||||
@ -192,6 +193,11 @@ class BigQueryVectorSearch(VectorStore):
|
|||||||
if self._have_index or self._creating_index:
|
if self._have_index or self._creating_index:
|
||||||
# Already have an index or in the process of creating one.
|
# Already have an index or in the process of creating one.
|
||||||
return
|
return
|
||||||
|
table = self.bq_client.get_table(self.vectors_table)
|
||||||
|
if (table.num_rows or 0) < _MIN_INDEX_ROWS:
|
||||||
|
# Not enough rows to create index.
|
||||||
|
self._logger.debug("Not enough rows to create a vector index.")
|
||||||
|
return
|
||||||
if (
|
if (
|
||||||
datetime.utcnow() - self._last_index_check
|
datetime.utcnow() - self._last_index_check
|
||||||
).total_seconds() < _INDEX_CHECK_PERIOD_SECONDS:
|
).total_seconds() < _INDEX_CHECK_PERIOD_SECONDS:
|
||||||
@ -228,6 +234,10 @@ class BigQueryVectorSearch(VectorStore):
|
|||||||
def _create_index(self):
|
def _create_index(self):
|
||||||
from google.api_core.exceptions import ClientError
|
from google.api_core.exceptions import ClientError
|
||||||
|
|
||||||
|
table = self.bq_client.get_table(self.vectors_table)
|
||||||
|
if (table.num_rows or 0) < _MIN_INDEX_ROWS:
|
||||||
|
# Not enough rows to create index.
|
||||||
|
return
|
||||||
if self.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
|
if self.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
|
||||||
distance_type = "EUCLIDEAN"
|
distance_type = "EUCLIDEAN"
|
||||||
elif self.distance_strategy == DistanceStrategy.COSINE:
|
elif self.distance_strategy == DistanceStrategy.COSINE:
|
||||||
@ -534,6 +544,7 @@ class BigQueryVectorSearch(VectorStore):
|
|||||||
else:
|
else:
|
||||||
metadata = {}
|
metadata = {}
|
||||||
metadata["__id"] = row[self.doc_id_field]
|
metadata["__id"] = row[self.doc_id_field]
|
||||||
|
metadata["__job_id"] = job.job_id
|
||||||
doc = Document(page_content=row[self.content_field], metadata=metadata)
|
doc = Document(page_content=row[self.content_field], metadata=metadata)
|
||||||
document_tuples.append(
|
document_tuples.append(
|
||||||
(doc, row[self.text_embedding_field], row["_vector_search_distance"])
|
(doc, row[self.text_embedding_field], row["_vector_search_distance"])
|
||||||
@ -833,3 +844,14 @@ class BigQueryVectorSearch(VectorStore):
|
|||||||
vs_obj = BigQueryVectorSearch(embedding=embedding, **kwargs)
|
vs_obj = BigQueryVectorSearch(embedding=embedding, **kwargs)
|
||||||
vs_obj.add_texts(texts, metadatas)
|
vs_obj.add_texts(texts, metadatas)
|
||||||
return vs_obj
|
return vs_obj
|
||||||
|
|
||||||
|
def explore_job_stats(self, job_id: str) -> Dict:
|
||||||
|
"""Return the statistics for a single job execution.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id: The BigQuery Job id.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary of job statistics for a given job.
|
||||||
|
"""
|
||||||
|
return self.bq_client.get_job(job_id)._properties["statistics"]
|
||||||
|
Loading…
Reference in New Issue
Block a user