community[minor]: add additional support for BigQueryVectorSearch (#15904)

BigQuery vector search lets you use GoogleSQL to do semantic search, using vector indexes for fast but approximate results, or using brute force for exact results. This PR: 1. Add `metadata[_job_ib]` in Document returned by any similarity search 2. Add `explore_job_stats` to enable users to explore job statistics and better the debuggability 3. Set the minimum row limit for running create vector index.
2025-12-23 07:56:46 +00:00 · 2024-01-15 11:45:15 -07:00
parent 8799b028a6
commit ce7723c1e5
2 changed files with 40 additions and 0 deletions
--- a/docs/docs/integrations/vectorstores/bigquery_vector_search.ipynb
+++ b/docs/docs/integrations/vectorstores/bigquery_vector_search.ipynb
@@ -324,6 +324,24 @@
    "docs = store.similarity_search_by_vector(query_vector, filter={\"len\": 6})\n",
    "print(docs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Explore job satistics with BigQuery Job Id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "job_id = \"\"  # @param {type:\"string\"}\n",
    "# Debug and explore the job statistics with a BigQuery Job id.\n",
    "store.explore_job_stats(job_id)"
   ]
  }
 ],
 "metadata": {
--- a/libs/community/langchain_community/vectorstores/bigquery_vector_search.py
+++ b/libs/community/langchain_community/vectorstores/bigquery_vector_search.py
@@ -28,6 +28,7 @@ DEFAULT_METADATA_COLUMN_NAME = "metadata"  # document metadata
 DEFAULT_CONTENT_COLUMN_NAME = "content"  # text content, do not rename
 DEFAULT_TOP_K = 4  # default number of documents returned from similarity search
 _MIN_INDEX_ROWS = 5000  # minimal number of rows for creating an index
 _INDEX_CHECK_PERIOD_SECONDS = 60  # Do not check for index more often that this.
 _vector_table_lock = Lock()  # process-wide BigQueryVectorSearch table lock
@@ -192,6 +193,11 @@ class BigQueryVectorSearch(VectorStore):
        if self._have_index or self._creating_index:
            # Already have an index or in the process of creating one.
            return
        table = self.bq_client.get_table(self.vectors_table)
        if (table.num_rows or 0) < _MIN_INDEX_ROWS:
            # Not enough rows to create index.
            self._logger.debug("Not enough rows to create a vector index.")
            return
        if (
            datetime.utcnow() - self._last_index_check
        ).total_seconds() < _INDEX_CHECK_PERIOD_SECONDS:
@@ -228,6 +234,10 @@ class BigQueryVectorSearch(VectorStore):
    def _create_index(self):
        from google.api_core.exceptions import ClientError
        table = self.bq_client.get_table(self.vectors_table)
        if (table.num_rows or 0) < _MIN_INDEX_ROWS:
            # Not enough rows to create index.
            return
        if self.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
            distance_type = "EUCLIDEAN"
        elif self.distance_strategy == DistanceStrategy.COSINE:
@@ -534,6 +544,7 @@ class BigQueryVectorSearch(VectorStore):
            else:
                metadata = {}
            metadata["__id"] = row[self.doc_id_field]
            metadata["__job_id"] = job.job_id
            doc = Document(page_content=row[self.content_field], metadata=metadata)
            document_tuples.append(
                (doc, row[self.text_embedding_field], row["_vector_search_distance"])
@@ -833,3 +844,14 @@ class BigQueryVectorSearch(VectorStore):
        vs_obj = BigQueryVectorSearch(embedding=embedding, **kwargs)
        vs_obj.add_texts(texts, metadatas)
        return vs_obj
    def explore_job_stats(self, job_id: str) -> Dict:
        """Return the statistics for a single job execution.
        Args:
            job_id: The BigQuery Job id.
        Returns:
            A dictionary of job statistics for a given job.
        """
        return self.bq_client.get_job(job_id)._properties["statistics"]