Minor improvements to rockset vectorstore (#8416)

This PR makes minor improvements to our python notebook, and adds
support for `Rockset` workspaces in our vectorstore client.

@rlancemartin, @eyurtsev

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Anubhav Bindlish
2023-07-31 17:54:59 +01:00
committed by GitHub
parent 893f3014af
commit 913a156cff
4 changed files with 146 additions and 126 deletions

View File

@@ -23,7 +23,6 @@ class Rockset(VectorStore):
See: https://rockset.com/blog/introducing-vector-search-on-rockset/ for more details
Everything below assumes `commons` Rockset workspace.
TODO: Add support for workspace args.
Example:
.. code-block:: python
@@ -50,6 +49,7 @@ class Rockset(VectorStore):
collection_name: str,
text_key: str,
embedding_key: str,
workspace: str = "commons",
):
"""Initialize with Rockset client.
Args:
@@ -82,6 +82,7 @@ class Rockset(VectorStore):
self._embeddings = embeddings
self._text_key = text_key
self._embedding_key = embedding_key
self._workspace = workspace
@property
def embeddings(self) -> Embeddings:
@@ -303,7 +304,7 @@ class Rockset(VectorStore):
where_str = f"WHERE {where_str}\n" if where_str else ""
return f"""\
SELECT * EXCEPT({self._embedding_key}), {distance_str}
FROM {self._collection_name}
FROM {self._workspace}.{self._collection_name}
{where_str}\
ORDER BY dist {distance_func.order_by()}
LIMIT {str(k)}
@@ -311,7 +312,7 @@ LIMIT {str(k)}
def _write_documents_to_rockset(self, batch: List[dict]) -> List[str]:
add_doc_res = self._client.Documents.add_documents(
collection=self._collection_name, data=batch
collection=self._collection_name, data=batch, workspace=self._workspace
)
return [doc_status._id for doc_status in add_doc_res.data]
@@ -328,4 +329,5 @@ LIMIT {str(k)}
self._client.Documents.delete_documents(
collection=self._collection_name,
data=[DeleteDocumentsRequestData(id=i) for i in ids],
workspace=self._workspace,
)

View File

@@ -34,12 +34,12 @@ def test_sql_query() -> None:
client = rockset.RocksetClient(host, api_key)
col_1 = "Rockset is a real-time analytics database which enables queries on massive, semi-structured data without operational burden. Rockset is serverless and fully managed. It offloads the work of managing configuration, cluster provisioning, denormalization, and shard / index management. Rockset is also SOC 2 Type II compliant and offers encryption at rest and in flight, securing and protecting any sensitive data. Most teams can ingest data into Rockset and start executing queries in less than 15 minutes." # noqa: E501
col_1 = "Rockset is a real-time analytics database"
col_2 = 2
col_3 = "e903e069-b0b5-4b80-95e2-86471b41f55f"
id = 7320132
"""Run a simple SQL query query"""
"""Run a simple SQL query"""
loader = RocksetLoader(
client,
rockset.models.QueryRequestSql(

View File

@@ -33,6 +33,7 @@ logger = logging.getLogger(__name__)
#
# See https://rockset.com/blog/introducing-vector-search-on-rockset/ for more details.
workspace = "langchain_tests"
collection_name = "langchain_demo"
text_key = "description"
embedding_key = "description_embedding"
@@ -71,10 +72,9 @@ class TestRockset:
"Deleting all existing documents from the Rockset collection %s",
collection_name,
)
query = f"select _id from {workspace}.{collection_name}"
query_response = client.Queries.query(
sql={"query": "select _id from {}".format(collection_name)}
)
query_response = client.Queries.query(sql={"query": query})
ids = [
str(r["_id"])
for r in getattr(
@@ -85,12 +85,13 @@ class TestRockset:
client.Documents.delete_documents(
collection=collection_name,
data=[rockset.models.DeleteDocumentsRequestData(id=i) for i in ids],
workspace=workspace,
)
embeddings = ConsistentFakeEmbeddings()
embeddings.embed_documents(fake_texts)
cls.rockset_vectorstore = Rockset(
client, embeddings, collection_name, text_key, embedding_key
client, embeddings, collection_name, text_key, embedding_key, workspace
)
def test_rockset_insert_and_search(self) -> None:
@@ -127,9 +128,9 @@ class TestRockset:
)
vector_str = ",".join(map(str, vector))
expected = f"""\
SELECT * EXCEPT(description_embedding), \
COSINE_SIM(description_embedding, [{vector_str}]) as dist
FROM langchain_demo
SELECT * EXCEPT({embedding_key}), \
COSINE_SIM({embedding_key}, [{vector_str}]) as dist
FROM {workspace}.{collection_name}
ORDER BY dist DESC
LIMIT 4
"""
@@ -145,9 +146,9 @@ LIMIT 4
)
vector_str = ",".join(map(str, vector))
expected = f"""\
SELECT * EXCEPT(description_embedding), \
COSINE_SIM(description_embedding, [{vector_str}]) as dist
FROM langchain_demo
SELECT * EXCEPT({embedding_key}), \
COSINE_SIM({embedding_key}, [{vector_str}]) as dist
FROM {workspace}.{collection_name}
WHERE age >= 10
ORDER BY dist DESC
LIMIT 4