community[minor]: added jaguar vector store (#14838)

Description: A new vector store Jaguar is being added. Class, test
scripts, and documentation is added.
Issue: None -- This is the first PR contributing to LangChain
Dependencies: This depends on "pip install -U jaguardb-http-client"
client http package
Tag maintainer: @baskaryan, @eyurtsev, @hwchase1
Twitter handle: @workbot

---------

Co-authored-by: JY <jyjy@jaguardb>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
JaguarDB
2023-12-19 07:40:18 -08:00
committed by GitHub
parent a5be9f9475
commit 992b04e475
5 changed files with 1158 additions and 0 deletions

View File

@@ -0,0 +1,441 @@
from __future__ import annotations
import json
import logging
from typing import TYPE_CHECKING, Any, List, Optional, Tuple
if TYPE_CHECKING:
from jaguardb_http_client.JaguarHttpClient import JaguarHttpClient
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
logger = logging.getLogger(__name__)
class Jaguar(VectorStore):
"""`Jaguar API` vector store.
See http://www.jaguardb.com
See http://github.com/fserv/jaguar-sdk
Example:
.. code-block:: python
from langchain.vectorstores import Jaguar
vectorstore = Jaguar(
pod = 'vdb',
store = 'mystore',
vector_index = 'v',
vector_type = 'cosine_fraction_float',
vector_dimension = 1536,
url='http://192.168.8.88:8080/fwww/',
embedding=openai_model
)
"""
def __init__(
self,
pod: str,
store: str,
vector_index: str,
vector_type: str,
vector_dimension: int,
url: str,
embedding: Embeddings,
):
self._pod = pod
self._store = store
self._vector_index = vector_index
self._vector_type = vector_type
self._vector_dimension = vector_dimension
self._embedding = embedding
self._jag = JaguarHttpClient(url)
self._token = ""
def login(
self,
jaguar_api_key: Optional[str] = "",
) -> bool:
"""
login to jaguardb server with a jaguar_api_key or let self._jag find a key
Args:
pod (str): name of a Pod
store (str): name of a vector store
optional jaguar_api_key (str): API key of user to jaguardb server
Returns:
True if successful; False if not successful
"""
if jaguar_api_key == "":
jaguar_api_key = self._jag.getApiKey()
self._jaguar_api_key = jaguar_api_key
self._token = self._jag.login(jaguar_api_key)
if self._token == "":
logger.error("E0001 error init(): invalid jaguar_api_key")
return False
return True
def create(
self,
metadata_str: str,
text_size: int,
) -> None:
"""
create the vector store on the backend database
Args:
metadata_str (str): columns and their types
Returns:
True if successful; False if not successful
"""
podstore = self._pod + "." + self._store
"""
source column is required.
v:text column is required.
"""
q = "create store "
q += podstore
q += f" ({self._vector_index} vector({self._vector_dimension},"
q += f" '{self._vector_type}'),"
q += f" source char(256), v:text char({text_size}),"
q += metadata_str + ")"
self.run(q)
def run(self, query: str, withFile: bool = False) -> dict:
"""
Run any query statement in jaguardb
Args:
query (str): query statement to jaguardb
Returns:
None for invalid token, or
json result string
"""
if self._token == "":
logger.error(f"E0005 error run({query})")
return {}
resp = self._jag.post(query, self._token, withFile)
txt = resp.text
try:
js = json.loads(txt)
return js
except Exception:
return {}
@property
def embeddings(self) -> Optional[Embeddings]:
return self._embedding
def add_texts(
self,
texts: List[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""
Add texts through the embeddings and add to the vectorstore.
Args:
texts: list of text strings to add to the jaguar vector store.
metadatas: Optional list of metadatas associated with the texts.
[{"m1": "v11", "m2": "v12", "m3": "v13", "filecol": "path_file1.jpg" },
{"m1": "v21", "m2": "v22", "m3": "v23", "filecol": "path_file2.jpg" },
{"m1": "v31", "m2": "v32", "m3": "v33", "filecol": "path_file3.jpg" },
{"m1": "v41", "m2": "v42", "m3": "v43", "filecol": "path_file4.jpg" }]
kwargs: vector_index=name_of_vector_index
file_column=name_of_file_column
Returns:
List of ids from adding the texts into the vectorstore
"""
vcol = self._vector_index
filecol = kwargs.get("file_column", "")
podstorevcol = self._pod + "." + self._store + "." + vcol
q = "textcol " + podstorevcol
js = self.run(q)
if js == "":
return []
textcol = js["data"]
embeddings = self._embedding.embed_documents(list(texts))
ids = []
if metadatas is None:
### no meta and no files to upload
i = 0
for vec in embeddings:
str_vec = [str(x) for x in vec]
values_comma = ",".join(str_vec)
podstore = self._pod + "." + self._store
q = "insert into " + podstore + " ("
q += vcol + "," + textcol + ") values ('" + values_comma
q += "','" + texts[i] + "')"
js = self.run(q, False)
ids.append(js["zid"])
i += 1
else:
i = 0
for vec in embeddings:
str_vec = [str(x) for x in vec]
nvec, vvec, filepath = self._parseMeta(metadatas[i], filecol)
if filecol != "":
rc = self._jag.postFile(self._token, filepath, 1)
if not rc:
return []
names_comma = ",".join(nvec)
names_comma += "," + vcol
## col1,col2,col3,vecl
values_comma = "'" + "','".join(vvec) + "'"
### 'va1','val2','val3'
values_comma += ",'" + ",".join(str_vec) + "'"
### 'v1,v2,v3'
podstore = self._pod + "." + self._store
q = "insert into " + podstore + " ("
q += names_comma + "," + textcol + ") values (" + values_comma
q += ",'" + texts[i] + "')"
if filecol != "":
js = self.run(q, True)
else:
js = self.run(q, False)
ids.append(js["zid"])
i += 1
return ids
def similarity_search_with_score(
self,
query: str,
k: int = 3,
fetch_k: int = -1,
where: Optional[str] = None,
score_threshold: Optional[float] = -1.0,
metadatas: Optional[List[str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""
Return Jaguar documents most similar to query, along with scores.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 3.
lambda_val: lexical match parameter for hybrid search.
where: the where clause in select similarity. For example a
where can be "rating > 3.0 and (state = 'NV' or state = 'CA')"
score_threshold: minimal score threshold for the result.
If defined, results with score less than this value will be
filtered out.
kwargs: vector_index=vcol, vector_type=cosine_fraction_float
Returns:
List of Documents most similar to the query and score for each.
List of Tuples of (doc, similarity_score):
[ (doc, score), (doc, score), ...]
"""
vcol = self._vector_index
vtype = self._vector_type
embeddings = self._embedding.embed_query(query)
str_embeddings = [str(f) for f in embeddings]
qv_comma = ",".join(str_embeddings)
podstore = self._pod + "." + self._store
q = (
"select similarity("
+ vcol
+ ",'"
+ qv_comma
+ "','topk="
+ str(k)
+ ",fetch_k="
+ str(fetch_k)
+ ",type="
+ vtype
)
q += ",with_score=yes,with_text=yes,score_threshold=" + str(score_threshold)
if metadatas is not None:
meta = "&".join(metadatas)
q += ",metadata=" + meta
q += "') from " + podstore
if where is not None:
q += " where " + where
jarr = self.run(q)
if jarr is None:
return []
docs_with_score = []
for js in jarr:
score = js["score"]
text = js["text"]
zid = js["zid"]
### give metadatas
md = {}
md["zid"] = zid
if metadatas is not None:
for m in metadatas:
mv = js[m]
md[m] = mv
doc = Document(page_content=text, metadata=md)
tup = (doc, score)
docs_with_score.append(tup)
return docs_with_score
def similarity_search(
self,
query: str,
k: int = 3,
where: Optional[str] = None,
metadatas: Optional[List[str]] = None,
**kwargs: Any,
) -> List[Document]:
"""
Return Jaguar documents most similar to query, along with scores.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 5.
where: the where clause in select similarity. For example a
where can be "rating > 3.0 and (state = 'NV' or state = 'CA')"
Returns:
List of Documents most similar to the query
"""
docs_and_scores = self.similarity_search_with_score(
query, k=k, where=where, metadatas=metadatas, **kwargs
)
return [doc for doc, _ in docs_and_scores]
def is_anomalous(
self,
query: str,
**kwargs: Any,
) -> bool:
"""
Detect if given text is anomalous from the dataset
Args:
query: Text to detect if it is anomaly
Returns:
True or False
"""
vcol = self._vector_index
vtype = self._vector_type
embeddings = self._embedding.embed_query(query)
str_embeddings = [str(f) for f in embeddings]
qv_comma = ",".join(str_embeddings)
podstore = self._pod + "." + self._store
q = "select anomalous(" + vcol + ", '" + qv_comma + "', 'type=" + vtype + "')"
q += " from " + podstore
js = self.run(q)
if isinstance(js, list) and len(js) == 0:
return False
jd = json.loads(js[0])
if jd["anomalous"] == "YES":
return True
return False
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
url: str,
pod: str,
store: str,
vector_index: str,
vector_type: str,
vector_dimension: int,
metadatas: Optional[List[dict]] = None,
jaguar_api_key: Optional[str] = "",
**kwargs: Any,
) -> Jaguar:
jagstore = cls(
pod, store, vector_index, vector_type, vector_dimension, url, embedding
)
jagstore.login(jaguar_api_key)
jagstore.clear()
jagstore.add_texts(texts, metadatas, **kwargs)
return jagstore
def clear(self) -> None:
"""
Delete all records in jaguardb
Args: No args
Returns: None
"""
podstore = self._pod + "." + self._store
q = "truncate store " + podstore
self.run(q)
def delete(self, zids: List[str], **kwargs: Any) -> None:
"""
Delete records in jaguardb by a list of zero-ids
Args:
pod (str): name of a Pod
ids (List[str]): a list of zid as string
Returns:
Do not return anything
"""
podstore = self._pod + "." + self._store
for zid in zids:
q = "delete from " + podstore + " where zid='" + zid + "'"
self.run(q)
def count(self) -> int:
"""
Count records of a store in jaguardb
Args: no args
Returns: (int) number of records in pod store
"""
podstore = self._pod + "." + self._store
q = "select count() from " + podstore
js = self.run(q)
if isinstance(js, list) and len(js) == 0:
return 0
jd = json.loads(js[0])
return int(jd["data"])
def drop(self) -> None:
"""
Drop or remove a store in jaguardb
Args: no args
Returns: None
"""
podstore = self._pod + "." + self._store
q = "drop store " + podstore
self.run(q)
def logout(self) -> None:
"""
Logout to cleanup resources
Args: no args
Returns: None
"""
self._jag.logout(self._token)
def prt(self, msg: str) -> None:
with open("/tmp/debugjaguar.log", "a") as file:
print(f"msg={msg}", file=file, flush=True)
def _parseMeta(self, nvmap: dict, filecol: str) -> Tuple[List[str], List[str], str]:
filepath = ""
if filecol == "":
nvec = list(nvmap.keys())
vvec = list(nvmap.values())
else:
nvec = []
vvec = []
if filecol in nvmap:
nvec.append(filecol)
vvec.append(nvmap[filecol])
filepath = nvmap[filecol]
for k, v in nvmap.items():
if k != filecol:
nvec.append(k)
vvec.append(v)
return nvec, vvec, filepath

View File

@@ -0,0 +1,138 @@
import json
from langchain_community.vectorstores.jaguar import Jaguar
from tests.integration_tests.vectorstores.fake_embeddings import (
ConsistentFakeEmbeddings,
)
#############################################################################################
##
## Requirement: fwww http server must be running at 127.0.0.1:8080 (or any end point)
## jaguardb server must be running accepting commands from the http server
##
## FakeEmbeddings is used to create text embeddings with dimension of 10.
##
#############################################################################################
class TestJaguar:
vectorstore: Jaguar
pod: str
store: str
@classmethod
def setup_class(cls) -> None:
url = "http://127.0.0.1:8080/fwww/"
cls.pod = "vdb"
cls.store = "langchain_test_store"
vector_index = "v"
vector_type = "cosine_fraction_float"
vector_dimension = 10
embeddings = ConsistentFakeEmbeddings()
cls.vectorstore = Jaguar(
cls.pod,
cls.store,
vector_index,
vector_type,
vector_dimension,
url,
embeddings,
)
@classmethod
def teardown_class(cls) -> None:
pass
def test_login(self) -> None:
"""
Requires environment variable JAGUAR_API_KEY
or $HOME/.jagrc storing the jaguar api key
"""
self.vectorstore.login()
def test_create(self) -> None:
"""
Create a vector with vector index 'v' of dimension 10
and 'v:text' to hold text and metadatas author and category
"""
metadata_str = "author char(32), category char(16)"
self.vectorstore.create(metadata_str, 1024)
podstore = self.pod + "." + self.store
js = self.vectorstore.run(f"desc {podstore}")
jd = json.loads(js[0])
assert podstore in jd["data"]
def test_add_texts(self) -> None:
"""
Add some texts
"""
texts = ["foo", "bar", "baz"]
metadatas = [
{"author": "Adam", "category": "Music"},
{"author": "Eve", "category": "Music"},
{"author": "John", "category": "History"},
]
ids = self.vectorstore.add_texts(texts=texts, metadatas=metadatas)
assert len(ids) == len(texts)
def test_search(self) -> None:
"""
Test that `foo` is closest to `foo`
Here k is 1
"""
output = self.vectorstore.similarity_search(
query="foo",
k=1,
metadatas=["author", "category"],
)
assert output[0].page_content == "foo"
assert output[0].metadata["author"] == "Adam"
assert output[0].metadata["category"] == "Music"
assert len(output) == 1
def test_search_filter(self) -> None:
"""
Test filter(where)
"""
where = "author='Eve'"
output = self.vectorstore.similarity_search(
query="foo",
k=3,
fetch_k=9,
where=where,
metadatas=["author", "category"],
)
assert output[0].page_content == "bar"
assert output[0].metadata["author"] == "Eve"
assert output[0].metadata["category"] == "Music"
assert len(output) == 1
def test_search_anomalous(self) -> None:
"""
Test detection of anomalousness
"""
result = self.vectorstore.is_anomalous(
query="dogs can jump high",
)
assert result is False
def test_clear(self) -> None:
"""
Test cleanup of data in the store
"""
self.vectorstore.clear()
assert self.vectorstore.count() == 0
def test_drop(self) -> None:
"""
Destroy the vector store
"""
self.vectorstore.drop()
def test_logout(self) -> None:
"""
Logout and free resources
"""
self.vectorstore.logout()