community[minor]: added jaguar vector store (#14838)

Description: A new vector store Jaguar is being added. Class, test scripts, and documentation is added. Issue: None -- This is the first PR contributing to LangChain Dependencies: This depends on "pip install -U jaguardb-http-client" client http package Tag maintainer: @baskaryan, @eyurtsev, @hwchase1 Twitter handle: @workbot --------- Co-authored-by: JY <jyjy@jaguardb> Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-09-19 00:58:32 +00:00 · 2023-12-19 07:40:18 -08:00
parent a5be9f9475
commit 992b04e475
5 changed files with 1158 additions and 0 deletions
--- a/libs/community/langchain_community/vectorstores/jaguar.py
+++ b/libs/community/langchain_community/vectorstores/jaguar.py
@@ -0,0 +1,441 @@
+from __future__ import annotations
+
+import json
+import logging
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple
+
+if TYPE_CHECKING:
+    from jaguardb_http_client.JaguarHttpClient import JaguarHttpClient
+
+from langchain_core.documents import Document
+from langchain_core.embeddings import Embeddings
+from langchain_core.vectorstores import VectorStore
+
+logger = logging.getLogger(__name__)
+
+
+class Jaguar(VectorStore):
+    """`Jaguar API` vector store.
+
+    See http://www.jaguardb.com
+    See http://github.com/fserv/jaguar-sdk
+
+    Example:
+       .. code-block:: python
+
+           from langchain.vectorstores import Jaguar
+
+           vectorstore = Jaguar(
+               pod = 'vdb',
+               store = 'mystore',
+               vector_index = 'v',
+               vector_type = 'cosine_fraction_float',
+               vector_dimension = 1536,
+               url='http://192.168.8.88:8080/fwww/',
+               embedding=openai_model
+           )
+    """
+
+    def __init__(
+        self,
+        pod: str,
+        store: str,
+        vector_index: str,
+        vector_type: str,
+        vector_dimension: int,
+        url: str,
+        embedding: Embeddings,
+    ):
+        self._pod = pod
+        self._store = store
+        self._vector_index = vector_index
+        self._vector_type = vector_type
+        self._vector_dimension = vector_dimension
+
+        self._embedding = embedding
+
+        self._jag = JaguarHttpClient(url)
+        self._token = ""
+
+    def login(
+        self,
+        jaguar_api_key: Optional[str] = "",
+    ) -> bool:
+        """
+        login to jaguardb server with a jaguar_api_key or let self._jag find a key
+        Args:
+            pod (str):  name of a Pod
+            store (str):  name of a vector store
+            optional jaguar_api_key (str): API key of user to jaguardb server
+        Returns:
+            True if successful; False if not successful
+        """
+
+        if jaguar_api_key == "":
+            jaguar_api_key = self._jag.getApiKey()
+        self._jaguar_api_key = jaguar_api_key
+        self._token = self._jag.login(jaguar_api_key)
+        if self._token == "":
+            logger.error("E0001 error init(): invalid jaguar_api_key")
+            return False
+        return True
+
+    def create(
+        self,
+        metadata_str: str,
+        text_size: int,
+    ) -> None:
+        """
+        create the vector store on the backend database
+        Args:
+            metadata_str (str):  columns and their types
+        Returns:
+            True if successful; False if not successful
+        """
+        podstore = self._pod + "." + self._store
+
+        """
+        source column is required.
+        v:text column is required.
+        """
+        q = "create store "
+        q += podstore
+        q += f" ({self._vector_index} vector({self._vector_dimension},"
+        q += f" '{self._vector_type}'),"
+        q += f" source char(256), v:text char({text_size}),"
+        q += metadata_str + ")"
+        self.run(q)
+
+    def run(self, query: str, withFile: bool = False) -> dict:
+        """
+        Run any query statement in jaguardb
+        Args:
+            query (str): query statement to jaguardb
+        Returns:
+            None for invalid token, or
+            json result string
+        """
+        if self._token == "":
+            logger.error(f"E0005 error run({query})")
+            return {}
+
+        resp = self._jag.post(query, self._token, withFile)
+        txt = resp.text
+        try:
+            js = json.loads(txt)
+            return js
+        except Exception:
+            return {}
+
+    @property
+    def embeddings(self) -> Optional[Embeddings]:
+        return self._embedding
+
+    def add_texts(
+        self,
+        texts: List[str],
+        metadatas: Optional[List[dict]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """
+        Add  texts through the embeddings and add to the vectorstore.
+        Args:
+          texts: list of text strings to add to the jaguar vector store.
+          metadatas: Optional list of metadatas associated with the texts.
+            [{"m1": "v11", "m2": "v12", "m3": "v13", "filecol": "path_file1.jpg" },
+             {"m1": "v21", "m2": "v22", "m3": "v23", "filecol": "path_file2.jpg" },
+             {"m1": "v31", "m2": "v32", "m3": "v33", "filecol": "path_file3.jpg" },
+             {"m1": "v41", "m2": "v42", "m3": "v43", "filecol": "path_file4.jpg" }]
+          kwargs: vector_index=name_of_vector_index
+                  file_column=name_of_file_column
+
+        Returns:
+            List of ids from adding the texts into the vectorstore
+        """
+        vcol = self._vector_index
+        filecol = kwargs.get("file_column", "")
+        podstorevcol = self._pod + "." + self._store + "." + vcol
+        q = "textcol " + podstorevcol
+        js = self.run(q)
+        if js == "":
+            return []
+        textcol = js["data"]
+
+        embeddings = self._embedding.embed_documents(list(texts))
+        ids = []
+        if metadatas is None:
+            ### no meta and no files to upload
+            i = 0
+            for vec in embeddings:
+                str_vec = [str(x) for x in vec]
+                values_comma = ",".join(str_vec)
+                podstore = self._pod + "." + self._store
+                q = "insert into " + podstore + " ("
+                q += vcol + "," + textcol + ") values ('" + values_comma
+                q += "','" + texts[i] + "')"
+                js = self.run(q, False)
+                ids.append(js["zid"])
+                i += 1
+        else:
+            i = 0
+            for vec in embeddings:
+                str_vec = [str(x) for x in vec]
+                nvec, vvec, filepath = self._parseMeta(metadatas[i], filecol)
+                if filecol != "":
+                    rc = self._jag.postFile(self._token, filepath, 1)
+                    if not rc:
+                        return []
+                names_comma = ",".join(nvec)
+                names_comma += "," + vcol
+                ## col1,col2,col3,vecl
+                values_comma = "'" + "','".join(vvec) + "'"
+                ### 'va1','val2','val3'
+                values_comma += ",'" + ",".join(str_vec) + "'"
+                ### 'v1,v2,v3'
+                podstore = self._pod + "." + self._store
+                q = "insert into " + podstore + " ("
+                q += names_comma + "," + textcol + ") values (" + values_comma
+                q += ",'" + texts[i] + "')"
+                if filecol != "":
+                    js = self.run(q, True)
+                else:
+                    js = self.run(q, False)
+                ids.append(js["zid"])
+                i += 1
+
+        return ids
+
+    def similarity_search_with_score(
+        self,
+        query: str,
+        k: int = 3,
+        fetch_k: int = -1,
+        where: Optional[str] = None,
+        score_threshold: Optional[float] = -1.0,
+        metadatas: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> List[Tuple[Document, float]]:
+        """
+        Return Jaguar documents most similar to query, along with scores.
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 3.
+            lambda_val: lexical match parameter for hybrid search.
+            where: the where clause in select similarity. For example a
+                where can be "rating > 3.0 and (state = 'NV' or state = 'CA')"
+            score_threshold: minimal score threshold for the result.
+                If defined, results with score less than this value will be
+                filtered out.
+            kwargs:  vector_index=vcol, vector_type=cosine_fraction_float
+        Returns:
+            List of Documents most similar to the query and score for each.
+            List of Tuples of (doc, similarity_score):
+                [ (doc, score), (doc, score), ...]
+        """
+        vcol = self._vector_index
+        vtype = self._vector_type
+        embeddings = self._embedding.embed_query(query)
+        str_embeddings = [str(f) for f in embeddings]
+        qv_comma = ",".join(str_embeddings)
+        podstore = self._pod + "." + self._store
+        q = (
+            "select similarity("
+            + vcol
+            + ",'"
+            + qv_comma
+            + "','topk="
+            + str(k)
+            + ",fetch_k="
+            + str(fetch_k)
+            + ",type="
+            + vtype
+        )
+        q += ",with_score=yes,with_text=yes,score_threshold=" + str(score_threshold)
+
+        if metadatas is not None:
+            meta = "&".join(metadatas)
+            q += ",metadata=" + meta
+
+        q += "') from " + podstore
+
+        if where is not None:
+            q += " where " + where
+
+        jarr = self.run(q)
+        if jarr is None:
+            return []
+
+        docs_with_score = []
+        for js in jarr:
+            score = js["score"]
+            text = js["text"]
+            zid = js["zid"]
+
+            ### give metadatas
+            md = {}
+            md["zid"] = zid
+            if metadatas is not None:
+                for m in metadatas:
+                    mv = js[m]
+                    md[m] = mv
+
+            doc = Document(page_content=text, metadata=md)
+            tup = (doc, score)
+            docs_with_score.append(tup)
+
+        return docs_with_score
+
+    def similarity_search(
+        self,
+        query: str,
+        k: int = 3,
+        where: Optional[str] = None,
+        metadatas: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """
+        Return Jaguar documents most similar to query, along with scores.
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 5.
+            where: the where clause in select similarity. For example a
+                where can be "rating > 3.0 and (state = 'NV' or state = 'CA')"
+        Returns:
+            List of Documents most similar to the query
+        """
+        docs_and_scores = self.similarity_search_with_score(
+            query, k=k, where=where, metadatas=metadatas, **kwargs
+        )
+        return [doc for doc, _ in docs_and_scores]
+
+    def is_anomalous(
+        self,
+        query: str,
+        **kwargs: Any,
+    ) -> bool:
+        """
+        Detect if given text is anomalous from the dataset
+        Args:
+            query: Text to detect if it is anomaly
+        Returns:
+            True or False
+        """
+        vcol = self._vector_index
+        vtype = self._vector_type
+        embeddings = self._embedding.embed_query(query)
+        str_embeddings = [str(f) for f in embeddings]
+        qv_comma = ",".join(str_embeddings)
+        podstore = self._pod + "." + self._store
+        q = "select anomalous(" + vcol + ", '" + qv_comma + "', 'type=" + vtype + "')"
+        q += " from " + podstore
+
+        js = self.run(q)
+        if isinstance(js, list) and len(js) == 0:
+            return False
+        jd = json.loads(js[0])
+        if jd["anomalous"] == "YES":
+            return True
+        return False
+
+    @classmethod
+    def from_texts(
+        cls,
+        texts: List[str],
+        embedding: Embeddings,
+        url: str,
+        pod: str,
+        store: str,
+        vector_index: str,
+        vector_type: str,
+        vector_dimension: int,
+        metadatas: Optional[List[dict]] = None,
+        jaguar_api_key: Optional[str] = "",
+        **kwargs: Any,
+    ) -> Jaguar:
+        jagstore = cls(
+            pod, store, vector_index, vector_type, vector_dimension, url, embedding
+        )
+        jagstore.login(jaguar_api_key)
+        jagstore.clear()
+        jagstore.add_texts(texts, metadatas, **kwargs)
+        return jagstore
+
+    def clear(self) -> None:
+        """
+        Delete all records in jaguardb
+        Args: No args
+        Returns: None
+        """
+        podstore = self._pod + "." + self._store
+        q = "truncate store " + podstore
+        self.run(q)
+
+    def delete(self, zids: List[str], **kwargs: Any) -> None:
+        """
+        Delete records in jaguardb by a list of zero-ids
+        Args:
+            pod (str):  name of a Pod
+            ids (List[str]):  a list of zid as string
+        Returns:
+            Do not return anything
+        """
+        podstore = self._pod + "." + self._store
+        for zid in zids:
+            q = "delete from " + podstore + " where zid='" + zid + "'"
+            self.run(q)
+
+    def count(self) -> int:
+        """
+        Count records of a store in jaguardb
+        Args: no args
+        Returns: (int) number of records in pod store
+        """
+        podstore = self._pod + "." + self._store
+        q = "select count() from " + podstore
+        js = self.run(q)
+        if isinstance(js, list) and len(js) == 0:
+            return 0
+        jd = json.loads(js[0])
+        return int(jd["data"])
+
+    def drop(self) -> None:
+        """
+        Drop or remove a store in jaguardb
+        Args: no args
+        Returns: None
+        """
+        podstore = self._pod + "." + self._store
+        q = "drop store " + podstore
+        self.run(q)
+
+    def logout(self) -> None:
+        """
+        Logout to cleanup resources
+        Args: no args
+        Returns: None
+        """
+        self._jag.logout(self._token)
+
+    def prt(self, msg: str) -> None:
+        with open("/tmp/debugjaguar.log", "a") as file:
+            print(f"msg={msg}", file=file, flush=True)
+
+    def _parseMeta(self, nvmap: dict, filecol: str) -> Tuple[List[str], List[str], str]:
+        filepath = ""
+        if filecol == "":
+            nvec = list(nvmap.keys())
+            vvec = list(nvmap.values())
+        else:
+            nvec = []
+            vvec = []
+            if filecol in nvmap:
+                nvec.append(filecol)
+                vvec.append(nvmap[filecol])
+                filepath = nvmap[filecol]
+
+            for k, v in nvmap.items():
+                if k != filecol:
+                    nvec.append(k)
+                    vvec.append(v)
+
+        return nvec, vvec, filepath
--- a/libs/community/tests/integration_tests/vectorstores/test_jaguar.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_jaguar.py
@@ -0,0 +1,138 @@
+import json
+
+from langchain_community.vectorstores.jaguar import Jaguar
+from tests.integration_tests.vectorstores.fake_embeddings import (
+    ConsistentFakeEmbeddings,
+)
+
+#############################################################################################
+##
+##  Requirement: fwww http server must be running at 127.0.0.1:8080 (or any end point)
+##               jaguardb server must be running accepting commands from the http server
+##
+##  FakeEmbeddings is used to create text embeddings with dimension of 10.
+##
+#############################################################################################
+
+
+class TestJaguar:
+    vectorstore: Jaguar
+    pod: str
+    store: str
+
+    @classmethod
+    def setup_class(cls) -> None:
+        url = "http://127.0.0.1:8080/fwww/"
+        cls.pod = "vdb"
+        cls.store = "langchain_test_store"
+        vector_index = "v"
+        vector_type = "cosine_fraction_float"
+        vector_dimension = 10
+        embeddings = ConsistentFakeEmbeddings()
+        cls.vectorstore = Jaguar(
+            cls.pod,
+            cls.store,
+            vector_index,
+            vector_type,
+            vector_dimension,
+            url,
+            embeddings,
+        )
+
+    @classmethod
+    def teardown_class(cls) -> None:
+        pass
+
+    def test_login(self) -> None:
+        """
+        Requires environment variable JAGUAR_API_KEY
+        or $HOME/.jagrc storing the jaguar api key
+        """
+        self.vectorstore.login()
+
+    def test_create(self) -> None:
+        """
+        Create a vector with vector index 'v' of dimension 10
+        and 'v:text' to hold text and metadatas author and category
+        """
+        metadata_str = "author char(32), category char(16)"
+        self.vectorstore.create(metadata_str, 1024)
+
+        podstore = self.pod + "." + self.store
+        js = self.vectorstore.run(f"desc {podstore}")
+        jd = json.loads(js[0])
+        assert podstore in jd["data"]
+
+    def test_add_texts(self) -> None:
+        """
+        Add some texts
+        """
+        texts = ["foo", "bar", "baz"]
+        metadatas = [
+            {"author": "Adam", "category": "Music"},
+            {"author": "Eve", "category": "Music"},
+            {"author": "John", "category": "History"},
+        ]
+
+        ids = self.vectorstore.add_texts(texts=texts, metadatas=metadatas)
+        assert len(ids) == len(texts)
+
+    def test_search(self) -> None:
+        """
+        Test that `foo` is closest to `foo`
+        Here k is 1
+        """
+        output = self.vectorstore.similarity_search(
+            query="foo",
+            k=1,
+            metadatas=["author", "category"],
+        )
+        assert output[0].page_content == "foo"
+        assert output[0].metadata["author"] == "Adam"
+        assert output[0].metadata["category"] == "Music"
+        assert len(output) == 1
+
+    def test_search_filter(self) -> None:
+        """
+        Test filter(where)
+        """
+        where = "author='Eve'"
+        output = self.vectorstore.similarity_search(
+            query="foo",
+            k=3,
+            fetch_k=9,
+            where=where,
+            metadatas=["author", "category"],
+        )
+        assert output[0].page_content == "bar"
+        assert output[0].metadata["author"] == "Eve"
+        assert output[0].metadata["category"] == "Music"
+        assert len(output) == 1
+
+    def test_search_anomalous(self) -> None:
+        """
+        Test detection of anomalousness
+        """
+        result = self.vectorstore.is_anomalous(
+            query="dogs can jump high",
+        )
+        assert result is False
+
+    def test_clear(self) -> None:
+        """
+        Test cleanup of data in the store
+        """
+        self.vectorstore.clear()
+        assert self.vectorstore.count() == 0
+
+    def test_drop(self) -> None:
+        """
+        Destroy the vector store
+        """
+        self.vectorstore.drop()
+
+    def test_logout(self) -> None:
+        """
+        Logout and free resources
+        """
+        self.vectorstore.logout()