community[minor]: Oraclevs integration (#21123)

Thank you for contributing to LangChain! - Oracle AI Vector Search Oracle AI Vector Search is designed for Artificial Intelligence (AI) workloads that allows you to query data based on semantics, rather than keywords. One of the biggest benefit of Oracle AI Vector Search is that semantic search on unstructured data can be combined with relational search on business data in one single system. This is not only powerful but also significantly more effective because you don't need to add a specialized vector database, eliminating the pain of data fragmentation between multiple systems. - Oracle AI Vector Search is designed for Artificial Intelligence (AI) workloads that allows you to query data based on semantics, rather than keywords. One of the biggest benefit of Oracle AI Vector Search is that semantic search on unstructured data can be combined with relational search on business data in one single system. This is not only powerful but also significantly more effective because you don't need to add a specialized vector database, eliminating the pain of data fragmentation between multiple systems. This Pull Requests Adds the following functionalities Oracle AI Vector Search : Vector Store Oracle AI Vector Search : Document Loader Oracle AI Vector Search : Document Splitter Oracle AI Vector Search : Summary Oracle AI Vector Search : Oracle Embeddings - We have added unit tests and have our own local unit test suite which verifies all the code is correct. We have made sure to add guides for each of the components and one end to end guide that shows how the entire thing runs. - We have made sure that make format and make lint run clean. Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, hwchase17. --------- Co-authored-by: skmishraoracle <shailendra.mishra@oracle.com> Co-authored-by: hroyofc <harichandan.roy@oracle.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-09-18 16:16:33 +00:00 · 2024-05-03 20:15:35 -07:00
parent c9e9470c5a
commit 8021d2a2ab
25 changed files with 5325 additions and 4 deletions
--- a/libs/community/tests/integration_tests/document_loaders/test_oracleds.py
+++ b/libs/community/tests/integration_tests/document_loaders/test_oracleds.py
@@ -0,0 +1,447 @@
+# Authors:
+#   Sudhir Kumar (sudhirkk)
+#
+# -----------------------------------------------------------------------------
+# test_oracleds.py
+# -----------------------------------------------------------------------------
+import sys
+
+from langchain_community.document_loaders.oracleai import (
+    OracleDocLoader,
+    OracleTextSplitter,
+)
+from langchain_community.utilities.oracleai import OracleSummary
+from langchain_community.vectorstores.oraclevs import (
+    _table_exists,
+    drop_table_purge,
+)
+
+uname = "hr"
+passwd = "hr"
+# uname = "LANGCHAINUSER"
+# passwd = "langchainuser"
+v_dsn = "100.70.107.245:1521/cdb1_pdb1.regress.rdbms.dev.us.oracle.com"
+
+
+### Test loader #####
+def test_loader_test() -> None:
+    try:
+        import oracledb
+    except ImportError:
+        return
+
+    try:
+        # oracle connection
+        connection = oracledb.connect(user=uname, password=passwd, dsn=v_dsn)
+        cursor = connection.cursor()
+
+        if _table_exists(connection, "LANGCHAIN_DEMO"):
+            drop_table_purge(connection, "LANGCHAIN_DEMO")
+
+        cursor.execute("CREATE TABLE langchain_demo(id number, text varchar2(25))")
+
+        rows = [
+            (1, "First"),
+            (2, "Second"),
+            (3, "Third"),
+            (4, "Fourth"),
+            (5, "Fifth"),
+            (6, "Sixth"),
+            (7, "Seventh"),
+        ]
+
+        cursor.executemany("insert into LANGCHAIN_DEMO(id, text) values (:1, :2)", rows)
+
+        connection.commit()
+
+        # local file, local directory, database  column
+        loader_params = {
+            "owner": uname,
+            "tablename": "LANGCHAIN_DEMO",
+            "colname": "TEXT",
+        }
+
+        # instantiate
+        loader = OracleDocLoader(conn=connection, params=loader_params)
+
+        # load
+        docs = loader.load()
+
+        # verify
+        if len(docs) == 0:
+            sys.exit(1)
+
+        if _table_exists(connection, "LANGCHAIN_DEMO"):
+            drop_table_purge(connection, "LANGCHAIN_DEMO")
+
+    except Exception:
+        sys.exit(1)
+
+    try:
+        # expectation : ORA-00942
+        loader_params = {
+            "owner": uname,
+            "tablename": "COUNTRIES1",
+            "colname": "COUNTRY_NAME",
+        }
+
+        # instantiate
+        loader = OracleDocLoader(conn=connection, params=loader_params)
+
+        # load
+        docs = loader.load()
+        if len(docs) == 0:
+            pass
+
+    except Exception:
+        pass
+
+    try:
+        # expectation : file "SUDHIR" doesn't exist.
+        loader_params = {"file": "SUDHIR"}
+
+        # instantiate
+        loader = OracleDocLoader(conn=connection, params=loader_params)
+
+        # load
+        docs = loader.load()
+        if len(docs) == 0:
+            pass
+
+    except Exception:
+        pass
+
+    try:
+        # expectation : path "SUDHIR" doesn't exist.
+        loader_params = {"dir": "SUDHIR"}
+
+        # instantiate
+        loader = OracleDocLoader(conn=connection, params=loader_params)
+
+        # load
+        docs = loader.load()
+        if len(docs) == 0:
+            pass
+
+    except Exception:
+        pass
+
+
+### Test splitter ####
+def test_splitter_test() -> None:
+    try:
+        import oracledb
+    except ImportError:
+        return
+
+    try:
+        # oracle connection
+        connection = oracledb.connect(user=uname, password=passwd, dsn=v_dsn)
+        doc = """Langchain is a wonderful framework to load, split, chunk 
+                and embed your data!!"""
+
+        # by words , max = 1000
+        splitter_params = {
+            "by": "words",
+            "max": "1000",
+            "overlap": "200",
+            "split": "custom",
+            "custom_list": [","],
+            "extended": "true",
+            "normalize": "all",
+        }
+
+        # instantiate
+        splitter = OracleTextSplitter(conn=connection, params=splitter_params)
+
+        # generate chunks
+        chunks = splitter.split_text(doc)
+
+        # verify
+        if len(chunks) == 0:
+            sys.exit(1)
+
+        # by chars , max = 4000
+        splitter_params = {
+            "by": "chars",
+            "max": "4000",
+            "overlap": "800",
+            "split": "NEWLINE",
+            "normalize": "all",
+        }
+
+        # instantiate
+        splitter = OracleTextSplitter(conn=connection, params=splitter_params)
+
+        # generate chunks
+        chunks = splitter.split_text(doc)
+
+        # verify
+        if len(chunks) == 0:
+            sys.exit(1)
+
+        # by words , max = 10
+        splitter_params = {
+            "by": "words",
+            "max": "10",
+            "overlap": "2",
+            "split": "SENTENCE",
+        }
+
+        # instantiate
+        splitter = OracleTextSplitter(conn=connection, params=splitter_params)
+
+        # generate chunks
+        chunks = splitter.split_text(doc)
+
+        # verify
+        if len(chunks) == 0:
+            sys.exit(1)
+
+        # by chars , max = 50
+        splitter_params = {
+            "by": "chars",
+            "max": "50",
+            "overlap": "10",
+            "split": "SPACE",
+            "normalize": "all",
+        }
+
+        # instantiate
+        splitter = OracleTextSplitter(conn=connection, params=splitter_params)
+
+        # generate chunks
+        chunks = splitter.split_text(doc)
+
+        # verify
+        if len(chunks) == 0:
+            sys.exit(1)
+
+    except Exception:
+        sys.exit(1)
+
+    try:
+        # ORA-20003: invalid value xyz for BY parameter
+        splitter_params = {"by": "xyz"}
+
+        # instantiate
+        splitter = OracleTextSplitter(conn=connection, params=splitter_params)
+
+        # generate chunks
+        chunks = splitter.split_text(doc)
+
+        # verify
+        if len(chunks) == 0:
+            pass
+
+    except Exception:
+        pass
+
+    try:
+        # Expectation: ORA-30584: invalid text chunking MAXIMUM - '10'
+        splitter_params = {
+            "by": "chars",
+            "max": "10",
+            "overlap": "2",
+            "split": "SPACE",
+            "normalize": "all",
+        }
+
+        # instantiate
+        splitter = OracleTextSplitter(conn=connection, params=splitter_params)
+
+        # generate chunks
+        chunks = splitter.split_text(doc)
+
+        # verify
+        if len(chunks) == 0:
+            pass
+
+    except Exception:
+        pass
+
+    try:
+        # Expectation: ORA-30584: invalid text chunking MAXIMUM - '5'
+        splitter_params = {
+            "by": "words",
+            "max": "5",
+            "overlap": "2",
+            "split": "SPACE",
+            "normalize": "all",
+        }
+
+        # instantiate
+        splitter = OracleTextSplitter(conn=connection, params=splitter_params)
+
+        # generate chunks
+        chunks = splitter.split_text(doc)
+
+        # verify
+        if len(chunks) == 0:
+            pass
+
+    except Exception:
+        pass
+
+    try:
+        # Expectation: ORA-30586: invalid text chunking SPLIT BY - SENTENCE
+        splitter_params = {
+            "by": "words",
+            "max": "50",
+            "overlap": "2",
+            "split": "SENTENCE",
+            "normalize": "all",
+        }
+
+        # instantiate
+        splitter = OracleTextSplitter(conn=connection, params=splitter_params)
+
+        # generate chunks
+        chunks = splitter.split_text(doc)
+
+        # verify
+        if len(chunks) == 0:
+            pass
+
+    except Exception:
+        pass
+
+
+#### Test summary ####
+def test_summary_test() -> None:
+    try:
+        import oracledb
+    except ImportError:
+        return
+
+    try:
+        # oracle connection
+        connection = oracledb.connect(user=uname, password=passwd, dsn=v_dsn)
+
+        # provider : Database, glevel : Paragraph
+        summary_params = {
+            "provider": "database",
+            "glevel": "paragraph",
+            "numParagraphs": 2,
+            "language": "english",
+        }
+
+        # summary
+        summary = OracleSummary(conn=connection, params=summary_params)
+
+        doc = """It was 7 minutes after midnight. The dog was lying on the grass in
+            of the lawn in front of Mrs Shears house. Its eyes were closed. It 
+            was running on its side, the way dogs run when they think they are 
+            cat in a dream. But the dog was not running or asleep. The dog was dead. 
+            was a garden fork sticking out of the dog. The points of the fork must
+            gone all the way through the dog and into the ground because the fork 
+            not fallen over. I decided that the dog was probably killed with the 
+            because I could not see any other wounds in the dog and I do not think  
+            would stick a garden fork into a dog after it had died for some other 
+            like cancer for example, or a road accident. But I could not be certain"""
+
+        summaries = summary.get_summary(doc)
+
+        # verify
+        if len(summaries) == 0:
+            sys.exit(1)
+
+        # provider : Database, glevel : Sentence
+        summary_params = {"provider": "database", "glevel": "Sentence"}
+
+        # summary
+        summary = OracleSummary(conn=connection, params=summary_params)
+        summaries = summary.get_summary(doc)
+
+        # verify
+        if len(summaries) == 0:
+            sys.exit(1)
+
+        # provider : Database, glevel : P
+        summary_params = {"provider": "database", "glevel": "P"}
+
+        # summary
+        summary = OracleSummary(conn=connection, params=summary_params)
+        summaries = summary.get_summary(doc)
+
+        # verify
+        if len(summaries) == 0:
+            sys.exit(1)
+
+        # provider : Database, glevel : S
+        summary_params = {
+            "provider": "database",
+            "glevel": "S",
+            "numParagraphs": 16,
+            "language": "english",
+        }
+
+        # summary
+        summary = OracleSummary(conn=connection, params=summary_params)
+        summaries = summary.get_summary(doc)
+
+        # verify
+        if len(summaries) == 0:
+            sys.exit(1)
+
+        # provider : Database, glevel : S, doc = ' '
+        summary_params = {"provider": "database", "glevel": "S", "numParagraphs": 2}
+
+        # summary
+        summary = OracleSummary(conn=connection, params=summary_params)
+
+        doc = " "
+        summaries = summary.get_summary(doc)
+
+        # verify
+        if len(summaries) == 0:
+            sys.exit(1)
+
+    except Exception:
+        sys.exit(1)
+
+    try:
+        # Expectation : DRG-11002: missing value for PROVIDER
+        summary_params = {"provider": "database1", "glevel": "S"}
+
+        # summary
+        summary = OracleSummary(conn=connection, params=summary_params)
+        summaries = summary.get_summary(doc)
+
+        # verify
+        if len(summaries) == 0:
+            pass
+
+    except Exception:
+        pass
+
+    try:
+        # Expectation : DRG-11425: gist level SUDHIR is invalid,
+        #               DRG-11427: valid gist level values are S, P
+        summary_params = {"provider": "database", "glevel": "SUDHIR"}
+
+        # summary
+        summary = OracleSummary(conn=connection, params=summary_params)
+        summaries = summary.get_summary(doc)
+
+        # verify
+        if len(summaries) == 0:
+            pass
+
+    except Exception:
+        pass
+
+    try:
+        # Expectation : DRG-11441: gist numParagraphs -2 is invalid
+        summary_params = {"provider": "database", "glevel": "S", "numParagraphs": -2}
+
+        # summary
+        summary = OracleSummary(conn=connection, params=summary_params)
+        summaries = summary.get_summary(doc)
+
+        # verify
+        if len(summaries) == 0:
+            pass
+
+    except Exception:
+        pass