community[minor]: Oraclevs integration (#21123)

Thank you for contributing to LangChain!

- Oracle AI Vector Search 
Oracle AI Vector Search is designed for Artificial Intelligence (AI)
workloads that allows you to query data based on semantics, rather than
keywords. One of the biggest benefit of Oracle AI Vector Search is that
semantic search on unstructured data can be combined with relational
search on business data in one single system. This is not only powerful
but also significantly more effective because you don't need to add a
specialized vector database, eliminating the pain of data fragmentation
between multiple systems.


- Oracle AI Vector Search is designed for Artificial Intelligence (AI)
workloads that allows you to query data based on semantics, rather than
keywords. One of the biggest benefit of Oracle AI Vector Search is that
semantic search on unstructured data can be combined with relational
search on business data in one single system. This is not only powerful
but also significantly more effective because you don't need to add a
specialized vector database, eliminating the pain of data fragmentation
between multiple systems.
This Pull Requests Adds the following functionalities
Oracle AI Vector Search : Vector Store
Oracle AI Vector Search : Document Loader
Oracle AI Vector Search : Document Splitter
Oracle AI Vector Search : Summary
Oracle AI Vector Search : Oracle Embeddings


- We have added unit tests and have our own local unit test suite which
verifies all the code is correct. We have made sure to add guides for
each of the components and one end to end guide that shows how the
entire thing runs.


- We have made sure that make format and make lint run clean.

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, hwchase17.

---------

Co-authored-by: skmishraoracle <shailendra.mishra@oracle.com>
Co-authored-by: hroyofc <harichandan.roy@oracle.com>
Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Rohan Aggarwal
2024-05-03 20:15:35 -07:00
committed by GitHub
parent c9e9470c5a
commit 8021d2a2ab
25 changed files with 5325 additions and 4 deletions

View File

@@ -0,0 +1,447 @@
# Authors:
# Sudhir Kumar (sudhirkk)
#
# -----------------------------------------------------------------------------
# test_oracleds.py
# -----------------------------------------------------------------------------
import sys
from langchain_community.document_loaders.oracleai import (
OracleDocLoader,
OracleTextSplitter,
)
from langchain_community.utilities.oracleai import OracleSummary
from langchain_community.vectorstores.oraclevs import (
_table_exists,
drop_table_purge,
)
uname = "hr"
passwd = "hr"
# uname = "LANGCHAINUSER"
# passwd = "langchainuser"
v_dsn = "100.70.107.245:1521/cdb1_pdb1.regress.rdbms.dev.us.oracle.com"
### Test loader #####
def test_loader_test() -> None:
try:
import oracledb
except ImportError:
return
try:
# oracle connection
connection = oracledb.connect(user=uname, password=passwd, dsn=v_dsn)
cursor = connection.cursor()
if _table_exists(connection, "LANGCHAIN_DEMO"):
drop_table_purge(connection, "LANGCHAIN_DEMO")
cursor.execute("CREATE TABLE langchain_demo(id number, text varchar2(25))")
rows = [
(1, "First"),
(2, "Second"),
(3, "Third"),
(4, "Fourth"),
(5, "Fifth"),
(6, "Sixth"),
(7, "Seventh"),
]
cursor.executemany("insert into LANGCHAIN_DEMO(id, text) values (:1, :2)", rows)
connection.commit()
# local file, local directory, database column
loader_params = {
"owner": uname,
"tablename": "LANGCHAIN_DEMO",
"colname": "TEXT",
}
# instantiate
loader = OracleDocLoader(conn=connection, params=loader_params)
# load
docs = loader.load()
# verify
if len(docs) == 0:
sys.exit(1)
if _table_exists(connection, "LANGCHAIN_DEMO"):
drop_table_purge(connection, "LANGCHAIN_DEMO")
except Exception:
sys.exit(1)
try:
# expectation : ORA-00942
loader_params = {
"owner": uname,
"tablename": "COUNTRIES1",
"colname": "COUNTRY_NAME",
}
# instantiate
loader = OracleDocLoader(conn=connection, params=loader_params)
# load
docs = loader.load()
if len(docs) == 0:
pass
except Exception:
pass
try:
# expectation : file "SUDHIR" doesn't exist.
loader_params = {"file": "SUDHIR"}
# instantiate
loader = OracleDocLoader(conn=connection, params=loader_params)
# load
docs = loader.load()
if len(docs) == 0:
pass
except Exception:
pass
try:
# expectation : path "SUDHIR" doesn't exist.
loader_params = {"dir": "SUDHIR"}
# instantiate
loader = OracleDocLoader(conn=connection, params=loader_params)
# load
docs = loader.load()
if len(docs) == 0:
pass
except Exception:
pass
### Test splitter ####
def test_splitter_test() -> None:
try:
import oracledb
except ImportError:
return
try:
# oracle connection
connection = oracledb.connect(user=uname, password=passwd, dsn=v_dsn)
doc = """Langchain is a wonderful framework to load, split, chunk
and embed your data!!"""
# by words , max = 1000
splitter_params = {
"by": "words",
"max": "1000",
"overlap": "200",
"split": "custom",
"custom_list": [","],
"extended": "true",
"normalize": "all",
}
# instantiate
splitter = OracleTextSplitter(conn=connection, params=splitter_params)
# generate chunks
chunks = splitter.split_text(doc)
# verify
if len(chunks) == 0:
sys.exit(1)
# by chars , max = 4000
splitter_params = {
"by": "chars",
"max": "4000",
"overlap": "800",
"split": "NEWLINE",
"normalize": "all",
}
# instantiate
splitter = OracleTextSplitter(conn=connection, params=splitter_params)
# generate chunks
chunks = splitter.split_text(doc)
# verify
if len(chunks) == 0:
sys.exit(1)
# by words , max = 10
splitter_params = {
"by": "words",
"max": "10",
"overlap": "2",
"split": "SENTENCE",
}
# instantiate
splitter = OracleTextSplitter(conn=connection, params=splitter_params)
# generate chunks
chunks = splitter.split_text(doc)
# verify
if len(chunks) == 0:
sys.exit(1)
# by chars , max = 50
splitter_params = {
"by": "chars",
"max": "50",
"overlap": "10",
"split": "SPACE",
"normalize": "all",
}
# instantiate
splitter = OracleTextSplitter(conn=connection, params=splitter_params)
# generate chunks
chunks = splitter.split_text(doc)
# verify
if len(chunks) == 0:
sys.exit(1)
except Exception:
sys.exit(1)
try:
# ORA-20003: invalid value xyz for BY parameter
splitter_params = {"by": "xyz"}
# instantiate
splitter = OracleTextSplitter(conn=connection, params=splitter_params)
# generate chunks
chunks = splitter.split_text(doc)
# verify
if len(chunks) == 0:
pass
except Exception:
pass
try:
# Expectation: ORA-30584: invalid text chunking MAXIMUM - '10'
splitter_params = {
"by": "chars",
"max": "10",
"overlap": "2",
"split": "SPACE",
"normalize": "all",
}
# instantiate
splitter = OracleTextSplitter(conn=connection, params=splitter_params)
# generate chunks
chunks = splitter.split_text(doc)
# verify
if len(chunks) == 0:
pass
except Exception:
pass
try:
# Expectation: ORA-30584: invalid text chunking MAXIMUM - '5'
splitter_params = {
"by": "words",
"max": "5",
"overlap": "2",
"split": "SPACE",
"normalize": "all",
}
# instantiate
splitter = OracleTextSplitter(conn=connection, params=splitter_params)
# generate chunks
chunks = splitter.split_text(doc)
# verify
if len(chunks) == 0:
pass
except Exception:
pass
try:
# Expectation: ORA-30586: invalid text chunking SPLIT BY - SENTENCE
splitter_params = {
"by": "words",
"max": "50",
"overlap": "2",
"split": "SENTENCE",
"normalize": "all",
}
# instantiate
splitter = OracleTextSplitter(conn=connection, params=splitter_params)
# generate chunks
chunks = splitter.split_text(doc)
# verify
if len(chunks) == 0:
pass
except Exception:
pass
#### Test summary ####
def test_summary_test() -> None:
try:
import oracledb
except ImportError:
return
try:
# oracle connection
connection = oracledb.connect(user=uname, password=passwd, dsn=v_dsn)
# provider : Database, glevel : Paragraph
summary_params = {
"provider": "database",
"glevel": "paragraph",
"numParagraphs": 2,
"language": "english",
}
# summary
summary = OracleSummary(conn=connection, params=summary_params)
doc = """It was 7 minutes after midnight. The dog was lying on the grass in
of the lawn in front of Mrs Shears house. Its eyes were closed. It
was running on its side, the way dogs run when they think they are
cat in a dream. But the dog was not running or asleep. The dog was dead.
was a garden fork sticking out of the dog. The points of the fork must
gone all the way through the dog and into the ground because the fork
not fallen over. I decided that the dog was probably killed with the
because I could not see any other wounds in the dog and I do not think
would stick a garden fork into a dog after it had died for some other
like cancer for example, or a road accident. But I could not be certain"""
summaries = summary.get_summary(doc)
# verify
if len(summaries) == 0:
sys.exit(1)
# provider : Database, glevel : Sentence
summary_params = {"provider": "database", "glevel": "Sentence"}
# summary
summary = OracleSummary(conn=connection, params=summary_params)
summaries = summary.get_summary(doc)
# verify
if len(summaries) == 0:
sys.exit(1)
# provider : Database, glevel : P
summary_params = {"provider": "database", "glevel": "P"}
# summary
summary = OracleSummary(conn=connection, params=summary_params)
summaries = summary.get_summary(doc)
# verify
if len(summaries) == 0:
sys.exit(1)
# provider : Database, glevel : S
summary_params = {
"provider": "database",
"glevel": "S",
"numParagraphs": 16,
"language": "english",
}
# summary
summary = OracleSummary(conn=connection, params=summary_params)
summaries = summary.get_summary(doc)
# verify
if len(summaries) == 0:
sys.exit(1)
# provider : Database, glevel : S, doc = ' '
summary_params = {"provider": "database", "glevel": "S", "numParagraphs": 2}
# summary
summary = OracleSummary(conn=connection, params=summary_params)
doc = " "
summaries = summary.get_summary(doc)
# verify
if len(summaries) == 0:
sys.exit(1)
except Exception:
sys.exit(1)
try:
# Expectation : DRG-11002: missing value for PROVIDER
summary_params = {"provider": "database1", "glevel": "S"}
# summary
summary = OracleSummary(conn=connection, params=summary_params)
summaries = summary.get_summary(doc)
# verify
if len(summaries) == 0:
pass
except Exception:
pass
try:
# Expectation : DRG-11425: gist level SUDHIR is invalid,
# DRG-11427: valid gist level values are S, P
summary_params = {"provider": "database", "glevel": "SUDHIR"}
# summary
summary = OracleSummary(conn=connection, params=summary_params)
summaries = summary.get_summary(doc)
# verify
if len(summaries) == 0:
pass
except Exception:
pass
try:
# Expectation : DRG-11441: gist numParagraphs -2 is invalid
summary_params = {"provider": "database", "glevel": "S", "numParagraphs": -2}
# summary
summary = OracleSummary(conn=connection, params=summary_params)
summaries = summary.get_summary(doc)
# verify
if len(summaries) == 0:
pass
except Exception:
pass