From 1f751343e25e9679ac5861ec03fff02b86817af4 Mon Sep 17 00:00:00 2001 From: Harichandan Roy Date: Mon, 3 Jun 2024 14:38:51 -0500 Subject: [PATCH] community[patch]: update embeddings/oracleai.py (#22240) Thank you for contributing to LangChain! - [ ] **PR title**: "package: description" - Where "package" is whichever of langchain, community, core, experimental, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" "community/embeddings: update oracleai.py" - [ ] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** a description of the change - **Issue:** the issue # it fixes, if applicable - **Dependencies:** any dependencies required for this change - **Twitter handle:** if your PR gets announced, and you'd like a mention, we'll gladly shout you out! Adding oracle VECTOR_ARRAY_T support. - [ ] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. Tests are not impacted. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Done. Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --- cookbook/oracleai_demo.ipynb | 2 - .../text_embedding/oracleai.ipynb | 7 --- .../embeddings/oracleai.py | 59 +++++++++++-------- 3 files changed, 36 insertions(+), 32 deletions(-) diff --git a/cookbook/oracleai_demo.ipynb b/cookbook/oracleai_demo.ipynb index ad0a6385cb7..8d67e122833 100644 --- a/cookbook/oracleai_demo.ipynb +++ b/cookbook/oracleai_demo.ipynb @@ -526,8 +526,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "***Note:*** Currently, OracleEmbeddings processes each embedding generation request individually, without batching, by calling REST endpoints separately for each request. This method could potentially lead to exceeding the maximum request per minute quota set by some providers. However, we are actively working to enhance this process by implementing request batching, which will allow multiple embedding requests to be combined into fewer API calls, thereby optimizing our use of provider resources and adhering to their request limits. This update is expected to be rolled out soon, eliminating the current limitation.\n", - "\n", "***Note:*** Users may need to configure a proxy to utilize third-party embedding generation providers, excluding the 'database' provider that utilizes an ONNX model." ] }, diff --git a/docs/docs/integrations/text_embedding/oracleai.ipynb b/docs/docs/integrations/text_embedding/oracleai.ipynb index cfda80026ba..1cb2c2adca7 100644 --- a/docs/docs/integrations/text_embedding/oracleai.ipynb +++ b/docs/docs/integrations/text_embedding/oracleai.ipynb @@ -193,13 +193,6 @@ "Oracle AI Vector Search provides multiple methods for generating embeddings, utilizing either locally hosted ONNX models or third-party APIs. For comprehensive instructions on configuring these alternatives, please refer to the [Oracle AI Vector Search Guide](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_vector_chain1.html#GUID-C6439E94-4E86-4ECD-954E-4B73D53579DE)." ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "***Note:*** Currently, OracleEmbeddings processes each embedding generation request individually, without batching, by calling REST endpoints separately for each request. This method could potentially lead to exceeding the maximum request per minute quota set by some providers. However, we are actively working to enhance this process by implementing request batching, which will allow multiple embedding requests to be combined into fewer API calls, thereby optimizing our use of provider resources and adhering to their request limits. This update is expected to be rolled out soon, eliminating the current limitation." - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/libs/community/langchain_community/embeddings/oracleai.py b/libs/community/langchain_community/embeddings/oracleai.py index ca2dc7f5b73..24105cf0204 100644 --- a/libs/community/langchain_community/embeddings/oracleai.py +++ b/libs/community/langchain_community/embeddings/oracleai.py @@ -118,23 +118,29 @@ class OracleEmbeddings(BaseModel, Embeddings): "begin utl_http.set_proxy(:proxy); end;", proxy=self.proxy ) - for text in texts: - cursor.execute( - "select t.* " - + "from dbms_vector_chain.utl_to_embeddings(:content, " - + "json(:params)) t", - content=text, - params=json.dumps(self.params), - ) + chunks = [] + for i, text in enumerate(texts, start=1): + chunk = {"chunk_id": i, "chunk_data": text} + chunks.append(json.dumps(chunk)) - for row in cursor: - if row is None: - embeddings.append([]) - else: - rdata = json.loads(row[0]) - # dereference string as array - vec = json.loads(rdata["embed_vector"]) - embeddings.append(vec) + vector_array_type = self.conn.gettype("SYS.VECTOR_ARRAY_T") + inputs = vector_array_type.newobject(chunks) + cursor.execute( + "select t.* " + + "from dbms_vector_chain.utl_to_embeddings(:content, " + + "json(:params)) t", + content=inputs, + params=json.dumps(self.params), + ) + + for row in cursor: + if row is None: + embeddings.append([]) + else: + rdata = json.loads(row[0]) + # dereference string as array + vec = json.loads(rdata["embed_vector"]) + embeddings.append(vec) cursor.close() return embeddings @@ -159,20 +165,27 @@ class OracleEmbeddings(BaseModel, Embeddings): """ # A sample unit test. -''' get the Oracle connection ''' +import oracledb +# get the Oracle connection conn = oracledb.connect( - user="", - password="", - dsn="") + user="", + password="", + dsn="/", +) print("Oracle connection is established...") -''' params ''' -embedder_params = {"provider":"database", "model":"demo_model"} +# params +embedder_params = {"provider": "database", "model": "demo_model"} proxy = "" -''' instance ''' +# instance embedder = OracleEmbeddings(conn=conn, params=embedder_params, proxy=proxy) +docs = ["hello world!", "hi everyone!", "greetings!"] +embeds = embedder.embed_documents(docs) +print(f"Total Embeddings: {len(embeds)}") +print(f"Embedding generated by OracleEmbeddings: {embeds[0]}\n") + embed = embedder.embed_query("Hello World!") print(f"Embedding generated by OracleEmbeddings: {embed}")