mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-12 15:59:56 +00:00
Update and simplify Oracle Database example cookbook (#31364)
This commit is contained in:
parent
c7e82ad95d
commit
b6f74bff40
@ -25,7 +25,7 @@
|
||||
" * [Oracle Blockchain](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_blockchain_table.html#GUID-B469E277-978E-4378-A8C1-26D3FF96C9A6)\n",
|
||||
" * [JSON](https://docs.oracle.com/en/database/oracle/oracle-database/23/adjsn/json-in-oracle-database.html)\n",
|
||||
"\n",
|
||||
"This guide demonstrates how Oracle AI Vector Search can be used with Langchain to serve an end-to-end RAG pipeline. This guide goes through examples of:\n",
|
||||
"This guide demonstrates how Oracle AI Vector Search can be used with LangChain to serve an end-to-end RAG pipeline. This guide goes through examples of:\n",
|
||||
"\n",
|
||||
" * Loading the documents from various sources using OracleDocLoader\n",
|
||||
" * Summarizing them within/outside the database using OracleSummary\n",
|
||||
@ -47,7 +47,19 @@
|
||||
"source": [
|
||||
"### Prerequisites\n",
|
||||
"\n",
|
||||
"Please install Oracle Python Client driver to use Langchain with Oracle AI Vector Search. "
|
||||
"Please install the Oracle Database [python-oracledb driver](https://pypi.org/project/oracledb/) to use LangChain with Oracle AI Vector Search:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"$ python -m pip install --upgrade oracledb\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create Demo User\n",
|
||||
"First, connect as a privileged user to create a demo user with all the required privileges. Change the credentials for your environment. Also set the DEMO_PY_DIR path to a directory on the database host where your model file is located:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -56,65 +68,30 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# pip install oracledb"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create Demo User\n",
|
||||
"First, create a demo user with all the required privileges. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Connection successful!\n",
|
||||
"User setup done!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"import oracledb\n",
|
||||
"\n",
|
||||
"# Update with your username, password, hostname, and service_name\n",
|
||||
"username = \"\"\n",
|
||||
"# Please update with your SYSTEM (or privileged user) username, password, and database connection string\n",
|
||||
"username = \"SYSTEM\"\n",
|
||||
"password = \"\"\n",
|
||||
"dsn = \"\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" conn = oracledb.connect(user=username, password=password, dsn=dsn)\n",
|
||||
"with oracledb.connect(user=username, password=password, dsn=dsn) as connection:\n",
|
||||
" print(\"Connection successful!\")\n",
|
||||
"\n",
|
||||
" cursor = conn.cursor()\n",
|
||||
" try:\n",
|
||||
" with connection.cursor() as cursor:\n",
|
||||
" cursor.execute(\n",
|
||||
" \"\"\"\n",
|
||||
" begin\n",
|
||||
" -- Drop user\n",
|
||||
" begin\n",
|
||||
" execute immediate 'drop user testuser cascade';\n",
|
||||
" exception\n",
|
||||
" when others then\n",
|
||||
" dbms_output.put_line('Error dropping user: ' || SQLERRM);\n",
|
||||
" end;\n",
|
||||
" \n",
|
||||
" execute immediate 'drop user if exists testuser cascade';\n",
|
||||
"\n",
|
||||
" -- Create user and grant privileges\n",
|
||||
" execute immediate 'create user testuser identified by testuser';\n",
|
||||
" execute immediate 'grant connect, unlimited tablespace, create credential, create procedure, create any index to testuser';\n",
|
||||
" execute immediate 'create or replace directory DEMO_PY_DIR as ''/scratch/hroy/view_storage/hroy_devstorage/demo/orachain''';\n",
|
||||
" execute immediate 'create or replace directory DEMO_PY_DIR as ''/home/yourname/demo/orachain''';\n",
|
||||
" execute immediate 'grant read, write on directory DEMO_PY_DIR to public';\n",
|
||||
" execute immediate 'grant create mining model to testuser';\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" -- Network access\n",
|
||||
" begin\n",
|
||||
" DBMS_NETWORK_ACL_ADMIN.APPEND_HOST_ACE(\n",
|
||||
@ -127,15 +104,7 @@
|
||||
" end;\n",
|
||||
" \"\"\"\n",
|
||||
" )\n",
|
||||
" print(\"User setup done!\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"User setup failed with error: {e}\")\n",
|
||||
" finally:\n",
|
||||
" cursor.close()\n",
|
||||
" conn.close()\n",
|
||||
"except Exception as e:\n",
|
||||
" print(f\"Connection failed with error: {e}\")\n",
|
||||
" sys.exit(1)"
|
||||
" print(\"User setup done!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -143,13 +112,13 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Process Documents using Oracle AI\n",
|
||||
"Consider the following scenario: users possess documents stored either in an Oracle Database or a file system and intend to utilize this data with Oracle AI Vector Search powered by Langchain.\n",
|
||||
"Consider the following scenario: users possess documents stored either in an Oracle Database or a file system and intend to utilize this data with Oracle AI Vector Search powered by LangChain.\n",
|
||||
"\n",
|
||||
"To prepare the documents for analysis, a comprehensive preprocessing workflow is necessary. Initially, the documents must be retrieved, summarized (if required), and chunked as needed. Subsequent steps involve generating embeddings for these chunks and integrating them into the Oracle AI Vector Store. Users can then conduct semantic searches on this data.\n",
|
||||
"\n",
|
||||
"The Oracle AI Vector Search Langchain library encompasses a suite of document processing tools that facilitate document loading, chunking, summary generation, and embedding creation.\n",
|
||||
"The Oracle AI Vector Search LangChain library encompasses a suite of document processing tools that facilitate document loading, chunking, summary generation, and embedding creation.\n",
|
||||
"\n",
|
||||
"In the sections that follow, we will detail the utilization of Oracle AI Langchain APIs to effectively implement each of these processes."
|
||||
"In the sections that follow, we will detail the utilization of Oracle AI LangChain APIs to effectively implement each of these processes."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -157,38 +126,24 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Connect to Demo User\n",
|
||||
"The following sample code will show how to connect to Oracle Database. By default, python-oracledb runs in a ‘Thin’ mode which connects directly to Oracle Database. This mode does not need Oracle Client libraries. However, some additional functionality is available when python-oracledb uses them. Python-oracledb is said to be in ‘Thick’ mode when Oracle Client libraries are used. Both modes have comprehensive functionality supporting the Python Database API v2.0 Specification. See the following [guide](https://python-oracledb.readthedocs.io/en/latest/user_guide/appendix_a.html#featuresummary) that talks about features supported in each mode. You might want to switch to thick-mode if you are unable to use thin-mode."
|
||||
"The following sample code shows how to connect to Oracle Database using the python-oracledb driver. By default, python-oracledb runs in a ‘Thin’ mode which connects directly to Oracle Database. This mode does not need Oracle Client libraries. However, some additional functionality is available when python-oracledb uses them. Python-oracledb is said to be in ‘Thick’ mode when Oracle Client libraries are used. Both modes have comprehensive functionality supporting the Python Database API v2.0 Specification. See the following [guide](https://python-oracledb.readthedocs.io/en/latest/user_guide/appendix_a.html#featuresummary) that talks about features supported in each mode. You can switch to Thick mode if you are unable to use Thin mode."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Connection successful!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"import oracledb\n",
|
||||
"\n",
|
||||
"# please update with your username, password, hostname and service_name\n",
|
||||
"username = \"\"\n",
|
||||
"# please update with your username, password, and database connection string\n",
|
||||
"username = \"testuser\"\n",
|
||||
"password = \"\"\n",
|
||||
"dsn = \"\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" conn = oracledb.connect(user=username, password=password, dsn=dsn)\n",
|
||||
" print(\"Connection successful!\")\n",
|
||||
"except Exception as e:\n",
|
||||
" print(\"Connection failed!\")\n",
|
||||
" sys.exit(1)"
|
||||
"connection = oracledb.connect(user=username, password=password, dsn=dsn)\n",
|
||||
"print(\"Connection successful!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -201,22 +156,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Table created and populated.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" cursor = conn.cursor()\n",
|
||||
"\n",
|
||||
" drop_table_sql = \"\"\"drop table demo_tab\"\"\"\n",
|
||||
"with connection.cursor() as cursor:\n",
|
||||
" drop_table_sql = \"\"\"drop table if exists demo_tab\"\"\"\n",
|
||||
" cursor.execute(drop_table_sql)\n",
|
||||
"\n",
|
||||
" create_table_sql = \"\"\"create table demo_tab (id number, data clob)\"\"\"\n",
|
||||
@ -239,15 +184,9 @@
|
||||
" ]\n",
|
||||
" cursor.executemany(insert_row_sql, rows_to_insert)\n",
|
||||
"\n",
|
||||
" conn.commit()\n",
|
||||
"connection.commit()\n",
|
||||
"\n",
|
||||
" print(\"Table created and populated.\")\n",
|
||||
" cursor.close()\n",
|
||||
"except Exception as e:\n",
|
||||
" print(\"Table creation failed.\")\n",
|
||||
" cursor.close()\n",
|
||||
" conn.close()\n",
|
||||
" sys.exit(1)"
|
||||
"print(\"Table created and populated.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -261,30 +200,22 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Load ONNX Model\n",
|
||||
"### Load the ONNX Model\n",
|
||||
"\n",
|
||||
"Oracle accommodates a variety of embedding providers, enabling users to choose between proprietary database solutions and third-party services such as OCIGENAI and HuggingFace. This selection dictates the methodology for generating and managing embeddings.\n",
|
||||
"Oracle accommodates a variety of embedding providers, enabling you to choose between proprietary database solutions and third-party services such as Oracle Generative AI Service and HuggingFace. This selection dictates the methodology for generating and managing embeddings.\n",
|
||||
"\n",
|
||||
"***Important*** : Should users opt for the database option, they must upload an ONNX model into the Oracle Database. Conversely, if a third-party provider is selected for embedding generation, uploading an ONNX model to Oracle Database is not required.\n",
|
||||
"***Important*** : Should you opt for the database option, you must upload an ONNX model into the Oracle Database. Conversely, if a third-party provider is selected for embedding generation, uploading an ONNX model to Oracle Database is not required.\n",
|
||||
"\n",
|
||||
"A significant advantage of utilizing an ONNX model directly within Oracle is the enhanced security and performance it offers by eliminating the need to transmit data to external parties. Additionally, this method avoids the latency typically associated with network or REST API calls.\n",
|
||||
"A significant advantage of utilizing an ONNX model directly within Oracle Database is the enhanced security and performance it offers by eliminating the need to transmit data to external parties. Additionally, this method avoids the latency typically associated with network or REST API calls.\n",
|
||||
"\n",
|
||||
"Below is the example code to upload an ONNX model into Oracle Database:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ONNX model loaded.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.embeddings.oracleai import OracleEmbeddings\n",
|
||||
"\n",
|
||||
@ -294,12 +225,8 @@
|
||||
"onnx_file = \"tinybert.onnx\"\n",
|
||||
"model_name = \"demo_model\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" OracleEmbeddings.load_onnx_model(conn, onnx_dir, onnx_file, model_name)\n",
|
||||
" print(\"ONNX model loaded.\")\n",
|
||||
"except Exception as e:\n",
|
||||
" print(\"ONNX model loading failed!\")\n",
|
||||
" sys.exit(1)"
|
||||
"OracleEmbeddings.load_onnx_model(connection, onnx_dir, onnx_file, model_name)\n",
|
||||
"print(\"ONNX model loaded.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -321,8 +248,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" cursor = conn.cursor()\n",
|
||||
"with connection.cursor() as cursor:\n",
|
||||
" cursor.execute(\n",
|
||||
" \"\"\"\n",
|
||||
" declare\n",
|
||||
@ -349,12 +275,7 @@
|
||||
" params => json(jo.to_string));\n",
|
||||
" end;\n",
|
||||
" \"\"\"\n",
|
||||
" )\n",
|
||||
" cursor.close()\n",
|
||||
" print(\"Credentials created.\")\n",
|
||||
"except Exception as ex:\n",
|
||||
" cursor.close()\n",
|
||||
" raise"
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -362,33 +283,24 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Load Documents\n",
|
||||
"Users have the flexibility to load documents from either the Oracle Database, a file system, or both, by appropriately configuring the loader parameters. For comprehensive details on these parameters, please consult the [Oracle AI Vector Search Guide](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_vector_chain1.html#GUID-73397E89-92FB-48ED-94BB-1AD960C4EA1F).\n",
|
||||
"You have the flexibility to load documents from either the Oracle Database, a file system, or both, by appropriately configuring the loader parameters. For comprehensive details on these parameters, please consult the [Oracle AI Vector Search Guide](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_vector_chain1.html#GUID-73397E89-92FB-48ED-94BB-1AD960C4EA1F).\n",
|
||||
"\n",
|
||||
"A significant advantage of utilizing OracleDocLoader is its capability to process over 150 distinct file formats, eliminating the need for multiple loaders for different document types. For a complete list of the supported formats, please refer to the [Oracle Text Supported Document Formats](https://docs.oracle.com/en/database/oracle/oracle-database/23/ccref/oracle-text-supported-document-formats.html).\n",
|
||||
"\n",
|
||||
"Below is a sample code snippet that demonstrates how to use OracleDocLoader"
|
||||
"Below is a sample code snippet that demonstrates how to use OracleDocLoader:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Number of docs loaded: 3\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders.oracleai import OracleDocLoader\n",
|
||||
"from langchain_core.documents import Document\n",
|
||||
"\n",
|
||||
"# loading from Oracle Database table\n",
|
||||
"# make sure you have the table with this specification\n",
|
||||
"loader_params = {}\n",
|
||||
"loader_params = {\n",
|
||||
" \"owner\": \"testuser\",\n",
|
||||
" \"tablename\": \"demo_tab\",\n",
|
||||
@ -396,7 +308,7 @@
|
||||
"}\n",
|
||||
"\n",
|
||||
"\"\"\" load the docs \"\"\"\n",
|
||||
"loader = OracleDocLoader(conn=conn, params=loader_params)\n",
|
||||
"loader = OracleDocLoader(conn=connection, params=loader_params)\n",
|
||||
"docs = loader.load()\n",
|
||||
"\n",
|
||||
"\"\"\" verify \"\"\"\n",
|
||||
@ -409,23 +321,23 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Generate Summary\n",
|
||||
"Now that the user loaded the documents, they may want to generate a summary for each document. The Oracle AI Vector Search Langchain library offers a suite of APIs designed for document summarization. It supports multiple summarization providers such as Database, OCIGENAI, HuggingFace, among others, allowing users to select the provider that best meets their needs. To utilize these capabilities, users must configure the summary parameters as specified. For detailed information on these parameters, please consult the [Oracle AI Vector Search Guide book](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_vector_chain1.html#GUID-EC9DDB58-6A15-4B36-BA66-ECBA20D2CE57)."
|
||||
"Now that you have loaded the documents, you may want to generate a summary for each document. The Oracle AI Vector Search LangChain library offers a suite of APIs designed for document summarization. It supports multiple summarization providers such as Database, Oracle Generative AI Service, HuggingFace, among others, allowing you to select the provider that best meets their needs. To utilize these capabilities, you must configure the summary parameters as specified. For detailed information on these parameters, please consult the [Oracle AI Vector Search Guide book](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_vector_chain1.html#GUID-EC9DDB58-6A15-4B36-BA66-ECBA20D2CE57)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"***Note:*** The users may need to set proxy if they want to use some 3rd party summary generation providers other than Oracle's in-house and default provider: 'database'. If you don't have proxy, please remove the proxy parameter when you instantiate the OracleSummary."
|
||||
"***Note:*** You may need to set proxy if you want to use some 3rd party summary generation providers other than Oracle's in-house and default provider: 'database'. If you don't have proxy, please remove the proxy parameter when you instantiate the OracleSummary."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# proxy to be used when we instantiate summary and embedder object\n",
|
||||
"# proxy to be used when we instantiate summary and embedder objects\n",
|
||||
"proxy = \"\""
|
||||
]
|
||||
},
|
||||
@ -433,22 +345,14 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The following sample code will show how to generate summary:"
|
||||
"The following sample code shows how to generate a summary:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Number of Summaries: 3\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.utilities.oracleai import OracleSummary\n",
|
||||
"from langchain_core.documents import Document\n",
|
||||
@ -463,7 +367,7 @@
|
||||
"\n",
|
||||
"# get the summary instance\n",
|
||||
"# Remove proxy if not required\n",
|
||||
"summ = OracleSummary(conn=conn, params=summary_params, proxy=proxy)\n",
|
||||
"summ = OracleSummary(conn=connection, params=summary_params, proxy=proxy)\n",
|
||||
"\n",
|
||||
"list_summary = []\n",
|
||||
"for doc in docs:\n",
|
||||
@ -487,17 +391,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Number of Chunks: 3\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders.oracleai import OracleTextSplitter\n",
|
||||
"from langchain_core.documents import Document\n",
|
||||
@ -506,7 +402,7 @@
|
||||
"splitter_params = {\"normalize\": \"all\"}\n",
|
||||
"\n",
|
||||
"\"\"\" get the splitter instance \"\"\"\n",
|
||||
"splitter = OracleTextSplitter(conn=conn, params=splitter_params)\n",
|
||||
"splitter = OracleTextSplitter(conn=connection, params=splitter_params)\n",
|
||||
"\n",
|
||||
"list_chunks = []\n",
|
||||
"for doc in docs:\n",
|
||||
@ -523,19 +419,19 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Generate Embeddings\n",
|
||||
"Now that the documents are chunked as per requirements, the users may want to generate embeddings for these chunks. Oracle AI Vector Search provides multiple methods for generating embeddings, utilizing either locally hosted ONNX models or third-party APIs. For comprehensive instructions on configuring these alternatives, please refer to the [Oracle AI Vector Search Guide](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_vector_chain1.html#GUID-C6439E94-4E86-4ECD-954E-4B73D53579DE)."
|
||||
"Now that the documents are chunked as per requirements, you may want to generate embeddings for these chunks. Oracle AI Vector Search provides multiple methods for generating embeddings, utilizing either locally hosted ONNX models or third-party APIs. For comprehensive instructions on configuring these alternatives, please refer to the [Oracle AI Vector Search Guide](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_vector_chain1.html#GUID-C6439E94-4E86-4ECD-954E-4B73D53579DE)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"***Note:*** Users may need to configure a proxy to utilize third-party embedding generation providers, excluding the 'database' provider that utilizes an ONNX model."
|
||||
"***Note:*** You may need to configure a proxy to utilize third-party embedding generation providers, excluding the 'database' provider that utilizes an ONNX model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -547,22 +443,14 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The following sample code will show how to generate embeddings:"
|
||||
"The following sample code shows how to generate embeddings:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Number of embeddings: 3\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.embeddings.oracleai import OracleEmbeddings\n",
|
||||
"from langchain_core.documents import Document\n",
|
||||
@ -572,7 +460,7 @@
|
||||
"\n",
|
||||
"# get the embedding instance\n",
|
||||
"# Remove proxy if not required\n",
|
||||
"embedder = OracleEmbeddings(conn=conn, params=embedder_params, proxy=proxy)\n",
|
||||
"embedder = OracleEmbeddings(conn=connection, params=embedder_params, proxy=proxy)\n",
|
||||
"\n",
|
||||
"embeddings = []\n",
|
||||
"for doc in docs:\n",
|
||||
@ -591,19 +479,19 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create Oracle AI Vector Store\n",
|
||||
"Now that you know how to use Oracle AI Langchain library APIs individually to process the documents, let us show how to integrate with Oracle AI Vector Store to facilitate the semantic searches."
|
||||
"Now that you know how to use Oracle AI LangChain library APIs individually to process the documents, let us show how to integrate with Oracle AI Vector Store to facilitate the semantic searches."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, let's import all the dependencies."
|
||||
"First, let's import all the dependencies:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 52,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -626,100 +514,80 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, let's combine all document processing stages together. Here is the sample code below:"
|
||||
"Next, let's combine all document processing stages together. Here is the sample code:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 53,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Connection successful!\n",
|
||||
"ONNX model loaded.\n",
|
||||
"Number of total chunks with metadata: 3\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\"\"\"\n",
|
||||
"In this sample example, we will use 'database' provider for both summary and embeddings.\n",
|
||||
"So, we don't need to do the followings:\n",
|
||||
"In this sample example, we will use 'database' provider for both summary and embeddings\n",
|
||||
"so, we don't need to do the following:\n",
|
||||
" - set proxy for 3rd party providers\n",
|
||||
" - create credential for 3rd party providers\n",
|
||||
"\n",
|
||||
"If you choose to use 3rd party provider, \n",
|
||||
"please follow the necessary steps for proxy and credential.\n",
|
||||
"If you choose to use 3rd party provider, please follow the necessary steps for proxy and credential.\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"# oracle connection\n",
|
||||
"# please update with your username, password, hostname, and service_name\n",
|
||||
"# please update with your username, password, and database connection string\n",
|
||||
"username = \"\"\n",
|
||||
"password = \"\"\n",
|
||||
"dsn = \"\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" conn = oracledb.connect(user=username, password=password, dsn=dsn)\n",
|
||||
"with oracledb.connect(user=username, password=password, dsn=dsn) as connection:\n",
|
||||
" print(\"Connection successful!\")\n",
|
||||
"except Exception as e:\n",
|
||||
" print(\"Connection failed!\")\n",
|
||||
" sys.exit(1)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# load onnx model\n",
|
||||
"# please update with your related information\n",
|
||||
"onnx_dir = \"DEMO_PY_DIR\"\n",
|
||||
"onnx_file = \"tinybert.onnx\"\n",
|
||||
"model_name = \"demo_model\"\n",
|
||||
"try:\n",
|
||||
" OracleEmbeddings.load_onnx_model(conn, onnx_dir, onnx_file, model_name)\n",
|
||||
" # load onnx model\n",
|
||||
" # please update with your related information\n",
|
||||
" onnx_dir = \"DEMO_PY_DIR\"\n",
|
||||
" onnx_file = \"tinybert.onnx\"\n",
|
||||
" model_name = \"demo_model\"\n",
|
||||
" OracleEmbeddings.load_onnx_model(connection, onnx_dir, onnx_file, model_name)\n",
|
||||
" print(\"ONNX model loaded.\")\n",
|
||||
"except Exception as e:\n",
|
||||
" print(\"ONNX model loading failed!\")\n",
|
||||
" sys.exit(1)\n",
|
||||
"\n",
|
||||
" # params\n",
|
||||
" # please update necessary fields with related information\n",
|
||||
" loader_params = {\n",
|
||||
" \"owner\": \"testuser\",\n",
|
||||
" \"tablename\": \"demo_tab\",\n",
|
||||
" \"colname\": \"data\",\n",
|
||||
" }\n",
|
||||
" summary_params = {\n",
|
||||
" \"provider\": \"database\",\n",
|
||||
" \"glevel\": \"S\",\n",
|
||||
" \"numParagraphs\": 1,\n",
|
||||
" \"language\": \"english\",\n",
|
||||
" }\n",
|
||||
" splitter_params = {\"normalize\": \"all\"}\n",
|
||||
" embedder_params = {\"provider\": \"database\", \"model\": \"demo_model\"}\n",
|
||||
"\n",
|
||||
"# params\n",
|
||||
"# please update necessary fields with related information\n",
|
||||
"loader_params = {\n",
|
||||
" \"owner\": \"testuser\",\n",
|
||||
" \"tablename\": \"demo_tab\",\n",
|
||||
" \"colname\": \"data\",\n",
|
||||
"}\n",
|
||||
"summary_params = {\n",
|
||||
" \"provider\": \"database\",\n",
|
||||
" \"glevel\": \"S\",\n",
|
||||
" \"numParagraphs\": 1,\n",
|
||||
" \"language\": \"english\",\n",
|
||||
"}\n",
|
||||
"splitter_params = {\"normalize\": \"all\"}\n",
|
||||
"embedder_params = {\"provider\": \"database\", \"model\": \"demo_model\"}\n",
|
||||
" # instantiate loader, summary, splitter, and embedder\n",
|
||||
" loader = OracleDocLoader(conn=connection, params=loader_params)\n",
|
||||
" summary = OracleSummary(conn=connection, params=summary_params)\n",
|
||||
" splitter = OracleTextSplitter(conn=connection, params=splitter_params)\n",
|
||||
" embedder = OracleEmbeddings(conn=connection, params=embedder_params)\n",
|
||||
"\n",
|
||||
"# instantiate loader, summary, splitter, and embedder\n",
|
||||
"loader = OracleDocLoader(conn=conn, params=loader_params)\n",
|
||||
"summary = OracleSummary(conn=conn, params=summary_params)\n",
|
||||
"splitter = OracleTextSplitter(conn=conn, params=splitter_params)\n",
|
||||
"embedder = OracleEmbeddings(conn=conn, params=embedder_params)\n",
|
||||
" # process the documents\n",
|
||||
" chunks_with_mdata = []\n",
|
||||
" for id, doc in enumerate(docs, start=1):\n",
|
||||
" summ = summary.get_summary(doc.page_content)\n",
|
||||
" chunks = splitter.split_text(doc.page_content)\n",
|
||||
" for ic, chunk in enumerate(chunks, start=1):\n",
|
||||
" chunk_metadata = doc.metadata.copy()\n",
|
||||
" chunk_metadata[\"id\"] = (\n",
|
||||
" chunk_metadata[\"_oid\"] + \"$\" + str(id) + \"$\" + str(ic)\n",
|
||||
" )\n",
|
||||
" chunk_metadata[\"document_id\"] = str(id)\n",
|
||||
" chunk_metadata[\"document_summary\"] = str(summ[0])\n",
|
||||
" chunks_with_mdata.append(\n",
|
||||
" Document(page_content=str(chunk), metadata=chunk_metadata)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"# process the documents\n",
|
||||
"chunks_with_mdata = []\n",
|
||||
"for id, doc in enumerate(docs, start=1):\n",
|
||||
" summ = summary.get_summary(doc.page_content)\n",
|
||||
" chunks = splitter.split_text(doc.page_content)\n",
|
||||
" for ic, chunk in enumerate(chunks, start=1):\n",
|
||||
" chunk_metadata = doc.metadata.copy()\n",
|
||||
" chunk_metadata[\"id\"] = chunk_metadata[\"_oid\"] + \"$\" + str(id) + \"$\" + str(ic)\n",
|
||||
" chunk_metadata[\"document_id\"] = str(id)\n",
|
||||
" chunk_metadata[\"document_summary\"] = str(summ[0])\n",
|
||||
" chunks_with_mdata.append(\n",
|
||||
" Document(page_content=str(chunk), metadata=chunk_metadata)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\"\"\" verify \"\"\"\n",
|
||||
"print(f\"Number of total chunks with metadata: {len(chunks_with_mdata)}\")"
|
||||
" \"\"\" verify \"\"\"\n",
|
||||
" print(f\"Number of total chunks with metadata: {len(chunks_with_mdata)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -733,23 +601,15 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 55,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Vector Store Table: oravs\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# create Oracle AI Vector Store\n",
|
||||
"vectorstore = OracleVS.from_documents(\n",
|
||||
" chunks_with_mdata,\n",
|
||||
" embedder,\n",
|
||||
" client=conn,\n",
|
||||
" client=connection,\n",
|
||||
" table_name=\"oravs\",\n",
|
||||
" distance_strategy=DistanceStrategy.DOT_PRODUCT,\n",
|
||||
")\n",
|
||||
@ -778,12 +638,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 56,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"oraclevs.create_index(\n",
|
||||
" conn, vectorstore, params={\"idx_name\": \"hnsw_oravs\", \"idx_type\": \"HNSW\"}\n",
|
||||
" connection, vectorstore, params={\"idx_name\": \"hnsw_oravs\", \"idx_type\": \"HNSW\"}\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Index created.\")"
|
||||
@ -793,7 +653,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This example demonstrates the creation of a default HNSW index on embeddings within the 'oravs' table. Users may adjust various parameters according to their specific needs. For detailed information on these parameters, please consult the [Oracle AI Vector Search Guide book](https://docs.oracle.com/en/database/oracle/oracle-database/23/vecse/manage-different-categories-vector-indexes.html).\n",
|
||||
"This example demonstrates the creation of a default HNSW index on embeddings within the 'oravs' table. You may adjust various parameters according to your specific needs. For detailed information on these parameters, please consult the [Oracle AI Vector Search Guide book](https://docs.oracle.com/en/database/oracle/oracle-database/23/vecse/manage-different-categories-vector-indexes.html).\n",
|
||||
"\n",
|
||||
"Additionally, various types of vector indices can be created to meet diverse requirements. More details can be found in our [comprehensive guide](https://python.langchain.com/v0.1/docs/integrations/vectorstores/oracle/).\n"
|
||||
]
|
||||
@ -805,29 +665,16 @@
|
||||
"## Perform Semantic Search\n",
|
||||
"All set!\n",
|
||||
"\n",
|
||||
"We have successfully processed the documents and stored them in the vector store, followed by the creation of an index to enhance query performance. We are now prepared to proceed with semantic searches.\n",
|
||||
"You have successfully processed the documents and stored them in the vector store, followed by the creation of an index to enhance query performance. You are now prepared to proceed with semantic searches.\n",
|
||||
"\n",
|
||||
"Below is the sample code for this process:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[Document(page_content='The database stores LOBs differently from other data types. Creating a LOB column implicitly creates a LOB segment and a LOB index. The tablespace containing the LOB segment and LOB index, which are always stored together, may be different from the tablespace containing the table. Sometimes the database can store small amounts of LOB data in the table itself rather than in a separate LOB segment.', metadata={'_oid': '662f2f257677f3c2311a8ff999fd34e5', '_rowid': 'AAAR/xAAEAAAAAnAAC', 'id': '662f2f257677f3c2311a8ff999fd34e5$3$1', 'document_id': '3', 'document_summary': 'Sometimes the database can store small amounts of LOB data in the table itself rather than in a separate LOB segment.\\n\\n'})]\n",
|
||||
"[]\n",
|
||||
"[(Document(page_content='The database stores LOBs differently from other data types. Creating a LOB column implicitly creates a LOB segment and a LOB index. The tablespace containing the LOB segment and LOB index, which are always stored together, may be different from the tablespace containing the table. Sometimes the database can store small amounts of LOB data in the table itself rather than in a separate LOB segment.', metadata={'_oid': '662f2f257677f3c2311a8ff999fd34e5', '_rowid': 'AAAR/xAAEAAAAAnAAC', 'id': '662f2f257677f3c2311a8ff999fd34e5$3$1', 'document_id': '3', 'document_summary': 'Sometimes the database can store small amounts of LOB data in the table itself rather than in a separate LOB segment.\\n\\n'}), 0.055675752460956573)]\n",
|
||||
"[]\n",
|
||||
"[Document(page_content='If the answer to any preceding questions is yes, then the database stops the search and allocates space from the specified tablespace; otherwise, space is allocated from the database default shared temporary tablespace.', metadata={'_oid': '662f2f253acf96b33b430b88699490a2', '_rowid': 'AAAR/xAAEAAAAAnAAA', 'id': '662f2f253acf96b33b430b88699490a2$1$1', 'document_id': '1', 'document_summary': 'If the answer to any preceding questions is yes, then the database stops the search and allocates space from the specified tablespace; otherwise, space is allocated from the database default shared temporary tablespace.\\n\\n'})]\n",
|
||||
"[Document(page_content='If the answer to any preceding questions is yes, then the database stops the search and allocates space from the specified tablespace; otherwise, space is allocated from the database default shared temporary tablespace.', metadata={'_oid': '662f2f253acf96b33b430b88699490a2', '_rowid': 'AAAR/xAAEAAAAAnAAA', 'id': '662f2f253acf96b33b430b88699490a2$1$1', 'document_id': '1', 'document_summary': 'If the answer to any preceding questions is yes, then the database stops the search and allocates space from the specified tablespace; otherwise, space is allocated from the database default shared temporary tablespace.\\n\\n'})]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What is Oracle AI Vector Store?\"\n",
|
||||
"filter = {\"document_id\": [\"1\"]}\n",
|
||||
@ -872,7 +719,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
"version": "3.13.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
Loading…
Reference in New Issue
Block a user