This commit is contained in:
Dhiru Pandey 2025-07-28 23:11:53 +00:00 committed by GitHub
commit c873c722b7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 5475 additions and 0 deletions

View File

@ -0,0 +1,382 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6eb5f05c-0488-4563-a81f-d8e6c2c6b7d6",
"metadata": {},
"source": [
"# Coherence\n",
"\n",
"This notebook covers how to get started with the `Coherence` vector store.\n",
"\n",
">[Coherence](https://www.oracle.com/java/coherence/) is an in-memory data grid that provides a distributed, fault-tolerant, and scalable platform for managing and accessing data. It is primarily used for high-performance, mission-critical enterprise applications that require low-latency access to large datasets. In addition to the commercially available product, Oracle also offers [Coherence CE (Community Edition)](https://github.com/oracle/coherence)\n"
]
},
{
"cell_type": "markdown",
"id": "59238b02-359e-4ac3-939d-e88735183f28",
"metadata": {},
"source": [
"## Setup\n",
"\n",
"To access `Coherence` vector stores you'll need to install the `langchain-coherence` integration package.\n",
"\n",
"```\n",
"pip install langchain_coherence\n",
"```\n"
]
},
{
"cell_type": "markdown",
"id": "35761315-a34e-4f03-9246-459c1c36331b",
"metadata": {},
"source": [
"## Initialization"
]
},
{
"cell_type": "markdown",
"id": "592b5db4-807c-4a04-ac64-f0077c44edfa",
"metadata": {},
"source": [
"### Usage\n",
"\n",
"Before using LangChain's CoherenceVectorStore you must ensure that a Coherence server ([Coherence CE](https://github.com/oracle/coherence) 25.03+ or [Oracle Coherence](https://www.oracle.com/java/coherence/) 14.1.2+) is running \n",
"\n",
"For local development, we recommend using the Coherence CE container image:\n",
"```\n",
"docker run -d -p 1408:1408 ghcr.io/oracle/coherence-ce:25.03.2\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "86544bf7-8459-40cd-813c-cdbbcde9084f",
"metadata": {},
"source": [
"### Basic Initialization"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1bdd7da5-c050-47b0-a08c-52d1bd7b6948",
"metadata": {},
"outputs": [
{
"ename": "RuntimeError",
"evalue": "Unexpected error, <AioRpcError of RPC that terminated with:\n\tstatus = StatusCode.UNAVAILABLE\n\tdetails = \"connections to all backends failing; last error: UNKNOWN: ipv4:127.0.0.1:1408: Failed to connect to remote host: connect: Connection refused (61)\"\n\tdebug_error_string = \"UNKNOWN:Error received from peer {grpc_status:14, grpc_message:\"connections to all backends failing; last error: UNKNOWN: ipv4:127.0.0.1:1408: Failed to connect to remote host: connect: Connection refused (61)\"}\"\n>, when attempting to handshake with proxy: connections to all backends failing; last error: UNKNOWN: ipv4:127.0.0.1:1408: Failed to connect to remote host: connect: Connection refused (61)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAioRpcError\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m~/work/coherence/github/dhirupandey/langchain/libs/partners/coherence/.venv/lib/python3.9/site-packages/coherence/client.py:113\u001b[0m, in \u001b[0;36m_Handshake.handshake\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 113\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m stream\u001b[38;5;241m.\u001b[39mwrite(RequestFactoryV1\u001b[38;5;241m.\u001b[39minit_sub_channel())\n\u001b[1;32m 114\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m asyncio\u001b[38;5;241m.\u001b[39mwait_for(stream\u001b[38;5;241m.\u001b[39mread(), \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_session\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mrequest_timeout_seconds)\n",
"File \u001b[0;32m~/work/coherence/github/dhirupandey/langchain/libs/partners/coherence/.venv/lib/python3.9/site-packages/grpc/aio/_call.py:526\u001b[0m, in \u001b[0;36m_StreamRequestMixin.write\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 525\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_for_different_style(_APIStyle\u001b[38;5;241m.\u001b[39mREADER_WRITER)\n\u001b[0;32m--> 526\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_write(request)\n",
"File \u001b[0;32m~/work/coherence/github/dhirupandey/langchain/libs/partners/coherence/.venv/lib/python3.9/site-packages/grpc/aio/_call.py:495\u001b[0m, in \u001b[0;36m_StreamRequestMixin._write\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 494\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdone():\n\u001b[0;32m--> 495\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_for_status()\n\u001b[1;32m 497\u001b[0m serialized_request \u001b[38;5;241m=\u001b[39m _common\u001b[38;5;241m.\u001b[39mserialize(\n\u001b[1;32m 498\u001b[0m request, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request_serializer\n\u001b[1;32m 499\u001b[0m )\n",
"File \u001b[0;32m~/work/coherence/github/dhirupandey/langchain/libs/partners/coherence/.venv/lib/python3.9/site-packages/grpc/aio/_call.py:272\u001b[0m, in \u001b[0;36mCall._raise_for_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 271\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m code \u001b[38;5;241m!=\u001b[39m grpc\u001b[38;5;241m.\u001b[39mStatusCode\u001b[38;5;241m.\u001b[39mOK:\n\u001b[0;32m--> 272\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m _create_rpc_error(\n\u001b[1;32m 273\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minitial_metadata(), \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_cython_call\u001b[38;5;241m.\u001b[39mstatus()\n\u001b[1;32m 274\u001b[0m )\n",
"\u001b[0;31mAioRpcError\u001b[0m: <AioRpcError of RPC that terminated with:\n\tstatus = StatusCode.UNAVAILABLE\n\tdetails = \"connections to all backends failing; last error: UNKNOWN: ipv4:127.0.0.1:1408: Failed to connect to remote host: connect: Connection refused (61)\"\n\tdebug_error_string = \"UNKNOWN:Error received from peer {grpc_status:14, grpc_message:\"connections to all backends failing; last error: UNKNOWN: ipv4:127.0.0.1:1408: Failed to connect to remote host: connect: Connection refused (61)\"}\"\n>",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[5], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mcoherence\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m NamedMap, Session\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mlangchain_coherence\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m CoherenceVectorStore\n\u001b[0;32m----> 8\u001b[0m session: Session \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m Session\u001b[38;5;241m.\u001b[39mcreate()\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 10\u001b[0m named_map: NamedMap[\u001b[38;5;28mstr\u001b[39m, Document] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m session\u001b[38;5;241m.\u001b[39mget_map(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmy-map\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"File \u001b[0;32m~/work/coherence/github/dhirupandey/langchain/libs/partners/coherence/.venv/lib/python3.9/site-packages/coherence/client.py:1932\u001b[0m, in \u001b[0;36mSession.create\u001b[0;34m(session_options)\u001b[0m\n\u001b[1;32m 1930\u001b[0m session: Session \u001b[38;5;241m=\u001b[39m Session(session_options)\n\u001b[1;32m 1931\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m session\u001b[38;5;241m.\u001b[39m_set_ready(\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m-> 1932\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m session\u001b[38;5;241m.\u001b[39m_handshake\u001b[38;5;241m.\u001b[39mhandshake()\n\u001b[1;32m 1933\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m session\u001b[38;5;241m.\u001b[39m_protocol_version \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 1934\u001b[0m COH_LOG\u001b[38;5;241m.\u001b[39minfo(\n\u001b[1;32m 1935\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSession(id=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msession\u001b[38;5;241m.\u001b[39msession_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, connected to [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msession\u001b[38;5;241m.\u001b[39m_session_options\u001b[38;5;241m.\u001b[39maddress\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1936\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m proxy-version=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msession\u001b[38;5;241m.\u001b[39m_proxy_version\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, protocol-version=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msession\u001b[38;5;241m.\u001b[39m_protocol_version\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1937\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m proxy-member-id=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msession\u001b[38;5;241m.\u001b[39m_proxy_member_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1938\u001b[0m )\n",
"File \u001b[0;32m~/work/coherence/github/dhirupandey/langchain/libs/partners/coherence/.venv/lib/python3.9/site-packages/coherence/client.py:129\u001b[0m, in \u001b[0;36m_Handshake.handshake\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 128\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 129\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 130\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnexpected error, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, when attempting to handshake with proxy: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;241m.\u001b[39mdetails()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 131\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01me\u001b[39;00m\n\u001b[1;32m 132\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m asyncio\u001b[38;5;241m.\u001b[39mTimeoutError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHandshake with proxy timed out\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01me\u001b[39;00m\n",
"\u001b[0;31mRuntimeError\u001b[0m: Unexpected error, <AioRpcError of RPC that terminated with:\n\tstatus = StatusCode.UNAVAILABLE\n\tdetails = \"connections to all backends failing; last error: UNKNOWN: ipv4:127.0.0.1:1408: Failed to connect to remote host: connect: Connection refused (61)\"\n\tdebug_error_string = \"UNKNOWN:Error received from peer {grpc_status:14, grpc_message:\"connections to all backends failing; last error: UNKNOWN: ipv4:127.0.0.1:1408: Failed to connect to remote host: connect: Connection refused (61)\"}\"\n>, when attempting to handshake with proxy: connections to all backends failing; last error: UNKNOWN: ipv4:127.0.0.1:1408: Failed to connect to remote host: connect: Connection refused (61)"
]
}
],
"source": [
"from langchain_core.documents import Document\n",
"from langchain_core.embeddings import Embeddings\n",
"from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
"\n",
"from coherence import NamedMap, Session\n",
"from langchain_coherence import CoherenceVectorStore\n",
"\n",
"session: Session = await Session.create()\n",
"try:\n",
" named_map: NamedMap[str, Document] = await session.get_map(\"my-map\")\n",
" embedding :Embeddings = HuggingFaceEmbeddings(\n",
" model_name=\"sentence-transformers/all-MiniLM-l6-v2\")\n",
" # this embedding generates vectors of dimension 384\n",
" cvs :CoherenceVectorStore = await CoherenceVectorStore.create(\n",
" named_map,embedding,384)\n",
" # other operations on the CoherenceVectorStore can be done\n",
"finally:\n",
" await session.close()"
]
},
{
"cell_type": "markdown",
"id": "5ca1dbd0-4fec-4907-8918-0a438a1b2535",
"metadata": {},
"source": [
"## Manage vector store"
]
},
{
"cell_type": "raw",
"id": "e7171672-4453-4b6f-afe6-071980908a5f",
"metadata": {},
"source": [
"### Add Documents and retrieve them:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d52c0187-f5c4-4c69-9052-0b8998945680",
"metadata": {},
"outputs": [],
"source": [
"from langchain_core.documents import Document\n",
"from langchain_core.embeddings import Embeddings\n",
"from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
"\n",
"from coherence import NamedMap, Session\n",
"from langchain_core.vectorstores.coherence_store import CoherenceVectorStore\n",
"\n",
"session: Session = await Session.create()\n",
"try:\n",
" named_map: NamedMap[str, Document] = await session.get_map(\"my-map\")\n",
" embedding :Embeddings = HuggingFaceEmbeddings(\n",
" model_name=\"sentence-transformers/all-MiniLM-l6-v2\")\n",
" # this embedding generates vectors of dimension 384\n",
" cvs :CoherenceVectorStore = await CoherenceVectorStore.create(\n",
" named_map,embedding,384)\n",
" d1 :Document = Document(id=\"1\", page_content=\"apple\")\n",
" d2 :Document = Document(id=\"2\", page_content=\"orange\")\n",
" documents = [d1, d2]\n",
" await cvs.aadd_documents(documents)\n",
"\n",
" ids = [doc.id for doc in documents]\n",
" l = await cvs.aget_by_ids(ids)\n",
" assert len(l) == len(ids)\n",
" print(\"====\")\n",
" for e in l:\n",
" print(e)\n",
"finally:\n",
" await session.close()"
]
},
{
"cell_type": "raw",
"id": "f0f7215e-16a4-4fe7-8070-34c385aeeead",
"metadata": {},
"source": [
"### Delete Documents:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11328dac-633e-4155-8f19-95ef2bfa3d06",
"metadata": {},
"outputs": [],
"source": [
"from langchain_core.documents import Document\n",
"from langchain_core.embeddings import Embeddings\n",
"from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
"\n",
"from coherence import NamedMap, Session\n",
"from langchain_core.vectorstores.coherence_store import CoherenceVectorStore\n",
"\n",
"session: Session = await Session.create()\n",
"try:\n",
" named_map: NamedMap[str, Document] = await session.get_map(\"my-map\")\n",
" embedding :Embeddings = HuggingFaceEmbeddings(\n",
" model_name=\"sentence-transformers/all-MiniLM-l6-v2\")\n",
" # this embedding generates vectors of dimension 384\n",
" cvs :CoherenceVectorStore = await CoherenceVectorStore.create(\n",
" named_map,embedding,384)\n",
" d1 :Document = Document(id=\"1\", page_content=\"apple\")\n",
" d2 :Document = Document(id=\"2\", page_content=\"orange\")\n",
" documents = [d1, d2]\n",
" await cvs.aadd_documents(documents)\n",
"\n",
" ids = [doc.id for doc in documents]\n",
" await cvs.adelete(ids)\n",
"finally:\n",
" await session.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8241b585-64ae-447a-b1a7-860d3c51f823",
"metadata": {},
"outputs": [],
"source": [
"## Query vector store"
]
},
{
"cell_type": "markdown",
"id": "a8c6ad91-ee54-486a-888f-cbfc89be75fd",
"metadata": {},
"source": [
"### Similarity Search:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ffe24b05-7bf1-4eaa-a030-6ac3a0446f29",
"metadata": {},
"outputs": [],
"source": [
"from langchain_core.documents import Document\n",
"from langchain_core.embeddings import Embeddings\n",
"from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
"\n",
"from coherence import NamedMap, Session\n",
"from langchain_core.vectorstores.coherence_store import CoherenceVectorStore\n",
"\n",
"def test_data():\n",
" d1 :Document = Document(id=\"1\", page_content=\"apple\")\n",
" d2 :Document = Document(id=\"2\", page_content=\"orange\")\n",
" d3 :Document = Document(id=\"3\", page_content=\"tiger\")\n",
" d4 :Document = Document(id=\"4\", page_content=\"cat\")\n",
" d5 :Document = Document(id=\"5\", page_content=\"dog\")\n",
" d6 :Document = Document(id=\"6\", page_content=\"fox\")\n",
" d7 :Document = Document(id=\"7\", page_content=\"pear\")\n",
" d8 :Document = Document(id=\"8\", page_content=\"banana\")\n",
" d9 :Document = Document(id=\"9\", page_content=\"plum\")\n",
" d10 :Document = Document(id=\"10\", page_content=\"lion\")\n",
"\n",
" documents = [d1, d2, d3, d4, d5, d6, d7, d8, d9, d10]\n",
" return documents\n",
"\n",
"async def test_asimilarity_search():\n",
" documents = test_data()\n",
" session: Session = await Session.create()\n",
" try:\n",
" named_map: NamedMap[str, Document] = await session.get_map(\"my-map\")\n",
" embedding :Embeddings = HuggingFaceEmbeddings(\n",
" model_name=\"sentence-transformers/all-MiniLM-l6-v2\")\n",
" # this embedding generates vectors of dimension 384\n",
" cvs :CoherenceVectorStore = await CoherenceVectorStore.create(\n",
" named_map,embedding,384)\n",
" await cvs.aadd_documents(documents)\n",
" ids = [doc.id for doc in documents]\n",
" l = await cvs.aget_by_ids(ids)\n",
" assert len(l) == 10\n",
"\n",
" result = await cvs.asimilarity_search(\"fruit\")\n",
" assert len(result) == 4\n",
" print(\"====\")\n",
" for e in result:\n",
" print(e)\n",
" finally:\n",
" await session.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6862ae65-a3d6-46e4-9e85-f3f64f2add5c",
"metadata": {},
"outputs": [],
"source": [
"## Usage for retrieval-augmented generation"
]
},
{
"cell_type": "markdown",
"id": "5411477e-5905-4a96-80d8-b8d2238c4bc4",
"metadata": {},
"source": [
"### Similarity Search by vector :"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbb07be8-b13e-4bd5-a9ea-94603ff2a6e4",
"metadata": {},
"outputs": [],
"source": [
"from langchain_core.documents import Document\n",
"from langchain_core.embeddings import Embeddings\n",
"from langchain_huggingface.embeddings import HuggingFaceEmbeddings\n",
"\n",
"from coherence import NamedMap, Session\n",
"from langchain_core.vectorstores.coherence_store import CoherenceVectorStore\n",
"\n",
"def test_data():\n",
" d1 :Document = Document(id=\"1\", page_content=\"apple\")\n",
" d2 :Document = Document(id=\"2\", page_content=\"orange\")\n",
" d3 :Document = Document(id=\"3\", page_content=\"tiger\")\n",
" d4 :Document = Document(id=\"4\", page_content=\"cat\")\n",
" d5 :Document = Document(id=\"5\", page_content=\"dog\")\n",
" d6 :Document = Document(id=\"6\", page_content=\"fox\")\n",
" d7 :Document = Document(id=\"7\", page_content=\"pear\")\n",
" d8 :Document = Document(id=\"8\", page_content=\"banana\")\n",
" d9 :Document = Document(id=\"9\", page_content=\"plum\")\n",
" d10 :Document = Document(id=\"10\", page_content=\"lion\")\n",
"\n",
" documents = [d1, d2, d3, d4, d5, d6, d7, d8, d9, d10]\n",
" return documents\n",
"\n",
"async def test_asimilarity_search_by_vector():\n",
" documents = test_data()\n",
" session: Session = await Session.create()\n",
" try:\n",
" named_map: NamedMap[str, Document] = await session.get_map(\"my-map\")\n",
" embedding :Embeddings = HuggingFaceEmbeddings(\n",
" model_name=\"sentence-transformers/all-MiniLM-l6-v2\")\n",
" # this embedding generates vectors of dimension 384\n",
" cvs :CoherenceVectorStore = await CoherenceVectorStore.create(\n",
" named_map,embedding,384)\n",
" await cvs.aadd_documents(documents)\n",
" ids = [doc.id for doc in documents]\n",
" l = await cvs.aget_by_ids(ids)\n",
" assert len(l) == 10\n",
"\n",
" vector = cvs.embeddings.embed_query(\"fruit\")\n",
" result = await cvs.asimilarity_search_by_vector(vector)\n",
" assert len(result) == 4\n",
" print(\"====\")\n",
" for e in result:\n",
" print(e)\n",
" finally:\n",
" await session.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "803d5c9c-0df5-4a27-b179-b97c0ca0c27a",
"metadata": {},
"outputs": [],
"source": [
"## API reference"
]
}
],
"metadata": {
"jupytext": {
"cell_metadata_filter": "-all",
"main_language": "python",
"notebook_metadata_filter": "-all"
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.21"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

36
libs/partners/coherence/.gitignore vendored Normal file
View File

@ -0,0 +1,36 @@
# Python
__pycache__/
*.py[cod]
*.egg
*.egg-info/
dist/
build/
.eggs/
# Virtual environments
.venv/
.env/
# uv cache
.uv/
# Testing
htmlcov/
.cache/
.coverage
coverage.xml
# IDE
.idea/
.vscode/
# Logs
*.log
# OCA
.oca
# OS
.DS_Store
Thumbs.db

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 LangChain, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,84 @@
# Makefile for LangChain-Coherence Integration
.PHONY: install update-dev update-core lock sync lint format fix check test clean docs-ipynb readme-ipynb create-ipynb help
# Paths to common tools (adjust if using .uv or other env)
PYTHON := .venv/bin/python
PIP := .venv/bin/pip
RUFF := .venv/bin/ruff
PYTEST := .venv/bin/pytest
MYPY := .venv/bin/mypy
PACKAGE_NAME=langchain-coherence
DIST_DIR=dist
install:
@echo "🔧 Installing all dependencies..."
uv venv
uv pip install -e .[lint,typing,test,docs,publish]
update-dev:
@echo "🔄 Updating development dependencies..."
uv pip install -e .[lint,typing,test,docs] --upgrade
update-core:
@echo "🔄 Updating core dependencies..."
uv pip install --upgrade langchain-core coherence-client
lock:
@echo "🔐 Locking all dependencies to uv.lock..."
uv pip compile --all-extras
sync:
@echo "📦 Syncing dependencies from uv.lock..."
uv pip install -r uv.lock
lint: check
check:
@echo "🔍 Running linter and type checker..."
$(RUFF) check langchain_coherence tests
$(MYPY) --explicit-package-bases langchain_coherence
format:
@echo "🎨 Formatting code with Ruff..."
$(RUFF) format langchain_coherence tests
fix:
@echo "🔧 Fixing lint issues..."
$(MAKE) format
$(RUFF) check langchain_coherence tests --fix
test:
@echo "🧪 Running tests..."
$(PYTEST)
clean:
@echo "🧹 Cleaning build/test artifacts..."
rm -rf .pytest_cache .mypy_cache .ruff_cache __pycache__ *.egg-info dist build
build:
@echo "🧱 Building distribution using local virtualenv"
$(PYTHON) -m build --no-isolation
upload-pypi:
@echo "🚀 Uploading to PyPI"
$(PYTHON) -m twine upload dist/*
publish: build upload-pypi
help:
@echo "🛠 Available Make targets:"
@echo " install - Install all dependencies into .venv"
@echo " update-dev - Upgrade dev dependencies (ruff, pytest, etc.)"
@echo " update-core - Upgrade core runtime deps (langchain-core, coherence-client)"
@echo " lock - Generate uv.lock with pinned versions"
@echo " sync - Install from uv.lock (repeatable builds)"
@echo " lint - Run linter and mypy"
@echo " fix - Autoformat and fix issues"
@echo " test - Run all tests"
@echo " build - Building distribution using local virtualenv"
@echo " upload-pypi - Uploading to PyPI"
@echo " publish - calls build, upload-pypi"
@echo " clean - Remove temp and build files"

View File

@ -0,0 +1,103 @@
# LangChain Coherence Integration
This package integrates Oracle Coherence as a vector store in LangChain.
## Installation
```bash
pip install langchain_coherence
```
## Usage
Before using LangChain's CoherenceVectorStore you must ensure that a Coherence server ([Coherence CE](https://github.com/oracle/coherence) 25.03+ or [Oracle Coherence](https://www.oracle.com/java/coherence/) 14.1.2+) is running
For local development, we recommend using the Coherence CE container image:
```aiignore
docker run -d -p 1408:1408 ghcr.io/oracle/coherence-ce:25.03.2
```
### Adding and retrieving Documents
```python
import asyncio
from langchain_coherence import CoherenceVectorStore
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from coherence import NamedMap, Session
async def do_run():
session: Session = await Session.create()
try:
named_map: NamedMap[str, Document] = await session.get_map("my-map")
embedding :Embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-l6-v2")
# this embedding generates vectors of dimension 384
cvs :CoherenceVectorStore = await CoherenceVectorStore.create(
named_map,embedding,384)
d1 :Document = Document(id="1", page_content="apple")
d2 :Document = Document(id="2", page_content="orange")
documents = [d1, d2]
await cvs.aadd_documents(documents)
ids = [doc.id for doc in documents]
l = await cvs.aget_by_ids(ids)
assert len(l) == len(ids)
print("====")
for e in l:
print(e)
finally:
await session.close()
asyncio.run(do_run())
```
### SimilaritySearch on Documents
```python
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from coherence import NamedMap, Session
from langchain_core.vectorstores.coherence_store import CoherenceVectorStore
def test_data():
d1 :Document = Document(id="1", page_content="apple")
d2 :Document = Document(id="2", page_content="orange")
d3 :Document = Document(id="3", page_content="tiger")
d4 :Document = Document(id="4", page_content="cat")
d5 :Document = Document(id="5", page_content="dog")
d6 :Document = Document(id="6", page_content="fox")
d7 :Document = Document(id="7", page_content="pear")
d8 :Document = Document(id="8", page_content="banana")
d9 :Document = Document(id="9", page_content="plum")
d10 :Document = Document(id="10", page_content="lion")
documents = [d1, d2, d3, d4, d5, d6, d7, d8, d9, d10]
return documents
async def test_asimilarity_search():
documents = test_data()
session: Session = await Session.create()
try:
named_map: NamedMap[str, Document] = await session.get_map("my-map")
embedding :Embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-l6-v2")
# this embedding generates vectors of dimension 384
cvs :CoherenceVectorStore = await CoherenceVectorStore.create(
named_map,embedding,384)
await cvs.aadd_documents(documents)
ids = [doc.id for doc in documents]
l = await cvs.aget_by_ids(ids)
assert len(l) == 10
result = await cvs.asimilarity_search("fruit")
assert len(result) == 4
print("====")
for e in result:
print(e)
finally:
await session.close()
```

View File

@ -0,0 +1,9 @@
"""Public interface for the LangChain Coherence integration."""
from __future__ import annotations
__version__ = "0.0.1"
from langchain_coherence.coherence_store import CoherenceVectorStore
__all__ = ["CoherenceVectorStore"]

View File

@ -0,0 +1,600 @@
"""Coherence vector store."""
from __future__ import annotations
import asyncio
import json
import uuid
from typing import (
TYPE_CHECKING,
Any,
Final,
Optional,
cast,
)
from typing_extensions import override
if TYPE_CHECKING:
from collections.abc import Iterator, Sequence
import jsonpickle # type: ignore[import-untyped]
from coherence import ( # type: ignore[import-untyped]
Extractors,
Filters,
NamedCache,
)
from coherence.ai import ( # type: ignore[import-untyped]
CosineDistance,
DistanceAlgorithm,
FloatVector,
HnswIndex,
QueryResult,
SimilaritySearch,
Vector,
Vectors,
)
from coherence.extractor import ( # type: ignore[import-untyped]
ValueExtractor,
)
from coherence.filter import ( # type: ignore[import-untyped]
Filter,
)
from coherence.serialization import ( # type: ignore[import-untyped]
JSONSerializer,
SerializerRegistry,
)
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
class CoherenceVectorStore(VectorStore):
"""Coherence VectorStore implementation.
Uses Coherence NamedCache, for similarity search.
Setup:
Install ``langchain-core``.
.. code-block:: bash
pip install -U langchain-core
Add Documents and retrieve them:
.. code-block:: python
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from coherence import NamedMap, Session
from langchain_core.vectorstores.coherence_store import CoherenceVectorStore
session: Session = await Session.create()
try:
named_map: NamedMap[str, Document] = await session.get_map("my-map")
embedding :Embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-l6-v2")
# this embedding generates vectors of dimension 384
cvs :CoherenceVectorStore = await CoherenceVectorStore.create(
named_map,embedding
d1 :Document = Document(id="1", page_content="apple")
d2 :Document = Document(id="2", page_content="orange")
documents = [d1, d2]
await cvs.aadd_documents(documents)
ids = [doc.id for doc in documents]
l = await cvs.aget_by_ids(ids)
assert len(l) == len(ids)
print("====")
for e in l:
print(e)
finally:
await session.close()
Delete Documents:
.. code-block:: python
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from coherence import NamedMap, Session
from langchain_core.vectorstores.coherence_store import CoherenceVectorStore
session: Session = await Session.create()
try:
named_map: NamedMap[str, Document] = await session.get_map("my-map")
embedding :Embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-l6-v2")
# this embedding generates vectors of dimension 384
cvs :CoherenceVectorStore = await CoherenceVectorStore.create(
named_map,embedding)
d1 :Document = Document(id="1", page_content="apple")
d2 :Document = Document(id="2", page_content="orange")
documents = [d1, d2]
await cvs.aadd_documents(documents)
ids = [doc.id for doc in documents]
await cvs.adelete(ids)
finally:
await session.close()
Similarity Search:
.. code-block:: python
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from coherence import NamedMap, Session
from langchain_core.vectorstores.coherence_store import CoherenceVectorStore
def test_data():
d1 :Document = Document(id="1", page_content="apple")
d2 :Document = Document(id="2", page_content="orange")
d3 :Document = Document(id="3", page_content="tiger")
d4 :Document = Document(id="4", page_content="cat")
d5 :Document = Document(id="5", page_content="dog")
d6 :Document = Document(id="6", page_content="fox")
d7 :Document = Document(id="7", page_content="pear")
d8 :Document = Document(id="8", page_content="banana")
d9 :Document = Document(id="9", page_content="plum")
d10 :Document = Document(id="10", page_content="lion")
documents = [d1, d2, d3, d4, d5, d6, d7, d8, d9, d10]
return documents
async def test_asimilarity_search():
documents = test_data()
session: Session = await Session.create()
try:
named_map: NamedMap[str, Document] = await session.get_map("my-map")
embedding :Embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-l6-v2")
# this embedding generates vectors of dimension 384
cvs :CoherenceVectorStore = await CoherenceVectorStore.create(
named_map,embedding)
await cvs.aadd_documents(documents)
ids = [doc.id for doc in documents]
l = await cvs.aget_by_ids(ids)
assert len(l) == 10
result = await cvs.asimilarity_search("fruit")
assert len(result) == 4
print("====")
for e in result:
print(e)
finally:
await session.close()
Similarity Search by vector :
.. code-block:: python
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from coherence import NamedMap, Session
from langchain_core.vectorstores.coherence_store import CoherenceVectorStore
def test_data():
d1 :Document = Document(id="1", page_content="apple")
d2 :Document = Document(id="2", page_content="orange")
d3 :Document = Document(id="3", page_content="tiger")
d4 :Document = Document(id="4", page_content="cat")
d5 :Document = Document(id="5", page_content="dog")
d6 :Document = Document(id="6", page_content="fox")
d7 :Document = Document(id="7", page_content="pear")
d8 :Document = Document(id="8", page_content="banana")
d9 :Document = Document(id="9", page_content="plum")
d10 :Document = Document(id="10", page_content="lion")
documents = [d1, d2, d3, d4, d5, d6, d7, d8, d9, d10]
return documents
async def test_asimilarity_search_by_vector():
documents = test_data()
session: Session = await Session.create()
try:
named_map: NamedMap[str, Document] = await session.get_map("my-map")
embedding :Embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-l6-v2")
# this embedding generates vectors of dimension 384
cvs :CoherenceVectorStore = await CoherenceVectorStore.create(
named_map,embedding)
await cvs.aadd_documents(documents)
ids = [doc.id for doc in documents]
l = await cvs.aget_by_ids(ids)
assert len(l) == 10
vector = cvs.embeddings.embed_query("fruit")
result = await cvs.asimilarity_search_by_vector(vector)
assert len(result) == 4
print("====")
for e in result:
print(e)
finally:
await session.close()
"""
VECTOR_FIELD: Final[str] = "__dict__.metadata.vector"
"""The name of the field containing the vector embeddings."""
VECTOR_EXTRACTOR: Final[ValueExtractor] = Extractors.extract(VECTOR_FIELD)
"""The ValueExtractor to extract the embeddings vector."""
def __init__(self, coherence_cache: NamedCache, embedding: Embeddings):
"""Initialize with Coherence cache and embedding function.
Args:
coherence_cache: Coherence NamedCache to use
embedding: embedding function to use.
"""
self.cache = coherence_cache
self.embedding = embedding
@staticmethod
async def create(
coherence_cache: NamedCache,
embedding: Embeddings,
) -> CoherenceVectorStore:
"""Create an instance of CoherenceVectorStore.
Args:
coherence_cache: Coherence NamedCache to use
embedding: embedding function to use.
"""
coh_store: CoherenceVectorStore = CoherenceVectorStore(
coherence_cache, embedding
)
return coh_store
async def add_index(self, dimensions: int) -> None:
"""Creates index on the Coherence cache on the VECTOR_FIELD.
Args:
dimensions: size of the vector created by the embedding function
"""
await self.cache.add_index(
HnswIndex(CoherenceVectorStore.VECTOR_EXTRACTOR, dimensions)
)
async def remove_index(self) -> None:
"""Removes index on the Coherence cache on the VECTOR_FIELD."""
await self.cache.remove_index(CoherenceVectorStore.VECTOR_EXTRACTOR)
@property
@override
def embeddings(self) -> Embeddings:
return self.embedding
@override
def add_documents(
self, documents: list[Document], ids: Optional[list[str]] = None, **kwargs: Any
) -> list[str]:
raise NotImplementedError
@override
async def aadd_documents(
self, documents: list[Document], ids: Optional[list[str]] = None, **kwargs: Any
) -> list[str]:
"""Async run more documents through the embeddings and add to the vectorstore.
Args:
documents: Documents to add to the vectorstore.
ids: Optional list of IDs of the documents.
kwargs: Additional keyword arguments.
Returns:
List of IDs of the added texts.
Raises:
ValueError: If the number of IDs does not match the number of documents.
"""
texts = [doc.page_content for doc in documents]
vectors = await self.embedding.aembed_documents(texts)
# Apply normalization and wrap in FloatVector
float_vectors = [FloatVector(Vectors.normalize(vector)) for vector in vectors]
if ids and len(ids) != len(texts):
msg = (
f"ids must be the same length as texts. "
f"Got {len(ids)} ids and {len(texts)} texts."
)
raise ValueError(msg)
id_iterator: Iterator[Optional[str]] = (
iter(ids) if ids else iter(doc.id for doc in documents)
)
ids_: list[str] = []
doc_map: dict[str, Document] = {}
for doc, vector in zip(documents, float_vectors):
doc_id = next(id_iterator)
doc_id_ = doc_id or str(uuid.uuid4())
ids_.append(doc_id_)
doc.metadata["vector"] = vector
doc_map[doc_id_] = doc
await self.cache.put_all(doc_map)
return ids_
@override
def get_by_ids(self, ids: Sequence[str], /) -> list[Document]:
raise NotImplementedError
@override
async def aget_by_ids(self, ids: Sequence[str], /) -> list[Document]:
"""Get documents by their ids.
Args:
ids: The ids of the documents to get.
Returns:
A list of Document objects.
"""
return [e.value async for e in await self.cache.get_all(set(ids))]
@override
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
"""Async delete by Documeny ID or other criteria.
Args:
ids: List of ids to delete. If None, delete all. Default is None.
**kwargs: Other keyword arguments that subclasses might use.
Returns:
Optional[bool]: True if deletion is successful,
False otherwise, None if not implemented.
"""
if ids is None:
await self.cache.clear()
else:
# Efficient parallel delete
await asyncio.gather(*(self.cache.remove(i) for i in ids))
def _parse_coherence_kwargs(
self, **kwargs: Any
) -> tuple[DistanceAlgorithm, Filter, bool]:
allowed_keys = {"algorithm", "filter", "brute_force"}
extra_keys = set(kwargs) - allowed_keys
if extra_keys:
# Silently ignore or log if needed
for key in extra_keys:
kwargs.pop(key)
algorithm: DistanceAlgorithm = kwargs.get("algorithm", CosineDistance())
filter_: Filter = kwargs.get("filter", Filters.always())
brute_force: bool = kwargs.get("brute_force", False)
return (algorithm, filter_, brute_force)
@override
async def asimilarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> list[Document]:
"""Async method return list of docs most similar to query.
Args:
query: Input text.
k: Number of Documents to return. Defaults to 4.
**kwargs: Optional arguments:
- algorithm: DistanceAlgorithm to use.(default CosineDistance)
https://oracle.github.io/coherence-py-client/api_reference/ai.html#cosinedistance
- filter: filter to use to limit the set of entries to search.
(default Filters.always())
https://oracle.github.io/coherence-py-client/api_reference/filter.html
- brute_force: Force brute force search, ignoring any available indexes.
(default False)
https://oracle.github.io/coherence-py-client/api_reference/ai.html#similaritysearch
Returns:
List of Documents most similar to the query.
"""
algorithm, filter_, brute_force = self._parse_coherence_kwargs(**kwargs)
query_vector = self.embedding.embed_query(query)
float_query_vector = FloatVector(Vectors.normalize(query_vector))
search: SimilaritySearch = SimilaritySearch(
CoherenceVectorStore.VECTOR_EXTRACTOR,
float_query_vector,
k,
algorithm=algorithm,
filter=filter_,
brute_force=brute_force,
)
query_results = await self.cache.aggregate(search)
return [e.value for e in query_results]
@override
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> list[Document]:
raise NotImplementedError
@override
async def asimilarity_search_by_vector(
self, embedding: list[float], k: int = 4, **kwargs: Any
) -> list[Document]:
"""Async method return list of docs most similar to passed embedding vector.
Args:
embedding: Input vector.
k: Number of Documents to return. Defaults to 4.
**kwargs: Optional arguments:
- algorithm: DistanceAlgorithm to use.(default CosineDistance)
https://oracle.github.io/coherence-py-client/api_reference/ai.html#cosinedistance
- filter: filter to use to limit the set of entries to search.
(default Filters.always())
https://oracle.github.io/coherence-py-client/api_reference/filter.html
- brute_force: Force brute force search, ignoring any available indexes.
(default False)
https://oracle.github.io/coherence-py-client/api_reference/ai.html#similaritysearch
Returns:
List of Documents most similar to the query.
"""
algorithm, filter_, brute_force = self._parse_coherence_kwargs(**kwargs)
float_query_vector = FloatVector(Vectors.normalize(embedding))
search: SimilaritySearch = SimilaritySearch(
CoherenceVectorStore.VECTOR_EXTRACTOR,
float_query_vector,
k,
algorithm=algorithm,
filter=filter_,
brute_force=brute_force,
)
query_results = await self.cache.aggregate(search, filter=Filters.always())
return [e.value for e in query_results]
@override
def similarity_search_by_vector(
self, embedding: list[float], k: int = 4, **kwargs: Any
) -> list[Document]:
raise NotImplementedError
@override
async def asimilarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any
) -> list[tuple[Document, float]]:
"""Async method return list of tuple(Document, score) most similar to query.
Args:
query: Input text.
k: Number of Documents to return. Defaults to 4.
**kwargs: Optional arguments:
- algorithm: DistanceAlgorithm to use.(default CosineDistance)
https://oracle.github.io/coherence-py-client/api_reference/ai.html#cosinedistance
- filter: filter to use to limit the set of entries to search.
(default Filters.always())
https://oracle.github.io/coherence-py-client/api_reference/filter.html
- brute_force: Force brute force search, ignoring any available indexes.
(default False)
https://oracle.github.io/coherence-py-client/api_reference/ai.html#similaritysearch
Returns:
List of tuple(Document, score) most similar to the query.
"""
algorithm, filter_, brute_force = self._parse_coherence_kwargs(**kwargs)
query_vector = self.embedding.embed_query(query)
float_query_vector = FloatVector(Vectors.normalize(query_vector))
search: SimilaritySearch = SimilaritySearch(
CoherenceVectorStore.VECTOR_EXTRACTOR,
float_query_vector,
k,
algorithm=algorithm,
filter=filter_,
brute_force=brute_force,
)
query_results: list[QueryResult] = await self.cache.aggregate(
search, filter=Filters.always()
)
return [(e.value, e.distance) for e in query_results]
@override
def similarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any
) -> list[tuple[Document, float]]:
raise NotImplementedError
@classmethod
@override
def from_texts(
cls,
texts: list[str],
embedding: Embeddings,
metadatas: Optional[list[dict[Any, Any]]] = None,
**kwargs: Any,
) -> CoherenceVectorStore:
msg = "Use `afrom_texts()` instead; sync context is not supported."
raise NotImplementedError(msg)
@classmethod
@override
async def afrom_texts(
cls,
texts: list[str],
embedding: Embeddings,
metadatas: Optional[list[dict[str, Any]]] = None,
**kwargs: Any,
) -> CoherenceVectorStore:
"""Asynchronously initialize the CoherenceVectorStore from texts and embeddings.
Args:
texts: List of input text strings.
embedding: Embedding function to use.
metadatas: Optional list of metadata dicts corresponding to each text.
kwargs: Additional keyword arguments.
- cache: Required Coherence NamedCache[str, Document] instance.
- ids: Optional list of document IDs.
Returns:
CoherenceVectorStore: An initialized and populated vector store.
Raises:
ValueError: If `cache` is not provided.
"""
# Extract and validate required Coherence cache
cache = kwargs.get("cache")
if cache is None:
msg = "Missing required 'cache' parameter in afrom_texts"
raise ValueError(msg)
# Optionally use caller-supplied document IDs
ids: Optional[list[str]] = kwargs.get("ids")
if ids is not None and len(ids) != len(texts):
msg = "Length of 'ids' must match length of 'texts'"
raise ValueError(msg)
# Create store instance
store = await cls.create(cache, embedding)
# Construct Document objects
documents = []
for i, text in enumerate(texts):
metadata = metadatas[i] if metadatas and i < len(metadatas) else {}
doc_id = ids[i] if ids else str(uuid.uuid4())
documents.append(Document(page_content=text, metadata=metadata, id=doc_id))
# Add documents to vector store
await store.aadd_documents(documents)
return store
@jsonpickle.handlers.register(Document)
class _LangChainDocumentHandler(jsonpickle.handlers.BaseHandler): # type: ignore[misc]
def flatten(self, obj: object, data: dict[str, Any]) -> dict[str, Any]:
"""Flatten object to a dictionary for handler to use."""
ser = SerializerRegistry.serializer(JSONSerializer.SER_FORMAT)
json_ser = cast("JSONSerializer", ser)
o = cast("Document", obj)
vector = o.metadata["vector"]
if vector is not None and isinstance(vector, Vector):
s = json_ser.serialize(vector)
d = json.loads(s[1:])
o.metadata["vector"] = json_ser.flatten_to_dict(d)
data["__dict__"] = obj.__dict__
return data
def restore(self, obj: dict[str, Any]) -> Document:
"""Convert dictionary to an object for handler to use."""
ser = SerializerRegistry.serializer(JSONSerializer.SER_FORMAT)
json_ser = cast("JSONSerializer", ser)
d = Document(page_content="")
d.__dict__ = obj["__dict__"]
vector = d.metadata["vector"]
if vector is not None and isinstance(vector, dict):
d.metadata["vector"] = json_ser.restore_to_object(vector)
return d

View File

@ -0,0 +1,95 @@
[project]
name = "langchain-coherence"
version = "0.0.1"
description = "LangChain integration for Oracle Coherence as a vector store."
authors = [{ name = "Your Name", email = "you@example.com" }]
license = {text = "MIT"}
readme = "README.md"
dependencies = [
"langchain-core>=0.1.20",
"coherence-client>=2.0.4",
]
requires-python = ">=3.9"
[project.urls]
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/coherence"
repository = "https://github.com/langchain-ai/langchain"
[project.optional-dependencies]
lint = [
"ruff<0.12.0,>=0.11.2",
]
typing = [
"mypy<1.16,>=1.15",
]
test = [
"pytest<9,>=8",
"pytest-asyncio<1.0.0,>=0.21.1",
"langchain_huggingface",
"sentence_transformers"
]
docs = [
"jupytext>=1.16",
"nbdoc>=0.0.29",
]
publish = [
"build",
"twine"
]
[tool.mypy]
strict = "True"
disallow_untyped_defs = "True"
[tool.ruff]
target-version = "py39"
[tool.ruff.lint]
select = [ "ALL",]
ignore = [
"C90", # McCabe complexity
"COM812", # Messes with the formatter
"FA100", # Can't activate since we exclude UP007 for now
"FIX002", # Line contains TODO
"ISC001", # Messes with the formatter
"PERF203", # Rarely useful
"PLR09", # Too many something (arg, statements, etc)
"RUF012", # Doesn't play well with Pydantic
"TC001", # Doesn't play well with Pydantic
"TC002", # Doesn't play well with Pydantic
"TC003", # Doesn't play well with Pydantic
"TD002", # Missing author in TODO
"TD003", # Missing issue link in TODO
"UP007", # Doesn't play well with Pydantic in Python 3.9
# TODO rules
"ANN401",
"BLE",
"ERA",
"PLR2004",
]
flake8-annotations.allow-star-arg-any = true
flake8-annotations.mypy-init-return = true
pydocstyle.convention = "google"
pydocstyle.ignore-var-parameters = true
[tool.pytest.ini_options]
asyncio_default_fixture_loop_scope = "function"
testpaths = ["tests"]
filterwarnings = [
"ignore::UserWarning:pkg_resources"
]
markers = [
"compile: marker used to test compilation-only tests"
]
[tool.ruff.lint.per-file-ignores]
"tests/**" = [
"D", # docstring rules
"ANN", # missing type annotations
"T201", # use of `print`
"S101", # use of `assert`
"E741", # ambiguous variable name like `l`
"RET504", # unnecessary assignment before return
"I001", # import sorting
"UP035" # import from collections.abc instead of typing
]

View File

@ -0,0 +1,153 @@
import pytest
import pytest_asyncio
import inspect
from typing import AsyncGenerator
from langchain_core.embeddings import Embeddings
from langchain_core.documents import Document
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from coherence import NamedCache, Session
from langchain_coherence import CoherenceVectorStore
@pytest_asyncio.fixture
async def store() -> AsyncGenerator[CoherenceVectorStore, None]:
session: Session = await Session.create()
named_cache: NamedCache[str, Document] = await session.get_cache("my-map")
embedding: Embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-l6-v2"
)
cvs: CoherenceVectorStore = await CoherenceVectorStore.create(
named_cache, embedding
)
yield cvs
await cvs.cache.destroy()
await session.close()
def get_test_data():
d1: Document = Document(id="1", page_content="apple")
d2: Document = Document(id="2", page_content="orange")
d3: Document = Document(id="3", page_content="tiger")
d4: Document = Document(id="4", page_content="cat")
d5: Document = Document(id="5", page_content="dog")
d6: Document = Document(id="6", page_content="fox")
d7: Document = Document(id="7", page_content="pear")
d8: Document = Document(id="8", page_content="banana")
d9: Document = Document(id="9", page_content="plum")
d10: Document = Document(id="10", page_content="lion")
documents = [d1, d2, d3, d4, d5, d6, d7, d8, d9, d10]
return documents
@pytest.mark.asyncio
async def test_aget_by_id(store: CoherenceVectorStore):
print()
print(f"=======: {inspect.currentframe().f_code.co_name}")
documents = get_test_data()
await store.aadd_documents(documents)
ids = [doc.id for doc in documents]
l = await store.aget_by_ids(ids)
assert len(l) == 10
print("====")
for e in l:
print(e)
@pytest.mark.asyncio
async def test_adelete(store: CoherenceVectorStore):
print()
print(f"=======: {inspect.currentframe().f_code.co_name}")
documents = get_test_data()
await store.aadd_documents(documents)
ids = [doc.id for doc in documents]
l = await store.aget_by_ids(ids)
assert len(l) == 10
await store.adelete(["1", "2"])
l = await store.aget_by_ids(ids)
assert len(l) == 8
await store.adelete()
l = await store.aget_by_ids(ids)
assert len(l) == 0
@pytest.mark.asyncio
async def test_asimilarity_search(store: CoherenceVectorStore):
print()
print(f"=======: {inspect.currentframe().f_code.co_name}")
documents = get_test_data()
await store.aadd_documents(documents)
ids = [doc.id for doc in documents]
l = await store.aget_by_ids(ids)
assert len(l) == 10
# result = await coherence_store.asimilarity_search("animal")
result = await store.asimilarity_search("fruit")
assert len(result) == 4
print("====")
for e in result:
print(e)
@pytest.mark.asyncio
async def test_asimilarity_search_by_vector(store: CoherenceVectorStore):
print()
print(f"=======: {inspect.currentframe().f_code.co_name}")
documents = get_test_data()
await store.aadd_documents(documents)
ids = [doc.id for doc in documents]
l = await store.aget_by_ids(ids)
assert len(l) == 10
vector = store.embeddings.embed_query("animal")
result = await store.asimilarity_search_by_vector(vector)
assert len(result) == 4
print("====")
for e in result:
print(e)
@pytest.mark.asyncio
async def test_asimilarity_search_with_score(store: CoherenceVectorStore):
print()
print(f"=======: {inspect.currentframe().f_code.co_name}")
documents = get_test_data()
await store.aadd_documents(documents)
ids = [doc.id for doc in documents]
l = await store.aget_by_ids(ids)
assert len(l) == 10
# result = await coherence_store.asimilarity_search("animal")
result = await store.asimilarity_search_with_score("fruit")
assert len(result) == 4
print("====")
for e in result:
print(e)
@pytest.mark.asyncio
async def test_afrom_texts():
session = await Session.create()
try:
cache = await session.get_cache("test-map-async")
embedding = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-l6-v2"
)
texts = ["apple", "banana"]
metadatas = [{"cat": "fruit"}, {"cat": "fruit"}]
ids = ["id1", "id2"]
store = await CoherenceVectorStore.afrom_texts(
texts=texts,
embedding=embedding,
cache=cache,
metadatas=metadatas,
ids=ids,
)
results = await store.aget_by_ids(ids)
assert len(results) == 2
finally:
await session.close()

View File

@ -0,0 +1,6 @@
import pytest
@pytest.mark.compile
def test_placeholder() -> None:
"""Used for compiling integration tests without running any real tests."""

File diff suppressed because it is too large Load Diff