mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-30 10:23:30 +00:00
Add Couchbase document loader (#13979)
**Description:** Adds the document loader for [Couchbase](http://couchbase.com/), a distributed NoSQL database. **Dependencies:** Added the Couchbase SDK as an optional dependency. **Twitter handle:** nithishr --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
805e9bfc24
commit
eecfa3f9e5
203
docs/docs/integrations/document_loaders/couchbase.ipynb
Normal file
203
docs/docs/integrations/document_loaders/couchbase.ipynb
Normal file
File diff suppressed because one or more lines are too long
@ -62,6 +62,7 @@ from langchain.document_loaders.college_confidential import CollegeConfidentialL
|
||||
from langchain.document_loaders.concurrent import ConcurrentLoader
|
||||
from langchain.document_loaders.confluence import ConfluenceLoader
|
||||
from langchain.document_loaders.conllu import CoNLLULoader
|
||||
from langchain.document_loaders.couchbase import CouchbaseLoader
|
||||
from langchain.document_loaders.csv_loader import CSVLoader, UnstructuredCSVLoader
|
||||
from langchain.document_loaders.cube_semantic import CubeSemanticLoader
|
||||
from langchain.document_loaders.datadog_logs import DatadogLogsLoader
|
||||
@ -247,6 +248,7 @@ __all__ = [
|
||||
"CollegeConfidentialLoader",
|
||||
"ConcurrentLoader",
|
||||
"ConfluenceLoader",
|
||||
"CouchbaseLoader",
|
||||
"CubeSemanticLoader",
|
||||
"DataFrameLoader",
|
||||
"DatadogLogsLoader",
|
||||
|
100
libs/langchain/langchain/document_loaders/couchbase.py
Normal file
100
libs/langchain/langchain/document_loaders/couchbase.py
Normal file
@ -0,0 +1,100 @@
|
||||
import logging
|
||||
from typing import Iterator, List, Optional
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CouchbaseLoader(BaseLoader):
|
||||
"""Load documents from `Couchbase`.
|
||||
|
||||
Each document represents one row of the result. The `page_content_fields` are
|
||||
written into the `page_content`of the document. The `metadata_fields` are written
|
||||
into the `metadata` of the document. By default, all columns are written into
|
||||
the `page_content` and none into the `metadata`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
connection_string: str,
|
||||
db_username: str,
|
||||
db_password: str,
|
||||
query: str,
|
||||
*,
|
||||
page_content_fields: Optional[List[str]] = None,
|
||||
metadata_fields: Optional[List[str]] = None,
|
||||
) -> None:
|
||||
"""Initialize Couchbase document loader.
|
||||
|
||||
Args:
|
||||
connection_string (str): The connection string to the Couchbase cluster.
|
||||
db_username (str): The username to connect to the Couchbase cluster.
|
||||
db_password (str): The password to connect to the Couchbase cluster.
|
||||
query (str): The SQL++ query to execute.
|
||||
page_content_fields (Optional[List[str]]): The columns to write into the
|
||||
`page_content` field of the document. By default, all columns are
|
||||
written.
|
||||
metadata_fields (Optional[List[str]]): The columns to write into the
|
||||
`metadata` field of the document. By default, no columns are written.
|
||||
"""
|
||||
try:
|
||||
from couchbase.auth import PasswordAuthenticator
|
||||
from couchbase.cluster import Cluster
|
||||
from couchbase.options import ClusterOptions
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Could not import couchbase package."
|
||||
"Please install couchbase SDK with `pip install couchbase`."
|
||||
) from e
|
||||
if not connection_string:
|
||||
raise ValueError("connection_string must be provided.")
|
||||
|
||||
if not db_username:
|
||||
raise ValueError("db_username must be provided.")
|
||||
|
||||
if not db_password:
|
||||
raise ValueError("db_password must be provided.")
|
||||
|
||||
auth = PasswordAuthenticator(
|
||||
db_username,
|
||||
db_password,
|
||||
)
|
||||
|
||||
self.cluster: Cluster = Cluster(connection_string, ClusterOptions(auth))
|
||||
self.query = query
|
||||
self.page_content_fields = page_content_fields
|
||||
self.metadata_fields = metadata_fields
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load Couchbase data into Document objects."""
|
||||
return list(self.lazy_load())
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load Couchbase data into Document objects lazily."""
|
||||
from datetime import timedelta
|
||||
|
||||
# Ensure connection to Couchbase cluster
|
||||
self.cluster.wait_until_ready(timedelta(seconds=5))
|
||||
|
||||
# Run SQL++ Query
|
||||
result = self.cluster.query(self.query)
|
||||
for row in result:
|
||||
metadata_fields = self.metadata_fields
|
||||
page_content_fields = self.page_content_fields
|
||||
|
||||
if not page_content_fields:
|
||||
page_content_fields = list(row.keys())
|
||||
|
||||
if not metadata_fields:
|
||||
metadata_fields = []
|
||||
|
||||
metadata = {field: row[field] for field in metadata_fields}
|
||||
|
||||
document = "\n".join(
|
||||
f"{k}: {v}" for k, v in row.items() if k in page_content_fields
|
||||
)
|
||||
|
||||
yield (Document(page_content=document, metadata=metadata))
|
38
libs/langchain/poetry.lock
generated
38
libs/langchain/poetry.lock
generated
@ -1662,6 +1662,40 @@ lint = ["black (>=22.6.0)", "mdformat (>0.7)", "mdformat-gfm (>=0.3.5)", "ruff (
|
||||
test = ["pytest"]
|
||||
typing = ["mypy (>=0.990)"]
|
||||
|
||||
[[package]]
|
||||
name = "couchbase"
|
||||
version = "4.1.9"
|
||||
description = "Python Client for Couchbase"
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "couchbase-4.1.9-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:f36f65d5ea66ebebe8f9055feb44c72b60b64b8c466ee177c7eaf6d97b71b41a"},
|
||||
{file = "couchbase-4.1.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b750cb641a44084137444e86ba2cf596e713dceaaa8dcd4a09c370ddd5e3bca2"},
|
||||
{file = "couchbase-4.1.9-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:371f4c5e00965d6579e98cd6e49eb8543e3aeabb64d9ac41dae5b85c831faed4"},
|
||||
{file = "couchbase-4.1.9-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cfe53bfa29d72d5fa921554408ff7fada301e4641b652f2551060ebd3d1cc096"},
|
||||
{file = "couchbase-4.1.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d675d0d862eae34ebedd09e4f40e26ac0092ea0dca93520616cd68d195a1fb3a"},
|
||||
{file = "couchbase-4.1.9-cp310-cp310-win_amd64.whl", hash = "sha256:c8adc08a70cbe5e1b1e0e45ebbb4ea5879b3f1aba64d09770d6e35a760201609"},
|
||||
{file = "couchbase-4.1.9-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:8f3e16fedb2dd79dba81df5eb1fb6e493ee720ef12be5a2699ac540955775647"},
|
||||
{file = "couchbase-4.1.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8bb93e17304499fb9b6972efe8a75ea156a097eed983b4802a478ad6cef500b3"},
|
||||
{file = "couchbase-4.1.9-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:85da68da9efd5ed35d031a5725744ee36653f940ad16c252d9927f481581366c"},
|
||||
{file = "couchbase-4.1.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e04f014a8990c89195689af4d332028a6769b45221d861778c079e9f67184e6e"},
|
||||
{file = "couchbase-4.1.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:50db238605620ca1a2f4ed36f7820a2d61323a8a425986fd3caf1d9be4eb7f46"},
|
||||
{file = "couchbase-4.1.9-cp311-cp311-win_amd64.whl", hash = "sha256:ba9312755c88d39d86cae7ba11c15a6255d8afe5c552bbc1e2f6b66c880bd08e"},
|
||||
{file = "couchbase-4.1.9-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:17bdf8db3721e4f7c54b7e50db16fa6c65733d45cfd6c3bf50cd80a7f1672ea8"},
|
||||
{file = "couchbase-4.1.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a2fb14336b78843691a9f007fbbd0c33959ea4ae4e323112614673601772fb84"},
|
||||
{file = "couchbase-4.1.9-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3af36a4b25f948a4dd1a349ba5ddfa87a228cbdfbb8228a5045e187849392857"},
|
||||
{file = "couchbase-4.1.9-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f1a6d03fb4fc76aedeede7a55f957936863256b654ce38f05a508925cbd1c713"},
|
||||
{file = "couchbase-4.1.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:454c46c9fb6e485f1aba53f64a4b794e2146db480ccd32eaa80b2bba0f53895e"},
|
||||
{file = "couchbase-4.1.9-cp38-cp38-win_amd64.whl", hash = "sha256:4c35c2ef600677121b95540c8e78bb43ce5d18cafd49036ea256643ed00ac042"},
|
||||
{file = "couchbase-4.1.9-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:40bebe809042efceae95fba8d2a1f0bfecd144c090cf638d8283e038ffea6f19"},
|
||||
{file = "couchbase-4.1.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f9e956b6580baf4365c4a1b4e22622dc0948447f5ce106d24ed59532302b164f"},
|
||||
{file = "couchbase-4.1.9-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:150916388ee2813d242de014fb3ad5e259103e5cd0f1ce600280cc1c11732980"},
|
||||
{file = "couchbase-4.1.9-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bf2d1fc8fe22f6e3e4b5e41c7fc367a3a4537dd272a26859f01796724d2ae977"},
|
||||
{file = "couchbase-4.1.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9d9ffbb6897a3e68193a8611032230e5d520ae07ae74923305acf8670eb5281b"},
|
||||
{file = "couchbase-4.1.9-cp39-cp39-win_amd64.whl", hash = "sha256:b11ff93f4b5da9437fdfb384943dfbf0dac054394d30d21b5e50852dc1d27d2a"},
|
||||
{file = "couchbase-4.1.9.tar.gz", hash = "sha256:ee476c5e5b420610e5f4ce778b8c6c7a513f9f4dd4b57fe25000e94ad6eefb9e"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "coverage"
|
||||
version = "7.3.2"
|
||||
@ -11475,7 +11509,7 @@ cli = ["typer"]
|
||||
cohere = ["cohere"]
|
||||
docarray = ["docarray"]
|
||||
embeddings = ["sentence-transformers"]
|
||||
extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "dashvector", "databricks-vectorsearch", "datasets", "dgml-utils", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "hologres-vector", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openai", "openapi-pydantic", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"]
|
||||
extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "couchbase", "dashvector", "databricks-vectorsearch", "datasets", "dgml-utils", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "hologres-vector", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openai", "openapi-pydantic", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"]
|
||||
javascript = ["esprima"]
|
||||
llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"]
|
||||
openai = ["openai", "tiktoken"]
|
||||
@ -11485,4 +11519,4 @@ text-helpers = ["chardet"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "0cd9769243ade0dc1df941e902aa66c18a57333ae50309f004b4f60e6e27b5cf"
|
||||
content-hash = "f4791327aca4bf3db1b46731d987347b537e638a1be85b2a6a771e52f95d3f29"
|
||||
|
@ -147,6 +147,7 @@ hologres-vector = {version = "^0.0.6", optional = true}
|
||||
praw = {version = "^7.7.1", optional = true}
|
||||
msal = {version = "^1.25.0", optional = true}
|
||||
databricks-vectorsearch = {version = "^0.21", optional = true}
|
||||
couchbase = {version = "^4.1.9", optional = true}
|
||||
dgml-utils = {version = "^0.3.0", optional = true}
|
||||
datasets = {version = "^2.15.0", optional = true}
|
||||
|
||||
@ -391,6 +392,7 @@ extended_testing = [
|
||||
"hologres-vector",
|
||||
"praw",
|
||||
"databricks-vectorsearch",
|
||||
"couchbase",
|
||||
"dgml-utils",
|
||||
"cohere",
|
||||
]
|
||||
|
@ -0,0 +1,44 @@
|
||||
import unittest
|
||||
|
||||
from langchain.document_loaders.couchbase import CouchbaseLoader
|
||||
|
||||
try:
|
||||
import couchbase # noqa: F401
|
||||
|
||||
couchbase_installed = True
|
||||
except ImportError:
|
||||
couchbase_installed = False
|
||||
|
||||
|
||||
@unittest.skipIf(not couchbase_installed, "couchbase not installed")
|
||||
class TestCouchbaseLoader(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.conn_string = "<enter-valid-couchbase-connection-string>"
|
||||
self.database_user = "<enter-valid-couchbase-user>"
|
||||
self.database_password = "<enter-valid-couchbase-password>"
|
||||
self.valid_query = "select h.* from `travel-sample`.inventory.hotel h limit 10"
|
||||
self.valid_page_content_fields = ["country", "name", "description"]
|
||||
self.valid_metadata_fields = ["id"]
|
||||
|
||||
def test_couchbase_loader(self) -> None:
|
||||
"""Test Couchbase loader."""
|
||||
loader = CouchbaseLoader(
|
||||
connection_string=self.conn_string,
|
||||
db_username=self.database_user,
|
||||
db_password=self.database_password,
|
||||
query=self.valid_query,
|
||||
page_content_fields=self.valid_page_content_fields,
|
||||
metadata_fields=self.valid_metadata_fields,
|
||||
)
|
||||
docs = loader.load()
|
||||
print(docs)
|
||||
|
||||
assert len(docs) > 0 # assuming the query returns at least one document
|
||||
for doc in docs:
|
||||
print(doc)
|
||||
assert (
|
||||
doc.page_content != ""
|
||||
) # assuming that every document has page_content
|
||||
assert (
|
||||
"id" in doc.metadata and doc.metadata["id"] != ""
|
||||
) # assuming that every document has 'id'
|
@ -0,0 +1,6 @@
|
||||
"""Test importing the Couchbase document loader."""
|
||||
|
||||
|
||||
def test_couchbase_import() -> None:
|
||||
"""Test that the Couchbase document loader can be imported."""
|
||||
from langchain.document_loaders import CouchbaseLoader # noqa: F401
|
@ -41,6 +41,7 @@ EXPECTED_ALL = [
|
||||
"CollegeConfidentialLoader",
|
||||
"ConcurrentLoader",
|
||||
"ConfluenceLoader",
|
||||
"CouchbaseLoader",
|
||||
"CubeSemanticLoader",
|
||||
"DataFrameLoader",
|
||||
"DatadogLogsLoader",
|
||||
|
Loading…
Reference in New Issue
Block a user