Add Couchbase document loader (#13979)

**Description:** 
Adds the document loader for [Couchbase](http://couchbase.com/), a
distributed NoSQL database.
**Dependencies:** 
Added the Couchbase SDK as an optional dependency.
**Twitter handle:** nithishr

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Nithish Raghunandanan 2023-12-05 01:58:12 +05:30 committed by GitHub
parent 805e9bfc24
commit eecfa3f9e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 394 additions and 2 deletions

File diff suppressed because one or more lines are too long

View File

@ -62,6 +62,7 @@ from langchain.document_loaders.college_confidential import CollegeConfidentialL
from langchain.document_loaders.concurrent import ConcurrentLoader
from langchain.document_loaders.confluence import ConfluenceLoader
from langchain.document_loaders.conllu import CoNLLULoader
from langchain.document_loaders.couchbase import CouchbaseLoader
from langchain.document_loaders.csv_loader import CSVLoader, UnstructuredCSVLoader
from langchain.document_loaders.cube_semantic import CubeSemanticLoader
from langchain.document_loaders.datadog_logs import DatadogLogsLoader
@ -247,6 +248,7 @@ __all__ = [
"CollegeConfidentialLoader",
"ConcurrentLoader",
"ConfluenceLoader",
"CouchbaseLoader",
"CubeSemanticLoader",
"DataFrameLoader",
"DatadogLogsLoader",

View File

@ -0,0 +1,100 @@
import logging
from typing import Iterator, List, Optional
from langchain_core.documents import Document
from langchain.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__)
class CouchbaseLoader(BaseLoader):
"""Load documents from `Couchbase`.
Each document represents one row of the result. The `page_content_fields` are
written into the `page_content`of the document. The `metadata_fields` are written
into the `metadata` of the document. By default, all columns are written into
the `page_content` and none into the `metadata`.
"""
def __init__(
self,
connection_string: str,
db_username: str,
db_password: str,
query: str,
*,
page_content_fields: Optional[List[str]] = None,
metadata_fields: Optional[List[str]] = None,
) -> None:
"""Initialize Couchbase document loader.
Args:
connection_string (str): The connection string to the Couchbase cluster.
db_username (str): The username to connect to the Couchbase cluster.
db_password (str): The password to connect to the Couchbase cluster.
query (str): The SQL++ query to execute.
page_content_fields (Optional[List[str]]): The columns to write into the
`page_content` field of the document. By default, all columns are
written.
metadata_fields (Optional[List[str]]): The columns to write into the
`metadata` field of the document. By default, no columns are written.
"""
try:
from couchbase.auth import PasswordAuthenticator
from couchbase.cluster import Cluster
from couchbase.options import ClusterOptions
except ImportError as e:
raise ImportError(
"Could not import couchbase package."
"Please install couchbase SDK with `pip install couchbase`."
) from e
if not connection_string:
raise ValueError("connection_string must be provided.")
if not db_username:
raise ValueError("db_username must be provided.")
if not db_password:
raise ValueError("db_password must be provided.")
auth = PasswordAuthenticator(
db_username,
db_password,
)
self.cluster: Cluster = Cluster(connection_string, ClusterOptions(auth))
self.query = query
self.page_content_fields = page_content_fields
self.metadata_fields = metadata_fields
def load(self) -> List[Document]:
"""Load Couchbase data into Document objects."""
return list(self.lazy_load())
def lazy_load(self) -> Iterator[Document]:
"""Load Couchbase data into Document objects lazily."""
from datetime import timedelta
# Ensure connection to Couchbase cluster
self.cluster.wait_until_ready(timedelta(seconds=5))
# Run SQL++ Query
result = self.cluster.query(self.query)
for row in result:
metadata_fields = self.metadata_fields
page_content_fields = self.page_content_fields
if not page_content_fields:
page_content_fields = list(row.keys())
if not metadata_fields:
metadata_fields = []
metadata = {field: row[field] for field in metadata_fields}
document = "\n".join(
f"{k}: {v}" for k, v in row.items() if k in page_content_fields
)
yield (Document(page_content=document, metadata=metadata))

View File

@ -1662,6 +1662,40 @@ lint = ["black (>=22.6.0)", "mdformat (>0.7)", "mdformat-gfm (>=0.3.5)", "ruff (
test = ["pytest"]
typing = ["mypy (>=0.990)"]
[[package]]
name = "couchbase"
version = "4.1.9"
description = "Python Client for Couchbase"
optional = true
python-versions = ">=3.7"
files = [
{file = "couchbase-4.1.9-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:f36f65d5ea66ebebe8f9055feb44c72b60b64b8c466ee177c7eaf6d97b71b41a"},
{file = "couchbase-4.1.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b750cb641a44084137444e86ba2cf596e713dceaaa8dcd4a09c370ddd5e3bca2"},
{file = "couchbase-4.1.9-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:371f4c5e00965d6579e98cd6e49eb8543e3aeabb64d9ac41dae5b85c831faed4"},
{file = "couchbase-4.1.9-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cfe53bfa29d72d5fa921554408ff7fada301e4641b652f2551060ebd3d1cc096"},
{file = "couchbase-4.1.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d675d0d862eae34ebedd09e4f40e26ac0092ea0dca93520616cd68d195a1fb3a"},
{file = "couchbase-4.1.9-cp310-cp310-win_amd64.whl", hash = "sha256:c8adc08a70cbe5e1b1e0e45ebbb4ea5879b3f1aba64d09770d6e35a760201609"},
{file = "couchbase-4.1.9-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:8f3e16fedb2dd79dba81df5eb1fb6e493ee720ef12be5a2699ac540955775647"},
{file = "couchbase-4.1.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8bb93e17304499fb9b6972efe8a75ea156a097eed983b4802a478ad6cef500b3"},
{file = "couchbase-4.1.9-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:85da68da9efd5ed35d031a5725744ee36653f940ad16c252d9927f481581366c"},
{file = "couchbase-4.1.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e04f014a8990c89195689af4d332028a6769b45221d861778c079e9f67184e6e"},
{file = "couchbase-4.1.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:50db238605620ca1a2f4ed36f7820a2d61323a8a425986fd3caf1d9be4eb7f46"},
{file = "couchbase-4.1.9-cp311-cp311-win_amd64.whl", hash = "sha256:ba9312755c88d39d86cae7ba11c15a6255d8afe5c552bbc1e2f6b66c880bd08e"},
{file = "couchbase-4.1.9-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:17bdf8db3721e4f7c54b7e50db16fa6c65733d45cfd6c3bf50cd80a7f1672ea8"},
{file = "couchbase-4.1.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a2fb14336b78843691a9f007fbbd0c33959ea4ae4e323112614673601772fb84"},
{file = "couchbase-4.1.9-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3af36a4b25f948a4dd1a349ba5ddfa87a228cbdfbb8228a5045e187849392857"},
{file = "couchbase-4.1.9-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f1a6d03fb4fc76aedeede7a55f957936863256b654ce38f05a508925cbd1c713"},
{file = "couchbase-4.1.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:454c46c9fb6e485f1aba53f64a4b794e2146db480ccd32eaa80b2bba0f53895e"},
{file = "couchbase-4.1.9-cp38-cp38-win_amd64.whl", hash = "sha256:4c35c2ef600677121b95540c8e78bb43ce5d18cafd49036ea256643ed00ac042"},
{file = "couchbase-4.1.9-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:40bebe809042efceae95fba8d2a1f0bfecd144c090cf638d8283e038ffea6f19"},
{file = "couchbase-4.1.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f9e956b6580baf4365c4a1b4e22622dc0948447f5ce106d24ed59532302b164f"},
{file = "couchbase-4.1.9-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:150916388ee2813d242de014fb3ad5e259103e5cd0f1ce600280cc1c11732980"},
{file = "couchbase-4.1.9-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bf2d1fc8fe22f6e3e4b5e41c7fc367a3a4537dd272a26859f01796724d2ae977"},
{file = "couchbase-4.1.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9d9ffbb6897a3e68193a8611032230e5d520ae07ae74923305acf8670eb5281b"},
{file = "couchbase-4.1.9-cp39-cp39-win_amd64.whl", hash = "sha256:b11ff93f4b5da9437fdfb384943dfbf0dac054394d30d21b5e50852dc1d27d2a"},
{file = "couchbase-4.1.9.tar.gz", hash = "sha256:ee476c5e5b420610e5f4ce778b8c6c7a513f9f4dd4b57fe25000e94ad6eefb9e"},
]
[[package]]
name = "coverage"
version = "7.3.2"
@ -11475,7 +11509,7 @@ cli = ["typer"]
cohere = ["cohere"]
docarray = ["docarray"]
embeddings = ["sentence-transformers"]
extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "dashvector", "databricks-vectorsearch", "datasets", "dgml-utils", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "hologres-vector", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openai", "openapi-pydantic", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"]
extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "couchbase", "dashvector", "databricks-vectorsearch", "datasets", "dgml-utils", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "hologres-vector", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openai", "openapi-pydantic", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"]
javascript = ["esprima"]
llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"]
openai = ["openai", "tiktoken"]
@ -11485,4 +11519,4 @@ text-helpers = ["chardet"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "0cd9769243ade0dc1df941e902aa66c18a57333ae50309f004b4f60e6e27b5cf"
content-hash = "f4791327aca4bf3db1b46731d987347b537e638a1be85b2a6a771e52f95d3f29"

View File

@ -147,6 +147,7 @@ hologres-vector = {version = "^0.0.6", optional = true}
praw = {version = "^7.7.1", optional = true}
msal = {version = "^1.25.0", optional = true}
databricks-vectorsearch = {version = "^0.21", optional = true}
couchbase = {version = "^4.1.9", optional = true}
dgml-utils = {version = "^0.3.0", optional = true}
datasets = {version = "^2.15.0", optional = true}
@ -391,6 +392,7 @@ extended_testing = [
"hologres-vector",
"praw",
"databricks-vectorsearch",
"couchbase",
"dgml-utils",
"cohere",
]

View File

@ -0,0 +1,44 @@
import unittest
from langchain.document_loaders.couchbase import CouchbaseLoader
try:
import couchbase # noqa: F401
couchbase_installed = True
except ImportError:
couchbase_installed = False
@unittest.skipIf(not couchbase_installed, "couchbase not installed")
class TestCouchbaseLoader(unittest.TestCase):
def setUp(self) -> None:
self.conn_string = "<enter-valid-couchbase-connection-string>"
self.database_user = "<enter-valid-couchbase-user>"
self.database_password = "<enter-valid-couchbase-password>"
self.valid_query = "select h.* from `travel-sample`.inventory.hotel h limit 10"
self.valid_page_content_fields = ["country", "name", "description"]
self.valid_metadata_fields = ["id"]
def test_couchbase_loader(self) -> None:
"""Test Couchbase loader."""
loader = CouchbaseLoader(
connection_string=self.conn_string,
db_username=self.database_user,
db_password=self.database_password,
query=self.valid_query,
page_content_fields=self.valid_page_content_fields,
metadata_fields=self.valid_metadata_fields,
)
docs = loader.load()
print(docs)
assert len(docs) > 0 # assuming the query returns at least one document
for doc in docs:
print(doc)
assert (
doc.page_content != ""
) # assuming that every document has page_content
assert (
"id" in doc.metadata and doc.metadata["id"] != ""
) # assuming that every document has 'id'

View File

@ -0,0 +1,6 @@
"""Test importing the Couchbase document loader."""
def test_couchbase_import() -> None:
"""Test that the Couchbase document loader can be imported."""
from langchain.document_loaders import CouchbaseLoader # noqa: F401

View File

@ -41,6 +41,7 @@ EXPECTED_ALL = [
"CollegeConfidentialLoader",
"ConcurrentLoader",
"ConfluenceLoader",
"CouchbaseLoader",
"CubeSemanticLoader",
"DataFrameLoader",
"DatadogLogsLoader",