From 658f8bdee7928243074b4d033f96711aafe1f3a8 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Wed, 7 Jun 2023 21:32:23 -0700 Subject: [PATCH] Harrison/fauna loader (#5864) Co-authored-by: Shadid12 --- .../document_loaders/examples/fauna.ipynb | 84 +++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/fauna.py | 63 ++++++++++++++ .../document_loaders/test_fauna.py | 41 +++++++++ 4 files changed, 190 insertions(+) create mode 100644 docs/modules/indexes/document_loaders/examples/fauna.ipynb create mode 100644 langchain/document_loaders/fauna.py create mode 100644 tests/integration_tests/document_loaders/test_fauna.py diff --git a/docs/modules/indexes/document_loaders/examples/fauna.ipynb b/docs/modules/indexes/document_loaders/examples/fauna.ipynb new file mode 100644 index 00000000000..92b7feaacfc --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/fauna.ipynb @@ -0,0 +1,84 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fauna\n", + "\n", + ">[Fauna](https://fauna.com/) is a Document Database.\n", + "\n", + "Query `Fauna` documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install fauna" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Query data example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.fauna import FaunaLoader\n", + "\n", + "secret = \"\"\n", + "query = \"Item.all()\" # Fauna query. Assumes that the collection is called \"Item\"\n", + "field = \"text\" # The field that contains the page content. Assumes that the field is called \"text\"\n", + "\n", + "loader = FaunaLoader(query, field, secret)\n", + "docs = loader.lazy_load()\n", + "\n", + "for value in docs:\n", + " print(value)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Query with Pagination\n", + "You get a `after` value if there are more data. You can get values after the curcor by passing in the `after` string in query. \n", + "\n", + "To learn more following [this link](https://fqlx-beta--fauna-docs.netlify.app/fqlx/beta/reference/schema_entities/set/static-paginate)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"\n", + "Item.paginate(\"hs+DzoPOg ... aY1hOohozrV7A\")\n", + "Item.all()\n", + "\"\"\"\n", + "loader = FaunaLoader(query, field, secret)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 83b6330adc8..84a825062b9 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -34,6 +34,7 @@ from langchain.document_loaders.epub import UnstructuredEPubLoader from langchain.document_loaders.evernote import EverNoteLoader from langchain.document_loaders.excel import UnstructuredExcelLoader from langchain.document_loaders.facebook_chat import FacebookChatLoader +from langchain.document_loaders.fauna import FaunaLoader from langchain.document_loaders.figma import FigmaFileLoader from langchain.document_loaders.gcs_directory import GCSDirectoryLoader from langchain.document_loaders.gcs_file import GCSFileLoader @@ -155,6 +156,7 @@ __all__ = [ "DocugamiLoader", "Docx2txtLoader", "DuckDBLoader", + "FaunaLoader", "EverNoteLoader", "FacebookChatLoader", "FigmaFileLoader", diff --git a/langchain/document_loaders/fauna.py b/langchain/document_loaders/fauna.py new file mode 100644 index 00000000000..e1c8b85d418 --- /dev/null +++ b/langchain/document_loaders/fauna.py @@ -0,0 +1,63 @@ +from typing import Iterator, List, Optional, Sequence + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class FaunaLoader(BaseLoader): + """ + Attributes: + query (str): The FQL query string to execute. + page_content_field (str): The field that contains the content of each page. + secret (str): The secret key for authenticating to FaunaDB. + metadata_fields (Optional[Sequence[str]]): + Optional list of field names to include in metadata. + """ + + def __init__( + self, + query: str, + page_content_field: str, + secret: str, + metadata_fields: Optional[Sequence[str]] = None, + ): + self.query = query + self.page_content_field = page_content_field + self.secret = secret + self.metadata_fields = metadata_fields + + def load(self) -> List[Document]: + return list(self.lazy_load()) + + def lazy_load(self) -> Iterator[Document]: + try: + from fauna import Page, fql + from fauna.client import Client + from fauna.encoding import QuerySuccess + except ImportError: + raise ImportError( + "Could not import fauna python package. " + "Please install it with `pip install fauna`." + ) + # Create Fauna Client + client = Client(secret=self.secret) + # Run FQL Query + response: QuerySuccess = client.query(fql(self.query)) + page: Page = response.data + for result in page: + if result is not None: + document_dict = dict(result.items()) + page_content = "" + for key, value in document_dict.items(): + if key == self.page_content_field: + page_content = value + document: Document = Document( + page_content=page_content, + metadata={"id": result.id, "ts": result.ts}, + ) + yield document + if page.after is not None: + yield Document( + page_content="Next Page Exists", + metadata={"after": page.after}, + ) diff --git a/tests/integration_tests/document_loaders/test_fauna.py b/tests/integration_tests/document_loaders/test_fauna.py new file mode 100644 index 00000000000..81588d93422 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_fauna.py @@ -0,0 +1,41 @@ +import unittest + +from langchain.document_loaders.fauna import FaunaLoader + +try: + import fauna # noqa: F401 + + fauna_installed = True +except ImportError: + fauna_installed = False + + +@unittest.skipIf(not fauna_installed, "fauna not installed") +class TestFaunaLoader(unittest.TestCase): + def setUp(self) -> None: + self.fauna_secret = "" + self.valid_fql_query = "Item.all()" + self.valid_page_content_field = "text" + self.valid_metadata_fields = ["valid_metadata_fields"] + + def test_fauna_loader(self) -> None: + """Test Fauna loader.""" + loader = FaunaLoader( + query=self.valid_fql_query, + page_content_field=self.valid_page_content_field, + secret=self.fauna_secret, + metadata_fields=self.valid_metadata_fields, + ) + docs = loader.load() + + assert len(docs) > 0 # assuming the query returns at least one document + for doc in docs: + assert ( + doc.page_content != "" + ) # assuming that every document has page_content + assert ( + "id" in doc.metadata and doc.metadata["id"] != "" + ) # assuming that every document has 'id' + assert ( + "ts" in doc.metadata and doc.metadata["ts"] != "" + ) # assuming that every document has 'ts'