From 1fad39be1c71268af33257c9bedf683f0ea3761c Mon Sep 17 00:00:00 2001 From: Pengcheng Liu Date: Mon, 29 Apr 2024 22:37:50 +0800 Subject: [PATCH] community[minor]: Add LarkSuite wiki document loader. (#21016) **Description:** Add LarkSuite wiki document loader. Refer to [LarkSuite api document ](https://open.feishu.cn/document/server-docs/docs/wiki-v2/space-node/list)for details. **Issue:** None **Dependencies:** None **Twitter handle:** None --- .../document_loaders/larksuite.py | 30 +++++++++++++++++++ .../document_loaders/test_larksuite.py | 14 ++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/document_loaders/larksuite.py b/libs/community/langchain_community/document_loaders/larksuite.py index 1ab61a9845e..ae680ec2c63 100644 --- a/libs/community/langchain_community/document_loaders/larksuite.py +++ b/libs/community/langchain_community/document_loaders/larksuite.py @@ -46,3 +46,33 @@ class LarkSuiteDocLoader(BaseLoader): "title": metadata_json["data"]["document"]["title"], } yield Document(page_content=text, metadata=metadata) + + +class LarkSuiteWikiLoader(LarkSuiteDocLoader): + """Load from `LarkSuite` (`FeiShu`) wiki.""" + + def __init__(self, domain: str, access_token: str, wiki_id: str): + """Initialize with domain, access_token (tenant / user), and wiki_id. + + Args: + domain: The domain to load the LarkSuite. + access_token: The access_token to use. + wiki_id: The wiki_id to load. + """ + self.domain = domain + self.access_token = access_token + self.wiki_id = wiki_id + self.document_id = "" + + def lazy_load(self) -> Iterator[Document]: + """Lazy load LarkSuite (FeiShu) wiki document.""" + + # convert Feishu wiki id to document id + if not self.document_id: + wiki_url_prefix = f"{self.domain}/open-apis/wiki/v2/spaces/get_node" + wiki_node_info_json = self._get_larksuite_api_json_data( + f"{wiki_url_prefix}?token={self.wiki_id}" + ) + self.document_id = wiki_node_info_json["data"]["node"]["obj_token"] + + yield from super().lazy_load() diff --git a/libs/community/tests/integration_tests/document_loaders/test_larksuite.py b/libs/community/tests/integration_tests/document_loaders/test_larksuite.py index 61d251dd929..a0c191a6408 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_larksuite.py +++ b/libs/community/tests/integration_tests/document_loaders/test_larksuite.py @@ -1,4 +1,7 @@ -from langchain_community.document_loaders.larksuite import LarkSuiteDocLoader +from langchain_community.document_loaders.larksuite import ( + LarkSuiteDocLoader, + LarkSuiteWikiLoader, +) DOMAIN = "" ACCESS_TOKEN = "" @@ -12,3 +15,12 @@ def test_larksuite_doc_loader() -> None: assert len(docs) == 1 assert docs[0].page_content is not None + + +def test_larksuite_wiki_loader() -> None: + """Test LarkSuite (FeiShu) wiki loader.""" + loader = LarkSuiteWikiLoader(DOMAIN, ACCESS_TOKEN, DOCUMENT_ID) + docs = loader.load() + + assert len(docs) == 1 + assert docs[0].page_content is not None