community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)

Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
2025-09-04 04:28:58 +00:00 · 2023-12-11 13:53:30 -08:00
parent c0f4b95aa9
commit ed58eeb9c5
2446 changed files with 171805 additions and 137118 deletions
--- a/libs/community/langchain_community/document_loaders/evernote.py
+++ b/libs/community/langchain_community/document_loaders/evernote.py
@@ -0,0 +1,151 @@
+"""Load documents from Evernote.
+
+https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
+"""
+import hashlib
+import logging
+from base64 import b64decode
+from time import strptime
+from typing import Any, Dict, Iterator, List, Optional
+
+from langchain_core.documents import Document
+
+from langchain_community.document_loaders.base import BaseLoader
+
+logger = logging.getLogger(__name__)
+
+
+class EverNoteLoader(BaseLoader):
+    """Load from `EverNote`.
+
+    Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
+    Instructions on producing this file can be found at
+    https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML
+
+    Currently only the plain text in the note is extracted and stored as the contents
+    of the Document, any non content metadata (e.g. 'author', 'created', 'updated' etc.
+    but not 'content-raw' or 'resource') tags on the note will be extracted and stored
+    as metadata on the Document.
+
+    Args:
+        file_path (str): The path to the notebook export with a .enex extension
+        load_single_document (bool): Whether or not to concatenate the content of all
+            notes into a single long Document.
+        If this is set to True (default) then the only metadata on the document will be
+            the 'source' which contains the file name of the export.
+    """  # noqa: E501
+
+    def __init__(self, file_path: str, load_single_document: bool = True):
+        """Initialize with file path."""
+        self.file_path = file_path
+        self.load_single_document = load_single_document
+
+    def load(self) -> List[Document]:
+        """Load documents from EverNote export file."""
+        documents = [
+            Document(
+                page_content=note["content"],
+                metadata={
+                    **{
+                        key: value
+                        for key, value in note.items()
+                        if key not in ["content", "content-raw", "resource"]
+                    },
+                    **{"source": self.file_path},
+                },
+            )
+            for note in self._parse_note_xml(self.file_path)
+            if note.get("content") is not None
+        ]
+
+        if not self.load_single_document:
+            return documents
+
+        return [
+            Document(
+                page_content="".join([document.page_content for document in documents]),
+                metadata={"source": self.file_path},
+            )
+        ]
+
+    @staticmethod
+    def _parse_content(content: str) -> str:
+        try:
+            import html2text
+
+            return html2text.html2text(content).strip()
+        except ImportError as e:
+            raise ImportError(
+                "Could not import `html2text`. Although it is not a required package "
+                "to use Langchain, using the EverNote loader requires `html2text`. "
+                "Please install `html2text` via `pip install html2text` and try again."
+            ) from e
+
+    @staticmethod
+    def _parse_resource(resource: list) -> dict:
+        rsc_dict: Dict[str, Any] = {}
+        for elem in resource:
+            if elem.tag == "data":
+                # Sometimes elem.text is None
+                rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b""
+                rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest()
+            else:
+                rsc_dict[elem.tag] = elem.text
+
+        return rsc_dict
+
+    @staticmethod
+    def _parse_note(note: List, prefix: Optional[str] = None) -> dict:
+        note_dict: Dict[str, Any] = {}
+        resources = []
+
+        def add_prefix(element_tag: str) -> str:
+            if prefix is None:
+                return element_tag
+            return f"{prefix}.{element_tag}"
+
+        for elem in note:
+            if elem.tag == "content":
+                note_dict[elem.tag] = EverNoteLoader._parse_content(elem.text)
+                # A copy of original content
+                note_dict["content-raw"] = elem.text
+            elif elem.tag == "resource":
+                resources.append(EverNoteLoader._parse_resource(elem))
+            elif elem.tag == "created" or elem.tag == "updated":
+                note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ")
+            elif elem.tag == "note-attributes":
+                additional_attributes = EverNoteLoader._parse_note(
+                    elem, elem.tag
+                )  # Recursively enter the note-attributes tag
+                note_dict.update(additional_attributes)
+            else:
+                note_dict[elem.tag] = elem.text
+
+        if len(resources) > 0:
+            note_dict["resource"] = resources
+
+        return {add_prefix(key): value for key, value in note_dict.items()}
+
+    @staticmethod
+    def _parse_note_xml(xml_file: str) -> Iterator[Dict[str, Any]]:
+        """Parse Evernote xml."""
+        # Without huge_tree set to True, parser may complain about huge text node
+        # Try to recover, because there may be "&nbsp;", which will cause
+        # "XMLSyntaxError: Entity 'nbsp' not defined"
+        try:
+            from lxml import etree
+        except ImportError as e:
+            logger.error(
+                "Could not import `lxml`. Although it is not a required package to use "
+                "Langchain, using the EverNote loader requires `lxml`. Please install "
+                "`lxml` via `pip install lxml` and try again."
+            )
+            raise e
+
+        context = etree.iterparse(
+            xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
+        )
+
+        for action, elem in context:
+            if elem.tag == "note":
+                yield EverNoteLoader._parse_note(elem)