mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-04 04:28:58 +00:00
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
This commit is contained in:
151
libs/community/langchain_community/document_loaders/evernote.py
Normal file
151
libs/community/langchain_community/document_loaders/evernote.py
Normal file
@@ -0,0 +1,151 @@
|
||||
"""Load documents from Evernote.
|
||||
|
||||
https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
|
||||
"""
|
||||
import hashlib
|
||||
import logging
|
||||
from base64 import b64decode
|
||||
from time import strptime
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EverNoteLoader(BaseLoader):
|
||||
"""Load from `EverNote`.
|
||||
|
||||
Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
|
||||
Instructions on producing this file can be found at
|
||||
https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML
|
||||
|
||||
Currently only the plain text in the note is extracted and stored as the contents
|
||||
of the Document, any non content metadata (e.g. 'author', 'created', 'updated' etc.
|
||||
but not 'content-raw' or 'resource') tags on the note will be extracted and stored
|
||||
as metadata on the Document.
|
||||
|
||||
Args:
|
||||
file_path (str): The path to the notebook export with a .enex extension
|
||||
load_single_document (bool): Whether or not to concatenate the content of all
|
||||
notes into a single long Document.
|
||||
If this is set to True (default) then the only metadata on the document will be
|
||||
the 'source' which contains the file name of the export.
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(self, file_path: str, load_single_document: bool = True):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
self.load_single_document = load_single_document
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents from EverNote export file."""
|
||||
documents = [
|
||||
Document(
|
||||
page_content=note["content"],
|
||||
metadata={
|
||||
**{
|
||||
key: value
|
||||
for key, value in note.items()
|
||||
if key not in ["content", "content-raw", "resource"]
|
||||
},
|
||||
**{"source": self.file_path},
|
||||
},
|
||||
)
|
||||
for note in self._parse_note_xml(self.file_path)
|
||||
if note.get("content") is not None
|
||||
]
|
||||
|
||||
if not self.load_single_document:
|
||||
return documents
|
||||
|
||||
return [
|
||||
Document(
|
||||
page_content="".join([document.page_content for document in documents]),
|
||||
metadata={"source": self.file_path},
|
||||
)
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def _parse_content(content: str) -> str:
|
||||
try:
|
||||
import html2text
|
||||
|
||||
return html2text.html2text(content).strip()
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Could not import `html2text`. Although it is not a required package "
|
||||
"to use Langchain, using the EverNote loader requires `html2text`. "
|
||||
"Please install `html2text` via `pip install html2text` and try again."
|
||||
) from e
|
||||
|
||||
@staticmethod
|
||||
def _parse_resource(resource: list) -> dict:
|
||||
rsc_dict: Dict[str, Any] = {}
|
||||
for elem in resource:
|
||||
if elem.tag == "data":
|
||||
# Sometimes elem.text is None
|
||||
rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b""
|
||||
rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest()
|
||||
else:
|
||||
rsc_dict[elem.tag] = elem.text
|
||||
|
||||
return rsc_dict
|
||||
|
||||
@staticmethod
|
||||
def _parse_note(note: List, prefix: Optional[str] = None) -> dict:
|
||||
note_dict: Dict[str, Any] = {}
|
||||
resources = []
|
||||
|
||||
def add_prefix(element_tag: str) -> str:
|
||||
if prefix is None:
|
||||
return element_tag
|
||||
return f"{prefix}.{element_tag}"
|
||||
|
||||
for elem in note:
|
||||
if elem.tag == "content":
|
||||
note_dict[elem.tag] = EverNoteLoader._parse_content(elem.text)
|
||||
# A copy of original content
|
||||
note_dict["content-raw"] = elem.text
|
||||
elif elem.tag == "resource":
|
||||
resources.append(EverNoteLoader._parse_resource(elem))
|
||||
elif elem.tag == "created" or elem.tag == "updated":
|
||||
note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ")
|
||||
elif elem.tag == "note-attributes":
|
||||
additional_attributes = EverNoteLoader._parse_note(
|
||||
elem, elem.tag
|
||||
) # Recursively enter the note-attributes tag
|
||||
note_dict.update(additional_attributes)
|
||||
else:
|
||||
note_dict[elem.tag] = elem.text
|
||||
|
||||
if len(resources) > 0:
|
||||
note_dict["resource"] = resources
|
||||
|
||||
return {add_prefix(key): value for key, value in note_dict.items()}
|
||||
|
||||
@staticmethod
|
||||
def _parse_note_xml(xml_file: str) -> Iterator[Dict[str, Any]]:
|
||||
"""Parse Evernote xml."""
|
||||
# Without huge_tree set to True, parser may complain about huge text node
|
||||
# Try to recover, because there may be " ", which will cause
|
||||
# "XMLSyntaxError: Entity 'nbsp' not defined"
|
||||
try:
|
||||
from lxml import etree
|
||||
except ImportError as e:
|
||||
logger.error(
|
||||
"Could not import `lxml`. Although it is not a required package to use "
|
||||
"Langchain, using the EverNote loader requires `lxml`. Please install "
|
||||
"`lxml` via `pip install lxml` and try again."
|
||||
)
|
||||
raise e
|
||||
|
||||
context = etree.iterparse(
|
||||
xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
|
||||
)
|
||||
|
||||
for action, elem in context:
|
||||
if elem.tag == "note":
|
||||
yield EverNoteLoader._parse_note(elem)
|
Reference in New Issue
Block a user