mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-16 23:13:31 +00:00
FEATURE: Add OneNote document loader (#13841)
- **Description:** Added OneNote document loader - **Issue:** #12125 - **Dependencies:** msal Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
216
libs/langchain/langchain/document_loaders/onenote.py
Normal file
216
libs/langchain/langchain/document_loaders/onenote.py
Normal file
@@ -0,0 +1,216 @@
|
||||
"""Loads data from OneNote Notebooks"""
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterator, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.pydantic_v1 import BaseModel, BaseSettings, Field, FilePath, SecretStr
|
||||
|
||||
|
||||
class _OneNoteGraphSettings(BaseSettings):
|
||||
client_id: str = Field(..., env="MS_GRAPH_CLIENT_ID")
|
||||
client_secret: SecretStr = Field(..., env="MS_GRAPH_CLIENT_SECRET")
|
||||
|
||||
class Config:
|
||||
"""Config for OneNoteGraphSettings."""
|
||||
|
||||
env_prefix = ""
|
||||
case_sentive = False
|
||||
env_file = ".env"
|
||||
|
||||
|
||||
class OneNoteLoader(BaseLoader, BaseModel):
|
||||
"""Load pages from OneNote notebooks."""
|
||||
|
||||
settings: _OneNoteGraphSettings = Field(default_factory=_OneNoteGraphSettings)
|
||||
"""Settings for the Microsoft Graph API client."""
|
||||
auth_with_token: bool = False
|
||||
"""Whether to authenticate with a token or not. Defaults to False."""
|
||||
access_token: str = ""
|
||||
"""Personal access token"""
|
||||
onenote_api_base_url: str = "https://graph.microsoft.com/v1.0/me/onenote"
|
||||
"""URL of Microsoft Graph API for OneNote"""
|
||||
authority_url = "https://login.microsoftonline.com/consumers/"
|
||||
"""A URL that identifies a token authority"""
|
||||
token_path: FilePath = Path.home() / ".credentials" / "onenote_graph_token.txt"
|
||||
"""Path to the file where the access token is stored"""
|
||||
notebook_name: Optional[str] = None
|
||||
"""Filter on notebook name"""
|
||||
section_name: Optional[str] = None
|
||||
"""Filter on section name"""
|
||||
page_title: Optional[str] = None
|
||||
"""Filter on section name"""
|
||||
object_ids: Optional[List[str]] = None
|
||||
""" The IDs of the objects to load data from."""
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""
|
||||
Get pages from OneNote notebooks.
|
||||
|
||||
Returns:
|
||||
A list of Documents with attributes:
|
||||
- page_content
|
||||
- metadata
|
||||
- title
|
||||
"""
|
||||
self._auth()
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"beautifulsoup4 package not found, please install it with "
|
||||
"`pip install bs4`"
|
||||
)
|
||||
|
||||
if self.object_ids is not None:
|
||||
for object_id in self.object_ids:
|
||||
page_content_html = self._get_page_content(object_id)
|
||||
soup = BeautifulSoup(page_content_html, "html.parser")
|
||||
page_title = ""
|
||||
title_tag = soup.title
|
||||
if title_tag:
|
||||
page_title = title_tag.get_text(strip=True)
|
||||
page_content = soup.get_text(separator="\n", strip=True)
|
||||
yield Document(
|
||||
page_content=page_content, metadata={"title": page_title}
|
||||
)
|
||||
else:
|
||||
request_url = self._url
|
||||
|
||||
while request_url != "":
|
||||
response = requests.get(request_url, headers=self._headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
pages = response.json()
|
||||
|
||||
for page in pages["value"]:
|
||||
page_id = page["id"]
|
||||
page_content_html = self._get_page_content(page_id)
|
||||
soup = BeautifulSoup(page_content_html, "html.parser")
|
||||
page_title = ""
|
||||
title_tag = soup.title
|
||||
if title_tag:
|
||||
page_content = soup.get_text(separator="\n", strip=True)
|
||||
yield Document(
|
||||
page_content=page_content, metadata={"title": page_title}
|
||||
)
|
||||
|
||||
if "@odata.nextLink" in pages:
|
||||
request_url = pages["@odata.nextLink"]
|
||||
else:
|
||||
request_url = ""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""
|
||||
Get pages from OneNote notebooks.
|
||||
|
||||
Returns:
|
||||
A list of Documents with attributes:
|
||||
- page_content
|
||||
- metadata
|
||||
- title
|
||||
"""
|
||||
return list(self.lazy_load())
|
||||
|
||||
def _get_page_content(self, page_id: str) -> str:
|
||||
"""Get page content from OneNote API"""
|
||||
request_url = self.onenote_api_base_url + f"/pages/{page_id}/content"
|
||||
response = requests.get(request_url, headers=self._headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
|
||||
@property
|
||||
def _headers(self) -> Dict[str, str]:
|
||||
"""Return headers for requests to OneNote API"""
|
||||
return {
|
||||
"Authorization": f"Bearer {self.access_token}",
|
||||
}
|
||||
|
||||
@property
|
||||
def _scopes(self) -> List[str]:
|
||||
"""Return required scopes."""
|
||||
return ["Notes.Read"]
|
||||
|
||||
def _auth(self) -> None:
|
||||
"""Authenticate with Microsoft Graph API"""
|
||||
if self.access_token != "":
|
||||
return
|
||||
|
||||
if self.auth_with_token:
|
||||
with self.token_path.open("r") as token_file:
|
||||
self.access_token = token_file.read()
|
||||
else:
|
||||
try:
|
||||
from msal import ConfidentialClientApplication
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"MSAL package not found, please install it with `pip install msal`"
|
||||
) from e
|
||||
|
||||
client_instance = ConfidentialClientApplication(
|
||||
client_id=self.settings.client_id,
|
||||
client_credential=self.settings.client_secret.get_secret_value(),
|
||||
authority=self.authority_url,
|
||||
)
|
||||
|
||||
authorization_request_url = client_instance.get_authorization_request_url(
|
||||
self._scopes
|
||||
)
|
||||
print("Visit the following url to give consent:")
|
||||
print(authorization_request_url)
|
||||
authorization_url = input("Paste the authenticated url here:\n")
|
||||
|
||||
authorization_code = authorization_url.split("code=")[1].split("&")[0]
|
||||
access_token_json = client_instance.acquire_token_by_authorization_code(
|
||||
code=authorization_code, scopes=self._scopes
|
||||
)
|
||||
self.access_token = access_token_json["access_token"]
|
||||
|
||||
try:
|
||||
if not self.token_path.parent.exists():
|
||||
self.token_path.parent.mkdir(parents=True)
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"Could not create the folder {self.token_path.parent} "
|
||||
+ "to store the access token."
|
||||
) from e
|
||||
|
||||
with self.token_path.open("w") as token_file:
|
||||
token_file.write(self.access_token)
|
||||
|
||||
@property
|
||||
def _url(self) -> str:
|
||||
"""Create URL for getting page ids from the OneNoteApi API."""
|
||||
query_params_list = []
|
||||
filter_list = []
|
||||
expand_list = []
|
||||
|
||||
query_params_list.append("$select=id")
|
||||
if self.notebook_name is not None:
|
||||
filter_list.append(
|
||||
"parentNotebook/displayName%20eq%20"
|
||||
+ f"'{self.notebook_name.replace(' ', '%20')}'"
|
||||
)
|
||||
expand_list.append("parentNotebook")
|
||||
if self.section_name is not None:
|
||||
filter_list.append(
|
||||
"parentSection/displayName%20eq%20"
|
||||
+ f"'{self.section_name.replace(' ', '%20')}'"
|
||||
)
|
||||
expand_list.append("parentSection")
|
||||
if self.page_title is not None:
|
||||
filter_list.append(
|
||||
"title%20eq%20" + f"'{self.page_title.replace(' ', '%20')}'"
|
||||
)
|
||||
|
||||
if len(expand_list) > 0:
|
||||
query_params_list.append("$expand=" + ",".join(expand_list))
|
||||
if len(filter_list) > 0:
|
||||
query_params_list.append("$filter=" + "%20and%20".join(filter_list))
|
||||
|
||||
query_params = "&".join(query_params_list)
|
||||
if query_params != "":
|
||||
query_params = "?" + query_params
|
||||
return f"{self.onenote_api_base_url}/pages{query_params}"
|
10
libs/langchain/poetry.lock
generated
10
libs/langchain/poetry.lock
generated
@@ -4848,13 +4848,13 @@ tests = ["pytest (>=4.6)"]
|
||||
|
||||
[[package]]
|
||||
name = "msal"
|
||||
version = "1.24.1"
|
||||
version = "1.25.0"
|
||||
description = "The Microsoft Authentication Library (MSAL) for Python library"
|
||||
optional = true
|
||||
python-versions = ">=2.7"
|
||||
files = [
|
||||
{file = "msal-1.24.1-py2.py3-none-any.whl", hash = "sha256:ce4320688f95c301ee74a4d0e9dbcfe029a63663a8cc61756f40d0d0d36574ad"},
|
||||
{file = "msal-1.24.1.tar.gz", hash = "sha256:aa0972884b3c6fdec53d9a0bd15c12e5bd7b71ac1b66d746f54d128709f3f8f8"},
|
||||
{file = "msal-1.25.0-py2.py3-none-any.whl", hash = "sha256:386df621becb506bc315a713ec3d4d5b5d6163116955c7dde23622f156b81af6"},
|
||||
{file = "msal-1.25.0.tar.gz", hash = "sha256:f44329fdb59f4f044c779164a34474b8a44ad9e4940afbc4c3a3a2bbe90324d9"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -11075,7 +11075,7 @@ cli = ["typer"]
|
||||
cohere = ["cohere"]
|
||||
docarray = ["docarray"]
|
||||
embeddings = ["sentence-transformers"]
|
||||
extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "dashvector", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openai", "openapi-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"]
|
||||
extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "dashvector", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "html2text", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openai", "openapi-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"]
|
||||
javascript = ["esprima"]
|
||||
llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"]
|
||||
openai = ["openai", "tiktoken"]
|
||||
@@ -11085,4 +11085,4 @@ text-helpers = ["chardet"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "9e9f9edcf4543eca60bc207ed6d91991e172a5b853b0caf869ff1762ead37186"
|
||||
content-hash = "37e62f668e1acddc4e462fdac5f694af3916b6edbd1ccde0a54c9a57524d6c92"
|
||||
|
@@ -143,6 +143,7 @@ azure-ai-textanalytics = {version = "^5.3.0", optional = true}
|
||||
google-cloud-documentai = {version = "^2.20.1", optional = true}
|
||||
fireworks-ai = {version = "^0.6.0", optional = true, python = ">=3.9,<4.0"}
|
||||
javelin-sdk = {version = "^0.1.8", optional = true}
|
||||
msal = {version = "^1.25.0", optional = true}
|
||||
|
||||
|
||||
[tool.poetry.group.test.dependencies]
|
||||
@@ -341,6 +342,7 @@ extended_testing = [
|
||||
"atlassian-python-api",
|
||||
"mwparserfromhell",
|
||||
"mwxml",
|
||||
"msal",
|
||||
"pandas",
|
||||
"telethon",
|
||||
"psychicapi",
|
||||
|
165
libs/langchain/tests/unit_tests/document_loaders/test_onenote.py
Normal file
165
libs/langchain/tests/unit_tests/document_loaders/test_onenote.py
Normal file
@@ -0,0 +1,165 @@
|
||||
import os
|
||||
from typing import Any
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
from _pytest.monkeypatch import MonkeyPatch
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.onenote import OneNoteLoader
|
||||
|
||||
|
||||
def test_initialization() -> None:
|
||||
os.environ["MS_GRAPH_CLIENT_ID"] = "CLIENT_ID"
|
||||
os.environ["MS_GRAPH_CLIENT_SECRET"] = "CLIENT_SECRET"
|
||||
|
||||
loader = OneNoteLoader(
|
||||
notebook_name="test_notebook",
|
||||
section_name="test_section",
|
||||
page_title="test_title",
|
||||
access_token="access_token",
|
||||
)
|
||||
assert loader.notebook_name == "test_notebook"
|
||||
assert loader.section_name == "test_section"
|
||||
assert loader.page_title == "test_title"
|
||||
assert loader.access_token == "access_token"
|
||||
assert loader._headers == {
|
||||
"Authorization": "Bearer access_token",
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_load(mocker: MockerFixture) -> None:
|
||||
os.environ["MS_GRAPH_CLIENT_ID"] = "CLIENT_ID"
|
||||
os.environ["MS_GRAPH_CLIENT_SECRET"] = "CLIENT_SECRET"
|
||||
|
||||
mocker.patch(
|
||||
"requests.get",
|
||||
return_value=mocker.MagicMock(json=lambda: {"value": []}, links=None),
|
||||
)
|
||||
loader = OneNoteLoader(
|
||||
notebook_name="test_notebook",
|
||||
section_name="test_section",
|
||||
page_title="test_title",
|
||||
access_token="access_token",
|
||||
)
|
||||
documents = loader.load()
|
||||
assert documents == []
|
||||
|
||||
mocker.patch(
|
||||
"langchain.document_loaders.onenote.OneNoteLoader._get_page_content",
|
||||
return_value=(
|
||||
"<html><head><title>Test Title</title></head>"
|
||||
"<body><p>Test Content</p></body></html>"
|
||||
),
|
||||
)
|
||||
loader = OneNoteLoader(object_ids=["test_id"], access_token="access_token")
|
||||
documents = loader.load()
|
||||
assert documents == [
|
||||
Document(
|
||||
page_content="Test Title\nTest Content", metadata={"title": "Test Title"}
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class FakeConfidentialClientApplication(Mock):
|
||||
def get_authorization_request_url(self, *args: Any, **kwargs: Any) -> str:
|
||||
return "fake_authorization_url"
|
||||
|
||||
|
||||
@pytest.mark.requires("msal")
|
||||
def test_msal_import(monkeypatch: MonkeyPatch, mocker: MockerFixture) -> None:
|
||||
os.environ["MS_GRAPH_CLIENT_ID"] = "CLIENT_ID"
|
||||
os.environ["MS_GRAPH_CLIENT_SECRET"] = "CLIENT_SECRET"
|
||||
|
||||
monkeypatch.setattr("builtins.input", lambda _: "invalid_url")
|
||||
mocker.patch(
|
||||
"msal.ConfidentialClientApplication",
|
||||
return_value=FakeConfidentialClientApplication(),
|
||||
)
|
||||
loader = OneNoteLoader(
|
||||
notebook_name="test_notebook",
|
||||
section_name="test_section",
|
||||
page_title="test_title",
|
||||
)
|
||||
with pytest.raises(IndexError):
|
||||
loader._auth()
|
||||
|
||||
|
||||
def test_url() -> None:
|
||||
os.environ["MS_GRAPH_CLIENT_ID"] = "CLIENT_ID"
|
||||
os.environ["MS_GRAPH_CLIENT_SECRET"] = "CLIENT_SECRET"
|
||||
|
||||
loader = OneNoteLoader(
|
||||
notebook_name="test_notebook",
|
||||
section_name="test_section",
|
||||
page_title="test_title",
|
||||
access_token="access_token",
|
||||
onenote_api_base_url="https://graph.microsoft.com/v1.0/me/onenote",
|
||||
)
|
||||
assert loader._url == (
|
||||
"https://graph.microsoft.com/v1.0/me/onenote/pages?$select=id"
|
||||
"&$expand=parentNotebook,parentSection"
|
||||
"&$filter=parentNotebook/displayName%20eq%20'test_notebook'"
|
||||
"%20and%20parentSection/displayName%20eq%20'test_section'"
|
||||
"%20and%20title%20eq%20'test_title'"
|
||||
)
|
||||
|
||||
loader = OneNoteLoader(
|
||||
notebook_name="test_notebook",
|
||||
section_name="test_section",
|
||||
access_token="access_token",
|
||||
onenote_api_base_url="https://graph.microsoft.com/v1.0/me/onenote",
|
||||
)
|
||||
assert loader._url == (
|
||||
"https://graph.microsoft.com/v1.0/me/onenote/pages?$select=id"
|
||||
"&$expand=parentNotebook,parentSection"
|
||||
"&$filter=parentNotebook/displayName%20eq%20'test_notebook'"
|
||||
"%20and%20parentSection/displayName%20eq%20'test_section'"
|
||||
)
|
||||
|
||||
loader = OneNoteLoader(
|
||||
notebook_name="test_notebook",
|
||||
access_token="access_token",
|
||||
onenote_api_base_url="https://graph.microsoft.com/v1.0/me/onenote",
|
||||
)
|
||||
assert loader._url == (
|
||||
"https://graph.microsoft.com/v1.0/me/onenote/pages?$select=id"
|
||||
"&$expand=parentNotebook"
|
||||
"&$filter=parentNotebook/displayName%20eq%20'test_notebook'"
|
||||
)
|
||||
|
||||
loader = OneNoteLoader(
|
||||
section_name="test_section",
|
||||
access_token="access_token",
|
||||
onenote_api_base_url="https://graph.microsoft.com/v1.0/me/onenote",
|
||||
)
|
||||
assert loader._url == (
|
||||
"https://graph.microsoft.com/v1.0/me/onenote/pages?$select=id"
|
||||
"&$expand=parentSection"
|
||||
"&$filter=parentSection/displayName%20eq%20'test_section'"
|
||||
)
|
||||
|
||||
loader = OneNoteLoader(
|
||||
section_name="test_section",
|
||||
page_title="test_title",
|
||||
access_token="access_token",
|
||||
onenote_api_base_url="https://graph.microsoft.com/v1.0/me/onenote",
|
||||
)
|
||||
assert loader._url == (
|
||||
"https://graph.microsoft.com/v1.0/me/onenote/pages?$select=id"
|
||||
"&$expand=parentSection"
|
||||
"&$filter=parentSection/displayName%20eq%20'test_section'"
|
||||
"%20and%20title%20eq%20'test_title'"
|
||||
)
|
||||
|
||||
loader = OneNoteLoader(
|
||||
page_title="test_title",
|
||||
access_token="access_token",
|
||||
onenote_api_base_url="https://graph.microsoft.com/v1.0/me/onenote",
|
||||
)
|
||||
assert loader._url == (
|
||||
"https://graph.microsoft.com/v1.0/me/onenote/pages?$select=id"
|
||||
"&$filter=title%20eq%20'test_title'"
|
||||
)
|
Reference in New Issue
Block a user