Adds dataview fields and tags to metadata #9800 (#9801)

Description: Adds tags and dataview fields to ObsidianLoader doc
metadata.
  - Issue: #9800, #4991
  - Dependencies: none
- Tag maintainer: My best guess is @hwchase17 looking through the git
logs
  - Twitter handle: I don't use twitter, sorry!
This commit is contained in:
Dane Summers 2023-09-03 18:56:48 -04:00 committed by GitHub
parent ce47124e8f
commit 7d1b0fbe79
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 275 additions and 21 deletions

View File

@ -1,15 +1,24 @@
import logging
import re import re
from pathlib import Path from pathlib import Path
from typing import List from typing import List
import yaml
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__)
class ObsidianLoader(BaseLoader): class ObsidianLoader(BaseLoader):
"""Load `Obsidian` files from directory.""" """Load `Obsidian` files from directory."""
FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL) FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL)
TAG_REGEX = re.compile(r"[^\S\/]#([a-zA-Z_]+[-_/\w]*)")
DATAVIEW_LINE_REGEX = re.compile(r"^\s*(\w+)::\s*(.*)$", re.MULTILINE)
DATAVIEW_INLINE_BRACKET_REGEX = re.compile(r"\[(\w+)::\s*(.*)\]", re.MULTILINE)
DATAVIEW_INLINE_PAREN_REGEX = re.compile(r"\((\w+)::\s*(.*)\)", re.MULTILINE)
def __init__( def __init__(
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
@ -30,18 +39,64 @@ class ObsidianLoader(BaseLoader):
"""Parse front matter metadata from the content and return it as a dict.""" """Parse front matter metadata from the content and return it as a dict."""
if not self.collect_metadata: if not self.collect_metadata:
return {} return {}
match = self.FRONT_MATTER_REGEX.search(content) match = self.FRONT_MATTER_REGEX.search(content)
front_matter = {} if not match:
if match: return {}
lines = match.group(1).split("\n")
for line in lines: try:
if ":" in line: front_matter = yaml.safe_load(match.group(1))
key, value = line.split(":", 1)
front_matter[key.strip()] = value.strip() # If tags are a string, split them into a list
else: if "tags" in front_matter and isinstance(front_matter["tags"], str):
# Skip lines without a colon front_matter["tags"] = front_matter["tags"].split(", ")
continue
return front_matter return front_matter
except yaml.parser.ParserError:
logger.warning("Encountered non-yaml frontmatter")
return {}
def _to_langchain_compatible_metadata(self, metadata: dict) -> dict:
"""Convert a dictionary to a compatible with langchain."""
result = {}
for key, value in metadata.items():
if type(value) in {str, int, float}:
result[key] = value
else:
result[key] = str(value)
return result
def _parse_document_tags(self, content: str) -> set:
"""Return a set of all tags in within the document."""
if not self.collect_metadata:
return set()
match = self.TAG_REGEX.findall(content)
if not match:
return set()
return {tag for tag in match}
def _parse_dataview_fields(self, content: str) -> dict:
"""Parse obsidian dataview plugin fields from the content and return it
as a dict."""
if not self.collect_metadata:
return {}
return {
**{
match[0]: match[1]
for match in self.DATAVIEW_LINE_REGEX.findall(content)
},
**{
match[0]: match[1]
for match in self.DATAVIEW_INLINE_PAREN_REGEX.findall(content)
},
**{
match[0]: match[1]
for match in self.DATAVIEW_INLINE_BRACKET_REGEX.findall(content)
},
}
def _remove_front_matter(self, content: str) -> str: def _remove_front_matter(self, content: str) -> str:
"""Remove front matter metadata from the given content.""" """Remove front matter metadata from the given content."""
@ -51,22 +106,29 @@ class ObsidianLoader(BaseLoader):
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load documents.""" """Load documents."""
ps = list(Path(self.file_path).glob("**/*.md")) paths = list(Path(self.file_path).glob("**/*.md"))
docs = [] docs = []
for p in ps: for path in paths:
with open(p, encoding=self.encoding) as f: with open(path, encoding=self.encoding) as f:
text = f.read() text = f.read()
front_matter = self._parse_front_matter(text) front_matter = self._parse_front_matter(text)
tags = self._parse_document_tags(text)
dataview_fields = self._parse_dataview_fields(text)
text = self._remove_front_matter(text) text = self._remove_front_matter(text)
metadata = { metadata = {
"source": str(p.name), "source": str(path.name),
"path": str(p), "path": str(path),
"created": p.stat().st_ctime, "created": path.stat().st_ctime,
"last_modified": p.stat().st_mtime, "last_modified": path.stat().st_mtime,
"last_accessed": p.stat().st_atime, "last_accessed": path.stat().st_atime,
**front_matter, **self._to_langchain_compatible_metadata(front_matter),
**dataview_fields,
} }
if tags or front_matter.get("tags"):
metadata["tags"] = ",".join(tags | set(front_matter.get("tags", [])))
docs.append(Document(page_content=text, metadata=metadata)) docs.append(Document(page_content=text, metadata=metadata))
return docs return docs

View File

@ -0,0 +1,9 @@
---
anArray:
one
- two
- three
tags: 'onetag', 'twotag' ]
---
A document with frontmatter that isn't valid.

View File

@ -0,0 +1,5 @@
---
tags: journal/entry, obsidian
---
No other content than the frontmatter.

View File

@ -0,0 +1,14 @@
### Description
#recipes #dessert #cookies
A document with HR elements that might trip up a front matter parser:
---
### Ingredients
- 3/4 cup (170g) **unsalted butter**, slightly softened to room temperature.
- 1 and 1/2 cups (180g) **confectioners sugar**
---

View File

@ -0,0 +1 @@
A markdown document with no additional metadata.

View File

@ -0,0 +1,35 @@
---
aFloat: 13.12345
anInt: 15
aBool: true
aString: string value
anArray:
- one
- two
- three
aDict:
dictId1: '58417'
dictId2: 1500
tags: [ 'onetag', 'twotag' ]
---
# Tags
()#notatag
#12345
#read
something #tagWithCases
- #tag-with-dash
#tag_with_underscore #tag/with/nesting
# Dataview
Here is some data in a [dataview1:: a value] line.
Here is even more data in a (dataview2:: another value) line.
dataview3:: more data
notdataview4: this is not a field
notdataview5: this is not a field
# Text content
https://example.com/blog/#not-a-tag

View File

@ -0,0 +1,128 @@
from pathlib import Path
from langchain.document_loaders.obsidian import ObsidianLoader
OBSIDIAN_EXAMPLE_PATH = Path(__file__).parent / "sample_documents" / "obsidian"
STANDARD_METADATA_FIELDS = {
"created",
"path",
"source",
"last_accessed",
"last_modified",
}
loader = ObsidianLoader(str(OBSIDIAN_EXAMPLE_PATH))
docs = loader.load()
def test_page_content_loaded() -> None:
"""Verify that all docs have page_content"""
assert len(docs) == 5
assert all(doc.page_content for doc in docs)
def test_disable_collect_metadata() -> None:
"""If collect_metadata is False, no additional metadata should be collected."""
loader_without_metadata = ObsidianLoader(
str(OBSIDIAN_EXAMPLE_PATH), collect_metadata=False
)
docs_wo = loader_without_metadata.load()
assert len(docs_wo) == 5
assert all(doc.page_content for doc in docs_wo)
assert all(set(doc.metadata) == STANDARD_METADATA_FIELDS for doc in docs_wo)
def test_metadata_without_frontmatter() -> None:
"""Verify docs without frontmatter, still have basic metadata."""
doc = next(doc for doc in docs if doc.metadata["source"] == "no_metadata.md")
assert set(doc.metadata) == STANDARD_METADATA_FIELDS
def test_metadata_with_frontmatter() -> None:
"""Verify a standard frontmatter field is loaded."""
doc = next(doc for doc in docs if doc.metadata["source"] == "frontmatter.md")
assert set(doc.metadata) == STANDARD_METADATA_FIELDS | {"tags"}
assert set(doc.metadata["tags"].split(",")) == {"journal/entry", "obsidian"}
def test_metadata_with_bad_frontmatter() -> None:
"""Verify a doc with non-yaml frontmatter."""
doc = next(doc for doc in docs if doc.metadata["source"] == "bad_frontmatter.md")
assert set(doc.metadata) == STANDARD_METADATA_FIELDS
def test_metadata_with_tags_and_frontmatter() -> None:
"""Verify a doc with frontmatter and tags/dataview tags are all added to
metadata."""
doc = next(
doc for doc in docs if doc.metadata["source"] == "tags_and_frontmatter.md"
)
FRONTMATTER_FIELDS = {
"aBool",
"aFloat",
"anInt",
"anArray",
"aString",
"aDict",
"tags",
}
DATAVIEW_FIELDS = {"dataview1", "dataview2", "dataview3"}
assert (
set(doc.metadata)
== STANDARD_METADATA_FIELDS | FRONTMATTER_FIELDS | DATAVIEW_FIELDS
)
def test_tags_in_page_content() -> None:
"""Verify a doc with tags are included in the metadata"""
doc = next(doc for doc in docs if doc.metadata["source"] == "no_frontmatter.md")
assert set(doc.metadata) == STANDARD_METADATA_FIELDS | {"tags"}
def test_boolean_metadata() -> None:
"""Verify boolean metadata is loaded correctly"""
doc = next(
doc for doc in docs if doc.metadata["source"] == "tags_and_frontmatter.md"
)
assert doc.metadata["aBool"]
def test_float_metadata() -> None:
"""Verify float metadata is loaded correctly"""
doc = next(
doc for doc in docs if doc.metadata["source"] == "tags_and_frontmatter.md"
)
assert doc.metadata["aFloat"] == 13.12345
def test_int_metadata() -> None:
"""Verify int metadata is loaded correctly"""
doc = next(
doc for doc in docs if doc.metadata["source"] == "tags_and_frontmatter.md"
)
assert doc.metadata["anInt"] == 15
def test_string_metadata() -> None:
"""Verify string metadata is loaded correctly"""
doc = next(
doc for doc in docs if doc.metadata["source"] == "tags_and_frontmatter.md"
)
assert doc.metadata["aString"] == "string value"
def test_array_metadata() -> None:
"""Verify array metadata is loaded as a string"""
doc = next(
doc for doc in docs if doc.metadata["source"] == "tags_and_frontmatter.md"
)
assert doc.metadata["anArray"] == "['one', 'two', 'three']"
def test_dict_metadata() -> None:
"""Verify dict metadata is stored as a string"""
doc = next(
doc for doc in docs if doc.metadata["source"] == "tags_and_frontmatter.md"
)
assert doc.metadata["aDict"] == "{'dictId1': '58417', 'dictId2': 1500}"