mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 12:18:24 +00:00
Update confluence.py to return spaces between elements (#5383)
# Update confluence.py to return spaces between elements like headers and links. Please see https://stackoverflow.com/questions/48913975/how-to-return-nicely-formatted-text-in-beautifulsoup4-when-html-text-is-across-m Given: ```html <address> 183 Main St<br>East Copper<br>Massachusetts<br>U S A<br> MA 01516-113 </address> ``` The document loader currently returns: ``` '183 Main StEast CopperMassachusettsU S A MA 01516-113' ``` After this change, the document loader will return: ``` 183 Main St East Copper Massachusetts U S A MA 01516-113 ``` @eyurtsev would you prefer this to be an option that can be passed in?
This commit is contained in:
parent
b72401b47b
commit
b81f98b8a6
@ -347,15 +347,17 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
attachment_texts = self.process_attachment(page["id"])
|
attachment_texts = self.process_attachment(page["id"])
|
||||||
else:
|
else:
|
||||||
attachment_texts = []
|
attachment_texts = []
|
||||||
text = BeautifulSoup(
|
text = BeautifulSoup(page["body"]["storage"]["value"], "lxml").get_text(
|
||||||
page["body"]["storage"]["value"], "lxml"
|
" ", strip=True
|
||||||
).get_text() + "".join(attachment_texts)
|
) + "".join(attachment_texts)
|
||||||
if include_comments:
|
if include_comments:
|
||||||
comments = self.confluence.get_page_comments(
|
comments = self.confluence.get_page_comments(
|
||||||
page["id"], expand="body.view.value", depth="all"
|
page["id"], expand="body.view.value", depth="all"
|
||||||
)["results"]
|
)["results"]
|
||||||
comment_texts = [
|
comment_texts = [
|
||||||
BeautifulSoup(comment["body"]["view"]["value"], "lxml").get_text()
|
BeautifulSoup(comment["body"]["view"]["value"], "lxml").get_text(
|
||||||
|
" ", strip=True
|
||||||
|
)
|
||||||
for comment in comments
|
for comment in comments
|
||||||
]
|
]
|
||||||
text = text + "".join(comment_texts)
|
text = text + "".join(comment_texts)
|
||||||
|
Loading…
Reference in New Issue
Block a user