mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 23:00:00 +00:00
Strip sitemap entries (#2132)
Loading this sitemap didn't work for me https://www.alzallies.com/sitemap.xml Changing this fixed it and it seems like a good idea to do it in general. Integration tests pass
This commit is contained in:
parent
27f80784d0
commit
4ab66c4f52
@ -58,7 +58,7 @@ class SitemapLoader(WebBaseLoader):
|
|||||||
|
|
||||||
els = self.parse_sitemap(soup)
|
els = self.parse_sitemap(soup)
|
||||||
|
|
||||||
results = self.scrape_all([el["loc"] for el in els if "loc" in el])
|
results = self.scrape_all([el["loc"].strip() for el in els if "loc" in el])
|
||||||
|
|
||||||
return [
|
return [
|
||||||
Document(
|
Document(
|
||||||
|
Loading…
Reference in New Issue
Block a user