mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-10-25 04:30:13 +00:00 
			
		
		
		
	Addded missed docstrings. Fixed inconsistency in docstrings. **Note** CC @efriis There were PR errors on `langchain_experimental/prompt_injection_identifier/hugging_face_identifier.py` But, I didn't touch this file in this PR! Can it be some cache problems? I fixed this error.
		
			
				
	
	
		
			51 lines
		
	
	
		
			1.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			51 lines
		
	
	
		
			1.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """Load Documents from Docusarus Documentation"""
 | |
| from typing import Any, List, Optional
 | |
| 
 | |
| from langchain_community.document_loaders.sitemap import SitemapLoader
 | |
| 
 | |
| 
 | |
| class DocusaurusLoader(SitemapLoader):
 | |
|     """Load from Docusaurus Documentation.
 | |
| 
 | |
|     It leverages the SitemapLoader to loop through the generated pages of a
 | |
|     Docusaurus Documentation website and extracts the content by looking for specific
 | |
|     HTML tags. By default, the parser searches for the main content of the Docusaurus
 | |
|     page, which is normally the <article>. You can also define your own
 | |
|     custom HTML tags by providing them as a list, for example: ["div", ".main", "a"].
 | |
|     """
 | |
| 
 | |
|     def __init__(
 | |
|         self,
 | |
|         url: str,
 | |
|         custom_html_tags: Optional[List[str]] = None,
 | |
|         **kwargs: Any,
 | |
|     ):
 | |
|         """Initialize DocusaurusLoader
 | |
| 
 | |
|         Args:
 | |
|             url: The base URL of the Docusaurus website.
 | |
|             custom_html_tags: Optional custom html tags to extract content from pages.
 | |
|             kwargs: Additional args to extend the underlying SitemapLoader, for example:
 | |
|                 filter_urls, blocksize, meta_function, is_local, continue_on_failure
 | |
|         """
 | |
|         if not kwargs.get("is_local"):
 | |
|             url = f"{url}/sitemap.xml"
 | |
| 
 | |
|         self.custom_html_tags = custom_html_tags or ["main article"]
 | |
| 
 | |
|         super().__init__(
 | |
|             url,
 | |
|             parsing_function=kwargs.get("parsing_function") or self._parsing_function,
 | |
|             **kwargs,
 | |
|         )
 | |
| 
 | |
|     def _parsing_function(self, content: Any) -> str:
 | |
|         """Parses specific elements from a Docusaurus page."""
 | |
|         relevant_elements = content.select(",".join(self.custom_html_tags))
 | |
| 
 | |
|         for element in relevant_elements:
 | |
|             if element not in relevant_elements:
 | |
|                 element.decompose()
 | |
| 
 | |
|         return str(content.get_text())
 |