mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-10-25 21:03:11 +00:00 
			
		
		
		
	<!-- Thank you for contributing to LangChain! Please title your PR "<package>: <description>", where <package> is whichever of langchain, community, core, experimental, etc. is being modified. Replace this entire comment with: - **Description:** a description of the change, - **Issue:** the issue # it fixes if applicable, - **Dependencies:** any dependencies required for this change, - **Twitter handle:** we announce bigger features on Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` from the root of the package you've modified to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://python.langchain.com/docs/contributing/ If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. -->
		
			
				
	
	
		
			46 lines
		
	
	
		
			1.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			46 lines
		
	
	
		
			1.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from typing import List
 | |
| 
 | |
| from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
 | |
| 
 | |
| 
 | |
| class UnstructuredMarkdownLoader(UnstructuredFileLoader):
 | |
|     """Load `Markdown` files using `Unstructured`.
 | |
| 
 | |
|     You can run the loader in one of two modes: "single" and "elements".
 | |
|     If you use "single" mode, the document will be returned as a single
 | |
|     langchain Document object. If you use "elements" mode, the unstructured
 | |
|     library will split the document into elements such as Title and NarrativeText.
 | |
|     You can pass in additional unstructured kwargs after mode to apply
 | |
|     different unstructured settings.
 | |
| 
 | |
|     Examples
 | |
|     --------
 | |
|     from langchain_community.document_loaders import UnstructuredMarkdownLoader
 | |
| 
 | |
|     loader = UnstructuredMarkdownLoader(
 | |
|         "example.md", mode="elements", strategy="fast",
 | |
|     )
 | |
|     docs = loader.load()
 | |
| 
 | |
|     References
 | |
|     ----------
 | |
|     https://unstructured-io.github.io/unstructured/core/partition.html#partition-md
 | |
|     """
 | |
| 
 | |
|     def _get_elements(self) -> List:
 | |
|         from unstructured.__version__ import __version__ as __unstructured_version__
 | |
|         from unstructured.partition.md import partition_md
 | |
| 
 | |
|         # NOTE(MthwRobinson) - enables the loader to work when you're using pre-release
 | |
|         # versions of unstructured like 0.4.17-dev1
 | |
|         _unstructured_version = __unstructured_version__.split("-")[0]
 | |
|         unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
 | |
| 
 | |
|         if unstructured_version < (0, 4, 16):
 | |
|             raise ValueError(
 | |
|                 f"You are on unstructured version {__unstructured_version__}. "
 | |
|                 "Partitioning markdown files is only supported in unstructured>=0.4.16."
 | |
|             )
 | |
| 
 | |
|         return partition_md(filename=self.file_path, **self.unstructured_kwargs)
 |