diff --git a/libs/community/langchain_community/document_transformers/doctran_text_extract.py b/libs/community/langchain_community/document_transformers/doctran_text_extract.py index eee109193ee..e942eafdde8 100644 --- a/libs/community/langchain_community/document_transformers/doctran_text_extract.py +++ b/libs/community/langchain_community/document_transformers/doctran_text_extract.py @@ -66,7 +66,27 @@ class DoctranPropertyExtractor(BaseDocumentTransformer): async def atransform_documents( self, documents: Sequence[Document], **kwargs: Any ) -> Sequence[Document]: - raise NotImplementedError + """Extracts properties from text documents using doctran.""" + try: + from doctran import Doctran, ExtractProperty + + doctran = Doctran( + openai_api_key=self.openai_api_key, openai_model=self.openai_api_model + ) + except ImportError: + raise ImportError( + "Install doctran to use this parser. (pip install doctran)" + ) + properties = [ExtractProperty(**property) for property in self.properties] + for d in documents: + doctran_doc = ( + doctran.parse(content=d.page_content) + .extract(properties=properties) + .execute() + ) + + d.metadata["extracted_properties"] = doctran_doc.extracted_properties + return documents def transform_documents( self, documents: Sequence[Document], **kwargs: Any