From 8799b028a6fa09dfefda92ea9a8bfc883c4e1aaf Mon Sep 17 00:00:00 2001 From: Mohammed Naqi <60170196+CsEnox@users.noreply.github.com> Date: Tue, 16 Jan 2024 00:09:25 +0530 Subject: [PATCH] community[minor]: Adding asynchronous function implementation for Doctran (#15941) ## Description In this update, I addressed the missing implementation for atransform_document, which is the asynchronous counterpart of transform_document in Doctran. ### Usage Example: ```py # Instantiate DoctranPropertyExtractor with specified properties property_extractor = DoctranPropertyExtractor(properties=properties) # Asynchronously extract properties from a list of documents extracted_document = await property_extractor.atransform_documents( documents, properties=properties ) # Display metadata of the first extracted document print(json.dumps(extracted_document[0].metadata, indent=2)) ``` ## Issue - Pull request #14525 has caused a break in the aforementioned code. Instead of removing an asynchronous implementation of a function, consider implementing a synchronous version alongside it. --- .../doctran_text_extract.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/document_transformers/doctran_text_extract.py b/libs/community/langchain_community/document_transformers/doctran_text_extract.py index eee109193ee..e942eafdde8 100644 --- a/libs/community/langchain_community/document_transformers/doctran_text_extract.py +++ b/libs/community/langchain_community/document_transformers/doctran_text_extract.py @@ -66,7 +66,27 @@ class DoctranPropertyExtractor(BaseDocumentTransformer): async def atransform_documents( self, documents: Sequence[Document], **kwargs: Any ) -> Sequence[Document]: - raise NotImplementedError + """Extracts properties from text documents using doctran.""" + try: + from doctran import Doctran, ExtractProperty + + doctran = Doctran( + openai_api_key=self.openai_api_key, openai_model=self.openai_api_model + ) + except ImportError: + raise ImportError( + "Install doctran to use this parser. (pip install doctran)" + ) + properties = [ExtractProperty(**property) for property in self.properties] + for d in documents: + doctran_doc = ( + doctran.parse(content=d.page_content) + .extract(properties=properties) + .execute() + ) + + d.metadata["extracted_properties"] = doctran_doc.extracted_properties + return documents def transform_documents( self, documents: Sequence[Document], **kwargs: Any