community[minor]: Adding asynchronous function implementation for Doctran (#15941)

## Description 
In this update, I addressed the missing implementation for
atransform_document, which is the asynchronous counterpart of
transform_document in Doctran.

### Usage Example:
```py
# Instantiate DoctranPropertyExtractor with specified properties
property_extractor = DoctranPropertyExtractor(properties=properties)

# Asynchronously extract properties from a list of documents
extracted_document = await property_extractor.atransform_documents(
    documents, properties=properties
)

# Display metadata of the first extracted document
print(json.dumps(extracted_document[0].metadata, indent=2))

```

## Issue
- Pull request #14525 has caused a break in the aforementioned code.
Instead of removing an asynchronous implementation of a function,
consider implementing a synchronous version alongside it.
This commit is contained in:
Mohammed Naqi 2024-01-16 00:09:25 +05:30 committed by GitHub
parent fb7e66b809
commit 8799b028a6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -66,7 +66,27 @@ class DoctranPropertyExtractor(BaseDocumentTransformer):
async def atransform_documents( async def atransform_documents(
self, documents: Sequence[Document], **kwargs: Any self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]: ) -> Sequence[Document]:
raise NotImplementedError """Extracts properties from text documents using doctran."""
try:
from doctran import Doctran, ExtractProperty
doctran = Doctran(
openai_api_key=self.openai_api_key, openai_model=self.openai_api_model
)
except ImportError:
raise ImportError(
"Install doctran to use this parser. (pip install doctran)"
)
properties = [ExtractProperty(**property) for property in self.properties]
for d in documents:
doctran_doc = (
doctran.parse(content=d.page_content)
.extract(properties=properties)
.execute()
)
d.metadata["extracted_properties"] = doctran_doc.extracted_properties
return documents
def transform_documents( def transform_documents(
self, documents: Sequence[Document], **kwargs: Any self, documents: Sequence[Document], **kwargs: Any