mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-13 21:47:12 +00:00
Merge pull request #12433
* feat: Add Google Cloud Translation document transformer * Merge branch 'langchain-ai:master' into google-translate * Add documentation for Google Translate Document Transformer * Fix line length error * Merge branch 'master' into google-translate * Merge branch 'google-translate' of https://github.com/holtskinner/lan… * Addressed code review comments * Merge branch 'master' into google-translate * Merge branch 'google-translate' of https://github.com/holtskinner/lan… * Removed extra variable * Merge branch 'google-translate' of https://github.com/holtskinner/lan… * Merge branch 'master' into google-translate * Merge branch 'google-translate' of https://github.com/holtskinner/lan… * Removed extra import
This commit is contained in:
@@ -28,6 +28,7 @@ from langchain.document_transformers.embeddings_redundant_filter import (
|
||||
EmbeddingsRedundantFilter,
|
||||
get_stateful_documents,
|
||||
)
|
||||
from langchain.document_transformers.google_translate import GoogleTranslateTransformer
|
||||
from langchain.document_transformers.html2text import Html2TextTransformer
|
||||
from langchain.document_transformers.long_context_reorder import LongContextReorder
|
||||
from langchain.document_transformers.nuclia_text_transform import NucliaTextTransformer
|
||||
@@ -40,6 +41,7 @@ __all__ = [
|
||||
"DoctranPropertyExtractor",
|
||||
"EmbeddingsClusteringFilter",
|
||||
"EmbeddingsRedundantFilter",
|
||||
"GoogleTranslateTransformer",
|
||||
"get_stateful_documents",
|
||||
"LongContextReorder",
|
||||
"NucliaTextTransformer",
|
||||
|
@@ -0,0 +1,106 @@
|
||||
from typing import Any, Optional, Sequence
|
||||
|
||||
from langchain.schema import BaseDocumentTransformer, Document
|
||||
from langchain.utilities.vertexai import get_client_info
|
||||
|
||||
|
||||
class GoogleTranslateTransformer(BaseDocumentTransformer):
|
||||
"""Translate text documents using Google Cloud Translation."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
project_id: str,
|
||||
*,
|
||||
location: str = "global",
|
||||
model_id: Optional[str] = None,
|
||||
glossary_id: Optional[str] = None,
|
||||
api_endpoint: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Arguments:
|
||||
project_id: Google Cloud Project ID.
|
||||
location: (Optional) Translate model location.
|
||||
model_id: (Optional) Translate model ID to use.
|
||||
glossary_id: (Optional) Translate glossary ID to use.
|
||||
api_endpoint: (Optional) Regional endpoint to use.
|
||||
"""
|
||||
try:
|
||||
from google.api_core.client_options import ClientOptions
|
||||
from google.cloud import translate
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Install Google Cloud Translate to use this parser."
|
||||
"(pip install google-cloud-translate)"
|
||||
) from exc
|
||||
|
||||
self.project_id = project_id
|
||||
self.location = location
|
||||
self.model_id = model_id
|
||||
self.glossary_id = glossary_id
|
||||
|
||||
self._client = translate.TranslationServiceClient(
|
||||
client_info=get_client_info("translate"),
|
||||
client_options=(
|
||||
ClientOptions(api_endpoint=api_endpoint) if api_endpoint else None
|
||||
),
|
||||
)
|
||||
self._parent_path = self._client.common_location_path(project_id, location)
|
||||
# For some reason, there's no `model_path()` method for the client.
|
||||
self._model_path = (
|
||||
f"{self._parent_path}/models/{model_id}" if model_id else None
|
||||
)
|
||||
self._glossary_path = (
|
||||
self._client.glossary_path(project_id, location, glossary_id)
|
||||
if glossary_id
|
||||
else None
|
||||
)
|
||||
|
||||
def transform_documents(
|
||||
self, documents: Sequence[Document], **kwargs: Any
|
||||
) -> Sequence[Document]:
|
||||
"""Translate text documents using Google Translate.
|
||||
|
||||
Arguments:
|
||||
source_language_code: ISO 639 language code of the input document.
|
||||
target_language_code: ISO 639 language code of the output document.
|
||||
For supported languages, refer to:
|
||||
https://cloud.google.com/translate/docs/languages
|
||||
mime_type: (Optional) Media Type of input text.
|
||||
Options: `text/plain`, `text/html`
|
||||
"""
|
||||
try:
|
||||
from google.cloud import translate
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Install Google Cloud Translate to use this parser."
|
||||
"(pip install google-cloud-translate)"
|
||||
) from exc
|
||||
|
||||
response = self._client.translate_text(
|
||||
request=translate.TranslateTextRequest(
|
||||
contents=[doc.page_content for doc in documents],
|
||||
parent=self._parent_path,
|
||||
model=self._model_path,
|
||||
glossary_config=translate.TranslateTextGlossaryConfig(
|
||||
glossary=self._glossary_path
|
||||
),
|
||||
source_language_code=kwargs.get("source_language_code", None),
|
||||
target_language_code=kwargs.get("target_language_code"),
|
||||
mime_type=kwargs.get("mime_type", "text/plain"),
|
||||
)
|
||||
)
|
||||
|
||||
# If using a glossary, the translations will be in `glossary_translations`.
|
||||
translations = response.glossary_translations or response.translations
|
||||
|
||||
return [
|
||||
Document(
|
||||
page_content=translation.translated_text,
|
||||
metadata={
|
||||
**doc.metadata,
|
||||
"model": translation.model,
|
||||
"detected_language_code": translation.detected_language_code,
|
||||
},
|
||||
)
|
||||
for doc, translation in zip(documents, translations)
|
||||
]
|
Reference in New Issue
Block a user