diff --git a/docs/docs/integrations/text_embedding/laser.ipynb b/docs/docs/integrations/text_embedding/laser.ipynb new file mode 100644 index 00000000000..4f9e9169490 --- /dev/null +++ b/docs/docs/integrations/text_embedding/laser.ipynb @@ -0,0 +1,149 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "900fbd04-f6aa-4813-868f-1c54e3265385", + "metadata": {}, + "source": [ + "# LASER Language-Agnostic SEntence Representations Embeddings by Meta AI\n", + "\n", + ">[LASER](https://github.com/facebookresearch/LASER/) is a Python library developed by the Meta AI Research team and used for creating multilingual sentence embeddings for over 147 languages as of 2/25/2024 \n", + ">- List of supported languages at https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2a773d8d", + "metadata": {}, + "source": [ + "## Dependencies\n", + "\n", + "To use LaserEmbed with LangChain, install the `laser_encoders` Python package." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91ea14ce-831d-409a-a88f-30353acdabd1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%pip install laser_encoders" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "426f1156", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3f5dc9d7-65e3-4b5b-9086-3327d016cfe0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain_community.embeddings.laser import LaserEmbeddings" + ] + }, + { + "cell_type": "markdown", + "id": "8c77b0bb-2613-4167-a204-14d424b59105", + "metadata": {}, + "source": [ + "## Instantiating Laser\n", + " \n", + "### Parameters\n", + "- `lang: Optional[str]`\n", + " >If empty will default\n", + " to using a multilingual LASER encoder model (called \"laser2\").\n", + " You can find the list of supported languages and lang_codes [here](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200)\n", + " and [here](https://github.com/facebookresearch/LASER/blob/main/laser_encoders/language_list.py)\n", + "." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fb585dd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Ex Instantiationz\n", + "embeddings = LaserEmbeddings(lang=\"eng_Latn\")" + ] + }, + { + "cell_type": "markdown", + "id": "119fbaad-9442-4fff-8214-c5f597bc8e77", + "metadata": {}, + "source": [ + "## Usage\n", + "\n", + "### Generating document embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62920051-cbd2-460d-ba24-0424c1ed395d", + "metadata": {}, + "outputs": [], + "source": [ + "document_embeddings = embeddings.embed_documents(\n", + " [\"This is a sentence\", \"This is some other sentence\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7fd10d96-baee-468f-a532-b70b16b78d1f", + "metadata": {}, + "source": [ + "### Generating query embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f793bb6-609a-4a4a-a5c7-8e8597228915", + "metadata": {}, + "outputs": [], + "source": [ + "query_embeddings = embeddings.embed_query(\"This is a query\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/community/langchain_community/embeddings/__init__.py b/libs/community/langchain_community/embeddings/__init__.py index 6f7c1d96e23..d228995afff 100644 --- a/libs/community/langchain_community/embeddings/__init__.py +++ b/libs/community/langchain_community/embeddings/__init__.py @@ -55,6 +55,7 @@ from langchain_community.embeddings.infinity_local import InfinityEmbeddingsLoca from langchain_community.embeddings.javelin_ai_gateway import JavelinAIGatewayEmbeddings from langchain_community.embeddings.jina import JinaEmbeddings from langchain_community.embeddings.johnsnowlabs import JohnSnowLabsEmbeddings +from langchain_community.embeddings.laser import LaserEmbeddings from langchain_community.embeddings.llamacpp import LlamaCppEmbeddings from langchain_community.embeddings.llm_rails import LLMRailsEmbeddings from langchain_community.embeddings.localai import LocalAIEmbeddings @@ -109,6 +110,7 @@ __all__ = [ "InfinityEmbeddingsLocal", "GradientEmbeddings", "JinaEmbeddings", + "LaserEmbeddings", "LlamaCppEmbeddings", "LLMRailsEmbeddings", "HuggingFaceHubEmbeddings", diff --git a/libs/community/langchain_community/embeddings/laser.py b/libs/community/langchain_community/embeddings/laser.py new file mode 100644 index 00000000000..675cd74f211 --- /dev/null +++ b/libs/community/langchain_community/embeddings/laser.py @@ -0,0 +1,89 @@ +from typing import Any, Dict, List, Optional + +import numpy as np +from langchain_core.embeddings import Embeddings +from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator + +LASER_MULTILINGUAL_MODEL: str = "laser2" + + +class LaserEmbeddings(BaseModel, Embeddings): + """LASER Language-Agnostic SEntence Representations. + LASER is a Python library developed by the Meta AI Research team + and used for creating multilingual sentence embeddings for over 147 languages + as of 2/25/2024 + See more documentation at: + * https://github.com/facebookresearch/LASER/ + * https://github.com/facebookresearch/LASER/tree/main/laser_encoders + * https://arxiv.org/abs/2205.12654 + + To use this class, you must install the `laser_encoders` Python package. + + `pip install laser_encoders` + Example: + from laser_encoders import LaserEncoderPipeline + encoder = LaserEncoderPipeline(lang="eng_Latn") + embeddings = encoder.encode_sentences(["Hello", "World"]) + """ + + lang: Optional[str] + """The language or language code you'd like to use + If empty, this implementation will default + to using a multilingual earlier LASER encoder model (called laser2) + Find the list of supported languages at + https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200 + """ + + _encoder_pipeline: Any # : :meta private: + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + """Validate that laser_encoders has been installed.""" + try: + from laser_encoders import LaserEncoderPipeline + + lang = values.get("lang") + if lang: + encoder_pipeline = LaserEncoderPipeline(lang=lang) + else: + encoder_pipeline = LaserEncoderPipeline(laser=LASER_MULTILINGUAL_MODEL) + values["_encoder_pipeline"] = encoder_pipeline + + except ImportError as e: + raise ImportError( + "Could not import 'laser_encoders' Python package. " + "Please install it with `pip install laser_encoders`." + ) from e + return values + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Generate embeddings for documents using LASER. + + Args: + texts: The list of texts to embed. + + Returns: + List of embeddings, one for each text. + """ + embeddings: np.ndarray + embeddings = self._encoder_pipeline.encode_sentences(texts) + + return embeddings.tolist() + + def embed_query(self, text: str) -> List[float]: + """Generate single query text embeddings using LASER. + + Args: + text: The text to embed. + + Returns: + Embeddings for the text. + """ + query_embeddings: np.ndarray + query_embeddings = self._encoder_pipeline.encode_sentences([text]) + return query_embeddings.tolist()[0] diff --git a/libs/community/tests/integration_tests/embeddings/test_laser.py b/libs/community/tests/integration_tests/embeddings/test_laser.py new file mode 100644 index 00000000000..b8bf9ab335b --- /dev/null +++ b/libs/community/tests/integration_tests/embeddings/test_laser.py @@ -0,0 +1,29 @@ +"""Test LASER embeddings.""" +import pytest + +from langchain_community.embeddings.laser import LaserEmbeddings + + +@pytest.mark.filterwarnings("ignore::UserWarning:") +@pytest.mark.parametrize("lang", [None, "lus_Latn", "english"]) +def test_laser_embedding_documents(lang: str) -> None: + """Test laser embeddings for documents. + User warning is returned by LASER library implementation + so will ignore in testing.""" + documents = ["hello", "world"] + embedding = LaserEmbeddings(lang=lang) + output = embedding.embed_documents(documents) + assert len(output) == 2 # type: ignore[arg-type] + assert len(output[0]) == 1024 # type: ignore[index] + + +@pytest.mark.filterwarnings("ignore::UserWarning:") +@pytest.mark.parametrize("lang", [None, "lus_Latn", "english"]) +def test_laser_embedding_query(lang: str) -> None: + """Test laser embeddings for query. + User warning is returned by LASER library implementation + so will ignore in testing.""" + query = "hello world" + embedding = LaserEmbeddings(lang=lang) + output = embedding.embed_query(query) + assert len(output) == 1024 diff --git a/libs/community/tests/unit_tests/embeddings/test_imports.py b/libs/community/tests/unit_tests/embeddings/test_imports.py index 5b574ba6767..27d4bc301b6 100644 --- a/libs/community/tests/unit_tests/embeddings/test_imports.py +++ b/libs/community/tests/unit_tests/embeddings/test_imports.py @@ -15,6 +15,7 @@ EXPECTED_ALL = [ "InfinityEmbeddingsLocal", "GradientEmbeddings", "JinaEmbeddings", + "LaserEmbeddings", "LlamaCppEmbeddings", "LLMRailsEmbeddings", "HuggingFaceHubEmbeddings",