mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-28 10:39:23 +00:00
Support add_embeddings for opensearch (#11050)
- **Description:** - Make running integration test for opensearch easy - Provide a way to use different text for embedding: refer to #11002 for more of the use case and design decision. - **Issue:** N/A - **Dependencies:** None other than the existing ones.
This commit is contained in:
parent
c586f6dc1b
commit
17fcbed92c
@ -347,33 +347,15 @@ class OpenSearchVectorSearch(VectorStore):
|
||||
def embeddings(self) -> Embeddings:
|
||||
return self.embedding_function
|
||||
|
||||
def add_texts(
|
||||
def __add(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
embeddings: List[List[float]],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
bulk_size: int = 500,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
ids: Optional list of ids to associate with the texts.
|
||||
bulk_size: Bulk API request count; Default: 500
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
|
||||
Optional Args:
|
||||
vector_field: Document field embeddings are stored in. Defaults to
|
||||
"vector_field".
|
||||
|
||||
text_field: Document field the text of the document is stored in. Defaults
|
||||
to "text".
|
||||
"""
|
||||
embeddings = self.embedding_function.embed_documents(list(texts))
|
||||
_validate_embeddings_and_bulk_size(len(embeddings), bulk_size)
|
||||
index_name = _get_kwargs_value(kwargs, "index_name", self.index_name)
|
||||
text_field = _get_kwargs_value(kwargs, "text_field", "text")
|
||||
@ -406,6 +388,79 @@ class OpenSearchVectorSearch(VectorStore):
|
||||
is_aoss=self.is_aoss,
|
||||
)
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
bulk_size: int = 500,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
ids: Optional list of ids to associate with the texts.
|
||||
bulk_size: Bulk API request count; Default: 500
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
|
||||
Optional Args:
|
||||
vector_field: Document field embeddings are stored in. Defaults to
|
||||
"vector_field".
|
||||
|
||||
text_field: Document field the text of the document is stored in. Defaults
|
||||
to "text".
|
||||
"""
|
||||
embeddings = self.embedding_function.embed_documents(list(texts))
|
||||
return self.__add(
|
||||
texts,
|
||||
embeddings,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
bulk_size=bulk_size,
|
||||
kwargs=kwargs,
|
||||
)
|
||||
|
||||
def add_embeddings(
|
||||
self,
|
||||
text_embeddings: Iterable[Tuple[str, List[float]]],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
bulk_size: int = 500,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Add the given texts and embeddings to the vectorstore.
|
||||
|
||||
Args:
|
||||
text_embeddings: Iterable pairs of string and embedding to
|
||||
add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
ids: Optional list of ids to associate with the texts.
|
||||
bulk_size: Bulk API request count; Default: 500
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
|
||||
Optional Args:
|
||||
vector_field: Document field embeddings are stored in. Defaults to
|
||||
"vector_field".
|
||||
|
||||
text_field: Document field the text of the document is stored in. Defaults
|
||||
to "text".
|
||||
"""
|
||||
texts, embeddings = zip(*text_embeddings)
|
||||
return self.__add(
|
||||
list(texts),
|
||||
list(embeddings),
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
bulk_size=bulk_size,
|
||||
kwargs=kwargs,
|
||||
)
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
|
@ -71,6 +71,10 @@ cd tests/integration_tests/vectorstores/docker-compose
|
||||
docker-compose -f elasticsearch.yml up
|
||||
```
|
||||
|
||||
For environments that requires more involving preparation, look for `*.sh`. For instance,
|
||||
`opensearch.sh` builds a required docker image and then launch opensearch.
|
||||
|
||||
|
||||
### Prepare environment variables for local testing:
|
||||
|
||||
- copy `tests/.env.example` to `tests/.env`
|
||||
|
@ -0,0 +1,8 @@
|
||||
#/bin/sh
|
||||
# references:
|
||||
# https://github.com/opensearch-project/documentation-website/blob/2.10/assets/examples/docker-compose.yml
|
||||
# https://opensearch.org/docs/latest/security/configuration/disable/
|
||||
|
||||
cd opensearch
|
||||
docker build --tag=opensearch-dashboards-no-security -f opensearch-dashboards-no-security.Dockerfile .
|
||||
docker compose -f opensearch.yml up
|
@ -0,0 +1,3 @@
|
||||
FROM opensearchproject/opensearch-dashboards:2.10.0
|
||||
RUN /usr/share/opensearch-dashboards/bin/opensearch-dashboards-plugin remove securityDashboards
|
||||
COPY --chown=opensearch-dashboards:opensearch-dashboards opensearch_dashboards.yml /usr/share/opensearch-dashboards/config/
|
@ -0,0 +1,39 @@
|
||||
version: '3'
|
||||
services:
|
||||
opensearch-node1: # This is also the hostname of the container within the Docker network (i.e. http://opensearch-node1/)
|
||||
image: opensearchproject/opensearch:2.10.0
|
||||
container_name: opensearch-node1
|
||||
environment:
|
||||
- node.name=opensearch-node1 # Name the node that will run in this container
|
||||
- plugins.security.disabled=true # security has been disabled, so no login or password is required.
|
||||
- discovery.type=single-node
|
||||
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # Set min and max JVM heap sizes to at least 50% of system RAM
|
||||
volumes:
|
||||
- opensearch-data1:/usr/share/opensearch/data # Creates volume called opensearch-data1 and mounts it to the container
|
||||
ports:
|
||||
- 9200:9200 # REST API
|
||||
- 9600:9600 # Performance Analyzer
|
||||
networks:
|
||||
- opensearch-net # All of the containers will join the same Docker bridge network
|
||||
|
||||
# opensearch-dashboards does not work if OpenSearch cluster is not secure.
|
||||
# to use dashboards, build opensearch-dashboards-no-security first by running
|
||||
#
|
||||
opensearch-dashboards:
|
||||
image: opensearch-dashboards-no-security
|
||||
container_name: opensearch-dashboards
|
||||
ports:
|
||||
- 5601:5601 # Map host port 5601 to container port 5601
|
||||
expose:
|
||||
- "5601" # Expose port 5601 for web access to OpenSearch Dashboards
|
||||
environment:
|
||||
OPENSEARCH_HOSTS: '["http://opensearch-node1:9200"]' # Define the OpenSearch nodes that OpenSearch Dashboards will query
|
||||
networks:
|
||||
- opensearch-net
|
||||
|
||||
volumes:
|
||||
opensearch-data1:
|
||||
|
||||
networks:
|
||||
opensearch-net:
|
||||
|
@ -0,0 +1,3 @@
|
||||
server.name: opensearch-dashboards
|
||||
server.host: "0.0.0.0"
|
||||
opensearch.hosts: http://localhost:9200
|
@ -8,7 +8,10 @@ from langchain.vectorstores.opensearch_vector_search import (
|
||||
SCRIPT_SCORING_SEARCH,
|
||||
OpenSearchVectorSearch,
|
||||
)
|
||||
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||
from tests.integration_tests.vectorstores.fake_embeddings import (
|
||||
ConsistentFakeEmbeddings,
|
||||
FakeEmbeddings,
|
||||
)
|
||||
|
||||
DEFAULT_OPENSEARCH_URL = "http://localhost:9200"
|
||||
texts = ["foo", "bar", "baz"]
|
||||
@ -87,6 +90,31 @@ def test_add_text() -> None:
|
||||
assert len(docids) == len(text_input)
|
||||
|
||||
|
||||
def test_add_embeddings() -> None:
|
||||
"""
|
||||
Test add_embeddings, which accepts pre-built embeddings instead of
|
||||
using inference for the texts.
|
||||
This allows you to separate the embeddings text and the page_content
|
||||
for better proximity between user's question and embedded text.
|
||||
For example, your embedding text can be a question, whereas page_content
|
||||
is the answer.
|
||||
"""
|
||||
embeddings = ConsistentFakeEmbeddings()
|
||||
text_input = ["foo1", "foo2", "foo3"]
|
||||
metadatas = [{"page": i} for i in range(len(text_input))]
|
||||
|
||||
"""In real use case, embedding_input can be questions for each text"""
|
||||
embedding_input = ["foo2", "foo3", "foo1"]
|
||||
embedding_vectors = embeddings.embed_documents(embedding_input)
|
||||
|
||||
docsearch = OpenSearchVectorSearch.from_texts(
|
||||
["filler"], embeddings, opensearch_url=DEFAULT_OPENSEARCH_URL
|
||||
)
|
||||
docsearch.add_embeddings(list(zip(text_input, embedding_vectors)), metadatas)
|
||||
output = docsearch.similarity_search("foo1", k=1)
|
||||
assert output == [Document(page_content="foo3", metadata={"page": 2})]
|
||||
|
||||
|
||||
def test_opensearch_script_scoring() -> None:
|
||||
"""Test end to end indexing and search using Script Scoring Search."""
|
||||
pre_filter_val = {"bool": {"filter": {"term": {"text": "bar"}}}}
|
||||
|
Loading…
Reference in New Issue
Block a user