mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-03 21:54:04 +00:00
Add custom vector fields and text fields for OpenSearch (#2652)
**Description** Add custom vector field name and text field name while indexing and querying for OpenSearch **Issues** https://github.com/hwchase17/langchain/issues/2500 Signed-off-by: Naveen Tatikonda <navtat@amazon.com>
This commit is contained in:
parent
023de9a70b
commit
4364d3316e
@ -55,7 +55,7 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\")\n",
|
"docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||||
"docs = docsearch.similarity_search(query)"
|
"docs = docsearch.similarity_search(query)"
|
||||||
@ -94,7 +94,7 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", engine=\"faiss\", space_type=\"innerproduct\", ef_construction=256, m=48)\n",
|
"docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", engine=\"faiss\", space_type=\"innerproduct\", ef_construction=256, m=48)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||||
"docs = docsearch.similarity_search(query)"
|
"docs = docsearch.similarity_search(query)"
|
||||||
@ -133,7 +133,7 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
|
"docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||||
"docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", k=1, search_type=\"script_scoring\")"
|
"docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", k=1, search_type=\"script_scoring\")"
|
||||||
@ -172,10 +172,10 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
|
"docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
|
||||||
"filter = {\"bool\": {\"filter\": {\"term\": {\"text\": \"smuggling\"}}}}\n",
|
"filter = {\"bool\": {\"filter\": {\"term\": {\"text\": \"smuggling\"}}}}\n",
|
||||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||||
"docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", search_type=\"painless_scripting\", space_type=\"cosinesimil\", pre_filter=filter)"
|
"docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", search_type=\"painless_scripting\", space_type=\"cosineSimilarity\", pre_filter=filter)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -65,6 +65,8 @@ def _bulk_ingest_embeddings(
|
|||||||
embeddings: List[List[float]],
|
embeddings: List[List[float]],
|
||||||
texts: Iterable[str],
|
texts: Iterable[str],
|
||||||
metadatas: Optional[List[dict]] = None,
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
vector_field: str = "vector_field",
|
||||||
|
text_field: str = "text",
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""Bulk Ingest Embeddings into given index."""
|
"""Bulk Ingest Embeddings into given index."""
|
||||||
bulk = _import_bulk()
|
bulk = _import_bulk()
|
||||||
@ -76,8 +78,8 @@ def _bulk_ingest_embeddings(
|
|||||||
request = {
|
request = {
|
||||||
"_op_type": "index",
|
"_op_type": "index",
|
||||||
"_index": index_name,
|
"_index": index_name,
|
||||||
"vector_field": embeddings[i],
|
vector_field: embeddings[i],
|
||||||
"text": text,
|
text_field: text,
|
||||||
"metadata": metadata,
|
"metadata": metadata,
|
||||||
"_id": _id,
|
"_id": _id,
|
||||||
}
|
}
|
||||||
@ -88,12 +90,15 @@ def _bulk_ingest_embeddings(
|
|||||||
return ids
|
return ids
|
||||||
|
|
||||||
|
|
||||||
def _default_scripting_text_mapping(dim: int) -> Dict:
|
def _default_scripting_text_mapping(
|
||||||
|
dim: int,
|
||||||
|
vector_field: str = "vector_field",
|
||||||
|
) -> Dict:
|
||||||
"""For Painless Scripting or Script Scoring,the default mapping to create index."""
|
"""For Painless Scripting or Script Scoring,the default mapping to create index."""
|
||||||
return {
|
return {
|
||||||
"mappings": {
|
"mappings": {
|
||||||
"properties": {
|
"properties": {
|
||||||
"vector_field": {"type": "knn_vector", "dimension": dim},
|
vector_field: {"type": "knn_vector", "dimension": dim},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -106,13 +111,14 @@ def _default_text_mapping(
|
|||||||
ef_search: int = 512,
|
ef_search: int = 512,
|
||||||
ef_construction: int = 512,
|
ef_construction: int = 512,
|
||||||
m: int = 16,
|
m: int = 16,
|
||||||
|
vector_field: str = "vector_field",
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""For Approximate k-NN Search, this is the default mapping to create index."""
|
"""For Approximate k-NN Search, this is the default mapping to create index."""
|
||||||
return {
|
return {
|
||||||
"settings": {"index": {"knn": True, "knn.algo_param.ef_search": ef_search}},
|
"settings": {"index": {"knn": True, "knn.algo_param.ef_search": ef_search}},
|
||||||
"mappings": {
|
"mappings": {
|
||||||
"properties": {
|
"properties": {
|
||||||
"vector_field": {
|
vector_field: {
|
||||||
"type": "knn_vector",
|
"type": "knn_vector",
|
||||||
"dimension": dim,
|
"dimension": dim,
|
||||||
"method": {
|
"method": {
|
||||||
@ -165,10 +171,18 @@ def _default_script_query(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def __get_painless_scripting_source(space_type: str, query_vector: List[float]) -> str:
|
def __get_painless_scripting_source(
|
||||||
|
space_type: str, query_vector: List[float], vector_field: str = "vector_field"
|
||||||
|
) -> str:
|
||||||
"""For Painless Scripting, it returns the script source based on space type."""
|
"""For Painless Scripting, it returns the script source based on space type."""
|
||||||
source_value = (
|
source_value = (
|
||||||
"(1.0 + " + space_type + "(" + str(query_vector) + ", doc['vector_field']))"
|
"(1.0 + "
|
||||||
|
+ space_type
|
||||||
|
+ "("
|
||||||
|
+ str(query_vector)
|
||||||
|
+ ", doc['"
|
||||||
|
+ vector_field
|
||||||
|
+ "']))"
|
||||||
)
|
)
|
||||||
if space_type == "cosineSimilarity":
|
if space_type == "cosineSimilarity":
|
||||||
return source_value
|
return source_value
|
||||||
@ -250,13 +264,26 @@ class OpenSearchVectorSearch(VectorStore):
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of ids from adding the texts into the vectorstore.
|
List of ids from adding the texts into the vectorstore.
|
||||||
|
|
||||||
|
Optional Args:
|
||||||
|
vector_field: Document field embeddings are stored in. Defaults to
|
||||||
|
"vector_field".
|
||||||
|
|
||||||
|
text_field: Document field the text of the document is stored in. Defaults
|
||||||
|
to "text".
|
||||||
"""
|
"""
|
||||||
embeddings = [
|
embeddings = self.embedding_function.embed_documents(list(texts))
|
||||||
self.embedding_function.embed_documents([text])[0] for text in texts
|
|
||||||
]
|
|
||||||
_validate_embeddings_and_bulk_size(len(embeddings), bulk_size)
|
_validate_embeddings_and_bulk_size(len(embeddings), bulk_size)
|
||||||
|
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
|
||||||
|
text_field = _get_kwargs_value(kwargs, "text_field", "text")
|
||||||
return _bulk_ingest_embeddings(
|
return _bulk_ingest_embeddings(
|
||||||
self.client, self.index_name, embeddings, texts, metadatas
|
self.client,
|
||||||
|
self.index_name,
|
||||||
|
embeddings,
|
||||||
|
texts,
|
||||||
|
metadatas,
|
||||||
|
vector_field,
|
||||||
|
text_field,
|
||||||
)
|
)
|
||||||
|
|
||||||
def similarity_search(
|
def similarity_search(
|
||||||
@ -277,14 +304,17 @@ class OpenSearchVectorSearch(VectorStore):
|
|||||||
Optional Args:
|
Optional Args:
|
||||||
vector_field: Document field embeddings are stored in. Defaults to
|
vector_field: Document field embeddings are stored in. Defaults to
|
||||||
"vector_field".
|
"vector_field".
|
||||||
|
|
||||||
text_field: Document field the text of the document is stored in. Defaults
|
text_field: Document field the text of the document is stored in. Defaults
|
||||||
to "text".
|
to "text".
|
||||||
|
|
||||||
metadata_field: Document field that metadata is stored in. Defaults to
|
metadata_field: Document field that metadata is stored in. Defaults to
|
||||||
"metadata".
|
"metadata".
|
||||||
Can be set to a special value "*" to include the entire document.
|
Can be set to a special value "*" to include the entire document.
|
||||||
|
|
||||||
Optional Args for Approximate Search:
|
Optional Args for Approximate Search:
|
||||||
search_type: "approximate_search"; default: "approximate_search"
|
search_type: "approximate_search"; default: "approximate_search"
|
||||||
|
|
||||||
size: number of results the query actually returns; default: 4
|
size: number of results the query actually returns; default: 4
|
||||||
|
|
||||||
Optional Args for Script Scoring Search:
|
Optional Args for Script Scoring Search:
|
||||||
@ -298,6 +328,7 @@ class OpenSearchVectorSearch(VectorStore):
|
|||||||
|
|
||||||
Optional Args for Painless Scripting Search:
|
Optional Args for Painless Scripting Search:
|
||||||
search_type: "painless_scripting"; default: "approximate_search"
|
search_type: "painless_scripting"; default: "approximate_search"
|
||||||
|
|
||||||
space_type: "l2Squared", "l1Norm", "cosineSimilarity"; default: "l2Squared"
|
space_type: "l2Squared", "l1Norm", "cosineSimilarity"; default: "l2Squared"
|
||||||
|
|
||||||
pre_filter: script_score query to pre-filter documents before identifying
|
pre_filter: script_score query to pre-filter documents before identifying
|
||||||
@ -307,23 +338,21 @@ class OpenSearchVectorSearch(VectorStore):
|
|||||||
search_type = _get_kwargs_value(kwargs, "search_type", "approximate_search")
|
search_type = _get_kwargs_value(kwargs, "search_type", "approximate_search")
|
||||||
text_field = _get_kwargs_value(kwargs, "text_field", "text")
|
text_field = _get_kwargs_value(kwargs, "text_field", "text")
|
||||||
metadata_field = _get_kwargs_value(kwargs, "metadata_field", "metadata")
|
metadata_field = _get_kwargs_value(kwargs, "metadata_field", "metadata")
|
||||||
|
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
|
||||||
if search_type == "approximate_search":
|
if search_type == "approximate_search":
|
||||||
size = _get_kwargs_value(kwargs, "size", 4)
|
size = _get_kwargs_value(kwargs, "size", 4)
|
||||||
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
|
|
||||||
search_query = _default_approximate_search_query(
|
search_query = _default_approximate_search_query(
|
||||||
embedding, size, k, vector_field
|
embedding, size, k, vector_field
|
||||||
)
|
)
|
||||||
elif search_type == SCRIPT_SCORING_SEARCH:
|
elif search_type == SCRIPT_SCORING_SEARCH:
|
||||||
space_type = _get_kwargs_value(kwargs, "space_type", "l2")
|
space_type = _get_kwargs_value(kwargs, "space_type", "l2")
|
||||||
pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY)
|
pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY)
|
||||||
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
|
|
||||||
search_query = _default_script_query(
|
search_query = _default_script_query(
|
||||||
embedding, space_type, pre_filter, vector_field
|
embedding, space_type, pre_filter, vector_field
|
||||||
)
|
)
|
||||||
elif search_type == PAINLESS_SCRIPTING_SEARCH:
|
elif search_type == PAINLESS_SCRIPTING_SEARCH:
|
||||||
space_type = _get_kwargs_value(kwargs, "space_type", "l2Squared")
|
space_type = _get_kwargs_value(kwargs, "space_type", "l2Squared")
|
||||||
pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY)
|
pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY)
|
||||||
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
|
|
||||||
search_query = _default_painless_scripting_query(
|
search_query = _default_painless_scripting_query(
|
||||||
embedding, space_type, pre_filter, vector_field
|
embedding, space_type, pre_filter, vector_field
|
||||||
)
|
)
|
||||||
@ -370,6 +399,13 @@ class OpenSearchVectorSearch(VectorStore):
|
|||||||
and lucene engines recommended for large datasets. Also supports brute force
|
and lucene engines recommended for large datasets. Also supports brute force
|
||||||
search through Script Scoring and Painless Scripting.
|
search through Script Scoring and Painless Scripting.
|
||||||
|
|
||||||
|
Optional Args:
|
||||||
|
vector_field: Document field embeddings are stored in. Defaults to
|
||||||
|
"vector_field".
|
||||||
|
|
||||||
|
text_field: Document field the text of the document is stored in. Defaults
|
||||||
|
to "text".
|
||||||
|
|
||||||
Optional Keyword Args for Approximate Search:
|
Optional Keyword Args for Approximate Search:
|
||||||
engine: "nmslib", "faiss", "hnsw"; default: "nmslib"
|
engine: "nmslib", "faiss", "hnsw"; default: "nmslib"
|
||||||
|
|
||||||
@ -402,6 +438,8 @@ class OpenSearchVectorSearch(VectorStore):
|
|||||||
kwargs, "index_name", "OPENSEARCH_INDEX_NAME", default=uuid.uuid4().hex
|
kwargs, "index_name", "OPENSEARCH_INDEX_NAME", default=uuid.uuid4().hex
|
||||||
)
|
)
|
||||||
is_appx_search = _get_kwargs_value(kwargs, "is_appx_search", True)
|
is_appx_search = _get_kwargs_value(kwargs, "is_appx_search", True)
|
||||||
|
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
|
||||||
|
text_field = _get_kwargs_value(kwargs, "text_field", "text")
|
||||||
if is_appx_search:
|
if is_appx_search:
|
||||||
engine = _get_kwargs_value(kwargs, "engine", "nmslib")
|
engine = _get_kwargs_value(kwargs, "engine", "nmslib")
|
||||||
space_type = _get_kwargs_value(kwargs, "space_type", "l2")
|
space_type = _get_kwargs_value(kwargs, "space_type", "l2")
|
||||||
@ -410,11 +448,13 @@ class OpenSearchVectorSearch(VectorStore):
|
|||||||
m = _get_kwargs_value(kwargs, "m", 16)
|
m = _get_kwargs_value(kwargs, "m", 16)
|
||||||
|
|
||||||
mapping = _default_text_mapping(
|
mapping = _default_text_mapping(
|
||||||
dim, engine, space_type, ef_search, ef_construction, m
|
dim, engine, space_type, ef_search, ef_construction, m, vector_field
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
mapping = _default_scripting_text_mapping(dim)
|
mapping = _default_scripting_text_mapping(dim)
|
||||||
|
|
||||||
client.indices.create(index=index_name, body=mapping)
|
client.indices.create(index=index_name, body=mapping)
|
||||||
_bulk_ingest_embeddings(client, index_name, embeddings, texts, metadatas)
|
_bulk_ingest_embeddings(
|
||||||
|
client, index_name, embeddings, texts, metadatas, vector_field, text_field
|
||||||
|
)
|
||||||
return cls(opensearch_url, index_name, embedding)
|
return cls(opensearch_url, index_name, embedding)
|
||||||
|
@ -23,6 +23,30 @@ def test_opensearch() -> None:
|
|||||||
assert output == [Document(page_content="foo")]
|
assert output == [Document(page_content="foo")]
|
||||||
|
|
||||||
|
|
||||||
|
def test_opensearch_with_custom_field_name() -> None:
|
||||||
|
"""Test indexing and search using custom vector field and text field name."""
|
||||||
|
docsearch = OpenSearchVectorSearch.from_texts(
|
||||||
|
texts,
|
||||||
|
FakeEmbeddings(),
|
||||||
|
opensearch_url=DEFAULT_OPENSEARCH_URL,
|
||||||
|
vector_field="my_vector",
|
||||||
|
text_field="custom_text",
|
||||||
|
)
|
||||||
|
output = docsearch.similarity_search(
|
||||||
|
"foo", k=1, vector_field="my_vector", text_field="custom_text"
|
||||||
|
)
|
||||||
|
assert output == [Document(page_content="foo")]
|
||||||
|
|
||||||
|
text_input = ["test", "add", "text", "method"]
|
||||||
|
OpenSearchVectorSearch.add_texts(
|
||||||
|
docsearch, text_input, vector_field="my_vector", text_field="custom_text"
|
||||||
|
)
|
||||||
|
output = docsearch.similarity_search(
|
||||||
|
"add", k=1, vector_field="my_vector", text_field="custom_text"
|
||||||
|
)
|
||||||
|
assert output == [Document(page_content="foo")]
|
||||||
|
|
||||||
|
|
||||||
def test_opensearch_with_metadatas() -> None:
|
def test_opensearch_with_metadatas() -> None:
|
||||||
"""Test end to end indexing and search with metadata."""
|
"""Test end to end indexing and search with metadata."""
|
||||||
metadatas = [{"page": i} for i in range(len(texts))]
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
Loading…
Reference in New Issue
Block a user