mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-28 10:39:23 +00:00
Add custom vector fields and text fields for OpenSearch (#2652)
**Description** Add custom vector field name and text field name while indexing and querying for OpenSearch **Issues** https://github.com/hwchase17/langchain/issues/2500 Signed-off-by: Naveen Tatikonda <navtat@amazon.com>
This commit is contained in:
parent
023de9a70b
commit
4364d3316e
@ -55,7 +55,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\")\n",
|
||||
"docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\")\n",
|
||||
"\n",
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"docs = docsearch.similarity_search(query)"
|
||||
@ -94,7 +94,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", engine=\"faiss\", space_type=\"innerproduct\", ef_construction=256, m=48)\n",
|
||||
"docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", engine=\"faiss\", space_type=\"innerproduct\", ef_construction=256, m=48)\n",
|
||||
"\n",
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"docs = docsearch.similarity_search(query)"
|
||||
@ -133,7 +133,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
|
||||
"docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
|
||||
"\n",
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", k=1, search_type=\"script_scoring\")"
|
||||
@ -172,10 +172,10 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
|
||||
"docsearch = OpenSearchVectorSearch.from_documents(docs, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n",
|
||||
"filter = {\"bool\": {\"filter\": {\"term\": {\"text\": \"smuggling\"}}}}\n",
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", search_type=\"painless_scripting\", space_type=\"cosinesimil\", pre_filter=filter)"
|
||||
"docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", search_type=\"painless_scripting\", space_type=\"cosineSimilarity\", pre_filter=filter)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -238,4 +238,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
}
|
@ -65,6 +65,8 @@ def _bulk_ingest_embeddings(
|
||||
embeddings: List[List[float]],
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
vector_field: str = "vector_field",
|
||||
text_field: str = "text",
|
||||
) -> List[str]:
|
||||
"""Bulk Ingest Embeddings into given index."""
|
||||
bulk = _import_bulk()
|
||||
@ -76,8 +78,8 @@ def _bulk_ingest_embeddings(
|
||||
request = {
|
||||
"_op_type": "index",
|
||||
"_index": index_name,
|
||||
"vector_field": embeddings[i],
|
||||
"text": text,
|
||||
vector_field: embeddings[i],
|
||||
text_field: text,
|
||||
"metadata": metadata,
|
||||
"_id": _id,
|
||||
}
|
||||
@ -88,12 +90,15 @@ def _bulk_ingest_embeddings(
|
||||
return ids
|
||||
|
||||
|
||||
def _default_scripting_text_mapping(dim: int) -> Dict:
|
||||
def _default_scripting_text_mapping(
|
||||
dim: int,
|
||||
vector_field: str = "vector_field",
|
||||
) -> Dict:
|
||||
"""For Painless Scripting or Script Scoring,the default mapping to create index."""
|
||||
return {
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"vector_field": {"type": "knn_vector", "dimension": dim},
|
||||
vector_field: {"type": "knn_vector", "dimension": dim},
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -106,13 +111,14 @@ def _default_text_mapping(
|
||||
ef_search: int = 512,
|
||||
ef_construction: int = 512,
|
||||
m: int = 16,
|
||||
vector_field: str = "vector_field",
|
||||
) -> Dict:
|
||||
"""For Approximate k-NN Search, this is the default mapping to create index."""
|
||||
return {
|
||||
"settings": {"index": {"knn": True, "knn.algo_param.ef_search": ef_search}},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"vector_field": {
|
||||
vector_field: {
|
||||
"type": "knn_vector",
|
||||
"dimension": dim,
|
||||
"method": {
|
||||
@ -165,10 +171,18 @@ def _default_script_query(
|
||||
}
|
||||
|
||||
|
||||
def __get_painless_scripting_source(space_type: str, query_vector: List[float]) -> str:
|
||||
def __get_painless_scripting_source(
|
||||
space_type: str, query_vector: List[float], vector_field: str = "vector_field"
|
||||
) -> str:
|
||||
"""For Painless Scripting, it returns the script source based on space type."""
|
||||
source_value = (
|
||||
"(1.0 + " + space_type + "(" + str(query_vector) + ", doc['vector_field']))"
|
||||
"(1.0 + "
|
||||
+ space_type
|
||||
+ "("
|
||||
+ str(query_vector)
|
||||
+ ", doc['"
|
||||
+ vector_field
|
||||
+ "']))"
|
||||
)
|
||||
if space_type == "cosineSimilarity":
|
||||
return source_value
|
||||
@ -250,13 +264,26 @@ class OpenSearchVectorSearch(VectorStore):
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
|
||||
Optional Args:
|
||||
vector_field: Document field embeddings are stored in. Defaults to
|
||||
"vector_field".
|
||||
|
||||
text_field: Document field the text of the document is stored in. Defaults
|
||||
to "text".
|
||||
"""
|
||||
embeddings = [
|
||||
self.embedding_function.embed_documents([text])[0] for text in texts
|
||||
]
|
||||
embeddings = self.embedding_function.embed_documents(list(texts))
|
||||
_validate_embeddings_and_bulk_size(len(embeddings), bulk_size)
|
||||
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
|
||||
text_field = _get_kwargs_value(kwargs, "text_field", "text")
|
||||
return _bulk_ingest_embeddings(
|
||||
self.client, self.index_name, embeddings, texts, metadatas
|
||||
self.client,
|
||||
self.index_name,
|
||||
embeddings,
|
||||
texts,
|
||||
metadatas,
|
||||
vector_field,
|
||||
text_field,
|
||||
)
|
||||
|
||||
def similarity_search(
|
||||
@ -277,14 +304,17 @@ class OpenSearchVectorSearch(VectorStore):
|
||||
Optional Args:
|
||||
vector_field: Document field embeddings are stored in. Defaults to
|
||||
"vector_field".
|
||||
|
||||
text_field: Document field the text of the document is stored in. Defaults
|
||||
to "text".
|
||||
|
||||
metadata_field: Document field that metadata is stored in. Defaults to
|
||||
"metadata".
|
||||
Can be set to a special value "*" to include the entire document.
|
||||
|
||||
Optional Args for Approximate Search:
|
||||
search_type: "approximate_search"; default: "approximate_search"
|
||||
|
||||
size: number of results the query actually returns; default: 4
|
||||
|
||||
Optional Args for Script Scoring Search:
|
||||
@ -298,6 +328,7 @@ class OpenSearchVectorSearch(VectorStore):
|
||||
|
||||
Optional Args for Painless Scripting Search:
|
||||
search_type: "painless_scripting"; default: "approximate_search"
|
||||
|
||||
space_type: "l2Squared", "l1Norm", "cosineSimilarity"; default: "l2Squared"
|
||||
|
||||
pre_filter: script_score query to pre-filter documents before identifying
|
||||
@ -307,23 +338,21 @@ class OpenSearchVectorSearch(VectorStore):
|
||||
search_type = _get_kwargs_value(kwargs, "search_type", "approximate_search")
|
||||
text_field = _get_kwargs_value(kwargs, "text_field", "text")
|
||||
metadata_field = _get_kwargs_value(kwargs, "metadata_field", "metadata")
|
||||
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
|
||||
if search_type == "approximate_search":
|
||||
size = _get_kwargs_value(kwargs, "size", 4)
|
||||
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
|
||||
search_query = _default_approximate_search_query(
|
||||
embedding, size, k, vector_field
|
||||
)
|
||||
elif search_type == SCRIPT_SCORING_SEARCH:
|
||||
space_type = _get_kwargs_value(kwargs, "space_type", "l2")
|
||||
pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY)
|
||||
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
|
||||
search_query = _default_script_query(
|
||||
embedding, space_type, pre_filter, vector_field
|
||||
)
|
||||
elif search_type == PAINLESS_SCRIPTING_SEARCH:
|
||||
space_type = _get_kwargs_value(kwargs, "space_type", "l2Squared")
|
||||
pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY)
|
||||
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
|
||||
search_query = _default_painless_scripting_query(
|
||||
embedding, space_type, pre_filter, vector_field
|
||||
)
|
||||
@ -370,6 +399,13 @@ class OpenSearchVectorSearch(VectorStore):
|
||||
and lucene engines recommended for large datasets. Also supports brute force
|
||||
search through Script Scoring and Painless Scripting.
|
||||
|
||||
Optional Args:
|
||||
vector_field: Document field embeddings are stored in. Defaults to
|
||||
"vector_field".
|
||||
|
||||
text_field: Document field the text of the document is stored in. Defaults
|
||||
to "text".
|
||||
|
||||
Optional Keyword Args for Approximate Search:
|
||||
engine: "nmslib", "faiss", "hnsw"; default: "nmslib"
|
||||
|
||||
@ -402,6 +438,8 @@ class OpenSearchVectorSearch(VectorStore):
|
||||
kwargs, "index_name", "OPENSEARCH_INDEX_NAME", default=uuid.uuid4().hex
|
||||
)
|
||||
is_appx_search = _get_kwargs_value(kwargs, "is_appx_search", True)
|
||||
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
|
||||
text_field = _get_kwargs_value(kwargs, "text_field", "text")
|
||||
if is_appx_search:
|
||||
engine = _get_kwargs_value(kwargs, "engine", "nmslib")
|
||||
space_type = _get_kwargs_value(kwargs, "space_type", "l2")
|
||||
@ -410,11 +448,13 @@ class OpenSearchVectorSearch(VectorStore):
|
||||
m = _get_kwargs_value(kwargs, "m", 16)
|
||||
|
||||
mapping = _default_text_mapping(
|
||||
dim, engine, space_type, ef_search, ef_construction, m
|
||||
dim, engine, space_type, ef_search, ef_construction, m, vector_field
|
||||
)
|
||||
else:
|
||||
mapping = _default_scripting_text_mapping(dim)
|
||||
|
||||
client.indices.create(index=index_name, body=mapping)
|
||||
_bulk_ingest_embeddings(client, index_name, embeddings, texts, metadatas)
|
||||
_bulk_ingest_embeddings(
|
||||
client, index_name, embeddings, texts, metadatas, vector_field, text_field
|
||||
)
|
||||
return cls(opensearch_url, index_name, embedding)
|
||||
|
@ -23,6 +23,30 @@ def test_opensearch() -> None:
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
|
||||
def test_opensearch_with_custom_field_name() -> None:
|
||||
"""Test indexing and search using custom vector field and text field name."""
|
||||
docsearch = OpenSearchVectorSearch.from_texts(
|
||||
texts,
|
||||
FakeEmbeddings(),
|
||||
opensearch_url=DEFAULT_OPENSEARCH_URL,
|
||||
vector_field="my_vector",
|
||||
text_field="custom_text",
|
||||
)
|
||||
output = docsearch.similarity_search(
|
||||
"foo", k=1, vector_field="my_vector", text_field="custom_text"
|
||||
)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
text_input = ["test", "add", "text", "method"]
|
||||
OpenSearchVectorSearch.add_texts(
|
||||
docsearch, text_input, vector_field="my_vector", text_field="custom_text"
|
||||
)
|
||||
output = docsearch.similarity_search(
|
||||
"add", k=1, vector_field="my_vector", text_field="custom_text"
|
||||
)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
|
||||
def test_opensearch_with_metadatas() -> None:
|
||||
"""Test end to end indexing and search with metadata."""
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
|
Loading…
Reference in New Issue
Block a user