mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-31 02:11:09 +00:00
Add MyScaleWithoutJSON which allows user to wrap columns into Document's Metadata (#13164)
<!-- Thank you for contributing to LangChain! Replace this entire comment with: - **Description:** a description of the change, - **Issue:** the issue # it fixes (if applicable), - **Dependencies:** any dependencies required for this change, - **Tag maintainer:** for a quicker response, tag the relevant maintainer (see below), - **Twitter handle:** we announce bigger features on Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/extras` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. --> Replace this entire comment with: - **Description:** Add MyScaleWithoutJSON which allows user to wrap columns into Document's Metadata - **Tag maintainer:** @baskaryan
This commit is contained in:
@@ -490,3 +490,125 @@ class MyScale(VectorStore):
|
||||
@property
|
||||
def metadata_column(self) -> str:
|
||||
return self.config.column_map["metadata"]
|
||||
|
||||
|
||||
class MyScaleWithoutJSON(MyScale):
|
||||
"""MyScale vector store without metadata column
|
||||
|
||||
This is super handy if you are working to a SQL-native table
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding: Embeddings,
|
||||
config: Optional[MyScaleSettings] = None,
|
||||
must_have_cols: List[str] = [],
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Building a myscale vector store without metadata column
|
||||
|
||||
embedding (Embeddings): embedding model
|
||||
config (MyScaleSettings): Configuration to MyScale Client
|
||||
must_have_cols (List[str]): column names to be included in query
|
||||
Other keyword arguments will pass into
|
||||
[clickhouse-connect](https://docs.myscale.com/)
|
||||
"""
|
||||
super().__init__(embedding, config, **kwargs)
|
||||
self.must_have_cols: List[str] = must_have_cols
|
||||
|
||||
def _build_qstr(
|
||||
self, q_emb: List[float], topk: int, where_str: Optional[str] = None
|
||||
) -> str:
|
||||
q_emb_str = ",".join(map(str, q_emb))
|
||||
if where_str:
|
||||
where_str = f"PREWHERE {where_str}"
|
||||
else:
|
||||
where_str = ""
|
||||
|
||||
q_str = f"""
|
||||
SELECT {self.config.column_map['text']}, dist,
|
||||
{','.join(self.must_have_cols)}
|
||||
FROM {self.config.database}.{self.config.table}
|
||||
{where_str}
|
||||
ORDER BY distance({self.config.column_map['vector']}, [{q_emb_str}])
|
||||
AS dist {self.dist_order}
|
||||
LIMIT {topk}
|
||||
"""
|
||||
return q_str
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
where_str: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Perform a similarity search with MyScale by vectors
|
||||
|
||||
Args:
|
||||
query (str): query string
|
||||
k (int, optional): Top K neighbors to retrieve. Defaults to 4.
|
||||
where_str (Optional[str], optional): where condition string.
|
||||
Defaults to None.
|
||||
|
||||
NOTE: Please do not let end-user to fill this and always be aware
|
||||
of SQL injection. When dealing with metadatas, remember to
|
||||
use `{self.metadata_column}.attribute` instead of `attribute`
|
||||
alone. The default name for it is `metadata`.
|
||||
|
||||
Returns:
|
||||
List[Document]: List of (Document, similarity)
|
||||
"""
|
||||
q_str = self._build_qstr(embedding, k, where_str)
|
||||
try:
|
||||
return [
|
||||
Document(
|
||||
page_content=r[self.config.column_map["text"]],
|
||||
metadata={k: r[k] for k in self.must_have_cols},
|
||||
)
|
||||
for r in self.client.query(q_str).named_results()
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m")
|
||||
return []
|
||||
|
||||
def similarity_search_with_relevance_scores(
|
||||
self, query: str, k: int = 4, where_str: Optional[str] = None, **kwargs: Any
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Perform a similarity search with MyScale
|
||||
|
||||
Args:
|
||||
query (str): query string
|
||||
k (int, optional): Top K neighbors to retrieve. Defaults to 4.
|
||||
where_str (Optional[str], optional): where condition string.
|
||||
Defaults to None.
|
||||
|
||||
NOTE: Please do not let end-user to fill this and always be aware
|
||||
of SQL injection. When dealing with metadatas, remember to
|
||||
use `{self.metadata_column}.attribute` instead of `attribute`
|
||||
alone. The default name for it is `metadata`.
|
||||
|
||||
Returns:
|
||||
List[Document]: List of documents most similar to the query text
|
||||
and cosine distance in float for each.
|
||||
Lower score represents more similarity.
|
||||
"""
|
||||
q_str = self._build_qstr(self._embeddings.embed_query(query), k, where_str)
|
||||
try:
|
||||
return [
|
||||
(
|
||||
Document(
|
||||
page_content=r[self.config.column_map["text"]],
|
||||
metadata={k: r[k] for k in self.must_have_cols},
|
||||
),
|
||||
r["dist"],
|
||||
)
|
||||
for r in self.client.query(q_str).named_results()
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m")
|
||||
return []
|
||||
|
||||
@property
|
||||
def metadata_column(self) -> str:
|
||||
return ""
|
||||
|
Reference in New Issue
Block a user