mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-01 11:02:37 +00:00
Add MyScaleWithoutJSON which allows user to wrap columns into Document's Metadata (#13164)
<!-- Thank you for contributing to LangChain! Replace this entire comment with: - **Description:** a description of the change, - **Issue:** the issue # it fixes (if applicable), - **Dependencies:** any dependencies required for this change, - **Tag maintainer:** for a quicker response, tag the relevant maintainer (see below), - **Twitter handle:** we announce bigger features on Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/extras` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. --> Replace this entire comment with: - **Description:** Add MyScaleWithoutJSON which allows user to wrap columns into Document's Metadata - **Tag maintainer:** @baskaryan
This commit is contained in:
@@ -490,3 +490,125 @@ class MyScale(VectorStore):
|
|||||||
@property
|
@property
|
||||||
def metadata_column(self) -> str:
|
def metadata_column(self) -> str:
|
||||||
return self.config.column_map["metadata"]
|
return self.config.column_map["metadata"]
|
||||||
|
|
||||||
|
|
||||||
|
class MyScaleWithoutJSON(MyScale):
|
||||||
|
"""MyScale vector store without metadata column
|
||||||
|
|
||||||
|
This is super handy if you are working to a SQL-native table
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embedding: Embeddings,
|
||||||
|
config: Optional[MyScaleSettings] = None,
|
||||||
|
must_have_cols: List[str] = [],
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
"""Building a myscale vector store without metadata column
|
||||||
|
|
||||||
|
embedding (Embeddings): embedding model
|
||||||
|
config (MyScaleSettings): Configuration to MyScale Client
|
||||||
|
must_have_cols (List[str]): column names to be included in query
|
||||||
|
Other keyword arguments will pass into
|
||||||
|
[clickhouse-connect](https://docs.myscale.com/)
|
||||||
|
"""
|
||||||
|
super().__init__(embedding, config, **kwargs)
|
||||||
|
self.must_have_cols: List[str] = must_have_cols
|
||||||
|
|
||||||
|
def _build_qstr(
|
||||||
|
self, q_emb: List[float], topk: int, where_str: Optional[str] = None
|
||||||
|
) -> str:
|
||||||
|
q_emb_str = ",".join(map(str, q_emb))
|
||||||
|
if where_str:
|
||||||
|
where_str = f"PREWHERE {where_str}"
|
||||||
|
else:
|
||||||
|
where_str = ""
|
||||||
|
|
||||||
|
q_str = f"""
|
||||||
|
SELECT {self.config.column_map['text']}, dist,
|
||||||
|
{','.join(self.must_have_cols)}
|
||||||
|
FROM {self.config.database}.{self.config.table}
|
||||||
|
{where_str}
|
||||||
|
ORDER BY distance({self.config.column_map['vector']}, [{q_emb_str}])
|
||||||
|
AS dist {self.dist_order}
|
||||||
|
LIMIT {topk}
|
||||||
|
"""
|
||||||
|
return q_str
|
||||||
|
|
||||||
|
def similarity_search_by_vector(
|
||||||
|
self,
|
||||||
|
embedding: List[float],
|
||||||
|
k: int = 4,
|
||||||
|
where_str: Optional[str] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Perform a similarity search with MyScale by vectors
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query (str): query string
|
||||||
|
k (int, optional): Top K neighbors to retrieve. Defaults to 4.
|
||||||
|
where_str (Optional[str], optional): where condition string.
|
||||||
|
Defaults to None.
|
||||||
|
|
||||||
|
NOTE: Please do not let end-user to fill this and always be aware
|
||||||
|
of SQL injection. When dealing with metadatas, remember to
|
||||||
|
use `{self.metadata_column}.attribute` instead of `attribute`
|
||||||
|
alone. The default name for it is `metadata`.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: List of (Document, similarity)
|
||||||
|
"""
|
||||||
|
q_str = self._build_qstr(embedding, k, where_str)
|
||||||
|
try:
|
||||||
|
return [
|
||||||
|
Document(
|
||||||
|
page_content=r[self.config.column_map["text"]],
|
||||||
|
metadata={k: r[k] for k in self.must_have_cols},
|
||||||
|
)
|
||||||
|
for r in self.client.query(q_str).named_results()
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def similarity_search_with_relevance_scores(
|
||||||
|
self, query: str, k: int = 4, where_str: Optional[str] = None, **kwargs: Any
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
"""Perform a similarity search with MyScale
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query (str): query string
|
||||||
|
k (int, optional): Top K neighbors to retrieve. Defaults to 4.
|
||||||
|
where_str (Optional[str], optional): where condition string.
|
||||||
|
Defaults to None.
|
||||||
|
|
||||||
|
NOTE: Please do not let end-user to fill this and always be aware
|
||||||
|
of SQL injection. When dealing with metadatas, remember to
|
||||||
|
use `{self.metadata_column}.attribute` instead of `attribute`
|
||||||
|
alone. The default name for it is `metadata`.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: List of documents most similar to the query text
|
||||||
|
and cosine distance in float for each.
|
||||||
|
Lower score represents more similarity.
|
||||||
|
"""
|
||||||
|
q_str = self._build_qstr(self._embeddings.embed_query(query), k, where_str)
|
||||||
|
try:
|
||||||
|
return [
|
||||||
|
(
|
||||||
|
Document(
|
||||||
|
page_content=r[self.config.column_map["text"]],
|
||||||
|
metadata={k: r[k] for k in self.must_have_cols},
|
||||||
|
),
|
||||||
|
r["dist"],
|
||||||
|
)
|
||||||
|
for r in self.client.query(q_str).named_results()
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m")
|
||||||
|
return []
|
||||||
|
|
||||||
|
@property
|
||||||
|
def metadata_column(self) -> str:
|
||||||
|
return ""
|
||||||
|
Reference in New Issue
Block a user