mirror of
https://github.com/hwchase17/langchain.git
synced 2025-04-29 04:16:02 +00:00
ci: Add script to check for pickle usage in community (#22863)
Add script to check for pickle usage in community.
This commit is contained in:
parent
77209f315e
commit
8f7cc73817
@ -44,6 +44,7 @@ lint_tests: MYPY_CACHE=.mypy_cache_test
|
|||||||
lint lint_diff lint_package lint_tests:
|
lint lint_diff lint_package lint_tests:
|
||||||
./scripts/check_pydantic.sh .
|
./scripts/check_pydantic.sh .
|
||||||
./scripts/lint_imports.sh
|
./scripts/lint_imports.sh
|
||||||
|
./scripts/check_pickle.sh .
|
||||||
poetry run ruff .
|
poetry run ruff .
|
||||||
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff
|
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff
|
||||||
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I $(PYTHON_FILES)
|
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I $(PYTHON_FILES)
|
||||||
|
@ -242,7 +242,7 @@ def _load_pickled_fn_from_hex_string(
|
|||||||
raise ValueError(f"Please install cloudpickle>=2.0.0. Error: {e}")
|
raise ValueError(f"Please install cloudpickle>=2.0.0. Error: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return cloudpickle.loads(bytes.fromhex(data))
|
return cloudpickle.loads(bytes.fromhex(data)) # ignore[pickle]: explicit-opt-in
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Failed to load the pickled function from a hexadecimal string. Error: {e}"
|
f"Failed to load the pickled function from a hexadecimal string. Error: {e}"
|
||||||
|
@ -36,7 +36,9 @@ def _send_pipeline_to_device(pipeline: Any, device: int) -> Any:
|
|||||||
"""Send a pipeline to a device on the cluster."""
|
"""Send a pipeline to a device on the cluster."""
|
||||||
if isinstance(pipeline, str):
|
if isinstance(pipeline, str):
|
||||||
with open(pipeline, "rb") as f:
|
with open(pipeline, "rb") as f:
|
||||||
pipeline = pickle.load(f)
|
# This code path can only be triggered if the user
|
||||||
|
# passed allow_dangerous_deserialization=True
|
||||||
|
pipeline = pickle.load(f) # ignore[pickle]: explicit-opt-in
|
||||||
|
|
||||||
if importlib.util.find_spec("torch") is not None:
|
if importlib.util.find_spec("torch") is not None:
|
||||||
import torch
|
import torch
|
||||||
|
@ -152,6 +152,8 @@ class TFIDFRetriever(BaseRetriever):
|
|||||||
|
|
||||||
# Load docs and tfidf array as pickle.
|
# Load docs and tfidf array as pickle.
|
||||||
with open(path / f"{file_name}.pkl", "rb") as f:
|
with open(path / f"{file_name}.pkl", "rb") as f:
|
||||||
docs, tfidf_array = pickle.load(f)
|
# This code path can only be triggered if the user
|
||||||
|
# passed allow_dangerous_deserialization=True
|
||||||
|
docs, tfidf_array = pickle.load(f) # ignore[pickle]: explicit-opt-in
|
||||||
|
|
||||||
return cls(vectorizer=vectorizer, docs=docs, tfidf_array=tfidf_array)
|
return cls(vectorizer=vectorizer, docs=docs, tfidf_array=tfidf_array)
|
||||||
|
@ -456,7 +456,14 @@ class Annoy(VectorStore):
|
|||||||
annoy = guard_import("annoy")
|
annoy = guard_import("annoy")
|
||||||
# load docstore and index_to_docstore_id
|
# load docstore and index_to_docstore_id
|
||||||
with open(path / "index.pkl", "rb") as file:
|
with open(path / "index.pkl", "rb") as file:
|
||||||
docstore, index_to_docstore_id, config_object = pickle.load(file)
|
# Code path can only be reached if allow_dangerous_deserialization is True
|
||||||
|
(
|
||||||
|
docstore,
|
||||||
|
index_to_docstore_id,
|
||||||
|
config_object,
|
||||||
|
) = pickle.load( # ignore[pickle]: explicit-opt-in
|
||||||
|
file
|
||||||
|
)
|
||||||
|
|
||||||
f = int(config_object["ANNOY"]["f"])
|
f = int(config_object["ANNOY"]["f"])
|
||||||
metric = config_object["ANNOY"]["metric"]
|
metric = config_object["ANNOY"]["metric"]
|
||||||
|
@ -1093,7 +1093,13 @@ class FAISS(VectorStore):
|
|||||||
|
|
||||||
# load docstore and index_to_docstore_id
|
# load docstore and index_to_docstore_id
|
||||||
with open(path / f"{index_name}.pkl", "rb") as f:
|
with open(path / f"{index_name}.pkl", "rb") as f:
|
||||||
docstore, index_to_docstore_id = pickle.load(f)
|
(
|
||||||
|
docstore,
|
||||||
|
index_to_docstore_id,
|
||||||
|
) = pickle.load( # ignore[pickle]: explicit-opt-in
|
||||||
|
f
|
||||||
|
)
|
||||||
|
|
||||||
return cls(embeddings, index, docstore, index_to_docstore_id, **kwargs)
|
return cls(embeddings, index, docstore, index_to_docstore_id, **kwargs)
|
||||||
|
|
||||||
def serialize_to_bytes(self) -> bytes:
|
def serialize_to_bytes(self) -> bytes:
|
||||||
@ -1123,7 +1129,13 @@ class FAISS(VectorStore):
|
|||||||
"loading a file from an untrusted source (e.g., some random site on "
|
"loading a file from an untrusted source (e.g., some random site on "
|
||||||
"the internet.)."
|
"the internet.)."
|
||||||
)
|
)
|
||||||
index, docstore, index_to_docstore_id = pickle.loads(serialized)
|
(
|
||||||
|
index,
|
||||||
|
docstore,
|
||||||
|
index_to_docstore_id,
|
||||||
|
) = pickle.loads( # ignore[pickle]: explicit-opt-in
|
||||||
|
serialized
|
||||||
|
)
|
||||||
return cls(embeddings, index, docstore, index_to_docstore_id, **kwargs)
|
return cls(embeddings, index, docstore, index_to_docstore_id, **kwargs)
|
||||||
|
|
||||||
def _select_relevance_score_fn(self) -> Callable[[float], float]:
|
def _select_relevance_score_fn(self) -> Callable[[float], float]:
|
||||||
|
@ -493,7 +493,13 @@ class ScaNN(VectorStore):
|
|||||||
|
|
||||||
# load docstore and index_to_docstore_id
|
# load docstore and index_to_docstore_id
|
||||||
with open(path / "{index_name}.pkl".format(index_name=index_name), "rb") as f:
|
with open(path / "{index_name}.pkl".format(index_name=index_name), "rb") as f:
|
||||||
docstore, index_to_docstore_id = pickle.load(f)
|
(
|
||||||
|
docstore,
|
||||||
|
index_to_docstore_id,
|
||||||
|
) = pickle.load( # ignore[pickle]: explicit-opt-in
|
||||||
|
f
|
||||||
|
)
|
||||||
|
|
||||||
return cls(embedding, index, docstore, index_to_docstore_id, **kwargs)
|
return cls(embedding, index, docstore, index_to_docstore_id, **kwargs)
|
||||||
|
|
||||||
def _select_relevance_score_fn(self) -> Callable[[float], float]:
|
def _select_relevance_score_fn(self) -> Callable[[float], float]:
|
||||||
|
@ -188,7 +188,7 @@ class TileDB(VectorStore):
|
|||||||
pickled_metadata = doc.get("metadata")
|
pickled_metadata = doc.get("metadata")
|
||||||
result_doc = Document(page_content=str(doc["text"][0]))
|
result_doc = Document(page_content=str(doc["text"][0]))
|
||||||
if pickled_metadata is not None:
|
if pickled_metadata is not None:
|
||||||
metadata = pickle.loads(
|
metadata = pickle.loads( # ignore[pickle]: explicit-opt-in
|
||||||
np.array(pickled_metadata.tolist()).astype(np.uint8).tobytes()
|
np.array(pickled_metadata.tolist()).astype(np.uint8).tobytes()
|
||||||
)
|
)
|
||||||
result_doc.metadata = metadata
|
result_doc.metadata = metadata
|
||||||
|
27
libs/community/scripts/check_pickle.sh
Executable file
27
libs/community/scripts/check_pickle.sh
Executable file
@ -0,0 +1,27 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# This checks for usage of pickle in the package.
|
||||||
|
#
|
||||||
|
# Usage: ./scripts/check_pickle.sh /path/to/repository
|
||||||
|
#
|
||||||
|
# Check if a path argument is provided
|
||||||
|
if [ $# -ne 1 ]; then
|
||||||
|
echo "Usage: $0 /path/to/repository"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
repository_path="$1"
|
||||||
|
|
||||||
|
# Search for lines matching the pattern within the specified repository
|
||||||
|
result=$(git -C "$repository_path" grep -E 'pickle.load\(|pickle.loads\(' | grep -v '# ignore\[pickle\]: explicit-opt-in')
|
||||||
|
|
||||||
|
# Check if any matching lines were found
|
||||||
|
if [ -n "$result" ]; then
|
||||||
|
echo "ERROR: The following lines need to be updated:"
|
||||||
|
echo "$result"
|
||||||
|
echo "Please avoid using pickle or cloudpickle."
|
||||||
|
echo "If you must, then add:"
|
||||||
|
echo "1. A security notice (scan the code for examples)"
|
||||||
|
echo "2. Code path should be opt-in."
|
||||||
|
exit 1
|
||||||
|
fi
|
Loading…
Reference in New Issue
Block a user