diff --git a/libs/community/langchain_community/document_loaders/quip.py b/libs/community/langchain_community/document_loaders/quip.py index 0d9c4474fb0..540ef8f945c 100644 --- a/libs/community/langchain_community/document_loaders/quip.py +++ b/libs/community/langchain_community/document_loaders/quip.py @@ -1,10 +1,9 @@ import logging import re -import xml.etree.cElementTree -import xml.sax.saxutils +import xml.etree.cElementTree # OK: user-must-opt-in from io import BytesIO from typing import List, Optional, Sequence -from xml.etree.ElementTree import ElementTree +from xml.etree.ElementTree import ElementTree # OK: user-must-opt-in from langchain_core.documents import Document @@ -22,14 +21,20 @@ class QuipLoader(BaseLoader): """ def __init__( - self, api_url: str, access_token: str, request_timeout: Optional[int] = 60 + self, + api_url: str, + access_token: str, + request_timeout: Optional[int] = 60, + *, + allow_dangerous_xml_parsing: bool = False, ): """ Args: api_url: https://platform.quip.com access_token: token of access quip API. Please refer: - https://quip.com/dev/automation/documentation/current#section/Authentication/Get-Access-to-Quip's-APIs + https://quip.com/dev/automation/documentation/current#section/Authentication/Get-Access-to-Quip's-APIs request_timeout: timeout of request, default 60s. + allow_dangerous_xml_parsing: Allow dangerous XML parsing, defaults to False """ try: from quip_api.quip import QuipClient @@ -42,6 +47,17 @@ class QuipLoader(BaseLoader): access_token=access_token, base_url=api_url, request_timeout=request_timeout ) + if not allow_dangerous_xml_parsing: + raise ValueError( + "The quip client uses the built-in XML parser which may cause" + "security issues when parsing XML data in some cases. " + "Please see " + "https://docs.python.org/3/library/xml.html#xml-vulnerabilities " + "For more information, set `allow_dangerous_xml_parsing` as True " + "if you are sure that your distribution of the standard library " + "is not vulnerable to XML vulnerabilities." + ) + def load( self, folder_ids: Optional[List[str]] = None, diff --git a/libs/community/scripts/lint_imports.sh b/libs/community/scripts/lint_imports.sh index 89af89514f5..97d9c96b031 100755 --- a/libs/community/scripts/lint_imports.sh +++ b/libs/community/scripts/lint_imports.sh @@ -8,6 +8,14 @@ errors=0 # make sure not importing from langchain or langchain_experimental git --no-pager grep '^from langchain_experimental\.' . && errors=$((errors+1)) +# make sure no one is importing from the built-in xml library +# instead defusedxml should be used to avoid getting CVEs. +# Whether the standary library actually poses a risk to users +# is very nuanced and dependns on user's environment. +# https://docs.python.org/3/library/xml.etree.elementtree.html +git --no-pager grep '^from xml\.' . | grep -vE "# OK: user-must-opt-in" && errors=$((errors+1)) +git --no-pager grep '^import xml\.' . | grep -vE "# OK: user-must-opt-in" && errors=$((errors+1)) + # Decide on an exit status based on the errors if [ "$errors" -gt 0 ]; then exit 1