community[major]: lint for usage of xml library (#22132)

* Lint for usage of standard xml library
* Add forced opt-in for quip client
* Actual security issue is with underlying QuipClient not LangChain
integration (since the client is doing the parsing), but adding
enforcement at the LangChain level.
This commit is contained in:
Eugene Yurtsev 2024-05-24 11:23:53 -04:00 committed by GitHub
parent 5b5ea2af30
commit d3db83abe3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 29 additions and 5 deletions

View File

@ -1,10 +1,9 @@
import logging
import re
import xml.etree.cElementTree
import xml.sax.saxutils
import xml.etree.cElementTree # OK: user-must-opt-in
from io import BytesIO
from typing import List, Optional, Sequence
from xml.etree.ElementTree import ElementTree
from xml.etree.ElementTree import ElementTree # OK: user-must-opt-in
from langchain_core.documents import Document
@ -22,14 +21,20 @@ class QuipLoader(BaseLoader):
"""
def __init__(
self, api_url: str, access_token: str, request_timeout: Optional[int] = 60
self,
api_url: str,
access_token: str,
request_timeout: Optional[int] = 60,
*,
allow_dangerous_xml_parsing: bool = False,
):
"""
Args:
api_url: https://platform.quip.com
access_token: token of access quip API. Please refer:
https://quip.com/dev/automation/documentation/current#section/Authentication/Get-Access-to-Quip's-APIs
https://quip.com/dev/automation/documentation/current#section/Authentication/Get-Access-to-Quip's-APIs
request_timeout: timeout of request, default 60s.
allow_dangerous_xml_parsing: Allow dangerous XML parsing, defaults to False
"""
try:
from quip_api.quip import QuipClient
@ -42,6 +47,17 @@ class QuipLoader(BaseLoader):
access_token=access_token, base_url=api_url, request_timeout=request_timeout
)
if not allow_dangerous_xml_parsing:
raise ValueError(
"The quip client uses the built-in XML parser which may cause"
"security issues when parsing XML data in some cases. "
"Please see "
"https://docs.python.org/3/library/xml.html#xml-vulnerabilities "
"For more information, set `allow_dangerous_xml_parsing` as True "
"if you are sure that your distribution of the standard library "
"is not vulnerable to XML vulnerabilities."
)
def load(
self,
folder_ids: Optional[List[str]] = None,

View File

@ -8,6 +8,14 @@ errors=0
# make sure not importing from langchain or langchain_experimental
git --no-pager grep '^from langchain_experimental\.' . && errors=$((errors+1))
# make sure no one is importing from the built-in xml library
# instead defusedxml should be used to avoid getting CVEs.
# Whether the standary library actually poses a risk to users
# is very nuanced and dependns on user's environment.
# https://docs.python.org/3/library/xml.etree.elementtree.html
git --no-pager grep '^from xml\.' . | grep -vE "# OK: user-must-opt-in" && errors=$((errors+1))
git --no-pager grep '^import xml\.' . | grep -vE "# OK: user-must-opt-in" && errors=$((errors+1))
# Decide on an exit status based on the errors
if [ "$errors" -gt 0 ]; then
exit 1