From 213927e1a7778f302c23783b4f54491ac39f34e8 Mon Sep 17 00:00:00 2001
From: lian <imwhatiam123@gmail.com>
Date: Mon, 24 Mar 2025 14:26:33 +0800
Subject: [PATCH] fix Polynomial regular expression used on uncontrolled data
 (#7656)

* fix Polynomial regular expression used on uncontrolled data

* remove office convert code
---
 seahub/api2/urls.py       |  10 ---
 seahub/api2/views.py      |  94 +--------------------------
 seahub/api2/views_misc.py |   5 +-
 seahub/urls.py            |  12 ----
 seahub/utils/__init__.py  |  74 ----------------------
 seahub/views/file.py      | 129 ++------------------------------------
 6 files changed, 9 insertions(+), 315 deletions(-)

diff --git a/seahub/api2/urls.py b/seahub/api2/urls.py
index 79a364bff1..51c118490d 100644
--- a/seahub/api2/urls.py
+++ b/seahub/api2/urls.py
@@ -102,16 +102,6 @@ urlpatterns = [
     re_path(r'^repos/(?P<repo_id>[-0-9-a-f]{36})/fileops/move/$', OpMoveView.as_view(), name="api2-fileops-move"),
 ]
 
-# serve office converter static files
-from seahub.utils import HAS_OFFICE_CONVERTER
-if HAS_OFFICE_CONVERTER:
-    urlpatterns += [
-        path('office-convert/status/', OfficeConvertQueryStatus.as_view()),
-    ]
-    urlpatterns += [
-        re_path(r'^office-convert/generate/repos/(?P<repo_id>[-0-9-a-f]{36})/$', OfficeGenerateView.as_view()),
-    ]
-
 from seahub.settings import CLIENT_SSO_VIA_LOCAL_BROWSER
 if CLIENT_SSO_VIA_LOCAL_BROWSER:
     urlpatterns += [
diff --git a/seahub/api2/views.py b/seahub/api2/views.py
index d4f7777caf..3ea439dccd 100644
--- a/seahub/api2/views.py
+++ b/seahub/api2/views.py
@@ -98,15 +98,13 @@ from seahub.views.file import get_file_view_path_and_perm, send_file_access_msg,
 if HAS_FILE_SEARCH or HAS_FILE_SEASEARCH:
     from seahub.search.utils import search_files, get_search_repos_map, SEARCH_FILEEXT, ai_search_files, \
         RELATED_REPOS_PREFIX, SEARCH_REPOS_LIMIT, RELATED_REPOS_CACHE_TIMEOUT, format_repos
-from seahub.utils import HAS_OFFICE_CONVERTER, transfer_repo
-if HAS_OFFICE_CONVERTER:
-    from seahub.utils import query_office_convert_status, prepare_converted_html
+from seahub.utils import transfer_repo
 import seahub.settings as settings
 from seahub.settings import THUMBNAIL_EXTENSION, THUMBNAIL_ROOT, \
     FILE_LOCK_EXPIRATION_DAYS, ENABLE_STORAGE_CLASSES, \
     STORAGE_CLASS_MAPPING_POLICY, \
     ENABLE_RESET_ENCRYPTED_REPO_PASSWORD, SHARE_LINK_EXPIRE_DAYS_MAX, \
-        SHARE_LINK_EXPIRE_DAYS_MIN, SHARE_LINK_EXPIRE_DAYS_DEFAULT
+    SHARE_LINK_EXPIRE_DAYS_MIN, SHARE_LINK_EXPIRE_DAYS_DEFAULT
 from seahub.subscription.utils import subscription_check
 from seahub.organizations.models import OrgAdminSettings, DISABLE_ORG_ENCRYPTED_LIBRARY
 from seahub.seadoc.utils import get_seadoc_file_uuid, gen_seadoc_image_parent_path, get_seadoc_asset_upload_link
@@ -5099,94 +5097,6 @@ class RepoHistoryChange(APIView):
                             content_type=json_content_type)
 
 
-# based on views/file.py::office_convert_query_status
-class OfficeConvertQueryStatus(APIView):
-    authentication_classes = (TokenAuthentication, )
-    permission_classes = (IsAuthenticated, )
-    throttle_classes = (UserRateThrottle, )
-
-    def get(self, request, format=None):
-        if not HAS_OFFICE_CONVERTER:
-            return api_error(status.HTTP_404_NOT_FOUND, 'Office converter not enabled.')
-
-        content_type = 'application/json; charset=utf-8'
-
-        ret = {'success': False}
-
-        file_id = request.GET.get('file_id', '')
-        if len(file_id) != 40:
-            ret['error'] = 'invalid param'
-        else:
-            try:
-                d = query_office_convert_status(file_id)
-                if d.error:
-                    ret['error'] = d.error
-                else:
-                    ret['success'] = True
-                    ret['status'] = d.status
-            except Exception as e:
-                logging.exception('failed to call query_office_convert_status')
-                ret['error'] = str(e)
-
-        return HttpResponse(json.dumps(ret), content_type=content_type)
-
-# based on views/file.py::view_file and views/file.py::handle_document
-class OfficeGenerateView(APIView):
-    authentication_classes = (TokenAuthentication, )
-    permission_classes = (IsAuthenticated, )
-    throttle_classes = (UserRateThrottle, )
-
-    def get(self, request, repo_id, format=None):
-        username = request.user.username
-        # check arguments
-        repo = get_repo(repo_id)
-        if not repo:
-            return api_error(status.HTTP_404_NOT_FOUND, 'Library not found.')
-
-
-        path = request.GET.get('p', '/').rstrip('/')
-        commit_id = request.GET.get('commit_id', None)
-
-        if commit_id:
-            try:
-                obj_id = seafserv_threaded_rpc.get_file_id_by_commit_and_path(
-                    repo.id, commit_id, path)
-            except:
-                return api_error(status.HTTP_404_NOT_FOUND, 'Revision not found.')
-        else:
-            try:
-                obj_id = seafile_api.get_file_id_by_path(repo_id, path)
-            except:
-                return api_error(status.HTTP_404_NOT_FOUND, 'File not found.')
-
-        if not obj_id:
-            return api_error(status.HTTP_404_NOT_FOUND, 'File not found.')
-
-        # Check whether user has permission to view file and get file raw path,
-        # render error page if permission deny.
-        raw_path, inner_path, user_perm = get_file_view_path_and_perm(request,
-                                                                      repo_id,
-                                                                      obj_id, path)
-
-        if not user_perm:
-            return api_error(status.HTTP_403_FORBIDDEN, 'You do not have permission to view this file.')
-
-        u_filename = os.path.basename(path)
-        filetype, fileext = get_file_type_and_ext(u_filename)
-        if filetype != DOCUMENT:
-            return api_error(status.HTTP_400_BAD_REQUEST, 'File is not a convertable document')
-
-        ret_dict = {}
-        if HAS_OFFICE_CONVERTER:
-            err = prepare_converted_html(raw_path, obj_id, fileext, ret_dict)
-            # populate return value dict
-            ret_dict['err'] = err
-            ret_dict['obj_id'] = obj_id
-        else:
-            ret_dict['filetype'] = 'Unknown'
-
-        return HttpResponse(json.dumps(ret_dict), status=200, content_type=json_content_type)
-
 class ThumbnailView(APIView):
     authentication_classes = (TokenAuthentication, SessionAuthentication)
     permission_classes = (IsAuthenticated,)
diff --git a/seahub/api2/views_misc.py b/seahub/api2/views_misc.py
index 7647691101..962ed256e7 100644
--- a/seahub/api2/views_misc.py
+++ b/seahub/api2/views_misc.py
@@ -2,7 +2,7 @@
 from seahub.api2.base import APIView
 from seahub.api2.utils import json_response
 from seahub import settings
-from seahub.utils import HAS_OFFICE_CONVERTER, HAS_FILE_SEARCH, is_pro_version, HAS_FILE_SEASEARCH
+from seahub.utils import HAS_FILE_SEARCH, is_pro_version, HAS_FILE_SEASEARCH
 
 from constance import config
 
@@ -32,9 +32,6 @@ class ServerInfoView(APIView):
         if is_pro_version():
             features.append('seafile-pro')
 
-        if HAS_OFFICE_CONVERTER:
-            features.append('office-preview')
-
         if HAS_FILE_SEARCH or HAS_FILE_SEASEARCH:
             features.append('file-search')
 
diff --git a/seahub/urls.py b/seahub/urls.py
index ae0913253c..60a88d41ef 100644
--- a/seahub/urls.py
+++ b/seahub/urls.py
@@ -972,18 +972,6 @@ if getattr(settings, 'ENABLE_LOGIN_SIMPLE_CHECK', False):
         re_path(r'^sso-auto-login/', login_simple_check),
     ]
 
-# serve office converter static files
-from seahub.utils import HAS_OFFICE_CONVERTER
-if HAS_OFFICE_CONVERTER:
-    from seahub.views.file import (
-        office_convert_query_status, office_convert_get_page
-    )
-    urlpatterns += [
-        re_path(r'^office-convert/static/(?P<repo_id>[-0-9a-f]{36})/(?P<commit_id>[0-9a-f]{40})/(?P<path>.+)/(?P<filename>[^/].+)$',
-            office_convert_get_page, name='office_convert_get_page'),
-        path('office-convert/status/', office_convert_query_status, name='office_convert_query_status'),
-    ]
-
 if getattr(settings, 'ENABLE_MULTI_ADFS', False):
     from seahub.adfs_auth.views import *
     urlpatterns += [
diff --git a/seahub/utils/__init__.py b/seahub/utils/__init__.py
index b0250d89c6..a3c2ee050c 100644
--- a/seahub/utils/__init__.py
+++ b/seahub/utils/__init__.py
@@ -120,11 +120,6 @@ def is_db_sqlite3():
 
 IS_DB_SQLITE3 = is_db_sqlite3()
 
-try:
-    from seahub.settings import OFFICE_CONVERTOR_ROOT
-except ImportError:
-    OFFICE_CONVERTOR_ROOT = ''
-
 from seahub.utils.file_types import *
 from seahub.utils.htmldiff import HtmlDiff # used in views/files.py
 
@@ -1154,78 +1149,9 @@ if EVENTS_CONFIG_FILE:
 
     FILE_AUDIT_ENABLED = check_file_audit_enabled()
 
-# office convert related
-def check_office_converter_enabled():
-    if OFFICE_CONVERTOR_ROOT:
-        return True
-    return False
-
-HAS_OFFICE_CONVERTER = check_office_converter_enabled()
 OFFICE_PREVIEW_MAX_SIZE = 2 * 1024 * 1024
 OFFICE_PREVIEW_MAX_PAGES = 50
 
-if HAS_OFFICE_CONVERTER:
-
-    import time
-    import requests
-    import jwt
-
-    def add_office_convert_task(file_id, doctype, raw_path):
-        payload = {'exp': int(time.time()) + 300, }
-        token = jwt.encode(payload, seahub.settings.SECRET_KEY, algorithm='HS256')
-        headers = {"Authorization": "Token %s" % token}
-        params = {'file_id': file_id, 'doctype': doctype, 'raw_path': raw_path}
-        url = urljoin(OFFICE_CONVERTOR_ROOT, '/add-task')
-        requests.get(url, params, headers=headers)
-        return {'exists': False}
-
-    def query_office_convert_status(file_id, doctype):
-        payload = {'exp': int(time.time()) + 300, }
-        token = jwt.encode(payload, seahub.settings.SECRET_KEY, algorithm='HS256')
-        headers = {"Authorization": "Token %s" % token}
-        params = {'file_id': file_id, 'doctype': doctype}
-        url = urljoin(OFFICE_CONVERTOR_ROOT, '/query-status')
-        d = requests.get(url, params, headers=headers)
-        d = d.json()
-        ret = {}
-        if 'error' in d:
-            ret['error'] = d['error']
-            ret['status'] = 'ERROR'
-        else:
-            ret['success'] = True
-            ret['status'] = d['status']
-        return ret
-
-    def get_office_converted_page(path, static_filename, file_id):
-        url = urljoin(OFFICE_CONVERTOR_ROOT, '/get-converted-page')
-        payload = {'exp': int(time.time()) + 300, }
-        token = jwt.encode(payload, seahub.settings.SECRET_KEY, algorithm='HS256')
-        headers = {"Authorization": "Token %s" % token}
-        params = {'static_filename': static_filename, 'file_id': file_id}
-        try:
-            ret = requests.get(url, params, headers=headers)
-        except urllib.error.HTTPError as e:
-            raise Exception(e)
-
-        content_type = ret.headers.get('content-type', None)
-        if content_type is None:
-            dummy, ext = os.path.splitext(os.path.basename(path))
-            content_type = mimetypes.types_map.get(ext, 'application/octet-stream')
-
-        resp = HttpResponse(ret, content_type=content_type)
-        if 'last-modified' in ret.headers:
-            resp['Last-Modified'] = ret.headers.get('last-modified')
-
-        return resp
-
-    def prepare_converted_html(raw_path, obj_id, doctype, ret_dict):
-        try:
-            add_office_convert_task(obj_id, doctype, raw_path)
-        except Exception as e:
-            logging.exception('failed to add_office_convert_task: %s' % e)
-            return _('Internal Server Error')
-        return None
-
 # search realted
 HAS_FILE_SEARCH = False
 HAS_FILE_SEASEARCH = False
diff --git a/seahub/views/file.py b/seahub/views/file.py
index 24225b81a5..7d18555fa0 100644
--- a/seahub/views/file.py
+++ b/seahub/views/file.py
@@ -55,7 +55,7 @@ from seahub.utils import render_error, is_org_context, \
     get_file_type_and_ext, gen_file_get_url, \
     render_permission_error, is_pro_version, is_textual_file, \
     EMPTY_SHA1, HtmlDiff, gen_inner_file_get_url, \
-    get_conf_text_ext, HAS_OFFICE_CONVERTER, PREVIEW_FILEEXT, \
+    get_conf_text_ext, PREVIEW_FILEEXT, \
     normalize_file_path, get_service_url, OFFICE_PREVIEW_MAX_SIZE, \
     normalize_cache_key, gen_file_get_url_by_sharelink, gen_file_get_url_new
 from seahub.utils.ip import get_remote_ip
@@ -78,12 +78,6 @@ from seahub.thumbnail.utils import extract_xmind_image, get_thumbnail_src, \
 from seahub.seadoc.utils import get_seadoc_file_uuid, gen_seadoc_access_token, is_seadoc_revision
 from seahub.seadoc.models import SeadocRevision
 
-if HAS_OFFICE_CONVERTER:
-    from seahub.utils import (
-        query_office_convert_status, get_office_converted_page,
-        prepare_converted_html,
-    )
-
 import seahub.settings as settings
 from seahub.settings import FILE_ENCODING_LIST, FILE_PREVIEW_MAX_SIZE, \
     FILE_ENCODING_TRY_LIST, MEDIA_URL, ENABLE_WATERMARK, \
@@ -327,12 +321,7 @@ def handle_textual_file(request, filetype, raw_path, ret_dict):
     ret_dict['file_encoding_list'] = file_encoding_list
 
 def handle_document(raw_path, obj_id, fileext, ret_dict):
-    if HAS_OFFICE_CONVERTER:
-        err = prepare_converted_html(raw_path, obj_id, fileext, ret_dict)
-        # populate return value dict
-        ret_dict['err'] = err
-    else:
-        ret_dict['filetype'] = 'Unknown'
+    ret_dict['filetype'] = 'Unknown'
 
 def handle_spreadsheet(raw_path, obj_id, fileext, ret_dict):
     handle_document(raw_path, obj_id, fileext, ret_dict)
@@ -410,34 +399,22 @@ def can_preview_file(file_name, file_size, repo):
             error_msg = _('The library is encrypted, can not open file online.')
             return False, error_msg
 
-        if not HAS_OFFICE_CONVERTER and \
-                not ENABLE_OFFICE_WEB_APP and \
+        if not ENABLE_OFFICE_WEB_APP and \
                 not ENABLE_ONLYOFFICE:
             error_msg = "File preview unsupported"
             return False, error_msg
 
         # priority of view office file is:
-        # OOS > OnlyOffice > Seafile integrated
+        # OOS > OnlyOffice
         if ENABLE_OFFICE_WEB_APP:
             if fileext not in OFFICE_WEB_APP_FILE_EXTENSION:
                 error_msg = "File preview unsupported"
                 return False, error_msg
 
-        elif ENABLE_ONLYOFFICE:
+        else:
             if fileext not in ONLYOFFICE_FILE_EXTENSION:
                 error_msg = "File preview unsupported"
                 return False, error_msg
-
-        else:
-            if not HAS_OFFICE_CONVERTER:
-                error_msg = "File preview unsupported"
-                return False, error_msg
-
-            # HAS_OFFICE_CONVERTER
-            if file_size > OFFICE_PREVIEW_MAX_SIZE:
-                error_msg = _('File size surpasses %s, can not be opened online.') % \
-                        filesizeformat(OFFICE_PREVIEW_MAX_SIZE)
-                return False, error_msg
     else:
         # NOT depends on Seafile settings
         if filetype not in list(PREVIEW_FILEEXT.keys()):
@@ -999,23 +976,15 @@ def view_lib_file(request, repo_id, path):
             else:
                 return_dict['err'] = _('Error when prepare OnlyOffice file preview page.')
 
-        if not HAS_OFFICE_CONVERTER:
-            return_dict['err'] = "File preview unsupported"
-            return render(request, template, return_dict)
-
         if file_size > OFFICE_PREVIEW_MAX_SIZE:
             error_msg = _('File size surpasses %s, can not be opened online.') % \
                     filesizeformat(OFFICE_PREVIEW_MAX_SIZE)
             return_dict['err'] = error_msg
             return render(request, template, return_dict)
 
-        error_msg = prepare_converted_html(raw_path, file_id, fileext, return_dict)
-        if error_msg:
-            return_dict['err'] = error_msg
-            return render(request, template, return_dict)
-
         send_file_access_msg(request, repo, path, 'web')
         return render(request, template, return_dict)
+
     elif getattr(settings, 'ENABLE_CAD', False) and path.endswith('.dwg'):
 
         from seahub.cad.utils import get_cad_dict
@@ -1983,92 +1952,6 @@ def text_diff(request, repo_id):
     })
 
 
-########## office related
-def _check_office_convert_perm(request, repo_id, path, ret):
-    token = request.GET.get('token', '')
-    if not token:
-        # Work around for the images embedded in excel files
-        referer = request.headers.get('referer', '')
-        if referer:
-            token = urllib.parse.parse_qs(
-                urllib.parse.urlparse(referer).query).get('token', [''])[0]
-    if token:
-        fileshare = FileShare.objects.get_valid_file_link_by_token(token)
-        if not fileshare or fileshare.repo_id != repo_id:
-            return False
-        if fileshare.is_file_share_link() and fileshare.path == path:
-            return True
-        if fileshare.is_dir_share_link():
-            ret['dir_share_path'] = fileshare.path
-            return True
-        return False
-    else:
-        return request.user.is_authenticated and \
-            check_folder_permission(request, repo_id, '/') is not None
-
-def _office_convert_get_file_id(request, repo_id=None, commit_id=None, path=None):
-    repo_id = repo_id or request.GET.get('repo_id', '')
-    commit_id = commit_id or request.GET.get('commit_id', '')
-    path = path or request.GET.get('path', '')
-    if not (repo_id and path and commit_id):
-        raise BadRequestException()
-    if '../' in path:
-        raise BadRequestException()
-
-    ret = {'dir_share_path': None}
-    if not _check_office_convert_perm(request, repo_id, path, ret):
-        raise BadRequestException()
-
-    if ret['dir_share_path']:
-        path = posixpath.join(ret['dir_share_path'], path.lstrip('/'))
-    return seafserv_threaded_rpc.get_file_id_by_commit_and_path(repo_id, commit_id, path)
-
-@json_response
-def office_convert_query_status(request):
-    if not request.headers.get('x-requested-with') == 'XMLHttpRequest':
-        raise Http404
-
-    doctype = request.GET.get('doctype', None)
-    file_id = _office_convert_get_file_id(request)
-
-    ret = {'success': False}
-    try:
-        ret = query_office_convert_status(file_id, doctype)
-    except Exception as e:
-        logging.exception('failed to call query_office_convert_status')
-        ret['error'] = str(e)
-
-    return ret
-
-_OFFICE_PAGE_PATTERN = re.compile(r'^file\.css|file\.outline|index.html|index_html_.*.png|[a-z0-9]+\.pdf$')
-def office_convert_get_page(request, repo_id, commit_id, path, filename):
-    """Valid static file path inclueds:
-    - index.html for spreadsheets and index_html_xxx.png for images embedded in spreadsheets
-    - 77e168722458356507a1f373714aa9b575491f09.pdf
-    """
-    if not HAS_OFFICE_CONVERTER:
-        raise Http404
-    
-
-    if not _OFFICE_PAGE_PATTERN.match(filename):
-        return HttpResponseForbidden()
-
-    path = '/' + path
-    file_id = _office_convert_get_file_id(request, repo_id, commit_id, path)
-
-    if filename.endswith('.pdf'):
-        filename = "{0}.pdf".format(file_id)
-
-    resp = get_office_converted_page(path, filename, file_id)
-
-    if filename.endswith('.page'):
-        content_type = 'text/html'
-    else:
-        content_type = mimetypes.guess_type(filename)[0] or 'text/html'
-    resp['Content-Type'] = content_type
-    return resp
-
-
 def view_media_file_via_share_link(request):
     image_path = request.GET.get('path', '')
     token = request.GET.get('token', '')