From 086c03a068e9cbf6eb30b1d15ca477706c2e586a Mon Sep 17 00:00:00 2001
From: llj <lingjun.li1@gmail.com>
Date: Sun, 24 Feb 2013 16:52:10 +0800
Subject: [PATCH] [file encoding] added encoding option & used chardet for
 Text/Markdown/Sf file

---
 base/context_processors.py                |  3 +-
 media/css/seahub.css                      | 10 ++++
 settings.py                               |  1 +
 templates/file_view.html                  |  2 +-
 templates/snippets/file_content_html.html | 21 +++++++-
 templates/snippets/file_content_js.html   | 29 +++++++++++-
 views.py                                  | 58 ++++++++++++++++-------
 7 files changed, 102 insertions(+), 22 deletions(-)
diff --git a/base/context_processors.py b/base/context_processors.py
index dc9ae46e35..6a4d589c36 100644
--- a/base/context_processors.py
+++ b/base/context_processors.py
@@ -7,7 +7,7 @@ These are referenced from the setting TEMPLATE_CONTEXT_PROCESSORS and used by
 RequestContext.
 """
 from settings import SEAFILE_VERSION, SITE_TITLE, SITE_NAME, SITE_BASE, \
-    ENABLE_SIGNUP, MAX_FILE_NAME, USE_PDFJS
+    ENABLE_SIGNUP, MAX_FILE_NAME, USE_PDFJS, FILE_ENCODING_LIST
 try:
     from settings import BUSINESS_MODE
 except ImportError:
@@ -38,5 +38,6 @@ def base(request):
         'enable_signup': ENABLE_SIGNUP,
         'max_file_name': MAX_FILE_NAME,
         'use_pdfjs': USE_PDFJS,
+        'file_encoding_list': FILE_ENCODING_LIST,
         }
 
diff --git a/media/css/seahub.css b/media/css/seahub.css
index 55c87cbd97..4fe41a0224 100644
--- a/media/css/seahub.css
+++ b/media/css/seahub.css
@@ -1507,6 +1507,16 @@ textarea:-moz-placeholder {/* for FF */
 #file-op button {
     padding:2px 8px;
 }
+#file-enc-cont {
+    width:950px;
+    margin:-20px auto 6px;
+    text-align:right;
+}
+#file-enc {
+    border:1px solid #ddd;
+    border-radius:2px;
+    background:#efefef;
+}
 #file-view-tip {
     height: 150px;
     padding:10px;
diff --git a/settings.py b/settings.py
index c926f1ff9c..be29187b2c 100644
--- a/settings.py
+++ b/settings.py
@@ -146,6 +146,7 @@ ACCOUNT_ACTIVATION_DAYS = 7
 # File preview
 FILE_PREVIEW_MAX_SIZE = 10 * 1024 * 1024
 USE_PDFJS = True
+FILE_ENCODING_LIST = ['auto', 'utf-8', 'gbk', 'ISO-8859-1', 'ISO-8859-5']
 
 # Avatar
 AVATAR_STORAGE_DIR = 'avatars'
diff --git a/templates/file_view.html b/templates/file_view.html
index 28be7cefb1..94232654b2 100644
--- a/templates/file_view.html
+++ b/templates/file_view.html
@@ -73,7 +73,7 @@
 
                 {% if not read_only %}
                 {% if filetype == 'Text' or filetype == 'Markdown' or filetype == 'Sf' %}
-                <button data="{{ SITE_ROOT }}repo/{{ repo.id }}/file/edit/?p={{ path }}" id="edit">{% trans "Edit"%}</button>
+                <button data="{{ SITE_ROOT }}repo/{{ repo.id }}/file/edit/?p={{ path }}&file_enc={{file_enc}}" id="edit">{% trans "Edit"%}</button>
                 {% endif %}
                 {% endif %}
             {% endif %}
diff --git a/templates/snippets/file_content_html.html b/templates/snippets/file_content_html.html
index d30df55d52..93c7476671 100644
--- a/templates/snippets/file_content_html.html
+++ b/templates/snippets/file_content_html.html
@@ -5,14 +5,31 @@ content of files that can be viewed online shows here.
 For details please refer to 'snippets/file_content_js.html'.
 {% endcomment %}
 <div id="file-view">
+    {% if filetype == 'Text' or filetype == 'Sf' or filetype == 'Markdown' %}
+    <div id="file-enc-cont">
+        <label for="file-enc">{% trans "Encoding:" %}</label>
+        <select id="file-enc">
+            {% for enc in file_encoding_list %}
+            <option value="{{ enc }}" {% if file_enc == enc %} selected="selected" {% endif %}>{% if enc == 'auto'%}{% trans "auto detect" %}{% else %}{{ enc }}{% endif %}</option>
+            {% endfor %}
+        </select>
+    </div>
+    {% endif %}
+
     {% if not err %}
-        {% if filetype == 'Text' or filetype == 'Sf' %}
+        {% if filetype == 'Text' or filetype == 'Sf' or filetype == 'Markdown' %}
             {% ifnotequal file_content None %}
                 {% if filetype == 'Text' %}
                 <textarea id="docu-view" class="vh">{{ file_content|escape }}</textarea>
-                {% else %}
+                {% endif %}
+
+                {% if filetype == 'Sf' %}
                 <div id="sf" class="article">{{ file_content|safe }}</div>
                 {% endif %}
+
+                {% if filetype == 'Markdown' %}
+                <div id="md-view" class="article"></div>
+                {% endif %}
             {% endifnotequal %}
         {% endif %}
 
diff --git a/templates/snippets/file_content_js.html b/templates/snippets/file_content_js.html
index 46f84e424b..25b6d8dd48 100644
--- a/templates/snippets/file_content_js.html
+++ b/templates/snippets/file_content_js.html
@@ -40,8 +40,7 @@
 {% if filetype == 'Markdown' %}
     {% ifnotequal file_content None %}
     var converter = new Showdown.converter();
-    $('#file-view').html('<div id="md-view" class="article">' + converter.makeHtml('{{ file_content|escapejs }}') + '</div>');
-    $('#md-view').children(':first').css('margin-top', '0');
+    $('#md-view').html(converter.makeHtml('{{ file_content|escapejs }}')).children(':first').css('margin-top', '0');
     {% endifnotequal %}
 {% endif %}
 
@@ -168,3 +167,29 @@ $('#file-view').html('<div id="file-view-tip"><p>{% trans "This type of file can
 {% endif %}
 
 {% endif %}{# 'if not err' ends here. #}
+
+{% if filetype == 'Text' or filetype == 'Sf' or filetype == 'Markdown' %}
+    $('#file-enc').change(function() {
+        var file_enc = $(this).val();
+        var s = location.search;
+        if (s.indexOf('?') == -1) {
+            location.search = '?file_enc=' + file_enc;
+        } else {
+            if (s.indexOf('file_enc') == -1) {
+                location.search += '&file_enc=' + file_enc; 
+            } else {
+                var params = s.substr(1).split('&');
+                var param;
+                for (var i = 0, len = params.length; i < len; i++) {
+                    param = params[i].split('=');
+                    if (param[0] == 'file_enc') {
+                        param[1] = file_enc;
+                        params[i] = param.join('=');
+                        break;
+                    }
+                }
+                location.search = '?' + params.join('&');
+            }
+        }
+    })
+{% endif %}
diff --git a/views.py b/views.py
index 7b2c20fd46..78efc68c29 100644
--- a/views.py
+++ b/views.py
@@ -9,6 +9,7 @@ import sys
 import urllib
 import urllib2
 import logging
+import chardet
 from urllib import quote
 from django.core.cache import cache
 from django.core.urlresolvers import reverse
@@ -78,7 +79,7 @@ try:
         DOCUMENT_CONVERTOR_ROOT += '/'
 except ImportError:
     DOCUMENT_CONVERTOR_ROOT = None
-from settings import FILE_PREVIEW_MAX_SIZE, INIT_PASSWD, USE_PDFJS,\
+from settings import FILE_PREVIEW_MAX_SIZE, INIT_PASSWD, USE_PDFJS, FILE_ENCODING_LIST, \
     SEND_EMAIL_ON_ADDING_SYSTEM_MEMBER, SEND_EMAIL_ON_RESETTING_USER_PASSWD
 
 try:
@@ -1247,6 +1248,7 @@ def repo_view_file(request, repo_id):
     filename = urllib2.quote(u_filename.encode('utf-8'))
     comment_open = request.GET.get('comment_open', '')
     page_from = request.GET.get('from', '')
+    file_enc = request.GET.get('file_enc', 'auto')
 
     commit_id = request.GET.get('commit_id', '')
     view_history = True if commit_id else False
@@ -1304,7 +1306,7 @@ def repo_view_file(request, repo_id):
     raw_path = gen_file_get_url(token, filename)
    
     # get file content
-    err, file_content, swf_exists, filetype = get_file_content(filetype, raw_path, obj_id, fileext)
+    err, file_content, swf_exists, filetype = get_file_content(filetype, raw_path, obj_id, fileext, file_enc)
 
     img_prev = None
     img_next = None
@@ -1346,6 +1348,7 @@ def repo_view_file(request, repo_id):
                 'raw_path': raw_path,
                 'err': err,
                 'file_content': file_content,
+                'file_enc': file_enc,
                 'swf_exists': swf_exists,
                 'DOCUMENT_CONVERTOR_ROOT': DOCUMENT_CONVERTOR_ROOT,
                 'page_from': page_from,
@@ -1423,6 +1426,7 @@ def repo_view_file(request, repo_id):
             'contacts': contacts,
             'err': err,
             'file_content': file_content,
+            'file_enc': file_enc,
             "applet_root": get_ccnetapplet_root(),
             'groups': groups,
             'comments': comments,
@@ -1492,10 +1496,13 @@ def file_comment(request):
                                 content_type=content_type)
     
    
-def repo_file_get(raw_path):
+def repo_file_get(raw_path, file_enc):
     err = ''
     file_content = ''
     encoding = ''
+    if file_enc in FILE_ENCODING_LIST and file_enc != 'auto':
+        encoding = file_enc
+
     try:
         file_response = urllib2.urlopen(raw_path)
         if long(file_response.headers['Content-Length']) > FILE_PREVIEW_MAX_SIZE:
@@ -1510,23 +1517,37 @@ def repo_file_get(raw_path):
         err = _(u'URLError: failed to open file online')
         return err, '', ''
     else:
-        try:
-            u_content = content.decode('utf-8')
-            encoding = 'utf-8'
-        except UnicodeDecodeError:
-            # XXX: file in windows is encoded in gbk
+        if encoding:
             try:
-                u_content = content.decode('gbk')
-                encoding = 'gbk'
+                u_content = content.decode(encoding)
             except UnicodeDecodeError:
-                err = _(u'Unknown file encoding')
+                err = _(u'The encoding you chose is not proper.')
                 return err, '', ''
+        else:
+            try:
+                u_content = content.decode('utf-8')
+                encoding = 'utf-8'
+            except UnicodeDecodeError:
+                try:
+                    u_content = content.decode('gbk')
+                    encoding = 'gbk'
+                except UnicodeDecodeError:
+                    encoding = chardet.detect(content)['encoding']
+                    if encoding != None:
+                        try:
+                            u_content = content.decode(encoding)
+                        except UnicodeDecodeError:
+                            err = _(u'Unknown file encoding')
+                            return err, '', ''
+                    else:
+                        err = _(u'Unknown file encoding')
+                        return err, '', ''
 
         file_content = u_content
 
     return err, file_content, encoding
 
-def get_file_content(filetype, raw_path, obj_id, fileext):
+def get_file_content(filetype, raw_path, obj_id, fileext, file_enc):
     err = ''
     file_content = ''
     swf_exists = False
@@ -1537,7 +1558,7 @@ def get_file_content(filetype, raw_path, obj_id, fileext):
         file_content['img_w'], file_content['img_h'] = img.size
 
     if filetype == 'Text' or filetype == 'Markdown' or filetype == 'Sf':
-        err, file_content, encoding = repo_file_get(raw_path)
+        err, file_content, encoding = repo_file_get(raw_path, file_enc)
     elif filetype == 'Document':
         if DOCUMENT_CONVERTOR_ROOT:
             err, swf_exists = flash_prepare(raw_path, obj_id, fileext)
@@ -1664,7 +1685,8 @@ def file_edit(request, repo_id):
                 op = 'decrypt'
         if not op:
             raw_path = gen_file_get_url(token, filename)
-            err, file_content, encoding = repo_file_get(raw_path)
+            file_enc = request.GET.get('file_enc', 'auto')
+            err, file_content, encoding = repo_file_get(raw_path, file_enc)
     else:
         err = _(u'Edit online is not offered for this type of file.')
 
@@ -2581,7 +2603,8 @@ def view_shared_file(request, token):
     raw_path = gen_file_get_url(access_token, quote_filename)
 
     # get file content
-    err, file_content, swf_exists, filetype = get_file_content(filetype, raw_path, obj_id, fileext)
+    file_enc = request.GET.get('file_enc', 'auto')
+    err, file_content, swf_exists, filetype = get_file_content(filetype, raw_path, obj_id, fileext, file_enc)
     
     # Increase file shared link view_cnt, this operation should be atomic
     fileshare = FileShare.objects.get(token=token)
@@ -2601,6 +2624,7 @@ def view_shared_file(request, token):
             'username': username,
             'err': err,
             'file_content': file_content,
+            'file_enc': file_enc,
             'swf_exists': swf_exists,
             'DOCUMENT_CONVERTOR_ROOT': DOCUMENT_CONVERTOR_ROOT,
             }, context_instance=RequestContext(request))
@@ -2683,7 +2707,8 @@ def view_file_via_shared_dir(request, token):
     # Raw path
     raw_path = gen_file_get_url(access_token, quote_filename)
     # get file content
-    err, file_content, swf_exists, filetype = get_file_content(filetype, raw_path, obj_id, fileext)
+    file_enc = request.GET.get('file_enc', 'auto')
+    err, file_content, swf_exists, filetype, encoding = get_file_content(filetype, raw_path, obj_id, fileext, file_enc)
 
     zipped = gen_path_link(path, '')
         
@@ -2700,6 +2725,7 @@ def view_file_via_shared_dir(request, token):
             'username': username,
             'err': err,
             'file_content': file_content,
+            'file_enc': file_enc,
             'swf_exists': swf_exists,
             'DOCUMENT_CONVERTOR_ROOT': DOCUMENT_CONVERTOR_ROOT,
             'zipped': zipped,