Skip to content

Commit

Permalink
Replaced slow chardet by charset_normalize, tuned failing csv files
Browse files Browse the repository at this point in the history
  • Loading branch information
Philipp Kraft committed Jan 16, 2024
1 parent 787cc84 commit 222036c
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 14 deletions.
6 changes: 3 additions & 3 deletions odmf/dataimport/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from configparser import RawConfigParser
from io import StringIO
import typing
import chardet
from charset_normalizer import detect

import ast

Expand Down Expand Up @@ -536,9 +536,9 @@ def from_file(cls, path, pattern='*.conf'):
config.read_file(f)
except UnicodeDecodeError:
rawdata = open(path, 'rb').read()
result = chardet.detect(rawdata)
result = detect(rawdata)

# if chardet can't detect encoding
# if charset_normalizer can't detect encoding
if result['encoding'] is None:
result['encoding'] = 'unknown'

Expand Down
1 change: 1 addition & 0 deletions odmf/static/templates/layout.html
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@

$(function () {
$('[data-toggle="tooltip"]').tooltip();
$('[data-toggle="popover"]').popover();
$('#login-logout').on('click', () => {
$.post('${conf.root_url}/api/logout').done(() => {
window.location.reload()
Expand Down
22 changes: 12 additions & 10 deletions odmf/webpage/filemanager/filehandlers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import io
import typing

import chardet
from charset_normalizer import detect
import pandas as pd


Expand All @@ -18,10 +18,13 @@
def load_text_file(path: Path) -> str:
with open(path.absolute, 'rb') as f:
data = f.read()
try:
return data.decode('utf-8')
except UnicodeDecodeError:
detection = chardet.detect(data)
for enc in 'utf-8', 'latin1', 'windows-1252', None:
try:
return data.decode(enc)
except UnicodeDecodeError:
continue
else:
detection = detect(data[:10000])
if not detection['encoding']:
raise ValueError(f'{path} is a binary file')
return data.decode(detection['encoding'])
Expand Down Expand Up @@ -106,7 +109,6 @@ def to_html(self, path) -> str:

class ConfFileHandler(TextFileHandler):
icon = 'file-import'
actions = fa.ConfImportAction(),
def render(self, source):
def div(content, *classes):
classes = ' '.join(classes)
Expand Down Expand Up @@ -171,14 +173,14 @@ class CsvFileHandler(BaseFileHandler):
icon = 'file-csv'
actions = fa.ConfImportAction(),
def to_html(self, path: Path) -> str:

text_io = load_text_stream(path)
try:
text_io = load_text_stream(path)
df = pd.read_csv(text_io, sep=None, engine='python')
return table_to_html(df)
except Exception as e:
text = load_text_file(path)

return '\n<pre>\n' + text + '\n</pre>\n'
return '\n<pre>\n' + text_io.getvalue() + '\n</pre>\n'


class ParquetFileHandler(BaseFileHandler):
Expand Down Expand Up @@ -248,7 +250,7 @@ class MultiHandler(BaseFileHandler):
PlotFileHandler(r'\.plot$'),
ExcelFileHandler(r'\.xls.?$'),
DocxFileHandler(r'\.docx$'),
CsvFileHandler(r'\.csv$'),
CsvFileHandler(r'\.(csv|dat)$'),
ParquetFileHandler(r'\.parquet$'),
PdfFileHandler(r'\.pdf$'),
ImageFileHandler(r'\.(jpg|jpeg|png|svg|gif)$'),
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
setuptools>=39.0
wheel>=0.30

chardet
charset_normalizer
PyYAML>=5.1
Pillow>=9.2.0
cherrypy>=18.0.0
Expand Down

0 comments on commit 222036c

Please sign in to comment.