Load encoding from libreoffice dictionaries

Based on #4
behdad · Nov 10, 2023 · 9d1e9bf · 9d1e9bf
1 parent 5e14c60
commit 9d1e9bf
Showing 1 changed file with 21 additions and 8 deletions.
diff --git a/ngrams.py b/ngrams.py
@@ -52,17 +52,30 @@ def extract_ngrams(text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ
 
 
 def extract_ngrams_from_file(filename, *kargs, **kwargs):
+    encoding = 'utf-8'
+    frqfile = None
     try:
         txtfile = open(filename, "rb")
-        # Assume hunspell dictionary format; drop everything after "/"
-        txtfile = (s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile)
-        frqfile = None
     except FileNotFoundError:
-        import bz2
-
-        # Assume harfbuzz-testing-wikipedia format
-        txtfile = bz2.open(filename + ".txt.bz2").read().splitlines()
-        frqfile = bz2.open(filename + ".frq.bz2").read().splitlines()
+        try:
+            import bz2
+
+            # Assume harfbuzz-testing-wikipedia format
+            txtfile = bz2.open(filename + ".txt.bz2").read().splitlines()
+            frqfile = bz2.open(filename + ".frq.bz2").read().splitlines()
+        except FileNotFoundError:
+            try:
+                # Assume hunspell dictionary format;
+                afffile = open(filename + ".aff", "rb")
+                for line in afffile:
+                    if line.startswith(b"SET"):
+                        encoding = line.replace(b"\t", b" ").split()[1].decode("ascii")
+                        break
+                txtfile = open(filename + ".dic", "rb")
+                txtfile = (s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile)
+
+            except FileNotFoundError:
+                raise FileNotFoundError("File not found: %s" % filename)
 
     return extract_ngrams(txtfile, *kargs, frequencies=frqfile, **kwargs)