Skip to content

Commit

Permalink
Load encoding from libreoffice dictionaries
Browse files Browse the repository at this point in the history
Based on #4
  • Loading branch information
behdad committed Nov 10, 2023
1 parent 5e14c60 commit 9d1e9bf
Showing 1 changed file with 21 additions and 8 deletions.
29 changes: 21 additions & 8 deletions ngrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,30 @@ def extract_ngrams(text, n, *, frequencies=None, cutoff=0.999, min_freq=MIN_FREQ


def extract_ngrams_from_file(filename, *kargs, **kwargs):
encoding = 'utf-8'
frqfile = None
try:
txtfile = open(filename, "rb")
# Assume hunspell dictionary format; drop everything after "/"
txtfile = (s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile)
frqfile = None
except FileNotFoundError:
import bz2

# Assume harfbuzz-testing-wikipedia format
txtfile = bz2.open(filename + ".txt.bz2").read().splitlines()
frqfile = bz2.open(filename + ".frq.bz2").read().splitlines()
try:
import bz2

# Assume harfbuzz-testing-wikipedia format
txtfile = bz2.open(filename + ".txt.bz2").read().splitlines()
frqfile = bz2.open(filename + ".frq.bz2").read().splitlines()
except FileNotFoundError:
try:
# Assume hunspell dictionary format;
afffile = open(filename + ".aff", "rb")
for line in afffile:
if line.startswith(b"SET"):
encoding = line.replace(b"\t", b" ").split()[1].decode("ascii")
break
txtfile = open(filename + ".dic", "rb")
txtfile = (s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile)

except FileNotFoundError:
raise FileNotFoundError("File not found: %s" % filename)

return extract_ngrams(txtfile, *kargs, frequencies=frqfile, **kwargs)

Expand Down

0 comments on commit 9d1e9bf

Please sign in to comment.