Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adds number_of_n_syllable_words_all function #21

Merged
merged 2 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ The following descriptive statistics are supported (`descriptive_statistics.py`
* Number of syllables `syllable_count`
* Number of sentences `sentence_count`
* Number of n-syllable words `number_of_n_syllable_words`
* Number of n-syllable words for all found syllables `number_of_n_syllable_words_all`
* Average syllables per word `avg_syllable_per_word`
* Average word length `avg_word_length`
* Average sentence length `avg_sentence_length`
Expand Down
31 changes: 21 additions & 10 deletions linguaf/descriptive_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from linguaf import SUPPORTED_LANGS, __load_json, __check_bool_param, __check_documents_param, __check_lang_param, \
__check_text_param, __check_words_param


LOGGER = logging.getLogger(__name__)

try:
Expand All @@ -28,7 +27,6 @@
PUNCTUATION = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~—«»"""
STOPWORDS = dict()


for language in SUPPORTED_LANGS:
try:
# TODO: consider using nltk directly
Expand Down Expand Up @@ -171,25 +169,38 @@ def number_of_n_syllable_words(documents: list, lang: str = 'en', n: tuple = (1,
__check_documents_param(documents)
__check_lang_param(lang)

counts = number_of_n_syllable_words_all(documents, lang, remove_stopwords)
count = 0
for i in range(n[0], n[1]):
count += counts.get(i, 0)
return count


def number_of_n_syllable_words_all(documents: list, lang: str = 'en', remove_stopwords: bool = False) -> dict:
"""Count each found number of syllables in a list of words.

Keyword arguments:
documents -- the list of documents
lang -- language of the words
"""
__check_documents_param(documents)
__check_lang_param(lang)

# TODO: refactor duplicate code!
unsupported_langs = ['zh', 'hy']
if lang in unsupported_langs:
raise ValueError(f"Syllable counting is currently not supported for the language " + lang + "!")
# TODO: chinese does have syllables! so this should be supported eventually
# however, chinese does not support hyphenation, so the implementation below does not work for it!
# however, chinese does not support hyphenation, so the implementation below does not work for it!

words = get_words(documents, lang, remove_stopwords)
if n[0] < 1 or n[1] <= n[0]:
raise ValueError(f"The given n parameter isn't correct: {n}. n=tuple(x,y), x>0, y>x.")

count = 0
counts = collections.defaultdict(int)
dic = pyphen.Pyphen(lang=lang) # TODO: match language
for word in words:
syl_cnt = len(dic.inserted(word).split('-'))
for i in range(n[0], n[1]):
if syl_cnt == i:
count += 1
return count
counts[syl_cnt] += 1
return counts


def get_words(documents: list, lang: str = 'en', remove_stopwords: bool = False) -> list:
Expand Down
Loading