From 4d534fd0396330e47b2acc442cc8983f65c0e4d0 Mon Sep 17 00:00:00 2001 From: Thomas Huber Date: Wed, 30 Oct 2024 10:48:58 +0100 Subject: [PATCH 1/2] adds number_of_n_syllable_words_all function, to count all frequencies of all numbers of syllables found in texts --- README.md | 1 + linguaf/descriptive_statistics.py | 34 ++++++++++++++++++++++--------- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 5f65916..1e32ab6 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ The following descriptive statistics are supported (`descriptive_statistics.py` * Number of syllables `syllable_count` * Number of sentences `sentence_count` * Number of n-syllable words `number_of_n_syllable_words` +* Number of n-syllable words for all found syllables `number_of_n_syllable_words_all` * Average syllables per word `avg_syllable_per_word` * Average word length `avg_word_length` * Average sentence length `avg_sentence_length` diff --git a/linguaf/descriptive_statistics.py b/linguaf/descriptive_statistics.py index a9e3613..39b9a6d 100644 --- a/linguaf/descriptive_statistics.py +++ b/linguaf/descriptive_statistics.py @@ -15,7 +15,6 @@ from linguaf import SUPPORTED_LANGS, __load_json, __check_bool_param, __check_documents_param, __check_lang_param, \ __check_text_param, __check_words_param - LOGGER = logging.getLogger(__name__) try: @@ -28,7 +27,6 @@ PUNCTUATION = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~—«»""" STOPWORDS = dict() - for language in SUPPORTED_LANGS: try: # TODO: consider using nltk directly @@ -171,25 +169,41 @@ def number_of_n_syllable_words(documents: list, lang: str = 'en', n: tuple = (1, __check_documents_param(documents) __check_lang_param(lang) + counts = number_of_n_syllable_words_all(documents, lang, remove_stopwords) + count = 0 + for i in range(n[0], n[1]): + count += counts.get(i, 0) + return count + + +def number_of_n_syllable_words_all(documents: list, lang: str = 'en', remove_stopwords: bool = False) -> dict: + """Count each found number of syllables in a list of words. + + Keyword arguments: + documents -- the list of documents + lang -- language of the words + """ + __check_documents_param(documents) + __check_lang_param(lang) + # TODO: refactor duplicate code! unsupported_langs = ['zh', 'hy'] if lang in unsupported_langs: raise ValueError(f"Syllable counting is currently not supported for the language " + lang + "!") # TODO: chinese does have syllables! so this should be supported eventually - # however, chinese does not support hyphenation, so the implementation below does not work for it! + # however, chinese does not support hyphenation, so the implementation below does not work for it! words = get_words(documents, lang, remove_stopwords) - if n[0] < 1 or n[1] <= n[0]: - raise ValueError(f"The given n parameter isn't correct: {n}. n=tuple(x,y), x>0, y>x.") - count = 0 + counts = {} dic = pyphen.Pyphen(lang=lang) # TODO: match language for word in words: syl_cnt = len(dic.inserted(word).split('-')) - for i in range(n[0], n[1]): - if syl_cnt == i: - count += 1 - return count + if syl_cnt in counts: + counts[syl_cnt] += 1 + else: + counts[syl_cnt] = 1 + return counts def get_words(documents: list, lang: str = 'en', remove_stopwords: bool = False) -> list: From cb553c2e038eeaabc8a877b3adda9f494a033c3e Mon Sep 17 00:00:00 2001 From: Thomas Huber Date: Wed, 30 Oct 2024 11:11:06 +0100 Subject: [PATCH 2/2] using defaultdict instead of regular dict --- linguaf/descriptive_statistics.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/linguaf/descriptive_statistics.py b/linguaf/descriptive_statistics.py index 39b9a6d..8f8f76c 100644 --- a/linguaf/descriptive_statistics.py +++ b/linguaf/descriptive_statistics.py @@ -195,14 +195,11 @@ def number_of_n_syllable_words_all(documents: list, lang: str = 'en', remove_sto words = get_words(documents, lang, remove_stopwords) - counts = {} + counts = collections.defaultdict(int) dic = pyphen.Pyphen(lang=lang) # TODO: match language for word in words: syl_cnt = len(dic.inserted(word).split('-')) - if syl_cnt in counts: - counts[syl_cnt] += 1 - else: - counts[syl_cnt] = 1 + counts[syl_cnt] += 1 return counts