From 5539dcf1c9fe360a2d6429d8f9af5299cbde2918 Mon Sep 17 00:00:00 2001 From: Lambert Rosique Date: Sat, 8 Jan 2022 19:30:18 +0100 Subject: [PATCH 1/7] Fix issue #19 --- wiktionnaireparser/parser.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/wiktionnaireparser/parser.py b/wiktionnaireparser/parser.py index fbdffb5..244ddf1 100644 --- a/wiktionnaireparser/parser.py +++ b/wiktionnaireparser/parser.py @@ -196,22 +196,23 @@ def get_definitions(self, part_of_speech): part_of_speech = '#' + part_of_speech.replace(' ', '_') text = self._query.find(part_of_speech)[0] text = text.getparent() - while text.tag != 'ol': + while text is not None and text.tag != 'ol': # ligne de forme if text.tag == 'p' or text.tag == 'span': self.ligne_de_forme(text) text = text.getnext() - for i, definition_bloc in enumerate(text.getchildren()): - raw = definition_bloc.text_content() - definition = raw.split('\n')[0] - # Catching examples - examples = get_examples(definition_bloc) - definitions[i] = {'definition': definition} - if examples: - definitions[i]['examples'] = examples - if definition_bloc.find('ol'): - subdefinitions = get_subdefinitions(definition_bloc.find('ol')) - definitions[i]['subdefinitions'] = subdefinitions + if text is not None: + for i, definition_bloc in enumerate(text.getchildren()): + raw = definition_bloc.text_content() + definition = raw.split('\n')[0] + # Catching examples + examples = get_examples(definition_bloc) + definitions[i] = {'definition': definition} + if examples: + definitions[i]['examples'] = examples + if definition_bloc.find('ol'): + subdefinitions = get_subdefinitions(definition_bloc.find('ol')) + definitions[i]['subdefinitions'] = subdefinitions return definitions def get_etymology(self): From 148681c7c791d4666e0a3a9fcf34a3c1cce47492 Mon Sep 17 00:00:00 2001 From: Lambert Rosique Date: Sat, 8 Jan 2022 20:05:07 +0100 Subject: [PATCH 2/7] Fix Issue #19 --- wiktionnaireparser/utils.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/wiktionnaireparser/utils.py b/wiktionnaireparser/utils.py index 2f01461..f412f5a 100644 --- a/wiktionnaireparser/utils.py +++ b/wiktionnaireparser/utils.py @@ -1,6 +1,7 @@ import re import json from contextlib import suppress +from lxml.html import HtmlComment def etymology_cleaner(etymology): @@ -31,25 +32,26 @@ def extract_related_words(section): """Extract related words.""" related = {} count = 0 - while section.tag != 'h3' and section.tag != 'h4': + while section is not None and section.tag != 'h3' and section.tag != 'h4': words = [] description = '' - if section.cssselect('.NavContent'): - with suppress(IndexError): - description = section.cssselect('.NavHead')[0].text_content() - for link in section.cssselect('.NavContent a'): - if 'Annexe:' in link.attrib.get('href'): - continue - words.append(link.text_content()) + if not type(section) is HtmlComment: + if section.cssselect('.NavContent'): + with suppress(IndexError): + description = section.cssselect('.NavHead')[0].text_content() + for link in section.cssselect('.NavContent a'): + if 'Annexe:' in link.attrib.get('href'): + continue + words.append(link.text_content()) - else: - for link in section.cssselect('a'): - if 'Annexe:' in link.attrib.get('href'): - continue - words.append(link.text_content()) - related[count] = {} - related[count]['description'] = description - related[count]['words'] = words + else: + for link in section.cssselect('a'): + if 'Annexe:' in link.attrib.get('href'): + continue + words.append(link.text_content()) + related[count] = {} + related[count]['description'] = description + related[count]['words'] = words section = section.getnext() count += 1 return related From 249b02baa15beb3849fff73cd2897693156ed108 Mon Sep 17 00:00:00 2001 From: Lambert Rosique Date: Sat, 8 Jan 2022 20:10:53 +0100 Subject: [PATCH 3/7] Fix Issue #22 --- wiktionnaireparser/parser.py | 42 +++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/wiktionnaireparser/parser.py b/wiktionnaireparser/parser.py index 244ddf1..2fdf7d2 100644 --- a/wiktionnaireparser/parser.py +++ b/wiktionnaireparser/parser.py @@ -274,26 +274,28 @@ def get_translations(self, translation_id): lines = section.getnext().cssselect('li') for line in lines: - language = line.find('span').text_content() - transl = [] - links = line.find('a') - while links is not None: - ''' - try: - if links.attrib.get('class').endswith('-Latn'): - links = links.getnext() - continue - except AttributeError: - pass - ''' - if links.attrib.get('class') != 'trad-exposant' and links.attrib: - if links.attrib.get('class') is None: - transl.append(links.text_content()) - # Ignore translittérations - elif not links.attrib.get('class').endswith('-Latn'): - transl.append(links.text_content()) - links = links.getnext() - result[language] = transl + language = line.find('span') + if language is not None: + language = language.text_content() + transl = [] + links = line.find('a') + while links is not None: + ''' + try: + if links.attrib.get('class').endswith('-Latn'): + links = links.getnext() + continue + except AttributeError: + pass + ''' + if links.attrib.get('class') != 'trad-exposant' and links.attrib: + if links.attrib.get('class') is None: + transl.append(links.text_content()) + # Ignore translittérations + elif not links.attrib.get('class').endswith('-Latn'): + transl.append(links.text_content()) + links = links.getnext() + result[language] = transl return result From 58b7cee3cd30767f4c40d2ad0193018dcfa5ce33 Mon Sep 17 00:00:00 2001 From: Lambert Rosique Date: Sun, 9 Jan 2022 01:54:46 +0100 Subject: [PATCH 4/7] Fix issue #24 --- wiktionnaireparser/parser.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/wiktionnaireparser/parser.py b/wiktionnaireparser/parser.py index 2fdf7d2..51e6d91 100644 --- a/wiktionnaireparser/parser.py +++ b/wiktionnaireparser/parser.py @@ -99,14 +99,15 @@ def _find_lang_sections_id(self): self.sections_id = {} for section in lang.getnext().getchildren(): # 'li' section_id = section.find('a').attrib['href'] - # Subsections? - if section.find('ul') is None: - self.sections_id[section_id] = [] - continue - subsections = [] - for subsection in section.find('ul'): - subsections.append(subsection.find('a').attrib['href']) - self.sections_id[section_id] = subsections + if not "*" in section_id: + # Subsections? + if section.find('ul') is None: + self.sections_id[section_id] = [] + continue + subsections = [] + for subsection in section.find('ul'): + subsections.append(subsection.find('a').attrib['href']) + self.sections_id[section_id] = subsections return self.sections_id @@ -133,7 +134,7 @@ def get_parts_of_speech(self): parts_of_speech = {} useless_sections = ( r'Étymologie', r'Prononciation', r'Références', r'Voir_aussi', - r'Anagrammes', r'Liens_externes' + r'Anagrammes', r'Liens_externes', r'Erreurs*', ) sections = filter_sections_id(self.sections_id.keys(), useless_sections) for section_name in sections: From 8eb34e6f99d15395788754eeab3540cb41eb00c8 Mon Sep 17 00:00:00 2001 From: Lambert Rosique Date: Sun, 9 Jan 2022 02:19:35 +0100 Subject: [PATCH 5/7] Fix 2nd part of issue #24 with "_" --- wiktionnaireparser/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wiktionnaireparser/parser.py b/wiktionnaireparser/parser.py index 51e6d91..2b2415e 100644 --- a/wiktionnaireparser/parser.py +++ b/wiktionnaireparser/parser.py @@ -99,7 +99,7 @@ def _find_lang_sections_id(self): self.sections_id = {} for section in lang.getnext().getchildren(): # 'li' section_id = section.find('a').attrib['href'] - if not "*" in section_id: + if not "*" in section_id and not "_" in section_id: # Subsections? if section.find('ul') is None: self.sections_id[section_id] = [] @@ -134,7 +134,7 @@ def get_parts_of_speech(self): parts_of_speech = {} useless_sections = ( r'Étymologie', r'Prononciation', r'Références', r'Voir_aussi', - r'Anagrammes', r'Liens_externes', r'Erreurs*', + r'Anagrammes', r'Liens_externes', r'Erreurs*', r'=_Synonymes' ) sections = filter_sections_id(self.sections_id.keys(), useless_sections) for section_name in sections: From c7597a4820aace2f304b6c34840d0e609cc0c5a7 Mon Sep 17 00:00:00 2001 From: Surkal Date: Sat, 19 Feb 2022 14:22:34 +0100 Subject: [PATCH 6/7] perf: make the default requirements file lighter --- requirements-dev.txt | 5 +++++ requirements.txt | 2 -- 2 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 requirements-dev.txt diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..756aab5 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,5 @@ +pytest==6.1.1 +wikitextparser==0.47.0 +coverage==5.3 +requests==2.24.0 +pyquery==1.4.1 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index cc0f971..3380ac2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,3 @@ -pytest==6.1.1 wikitextparser==0.47.0 -coverage==5.3 requests==2.24.0 pyquery==1.4.1 From b90b257f6c849e0799e879bc93f98674be7800c9 Mon Sep 17 00:00:00 2001 From: Surkal Date: Sun, 20 Feb 2022 13:22:29 +0100 Subject: [PATCH 7/7] style: use isinstance() rather than type() --- wiktionnaireparser/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wiktionnaireparser/utils.py b/wiktionnaireparser/utils.py index cdebe9c..9a63d07 100644 --- a/wiktionnaireparser/utils.py +++ b/wiktionnaireparser/utils.py @@ -3,6 +3,7 @@ import re import json from contextlib import suppress + from lxml.html import HtmlComment @@ -38,7 +39,7 @@ def extract_related_words(section): while section is not None and section.tag not in ('h3', 'h4'): words = [] description = '' - if not type(section) is HtmlComment: + if not isinstance(section, HtmlComment): if section.cssselect('.NavContent'): with suppress(IndexError): description = section.cssselect('.NavHead')[0].text_content()