diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..756aab5 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,5 @@ +pytest==6.1.1 +wikitextparser==0.47.0 +coverage==5.3 +requests==2.24.0 +pyquery==1.4.1 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index cc0f971..3380ac2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,3 @@ -pytest==6.1.1 wikitextparser==0.47.0 -coverage==5.3 requests==2.24.0 pyquery==1.4.1 diff --git a/wiktionnaireparser/parser.py b/wiktionnaireparser/parser.py index 7aa5bff..ebc1b7f 100644 --- a/wiktionnaireparser/parser.py +++ b/wiktionnaireparser/parser.py @@ -98,14 +98,15 @@ def _find_lang_sections_id(self): self.sections_id = {} for section in lang.getnext().getchildren(): # 'li' section_id = section.find('a').attrib['href'] - # Subsections? - if section.find('ul') is None: - self.sections_id[section_id] = [] - continue - subsections = [] - for subsection in section.find('ul'): - subsections.append(subsection.find('a').attrib['href']) - self.sections_id[section_id] = subsections + if not "*" in section_id and not "_" in section_id: + # Subsections? + if section.find('ul') is None: + self.sections_id[section_id] = [] + continue + subsections = [] + for subsection in section.find('ul'): + subsections.append(subsection.find('a').attrib['href']) + self.sections_id[section_id] = subsections return self.sections_id @@ -132,7 +133,7 @@ def get_parts_of_speech(self): parts_of_speech = {} useless_sections = ( r'Étymologie', r'Prononciation', r'Références', r'Voir_aussi', - r'Anagrammes', r'Liens_externes' + r'Anagrammes', r'Liens_externes', r'Erreurs*', r'=_Synonymes' ) sections = filter_sections_id(self.sections_id.keys(), useless_sections) for section_name in sections: @@ -195,22 +196,23 @@ def get_definitions(self, part_of_speech): part_of_speech = '#' + part_of_speech.replace(' ', '_') text = self._query.find(part_of_speech)[0] text = text.getparent() - while text.tag != 'ol': + while text is not None and text.tag != 'ol': # ligne de forme if text.tag in ('p', 'span'): self.ligne_de_forme(text) text = text.getnext() - for i, definition_bloc in enumerate(text.getchildren()): - raw = definition_bloc.text_content() - definition = raw.split('\n')[0] - # Catching examples - examples = get_examples(definition_bloc) - definitions[i] = {'definition': definition} - if examples: - definitions[i]['examples'] = examples - if definition_bloc.find('ol') is not None: - subdefinitions = get_subdefinitions(definition_bloc.find('ol')) - definitions[i]['subdefinitions'] = subdefinitions + if text is not None: + for i, definition_bloc in enumerate(text.getchildren()): + raw = definition_bloc.text_content() + definition = raw.split('\n')[0] + # Catching examples + examples = get_examples(definition_bloc) + definitions[i] = {'definition': definition} + if examples: + definitions[i]['examples'] = examples + if definition_bloc.find('ol') is not None: + subdefinitions = get_subdefinitions(definition_bloc.find('ol')) + definitions[i]['subdefinitions'] = subdefinitions return definitions def get_etymology(self): @@ -274,11 +276,12 @@ def get_translations(self, translation_id): lines = section.getnext().cssselect('li') for line in lines: + language = line.find('span') translations = [] - language = line.find('span').text_content() - links = line.cssselect('bdi a') - - for link in links: + if language is not None: + language = language.text_content() + links = line.cssselect('bdi a') + for link in links: translations.append(link.text_content()) result[language] = translations return result diff --git a/wiktionnaireparser/utils.py b/wiktionnaireparser/utils.py index 7a2ed78..9a63d07 100644 --- a/wiktionnaireparser/utils.py +++ b/wiktionnaireparser/utils.py @@ -4,6 +4,8 @@ import json from contextlib import suppress +from lxml.html import HtmlComment + def etymology_cleaner(etymology): """ @@ -34,25 +36,26 @@ def extract_related_words(section): """Extract related words.""" related = {} count = 0 - while section.tag not in ('h3', 'h4'): + while section is not None and section.tag not in ('h3', 'h4'): words = [] description = '' - if section.cssselect('.NavContent'): - with suppress(IndexError): - description = section.cssselect('.NavHead')[0].text_content() - for link in section.cssselect('.NavContent a'): - if 'Annexe:' in link.attrib.get('href'): - continue - words.append(link.text_content()) + if not isinstance(section, HtmlComment): + if section.cssselect('.NavContent'): + with suppress(IndexError): + description = section.cssselect('.NavHead')[0].text_content() + for link in section.cssselect('.NavContent a'): + if 'Annexe:' in link.attrib.get('href'): + continue + words.append(link.text_content()) - else: - for link in section.cssselect('a'): - if 'Annexe:' in link.attrib.get('href'): - continue - words.append(link.text_content()) - related[count] = {} - related[count]['description'] = description - related[count]['words'] = words + else: + for link in section.cssselect('a'): + if 'Annexe:' in link.attrib.get('href'): + continue + words.append(link.text_content()) + related[count] = {} + related[count]['description'] = description + related[count]['words'] = words section = section.getnext() count += 1 return related