Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge irosique's contributions #29

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pytest==6.1.1
wikitextparser==0.47.0
coverage==5.3
requests==2.24.0
pyquery==1.4.1
2 changes: 0 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
pytest==6.1.1
wikitextparser==0.47.0
coverage==5.3
requests==2.24.0
pyquery==1.4.1
53 changes: 28 additions & 25 deletions wiktionnaireparser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,15 @@ def _find_lang_sections_id(self):
self.sections_id = {}
for section in lang.getnext().getchildren(): # 'li'
section_id = section.find('a').attrib['href']
# Subsections?
if section.find('ul') is None:
self.sections_id[section_id] = []
continue
subsections = []
for subsection in section.find('ul'):
subsections.append(subsection.find('a').attrib['href'])
self.sections_id[section_id] = subsections
if not "*" in section_id and not "_" in section_id:
# Subsections?
if section.find('ul') is None:
self.sections_id[section_id] = []
continue
subsections = []
for subsection in section.find('ul'):
subsections.append(subsection.find('a').attrib['href'])
self.sections_id[section_id] = subsections

return self.sections_id

Expand All @@ -132,7 +133,7 @@ def get_parts_of_speech(self):
parts_of_speech = {}
useless_sections = (
r'Étymologie', r'Prononciation', r'Références', r'Voir_aussi',
r'Anagrammes', r'Liens_externes'
r'Anagrammes', r'Liens_externes', r'Erreurs*', r'=_Synonymes'
)
sections = filter_sections_id(self.sections_id.keys(), useless_sections)
for section_name in sections:
Expand Down Expand Up @@ -195,22 +196,23 @@ def get_definitions(self, part_of_speech):
part_of_speech = '#' + part_of_speech.replace(' ', '_')
text = self._query.find(part_of_speech)[0]
text = text.getparent()
while text.tag != 'ol':
while text is not None and text.tag != 'ol':
# ligne de forme
if text.tag in ('p', 'span'):
self.ligne_de_forme(text)
text = text.getnext()
for i, definition_bloc in enumerate(text.getchildren()):
raw = definition_bloc.text_content()
definition = raw.split('\n')[0]
# Catching examples
examples = get_examples(definition_bloc)
definitions[i] = {'definition': definition}
if examples:
definitions[i]['examples'] = examples
if definition_bloc.find('ol') is not None:
subdefinitions = get_subdefinitions(definition_bloc.find('ol'))
definitions[i]['subdefinitions'] = subdefinitions
if text is not None:
for i, definition_bloc in enumerate(text.getchildren()):
raw = definition_bloc.text_content()
definition = raw.split('\n')[0]
# Catching examples
examples = get_examples(definition_bloc)
definitions[i] = {'definition': definition}
if examples:
definitions[i]['examples'] = examples
if definition_bloc.find('ol') is not None:
subdefinitions = get_subdefinitions(definition_bloc.find('ol'))
definitions[i]['subdefinitions'] = subdefinitions
return definitions

def get_etymology(self):
Expand Down Expand Up @@ -274,11 +276,12 @@ def get_translations(self, translation_id):
lines = section.getnext().cssselect('li')

for line in lines:
language = line.find('span')
translations = []
language = line.find('span').text_content()
links = line.cssselect('bdi a')

for link in links:
if language is not None:
language = language.text_content()
links = line.cssselect('bdi a')
for link in links:
translations.append(link.text_content())
result[language] = translations
return result
Expand Down
35 changes: 19 additions & 16 deletions wiktionnaireparser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import json
from contextlib import suppress

from lxml.html import HtmlComment


def etymology_cleaner(etymology):
"""
Expand Down Expand Up @@ -34,25 +36,26 @@ def extract_related_words(section):
"""Extract related words."""
related = {}
count = 0
while section.tag not in ('h3', 'h4'):
while section is not None and section.tag not in ('h3', 'h4'):
words = []
description = ''
if section.cssselect('.NavContent'):
with suppress(IndexError):
description = section.cssselect('.NavHead')[0].text_content()
for link in section.cssselect('.NavContent a'):
if 'Annexe:' in link.attrib.get('href'):
continue
words.append(link.text_content())
if not isinstance(section, HtmlComment):
if section.cssselect('.NavContent'):
with suppress(IndexError):
description = section.cssselect('.NavHead')[0].text_content()
for link in section.cssselect('.NavContent a'):
if 'Annexe:' in link.attrib.get('href'):
continue
words.append(link.text_content())

else:
for link in section.cssselect('a'):
if 'Annexe:' in link.attrib.get('href'):
continue
words.append(link.text_content())
related[count] = {}
related[count]['description'] = description
related[count]['words'] = words
else:
for link in section.cssselect('a'):
if 'Annexe:' in link.attrib.get('href'):
continue
words.append(link.text_content())
related[count] = {}
related[count]['description'] = description
related[count]['words'] = words
section = section.getnext()
count += 1
return related
Expand Down