Skip to content

Commit

Permalink
fix for unicode errors
Browse files Browse the repository at this point in the history
  • Loading branch information
nooraangelva committed Sep 29, 2022
1 parent e922f50 commit ee96740
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 1 deletion.
3 changes: 2 additions & 1 deletion inspirehep/modules/workflows/tasks/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,8 @@ def extract_authors_from_xml(xml_content):

# Gets all the names for affiliated organizations using the organization ids from author
for affiliation in author.xpath("./authorAffiliations/authorAffiliation/@organizationid").getall():
orgName = str(content.xpath(u'//organizations/Organization[@id="{}"]/orgName[@source="spiresICN" or @source="INSPIRE" and text()!="" ]/text()'.format(affiliation)).get())
orgName = content.xpath(u'string(//organizations/Organization[@id="{}"]/orgName[@source="spiresICN" or @source="INSPIRE" and text()!="" ]/text())'.format(affiliation)).get()

cleaned_org_name = re.sub(remove_new_line_regex, '', orgName)
if orgName and not re.match(undefined_or_none_value_regex, cleaned_org_name):
affiliations.append(cleaned_org_name)
Expand Down
Binary file added tests/unit/workflows/fixtures/2202.12988.tar.gz
Binary file not shown.
35 changes: 35 additions & 0 deletions tests/unit/workflows/test_workflows_tasks_arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1421,3 +1421,38 @@ def test_arxiv_ignores_random_xml_files():

arxiv_author_list(obj, eng)
assert obj.data.get('authors', None) is None


def test_arxiv_handles_non_ascii_organization_names():
schema = load_schema('hep')
eprints_subschema = schema['properties']['arxiv_eprints']
filename = pkg_resources.resource_filename(
__name__, os.path.join('fixtures', '2202.12988.tar.gz'))

data = {
'$schema': 'http://localhost:5000/hep.json',
'arxiv_eprints': [
{
'categories': [
'hep-ex',
],
'value': '2202.12988',
},
],
}
validate(data['arxiv_eprints'], eprints_subschema)

extra_data = {}
files = MockFiles({
'2202.12988.tar.gz': AttrDict({
'file': AttrDict({
'uri': filename,
})
})
})

obj = MockObj(data, extra_data, files=files)
eng = MockEng()

arxiv_author_list(obj, eng)
assert obj.data.get('authors', None) is not None

0 comments on commit ee96740

Please sign in to comment.