-
Notifications
You must be signed in to change notification settings - Fork 0
/
transform-authors
executable file
·66 lines (52 loc) · 2.23 KB
/
transform-authors
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python3
"""
Abbreviates a full list of authors to be '<first author> et al.' of the NDJSON
record from stdin and outputs modified records to stdout.
Note: This is a "best effort" approach and can potentially mangle the author name.
"""
import argparse
import json
import re
from sys import stderr, stdin, stdout
def parse_authors(record: dict, authors_field: str, default_value: str,
index: int, abbr_authors_field: str = None) -> dict:
# Strip and normalize whitespace
new_authors = re.sub(r'\s+', ' ', record[authors_field])
if new_authors == "":
new_authors = default_value
else:
# Split authors list on comma/semicolon
# OR "and"/"&" with at least one space before and after
new_authors = re.split(r'(?:\s*[,,;;]\s*|\s+(?:and|&)\s+)', new_authors)[0]
# if it does not already end with " et al.", add it
if not new_authors.strip('. ').endswith(" et al"):
new_authors += ' et al'
if abbr_authors_field:
if record.get(abbr_authors_field):
print(
f"WARNING: the {abbr_authors_field!r} field already exists",
f"in record {index} and will be overwritten!",
file=stderr
)
record[abbr_authors_field] = new_authors
else:
record[authors_field] = new_authors
return record
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--authors-field", default="authors",
help="The field containing list of authors.")
parser.add_argument("--default-value", default="?",
help="Default value to use if authors list is empty.")
parser.add_argument("--abbr-authors-field",
help="The field for the generated abbreviated authors. " +
"If not provided, the original authors field will be modified.")
args = parser.parse_args()
for index, record in enumerate(stdin):
record = json.loads(record)
parse_authors(record, args.authors_field, args.default_value, index, args.abbr_authors_field)
json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
print()