-
Notifications
You must be signed in to change notification settings - Fork 1
/
getData.py
44 lines (35 loc) · 1.57 KB
/
getData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from pathlib import Path
from bs4 import BeautifulSoup
import json
import re
class Paper:
def __init__(self, path:str) -> None:
self.path = path
self.json = json.load(open(path, encoding='utf-8'))
self.title = self.json['articleTitle']
self.number = self.json['articleNumber']
self.authors = self.json['authors']
self.abstract = self.json['abstract']
self.text = self._get_text()
def __str__(self) -> str:
return f'{self.title}'
def __repr__(self):
return f'{self.title}'
def _get_text(self) -> str:
text = self.abstract + "\n" + "\n".join([p.text for p in BeautifulSoup(self.json['xml'], 'lxml').find_all('p')])
return self._clean_text(text)
def _clean_text(self, text:str) ->str:
"""
arg(s) : The input text is the xml component of the json object with the key `xml`
return(s) : Return the cleaned text without the xml tags
"""
regex = r"CCBY - IEEE.*|\[\d+\]|\$.*\$|View Source.*|\\begin.*|FIGURE \d+|Fig. \d+|[^A-Za-z0-9^ ]|SECTION [A-Z]+|\t\t|\n|Eq \d+| "
regex_empty = r" +"
regex_eqns = r"Eq \d+|Lemma \d+|section \d+|section \d+ \d+|From \d+|Eqs[^a-z^A-Z]+"
result = re.sub(regex, " ", text, 0, re.MULTILINE)
result = re.sub(regex_empty, " ", result, 0, re.MULTILINE)
result = re.sub(regex_eqns, "", result, 0, re.MULTILINE).strip()
return result
def getData(path_to_data:str)-> list[list]:
papers = [Paper(path) for path in Path(path_to_data).rglob('*.json')]
return papers