-
Notifications
You must be signed in to change notification settings - Fork 0
/
manager.py
108 lines (75 loc) · 3.65 KB
/
manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os, pandas, sys, re, json, numpy as np
from statistics import median
def dir(dir):
if not os.path.exists(dir):
os.makedirs(dir)
return dir
def write(file, directory, st):
if not isinstance(st, str):
print('Not a string')
sys.exit(1)
with open(os.path.join(dir(directory), file), 'w') as f:
f.write(st)
MAIN_DIR = os.path.dirname(os.path.abspath(__file__))
INPUT_PATH = dir(os.path.join(MAIN_DIR, 'inputs'))
OUTPUT_PATH = dir(os.path.join(MAIN_DIR, 'outputs'))
WORKING_PATH = dir(os.path.join(MAIN_DIR, 'working'))
# These are tuple sets that contain different regular expression terms Google Scholar matched and their categories,
# used to associate each publication with its attribute "Sets"
WEIGHTED_SETS = [('NKI', re.compile('(\Wnki\W*|nathan\s(s\. )?kline\sinstitute).*rockland|rockland\ssample')),
('ADHD200', re.compile('adhd\W200')),
('CORR', re.compile('\scorr\s|(consortium\sfor\sreproducibility\sand\sreliability)')),
('ABIDE', re.compile('abide|(autism\sbrain\simaging\sdata\sexchange)'))]
UNWEIGHTED_SETS = [('FCP', re.compile('1,?000\sfunctional\sconnectomes?(\sproject)?|fcp|fcon[\s_]*1000'))]
# These are the papers that have connection to the main area of interest, found manually
CONTR_PAPERS = [1, 5, 74, 92, 653]
def get_file(file, dir):
"""This function retrieves the file based on the user-input filename and directory
:param file: Name of the text file that contains the filename to be converted into a csv file for analysis
:param dir: Directory name specified by WORKING_PATH
:return: String containing filename to be converted to a csv file
"""
file_name = file.replace('_', ' ').split('.')[0]
if not os.path.isfile(os.path.join(dir, file)):
print(file_name, 'does not exist')
sys.exit(1)
try:
return open(os.path.join(dir, file))
except IOError:
print('Unable to open', file_name, 'file')
sys.exit(1)
def get_data():
"""Converts the csv file specified in get_file into a pandas data structure
:return: DATA -- contains a pandas data structure with Google Scholar results
"""
if 'DATA' in globals():
return
global DATA_NAME, DATA
DATA_NAME = get_file('DATA_NAME.txt', WORKING_PATH).read() + '.csv'
DATA = pandas.read_csv(get_file(DATA_NAME, OUTPUT_PATH))
DATA = DATA.loc[DATA['Duplicate of'] < 0]
DATA.drop('Duplicate of', axis=1, inplace=True)
return DATA
def get_paragraphs():
"""Retrieves text from the paragraphs.json file and converts into a dictionary
:return: Dictionary with (key = integer index, value = paragraph text)
"""
return {int(i): paragraph for i, paragraph in json.load(get_file('paragraphs.json', WORKING_PATH)).items()}
def get_bibs():
"""Use in affil_to_json.py to determine article affiliations and locate them geographically
:return: Pandas data structure of article affiliations
"""
return pandas.read_csv(get_file('parsed_bibs.csv', WORKING_PATH))
def update_data():
"""Updates dataframe and exports as a CSV file to the output path directory
:return: Exports pandas dataframe to a CSV file entitled DATA_NAME.txt
"""
print('data updated:', id(DATA))
DATA.to_csv(path_or_buf=os.path.join(OUTPUT_PATH, DATA_NAME),
index=False)
def get_journal_attrs():
"""Retrieves journal names and relevant attributes (Citescore, categories) from a json file located
in inputs directory
:return: A dictionary with (key = journal name, value = attributes)
"""
return {key.lower(): value for key, value in json.load(get_file('journal_attrs.json', INPUT_PATH)).items()}