-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnlp_summaries.py
72 lines (48 loc) · 1.81 KB
/
nlp_summaries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pytextrank
import spacy
import pandas as pd
import glob
path = "data//nlp/*.csv"
nlp = spacy.load("en_core_web_sm")
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
def listToString(s):
str1 = ""
for ele in s:
str1 += ele
return str1
def getTarget(doc4):
for index, row in doc4.iterrows():
target_site = str(row['target_site'])
return target_site
def run_nlp_summaries(path):
eigencentrality = []
counts = []
key_words = []
targets = []
for fname in glob.iglob(path):
df_1 = pd.read_csv(fname, encoding='windows-1252')
# Parse Fields
doc1 = df_1.summary.T
doc2 = pd.DataFrame(df_1.target_site.T)
# Convert elements to strings
docs1 = listToString(doc1)
target_site = getTarget(doc2)
# NLP metrics: Summary
doc_summary = nlp(docs1)
for p in doc_summary._.phrases:
eigencentrality.append("{:.4f}".format(p.rank))
counts.append("{:5d}".format(p.count))
key_words.append("{}".format(p.text))
targets.append("{}".format(target_site))
# Combine indiviudal NLP metrics into one dataframe
nlp_eigencentrality = pd.DataFrame({'eigencentrality': eigencentrality})
nlp_counts = pd.DataFrame({'counts': counts})
nlp_key_words = pd.DataFrame({'key_words': key_words})
nlp_target_sites = pd.DataFrame({'target_site': targets})
nlp_summaries = pd.concat([nlp_eigencentrality, nlp_counts, nlp_key_words, nlp_target_sites],
ignore_index=True,
axis=1)
print(nlp_summaries)
return nlp_summaries
run_nlp_summaries(path)