-
Notifications
You must be signed in to change notification settings - Fork 51
/
evaluate.py
executable file
·106 lines (93 loc) · 3.38 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from urllib.parse import urljoin
import argparse
import json
from tqdm import tqdm
from pathlib import Path
import newspaper
from nltk.translate import bleu_score
from helper import (
read_or_download_json,
get_html,
string_shingle_matching,
metrics_shingle,
)
def main(args):
ground_truth = read_or_download_json(args.ground_truth)
results = {}
metrics = []
for filename, expected_article in tqdm(ground_truth.items()):
if not filename.endswith(".html") and not filename.endswith(".html.gz"):
filename += ".html.gz"
if args.html_folder.startswith("http"):
html = get_html(urljoin(args.html_folder, filename))
else:
html = get_html(Path(args.html_folder) / filename)
article = newspaper.article(
url=expected_article["url"],
input_html=html,
fetch_images=False,
)
parsed_result = article.text
metric = string_shingle_matching(expected_article["articleBody"], parsed_result)
metrics.append(metric)
results[filename] = {
"url": expected_article["url"],
"truth": expected_article["articleBody"],
"parsed": parsed_result,
"bleu_score": bleu_score.sentence_bleu(
[expected_article["articleBody"]], parsed_result
),
"precision": float(
bleu_score.modified_precision(
[expected_article["articleBody"]], parsed_result, n=5
)
),
"acc": metric[3],
"prec": metric[4],
"recall": metric[5],
"f1": metric[6],
}
copora_score = bleu_score.corpus_bleu(
[[result["truth"]] for result in results.values()],
[result["parsed"] for result in results.values()],
)
sorted_results = sorted(
[(k, result["url"], result["bleu_score"]) for k, result in results.items()],
key=lambda x: x[2],
reverse=False,
)
metric_corpora = metrics_shingle(metrics)
print(f"Corpus BLEU score: {copora_score}")
print(f"Corpus Accuracy score: {metric_corpora['accuracy']}")
print(f"Corpus Precision score: {metric_corpora['precision']}")
print(f"Corpus Recall score: {metric_corpora['recall']}")
print(f"Corpus F1 score: {metric_corpora['f1']}")
print("Top 10 worst results:")
for filename, url, score in sorted_results[:10]:
print(f"{score} {filename} {url} ")
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
json.dump(results, f, indent=4, ensure_ascii=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--ground-truth",
type=str,
default="https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/master/ground-truth.json",
help="URL to the groundtruth json or a local path to the json file",
)
parser.add_argument(
"--html-folder",
type=str,
default="https://github.com/scrapinghub/article-extraction-benchmark/raw/master/html/",
help=(
"URL to the folder containing the html files or a local path to the folder"
),
)
parser.add_argument(
"--output",
type=str,
help="Write parsing results and scores into an json file",
)
args = parser.parse_args()
main(args)