-
Notifications
You must be signed in to change notification settings - Fork 2
/
convertArticles.pl
executable file
·127 lines (107 loc) · 3.34 KB
/
convertArticles.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/perl
use strict;
use Encode qw(decode encode);
use HTML::Tidy;
my $tidy = HTML::Tidy->new();
sub tidySnippet {
my $snippet = shift;
my $document = $tidy->clean($snippet);
if ($document =~ /<body>(.*)<\/body>/mis) {
return $1;
}
return $document;
}
sub getbody {
my ($article, $title) = @_;
if ($article =~ /<div id="content">(.*)/mis) {
$article = $1;
}
if ($article =~ /.*(<p class="noindent">)?(<u>|<h1>)\Q$title\E\s*\.?(<\/u>|<\/h1>)\.?(<\/p>)?(.*)/mis) {
$article = $5;
}
return tidySnippet($article);
}
sub parse {
my $article = shift;
my %result = {};
$article =~ s/href="..\/..\//href="https:\/\/www.cs.utexas.edu\/~EWD\/$1/gi;
if ($article =~ /<title>(E.W.\s*Dijkstra Archive:\s*)?(“)?(.*?)\.?(”)?(\s*\(EWD\s*\d+\w*\))?\s*<\/title>/mi) {
$result{'title'} = $3;
$article =~ s/<h1>\Q$result{'title'}\E<\/h1>//mi;
}
$result{'body'} = getbody($article, $result{'title'});
return \%result;
}
sub original {
if (shift =~ /EWD(\d\d)(.*)\.html/) {
return "<a class='original' href='https://www.cs.utexas.edu/~EWD/ewd$1xx/EWD$1$2.PDF'><img src='assets/original.png' alt='Show original manuscript'></a>";
} else {
return '';
}
}
sub convertArticle {
my ($filename_in, $filename_out) = @_;
open ARTICLE, "iconv -c --from UTF-8 --to UTF-8 \"$filename_in\"|";;
binmode(ARTICLE, ":utf8");
my $article = <ARTICLE>;
my $parsed = parse($article);
my $original = original($filename_out);
open OUT, ">$filename_out";
binmode(OUT, ":utf8");
print OUT <<EOF;
<!DOCTYPE html>
<html>
<head>
<title>$parsed->{'title'}</title>
<link href="https://fonts.googleapis.com/css?family=Lobster|Raleway" rel="stylesheet">
<link href="assets/common.css" rel="stylesheet">
<link href="assets/transcriptions.css" rel="stylesheet">
<link href="assets/tweet-selection.css" rel="stylesheet">
<script src="https://code.jquery.com/jquery-1.12.0.min.js" type="text/javascript" charset="utf-8"></script>
<script src="assets/tweet-selection.js"></script>
<meta name="generator" content="convertArticle.pl">
<meta name="twitter:card" content="summary" />
<meta name="twitter:site" content="\@raboofje" />
<meta name="twitter:description" content="From the Edsger Dijkstra EWD archive: $parsed->{'title'}" />
<meta name="twitter:title" content="$parsed->{'title'}" />
<meta name="twitter:image" content="http://raboof.github.io/ewd/assets/dijkstra.jpeg" />
</head>
<body>
<div class="metabar">
<div class="metabar-inner">
<a href="index.html">HOME</a>
</div>
</div>
<h1>$parsed->{'title'}</h1>
$original
<div class='body'>$parsed->{'body'}</div>
<script>
\$('.body').tweetSelection({
ellipsis: '...',
quoteLeft: '\\'',
quoteRight: '\\'',
via: 'raboofje'
});
</script>
</body>
</html>
EOF
}
sub targetName {
my $source = shift;
if ($source =~ /EWD978/) {
return "EWD978.html"
}
if ($source =~ /.*\/(.*)\.html?/) {
my $file = $1;
$file =~ s/ewd/EWD/;
return $file . ".html";
}
die "Could not determine output filename for $source";
}
$/ = undef;
my @files = <sources/www.cs.utexas.edu/~EWD/transcriptions/EWD*/*.htm*>;
push(@files, 'sources/www.cs.utexas.edu/~EWD/transcriptions/EWD12xx/EWD 1202/EWD1202.html');
foreach my $file (@files) {
convertArticle($file, 'target/' . targetName($file));
}