-
Notifications
You must be signed in to change notification settings - Fork 0
/
Program.cs
70 lines (57 loc) · 1.91 KB
/
Program.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
using HtmlAgilityPack;
Console.WriteLine("Extract plain text from HTML.");
var allChunks = new List<string>();
if (args.Length > 0)
{
string rootPath = args[0];
TraverseDirectory(rootPath, ProcessFile);
File.WriteAllText(rootPath+"\\allText.txt", String.Join("\r\n", allChunks));
return 0;
}
else
Console.WriteLine("ERROR: Please provide a folder path as argument.");
return 1;
void TraverseDirectory(string path, Action<string> action)
{
foreach (string file in Directory.GetFiles(path))
{
if (file.ToLowerInvariant().EndsWith(".htm") || file.ToLowerInvariant().EndsWith("html"))
{
action(file);
}
}
foreach (string directory in Directory.GetDirectories(path))
{
TraverseDirectory(directory, action);
}
}
void ProcessFile(string filePath)
{
string html = File.ReadAllText(filePath);
Console.WriteLine($"Processing file: {filePath} {html.Length} characters");
string result = ExtractText(html);
string newFileName = filePath + ".txt";
File.WriteAllText(newFileName, result);
Console.WriteLine($"Plain text written to: {newFileName} {result.Length} characters");
}
string ExtractText(string html)
{
if (html == null)
{
throw new ArgumentNullException(nameof(html));
}
HtmlDocument doc = new();
doc.LoadHtml(html);
var chunks = new List<string>();
//translatable elements are h1-h6, p, a, li, tr, td, th, label, button, alt attribute,
foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p|//h1|//h2|//h3|//h4|//h5|//h6|//li|//tr|//td|//th|//label|//a"))
{
if (node.InnerText.Trim() != "")
{
string toInsert = node.InnerText.Trim();
if (!chunks.Contains(toInsert)) chunks.Add(toInsert);
if (!allChunks.Contains(toInsert)) allChunks.Add(toInsert);
}
}
return String.Join("\r\n", chunks);
}