-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathBuild a Stack Exchange Scraper
104 lines (86 loc) · 3.42 KB
/
Build a Stack Exchange Scraper
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Solution{
public static void sumarry(String string, ArrayList<String> arrSummary) {
/* Enter your code here. Read input from STDIN. Print output to STDOUT. Your class should be named Solution. */
String regexOnlyTag = "[<][\\W]*div[\\W]*class[\\W]*=[\\W]*summary[\\W]*[>][\\W]*[<][\\W]*[\\w]*[\\W]*[>][\\W]*"
+ "[<][\\W]*+[a][\\W]*href=\"[\\W]*[^>][^<]+";
String regexHTML = "[>][^>]*[<][\\W]*[/][\\W]*[a|b|h|H][\\w]{0,1}[\\W]*[>]";
String regexATag = "[\\[]";
Pattern tagPattern = Pattern.compile(regexOnlyTag);
Pattern htmlPattern = Pattern.compile(regexHTML);
Set<String> tagSet = new HashSet<String>();
Matcher tagMatcher = tagPattern.matcher(string);
Matcher htmlMatcher = htmlPattern.matcher(string);
while(tagMatcher.find())
{
String temp = string.substring(tagMatcher.start(), tagMatcher.end());
temp = temp.trim();
String[] temparr = temp.split(">");
String tempNew = temparr[temparr.length-1].trim();
arrSummary.add(tempNew);
}
}
public static void main(String[] args) throws IOException {
/* Enter your code here. Read input from STDIN. Print output to STDOUT. Your class should be named Solution. */
ArrayList<String> arrQuestionId = new ArrayList<String>();
ArrayList<String> arrSummary = new ArrayList<String>();
ArrayList<String> arrTime = new ArrayList<String>();
Scanner in = new Scanner(System.in);
String string;
String regexOnlyTag = "[<][\\W]*div[\\W]*class=\"question-summary\"[^>]*";
String regexHTML = "id[^>]*";
String regexATag = "=";
String regexTime = "asked[\\W]*<[\\W]*span[^/]*";
String regexExactTime = ">[\\W]*[^<]*";
Pattern tagPattern = Pattern.compile(regexOnlyTag);
Pattern htmlPattern = Pattern.compile(regexHTML);
Pattern timePattern = Pattern.compile(regexTime);
Pattern exactTimePattern = Pattern.compile(regexExactTime);
StringBuffer strbuff = new StringBuffer();
while( in.hasNextLine())
{
string = in.nextLine();
strbuff.append(string + "\n");
}
string = strbuff.toString();
Matcher tagMatcher = tagPattern.matcher(string);
while(tagMatcher.find())
{
String temp = string.substring(tagMatcher.start(), tagMatcher.end());
temp = temp.trim();
Matcher htmlMatcher = htmlPattern.matcher(temp);
while(htmlMatcher.find())
{
String quetionId = temp.substring(htmlMatcher.start(), htmlMatcher.end());
String[] idArr = quetionId.split(regexATag);
idArr[1] = idArr[1].replaceAll("\"", "").trim();
String[] id = idArr[1].split("-");
arrQuestionId.add(id[2].trim());
}
}
String summaryString = string;
sumarry(summaryString, arrSummary);
String timeString = string;
Matcher timeMatcher = timePattern.matcher(timeString);
while(timeMatcher.find())
{
String time = timeString.substring(timeMatcher.start(), timeMatcher.end());
Matcher timeMatcher2 = exactTimePattern.matcher(time);
if(timeMatcher2.find())
{
String exactTime = time.substring(timeMatcher2.start(), timeMatcher2.end());
exactTime = exactTime.replaceAll("[<|>]", "");
exactTime = exactTime.trim();
arrTime.add(exactTime);
}
}
for (int i = 0; i < arrQuestionId.size(); i++)
System.out.println(arrQuestionId.get(i)+";"+arrSummary.get(i)+";"+arrTime.get(i));
}
}