-
Notifications
You must be signed in to change notification settings - Fork 1
/
rss_scrape.php
86 lines (68 loc) · 1.93 KB
/
rss_scrape.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
<?php
//rss_scrape.php
//runs the automatic harvest of IA with the designated RSS feed
include_once('config.php');
include_once('get_files.php');
echo "\nRunning RSS IA Scraper. ".$LE;
if (!isset($RSS_URL)){
echo "RSS URL has not been set. ".$LE;
exit;
}
$rss_url_c = curl_init($RSS_URL);
curl_setopt($rss_url_c, CURLOPT_RETURNTRANSFER, true);
curl_setopt($rss_url_c, CURLOPT_HEADER, 0);
$rss_data = curl_exec($rss_url_c);
curl_close($rss_url_c);
$rss_xml = new SimpleXmlElement($rss_data, LIBXML_NOCDATA);
$blist = load_blacklist();
//iterate through whole RSS file
$cnt = count($rss_xml->channel->item);
for($i=0; $i < $cnt; $i++)
{
//Relevant items from feed
$url = $rss_xml->channel->item[$i]->link;
echo $LE."Item number $i - Processing: ".$url;
//if it has been downloaded jump to next item
if(check_if_dl($url)){
echo " ...already downloaded. ".$LE;
continue;
}
//If no particular search term is set download everything
if (!isset($SEARCH_TERM) || $SEARCH_TERM == "")
{
if (!on_blacklist($url,$blist)) {
//Attempt and Check all in one
if (!download_data($url)) {
write_single_log($url,$INSTALL_DIR.$BAD_RSS_ITEMS);
echo " ...error encounter. Item logged. ".$LE;
}
else {
write_single_log($url,$INSTALL_DIR.$GOOD_ITEMS);
echo "...done. ".$LE;
}
}
continue;
}
else if (preg_match($SEARCH_TERM,$rss_xml->channel->item[$i]->description))
{
if (!on_blacklist($url,$blist)) {
//Attempt and Check all it one
if (!download_data($url)) {
write_single_log($url,$INSTALL_DIR.$BAD_RSS_ITEMS);
echo " ...error encounter. Item logged. ".$LE;
}
else {
write_single_log($url,$INSTALL_DIR.$GOOD_ITEMS);
echo "...done. ".$LE;
}
}
}
else
{
write_single_log($url, $INSTALL_DIR.$BAD_ITEMS);
if($DEBUG)
echo "... Item not recognized. Logged in $INSTALL_DIR.$BAD_ITEMS".$LE.$LE;
}
}
echo $LE."RSS Scrape Complete";
?>