-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathxmlMaker.py
105 lines (85 loc) · 3.28 KB
/
xmlMaker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/python
import xml.etree.cElementTree as ET
from bs4 import BeautifulSoup
import urllib
import csv
import re
email=''
#start the xml
listings = ET.Element("listings")
#load and open the csv
f = open('urls.csv')
csv_f = csv.reader(f)
# parsing for each url in csv
for row in csv_f:
#find our base and region
rex = re.compile(ur'(.*org)', re.MULTILINE)
pex = re.compile(ur'http://(.*).craigslist.org', re.MULTILINE)
base = rex.search(row[0]).groups()[0]
region = pex.search(row[0]).groups()[0]
#load the html parser
html = urllib.urlopen(row[0]).read()
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all("a", {"id": "replylink"}):
r = link.get('href') # get reply link
reply = base + r # build it
# open and load our new reply url
replyhtml = urllib.urlopen(reply).read()
soupre = BeautifulSoup(replyhtml, "html.parser")
for e in soupre.find_all("a", {"class": "mailapp"}):
email = e.get_text() #email address
listing = ET.SubElement(listings, "listing")
#title
titleTag = soup.html.head.title.get_text()
ET.SubElement(listing, "title", lang="en_US").text = titleTag
#content
for con in soup.find_all("section", {"id": "postingbody"}):
content = con.get_text()
if len(row) > 5:
ET.SubElement(listing, "content", lang="en_US").text = content + '\n\n' +row[0] + '\n\n' +row[5]
else:
ET.SubElement(listing, "content", lang="en_US").text = content + '\n\n' +row[0]
# category
if len(row) > 1:
ET.SubElement(listing, "category", lang="en_US").text = row[1]
else:
ET.SubElement(listing, "category", lang="en_US").text = "other"
# email and name
ET.SubElement(listing, "contactemail").text = email
ET.SubElement(listing, "contactname").text = "Craigslist Ad"
# price
for p in soup.find_all("span", {"class": "price"}):
price = p.get_text().strip('$')
ET.SubElement(listing, "price").text = price
ET.SubElement(listing, "currency").text = "USD"
ET.SubElement(listing, "city_area").text = ""
# city and state use columns C and D if present
if len(row) > 2:
ET.SubElement(listing, "city").text = row[2]
else:
ET.SubElement(listing, "city").text = region
if len(row) > 3:
ET.SubElement(listing, "region").text = row[3]
else:
ET.SubElement(listing, "region").text = ""
ET.SubElement(listing, "countryId").text = "US"
ET.SubElement(listing, "country").text = ""
# Custom fields
if len(row) > 4:
ET.SubElement(listing, "custom", name="new-custom-field").text = row[4]
# Images
img = soup.find_all("div", {"class": "slide first visible"}) #single image
imgs = soup.find_all("a", {"class": "thumb"}) #multiple images
if len(imgs) > 1:
for pi in imgs:
pic = pi.get('href')
ET.SubElement(listing, "image").text = pic
elif len(img) > 0:
for pi2 in img:
pic2 = pi2.find('img').get('src')
ET.SubElement(listing, "image").text = pic2
# time
time = soup.find_all("time")[0].get('datetime')
ET.SubElement(listing, "datetime").text = time
tree = ET.ElementTree(listings)
tree.write("criagslist.xml", encoding='utf-8', xml_declaration=True)