diff --git "a/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/link_extraction.py" "b/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/link_extraction.py" new file mode 100644 index 0000000..5ba1fd3 --- /dev/null +++ "b/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/link_extraction.py" @@ -0,0 +1,36 @@ +from bs4 import BeautifulSoup +import requests + +start_url = 'http://bj.ganji.com/wu/' + +def get_link_urls(url): + wb_data = requests.get(start_url) + soup = BeautifulSoup(wb_data.text,'lxml') + links = soup.select('.fenlei dt a') + for link in links: + page_url = 'http://bj.ganji.com' + link.get('href') + print(page_url) + +#get_link_urls(start_url) + +link_list = ''' + http://bj.ganji.com/jiaju/ + http://bj.ganji.com/rirongbaihuo/ + http://bj.ganji.com/shoujihaoma/ + http://bj.ganji.com/bangong/ + http://bj.ganji.com/nongyongpin/ + http://bj.ganji.com/jiadian/ + http://bj.ganji.com/ershoubijibendiannao/ + http://bj.ganji.com/ruanjiantushu/ + http://bj.ganji.com/yingyouyunfu/ + http://bj.ganji.com/diannao/ + http://bj.ganji.com/xianzhilipin/ + http://bj.ganji.com/fushixiaobaxuemao/ + http://bj.ganji.com/meironghuazhuang/ + http://bj.ganji.com/shuma/ + http://bj.ganji.com/laonianyongpin/ + http://bj.ganji.com/xuniwupin/ + http://bj.ganji.com/qitawupin/ + http://bj.ganji.com/ershoufree/ + http://bj.ganji.com/wupinjiaohuan/ +''' \ No newline at end of file diff --git "a/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/w2_main.py" "b/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/w2_main.py" new file mode 100644 index 0000000..a1d8125 --- /dev/null +++ "b/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/w2_main.py" @@ -0,0 +1,29 @@ +#coding = utf-8 +from multiprocessing import Pool +from link_extraction import link_list #导入类目函数 +from web_page_parsing import get_links_from,get_item_info,url_list,item_info #导入获取链接和获取页面详情模块和数据库 + +# db_urls = [item['url'] for item in url_list.find()] +# index_urls = [item['url'] for item in item_info.find()] +# x = set(db_urls) +# y = set(index_urls) +# rest_of_urls = x-y + +def get_all_links_from(link): + for num in range(1,101): + get_links_from(link,num) + + +def get_link_from_database(url_list): + url_infos = [] + for item in url_list.find(): + url_infos.append(item['url']) + return url_infos + + +if __name__ == '__main__': + pool = Pool() + #获得url + pool.map(get_all_links_from,link_list.split()) + #获得详情 + pool.map(get_item_info,get_link_from_database(url_list)) \ No newline at end of file diff --git "a/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/web_page_parsing.py" "b/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/web_page_parsing.py" new file mode 100644 index 0000000..da7f136 --- /dev/null +++ "b/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/web_page_parsing.py" @@ -0,0 +1,50 @@ +#coding = utf-8 +from bs4 import BeautifulSoup +import requests +import time +import pymongo + +client = pymongo.MongoClient('localhost',27017) +project_market = client['project_market'] +url_list = project_market['url_list'] +item_info = project_market['item_info'] + +def get_links_from(link,pages,who_type=1): + url = '{}a{}o{}/' .format (link, str(who_type),str(pages)) + #url = 'http://bj.ganji.com/jiaju/a1o1/' + wb_data = requests.get(url) + soup = BeautifulSoup(wb_data.text,'lxml') + if soup.find('div','pageBox'): + for link in soup.select('.ft-tit'): + item_link = link.get('href') + url_list.insert_one({'url':item_link}) + print(item_link) + else: + return + +def get_item_info(url): + wb_data = requests.get(url) + soup = BeautifulSoup(wb_data.text,'lxml') + if soup.find('h1','title-name'): + title = soup.select('.title-name')[0].text.split('-')[0] + post_time = soup.select('.pr-5')[0].text.strip().split(' ')[0] + type_info = soup.select('div.leftBox > div:nth-of-type(3) > div > ul > li:nth-of-type(1) > span > a')[0].text + price = soup.select('.f22')[0].text + area = list(map(lambda x:x.text,soup.select('div.leftBox > div:nth-of-type(3) > div > ul > li:nth-of-type(3) > a '))) + data = { + '商品标题':title, + '发帖时间':post_time, + '类型':type_info, + '价格':price, + '交易地点':area, + 'url':url + } + print(data) + item_info.insert_one(data) + + else: + pass + +# url = 'http://biz.click.ganji.com/bizClick?url=pZwY0jCfsvFJshI6UhGGshPfUiqJpy7JIitQPHEYP1nOrH9vXaOCIAd0njTQPDDzn1cvwDndnHn3ENRjnbFAnHwanNDknH03rH0zwNP0njTQPjEdPWT3nWTdn1DdnjE3rHnzPHbvndkknjDVgjTknHELnjbQPHEQn7kknjDQP7kknjDQPHEYP1nOrH9vgjTknHDQn7kknjDvndkknjDQPj60njTQnHF0njTQnHEdPHD3rHcvnWmkPdkknjDQgjTknHD1nRkknj7BpywbpyOMgjTknH70njTQuv78ph-WUvdx0AI0njTQn7kknjDYPjNvnj9znjN1nHNkPj9On1cdrHm1gjTknHK0njTQnWc1sW01sWE3sW0OgjTknNdfXh-_UADfPi3kca6gpyObULI1cDONcjDksWTec7I5R1mY2iKK0ZK_uRI-mbVGIatdn108n1m92DVcRDdnsaK_pyV-cDI-mvVf2iKjpZFfUyNfPj98na3zPHmYsWbLc7P6uh7zpitdn108n1u0njTQPH9YgjTkniYQgjTkniYQgjTknNPC0hqVuE&v=2' +#url = 'http://bj.ganji.com/jiaju/1956529909x.htm' +#get_item_info(url) \ No newline at end of file diff --git "a/\347\254\254\344\272\214\345\221\250\350\257\276\347\250\213/2.2\347\210\254\345\217\226\345\267\245\344\275\234\346\265\201\345\210\206\346\236\220/2.2\347\273\203\344\271\240\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/2.2.py" "b/\347\254\254\344\272\214\345\221\250\350\257\276\347\250\213/2.2\347\210\254\345\217\226\345\267\245\344\275\234\346\265\201\345\210\206\346\236\220/2.2\347\273\203\344\271\240\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/2.2.py" new file mode 100644 index 0000000..74ef045 --- /dev/null +++ "b/\347\254\254\344\272\214\345\221\250\350\257\276\347\250\213/2.2\347\210\254\345\217\226\345\267\245\344\275\234\346\265\201\345\210\206\346\236\220/2.2\347\273\203\344\271\240\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/2.2.py" @@ -0,0 +1,27 @@ +from bs4 import BeautifulSoup +import requests +import time +import pymongo + +client = pymongo.MongoClient('localhost',27017) +_58shouji = client['_58shouji'] +url_list = _58shouji['url_list'] +pages = 0 +while True: + pages += 1 + url = 'http://bj.58.com/shoujihao/pn%s/' % str(pages) + time.sleep(1) + wb_data = requests.get(url) + soup = BeautifulSoup(wb_data.text,'lxml') + if soup.find('a','t'): + infos =soup.select('a.t') + for info in infos: + data = { + 'url':info.get('href'), + #'price':info.select('.price')[0].text if info.find_all('b','price') else u'面议', + 'title':info.select('strong')[0].get_text() + } + url_list.insert_one(data) + print(data) + else: + break \ No newline at end of file