pengtao

# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import requests
import time
url = 'http://bj.58.com/pbdn/?PGTID=0d100000-0000-1121-f41b-137aeef068b7&ClickID=6?'
wb_date = requests.get(url)
soup = BeautifulSoup(wb_date.text,'lxml')
info_tags = soup.select('td.img > a')

def parse_info(crawl_url):
    content = requests.get(crawl_url).text
    soup = BeautifulSoup(content, 'lxml')
    title = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.mainTitle > h1')[0].text
    when = soup.select('#index_show > ul.mtit_con_left.fl > li.time')[0].text.rstrip()
    price = soup.select('.su_con > span')[0].text.lstrip()
    address = soup.select('#totalcount')[0].text.rstrip()
    type = u'商家' if (soup.select('span.red')[1].text.lstrip()).strip()=='' else u'个人'
    name = soup.select('.crb_i')[1].text
    print(u'商品标题: %s, 发帖时间: %s,价格: %s, 卖家类型: %s, 区域: %s, 类目: %s' %
    (title,when,price,type,address,name))
    time.sleep(1)

for info_tag in info_tags:
    info_url= info_tag.get('href')
    parse_info(info_url)