python爬虫抓取房产信息

一、基本思路
1、使用python 的urllib2抓取网页信息下载下来
2、使用beatifulsoup4对网页信息进行分析，获得所需的字段信息
3、建立sqlite3轻型数据库用于保存网页信息
二、实施工艺
1、python使用urllib2抓取网页信息
# coding:utf-8import urllib2import timemax_tries=3class htmldownloader(object): def download(self, url): if url is none: return none useragent='mozilla/5.0 (windows nt 6.2; wow64) applewebkit/537.36 (khtml, like gecko) chrome/43.0.2357.134 safari/537.36' #print url opener = urllib2.request(url) opener.add_header('user-agent', useragent) opener.add_header('referer','downloader() self.nextpage=crawurl self.pool=threadpool() def load_sql_storage(self,dbname): self.sqldir = os.path.join('./anjuke','%s.db'%dbname) with sqlite3client(self.sqldir) as s3c: s3c.execute('create table if not exists houses (bianma text,faburiqi text,title text, xiaoqu text,xiaoqu_url, fangxing text,weizhi text,mianji numeric,danjia numeric,niandai text,chaoxiang text,zhongjie text,dianhua text,primary key(bianma))')
def down_titles(self,downurl,callback): try: htmldoc=self.down.download(downurl) if htmldoc is none: print 'error' return none soup=beautifulsoup(htmldoc,'html.parser',from_encoding='utf8') # hreflist=soup.find_all('a',class_='houselisttitle') urls=[] for hr in hreflist: urls.append(hr['href']) #print urls self.pool.map(callback,urls) #下一页 > nextpage=soup.find('a',class_=anxt) return nextpage['href'] except exception,error: print error def down_article(self,downurl): try: htmldoc=self.down.download(downurl) if htmldoc is none: print 'error' return none soup=beautifulsoup(htmldoc,'html.parser',from_encoding='utf8')
#房屋编码： 861183257，发布时间：2017年06月02日 house_encode=soup.find('span',class_='house-encode').text #print house_encode bianma=house_encode.split('，')[0].split('：')[1] faburiqi=house_encode.split('，')[1].split('：')[1] # 瑞鑫一期厦门海沧龙池周边环境优美 title=soup.find('h3',class_='long-title').text #第一列 first_col=soup.find('p',class_='first-col detail-col') xiaoqu=first_col.find_all('dd')[0].text xiaoqu_url=first_col.find_all('dd')[0].a['href'] weizhi=first_col.find_all('dd')[1].text niandai=first_col.find_all('dd')[2].text.split('年')[0] #第二列 second_col=soup.find('p',class_='second-col detail-col') fangxing=second_col.find_all('dd')[0].text mianji=second_col.find_all('dd')[1].text.split('平')[0] chaoxiang=second_col.find_all('dd')[2].text #第三列 third_col=soup.find('p',class_='third-col detail-col') danjia=third_col.find_all('dd')[1].text.split('元')[0] #曾彩根
zhongjie=soup.find('p',class_='brokercard-name').text #136 9501 6124
dianhua=soup.find('p',class_='broker-mobile').text #print bianma,faburiqi,title, xiaoqu,xiaoqu_url,fangxing,weizhi,mianji,danjia,niandai,chaoxiang,zhongjie,dianhua dbsave=sqlite3client(self.sqldir) dbsave.insert_data('houses', [bianma,faburiqi,title, xiaoqu,xiaoqu_url, fangxing,weizhi,mianji,danjia,niandai,chaoxiang,zhongjie,dianhua]) print '成功保存：',title except exception,error: print '错误:',error
def batchdowntitles(self): cnt=0 while(true): if self.nextpage is none: break self.nextpage=self.down_titles(self.nextpage,self.down_article) sleep(5) cnt+=1 print u'完成下载第'+str(cnt)+u'页'if __name__ == __main__:
pyspider = spidermain() pyspider.batchdowntitles()
4、运行显示界面：
在eclipse的console下运行状态
5、用sqlite expert查看数据并导出
sqlite expert软件查看数据
全选数据可以导出成excel表
可以将数据导出excel表
也可以用sql语句查询
查询小区均价排行
用技术玩转房商

python爬虫抓取房产信息

VIP推荐