您的当前位置:首页正文

Python网络爬虫实习报告-python实习报告

2020-01-18 来源:星星旅游
Python收集爬虫练习陈述

目次

一.选题布景- 2 - 二.爬虫道理- 2 - 三.爬虫汗青和分类- 2 - 错误!未指定书签。 错误!未指定书签。 剖析网页- 1 -

2爬取数据- 7 - 3数据整顿.转换- 10 - 4数据保管.展现- 12 - 5技巧难点症结点- 12 -

六.总结- 14 -

一、 选题布景 二、 爬虫道理 三、 爬虫汗青和分类 四、 经常运用爬虫框架比较

Scrapy框架:Scrapy框架是一套比较成熟的Python爬虫框架,是运用Python开辟的快速.高层次的信息爬取框架,可以高效的爬取web页面并提掏出构造化数据.Scrapy运用规模很广,爬虫开辟.数据发掘.数据监测.主动化测试等.

Crawley框架:Crawley也是Python开辟出的爬虫框架,该框架致力于转变人们从互联网中提取数据的方法.

Portia框架:Portia框架是一款许可没有任何编程基本的用户可视化地爬取网页的爬虫框架.

newspaper框架:newspaper框架是一个用来提取消息.文章以及内容剖析的Python爬虫框架.

Python-goose框架:Python-goose框架可提取的信息包含:<1>文章主体内

容;<2>文章重要图片;<3>文章中嵌入的任heYoutube/Vimeo视频;<4>元描写;<5>元标签

五.数据爬取实战(豆瓣网爬取片子数据)

1剖析网页

# 获取html源代码 def __getHtml(): data = [] pageNum = 1 pageSize = 0 try:

while (pageSize <= 125):

# headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', # 'Referer':None #留意假如依旧不克不及抓取的话,这里可以设置抓取网站的host

# }

# opener = urllib.request.build_opener() # opener.addheaders = [headers]

url = \"https://movie.douban.com/top250?start=\" + str(pageSize) + \"&filter=\" + str(pageNum)

# data['html%s' % i ]=urllib.request.urlopen(url).read().decode(\"utf-8\") data.append(urllib.request.urlopen(url).read().decode(\"utf-8\")) pageSize += 25 pageNum += 1

print(pageSize, pageNum) except Exception as e: raise e return data

2爬取数据

def __getData(html): title = [] # 片子标题 #rating_num = [] # 评分 range_num = [] # 排名

#rating_people_num = [] # 评价人数 movie_author = [] # 导演 data = {} # bs4解析html

soup = BeautifulSoup(html, \"html.parser\")

for li in soup.find(\"ol\ title.append(li.find(\"span\

#rating_num.append(li.find(\"div\class_='rating_num').text)

range_num.append(li.find(\"div\ #spans = li.find(\"div\ #for x in range(len(spans)): # if x <= 2: # pass # else:

# rating_people_num.append(spans[x].string[-len(spans[x].string):-3]) str = li.find(\"div\ index = str.find(\"主\") if (index == -1):

index = str.find(\"...\")

print(li.find(\"div\ if (li.find(\"div\ index = 60 # print(\"aaa\") # print(str[4:index])

movie_author.append(str[4:index])

data['title'] = title

#data['rating_num'] = rating_num data['range_num'] = range_num

#data['rating_people_num'] = rating_people_num data['movie_author'] = movie_author return data

3数据整顿.转换

def __getMovies(data):

f = open('F://douban_movie.html', 'w',encoding='utf-8') f.write(\"\")

f.write(\"Insert title here\") f.write(\"\")

f.write(\"

爬取豆瓣片子

\") f.write(\"

作者:刘文斌

\")

f.write(\"

时光:\" + nowtime + \"

\") f.write(\"
\")

f.write(\"

\") f.write(\"\") f.write(\"\")

f.write(\"

\")

#f.write(\"

\") f.write(\"\") #f.write(\"\") f.write(\"\") f.write(\"\") f.write(\"\") f.write(\"\") for data in datas:

for i in range(0, 25): f.write(\"

\")

f.write(\"

\" % data['title'][i])

# f.write(\"

\" % data['rating_num'][i])

f.write(\"

\" % data['range_num'][i])

# f.write(\"

\" % data['rating_people_num'][i])

f.write(\"

\" % data['movie_author'][i]) f.write(\"\") f.write(\"\") f.write(\"\") f.write(\"
片子评分排名评价人数导演
%s%s%s%s%s
\") f.write(\"\") f.write(\"\") f.close()

if __name__ == '__main__': datas = []

htmls = __getHtml()

for i in range(len(htmls)): data = __getData(htmls[i]) datas.append(data) __getMovies(datas) 4数据保管.展现 成果如后图所示:

5技巧难点症结点

数据爬取实战(搜房网爬取衡宇数据)

from bs4 import BeautifulSoup import requests

rep = requests.get('http://newhouse.fang.com/top/') rep.encoding = \"gb2312\" # 设置编码方法 html = rep.text

soup = BeautifulSoup(html, 'html.parser') f = open('F://fang.html', 'w',encoding='utf-8') f.write(\"\")

f.write(\"Insert title here\")

f.write(\"\")

f.write(\"

新房成交TOP3

\")

f.write(\"

\") f.write(\"\") f.write(\"\") f.write(\"\")

for li in soup.find(\"ul\

name=li.find(\"div\ chengjiaoliang=li.find(\"span\ try:

junjia=li.find(\"div\9\")#.text.WordStr('�O', '平方米') except Exception as e:

junjia=li.find(\"div\�O', '平方米')

f.write(\"

\" % name) f.write(\"\" % chengjiaoliang)

f.write(\"

\" % junjia)

print(name)

f.write(\"

房址

成交量

均价

%s%s%s
\") f.write(\"\")

六.总结

教师考语:

成绩: 指点教师:

因篇幅问题不能全部显示,请点此查看更多更全内容