使用python实现简单的爬虫操作
requests模块
安装
pip install requests
抓包工具使用
比如Chrome浏览器
按F12-》Network-》点击XHR-》刷新页面-》选择下方数据包
就可以看到相关数据
1.指定url
UA伪装
headers
请求头
User-Agent
用户代理,一般是浏览器
示例:
这是从我的chrome浏览器的抓包工具中查看的,User-Agent
字段
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36'
}
请求参数
params
对于get操作
data
对于post
2.发起请求
requests.get(url,params,headers)
requests.post(url,data,headers)
一般直接请求是get多,ajax请求post多
3.获取响应数据
response = requests.get(url,params,headers)
response.text
文本数据
response.json()
ajax数据
查看是什么数据Content-Type
application/json
---》json数据
application/x-www-form-urlencoded
---》text数据
4.持久化存储
json.dump()
存储json对象
f.write()
存储text数据或者json字符串
后续用数据库存储
案例
案例1:爬取搜狗首页的页面数据
import requests
if __name__ == "__main__":
# 1. 设定url
url = "https://www.sogou.com/"
# 2.发起请求
response = requests.get(url=url)
# 3.获取响应数据
page_text = response.text
# 4.持久化存储
with open("./sougou.html","w",encoding="utf-8") as f:
f.write(page_text)
案例2:爬取搜狗关键字搜索
#使用ua伪装请求头headers 使用params 参数
import requests
if __name__ == "__main__":
# 1.拼接url
url = 'https://www.sogou.com/web'
# 1.1 做一个参数列表
kw = input(':>>')
params = {
'query':kw
}
# 2.做请求头
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36'
}
# 发送请求
response = requests.get(url=url,params=params,headers=headers)
# 解析数据
page_text = response.text
# 持久化存储
fileName = kw + '.html'
with open(fileName,"w",encoding="utf-8") as f:
f.write(page_text)
print(fileName,"下载成功")
案例3:爬取百度翻译
import requests
import json
if __name__ == "__main__":
post_url = "https://fanyi.baidu.com/sug"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
}
msg = input(">>:")
data = {
"kw":msg
}
response = requests.post(url=post_url,data=data,headers=headers)
data_json = response.json()
fileName = msg + ".json"
fp = open(fileName,"w",encoding="utf-8")
json.dump(data_json,fp=fp,ensure_ascii=False)
print("写入成功")
案例4:爬取豆瓣电影
import requests
import json
if __name__ == "__main__":
url = "https://movie.douban.com/j/chart/top_list"
params = {
'type':'11',
'interval_id': '100:90',
'action': '',
'start': '0',
'limit': '20',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
response = requests.get(url=url,params=params,headers=headers)
#解析字符串
movie_list = response.json()
#持久化存储
fp = open("./movie.json","w",encoding="utf-8")
json.dump(movie_list,fp=fp,ensure_ascii=False)
print("爬取成功")
案例5:爬取所有北京的肯德基餐厅信息
import requests
import os
if __name__ == "__main__":
post_url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
}
data = {
"cname":"",
"pid": "",
"keyword": "北京",
"pageIndex": "1",
"pageSize": "10"
}
start = 1
end = 8
if not os.path.exists('./beijing'):
os.mkdir('./beijing')
for i in range(start,end+1):
data['pageIndex'] = str(i)
# 发送post请求
response = requests.post(url=post_url,data=data,headers=headers)
response_json_str = response.text
fileName = "./beijing/beijing%s.json" % i
fp = open(fileName,"w",encoding="utf-8")
fp.write(response_json_str)
print("爬取成功%s" % i)
案例六:爬取药监局详情页数据
#爬取药监局的信息
import requests
import json
import os
import time
import random
if __name__ == "__main__":
# 发送post请求,获取首页ajax信息
index_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
}
#修改page,就可以获取每页数据
index_data = {
"on": "true",
"page": "1",
"pageSize": "15",
"productName":"",
"conditionType":"1",
"applyname":"",
"applysn":""
}
#将每页数据存储到一个列表中
max_page_num = 20
for page_num in range(1,max_page_num): #爬取每一页数据
index_data['page'] = page_num
index_json = requests.post(url=index_url,data=index_data,headers=headers).json()
#解析每一页的企业列表
enterprise_id_list = []
enterprise_list = index_json['list']
# print(enterprise_list)
#爬取详情页数据
desc_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
#需要拼接data
desc_data = {
"id":""
}
pre_page_data = []
for enterprise in enterprise_list:
desc_data["id"] = enterprise["ID"]
# print(desc_data)
desc_json = requests.post(url=desc_url,data=desc_data,headers=headers).json()
# 将获取的json字符串放到一起
pre_page_data.append(desc_json)
#持久化存储
if not os.path.exists('./药监局数据'):
os.mkdir('./药监局数据')
fileName = "./药监局数据/page%d.json"%page_num
with open(fileName,"w",encoding="utf-8") as fp:
json.dump(pre_page_data,fp=fp,ensure_ascii=False)
print("page%d爬取完成"%page_num)
d = random.random()
print("下一次爬取时间",d)
time.sleep(d)
d = random.random()
print("下一次爬取时间",d)
time.sleep(d)
可以没有,这个是模拟人的行为