python3 爬补天公益src 厂商名称和url
获取厂商的地址:
注意: cookie 是已经登陆的cookie
import json import requests import time from bs4 import BeautifulSoup def spider(): ''' :return: ''' headers = { 'Host': 'butian.360.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'Referer': 'http://butian.360.cn/Reward/pub//Message/send', 'Cookie': '这里是已经登陆的cookie!!!!!', 'Connection': 'keep-alive' } for i in range(1,149): data={ 'p': i, 'token': '' } time.sleep(3) res = requests.post('http://butian.360.cn/Reward/pub/Message/send', data=data,headers=headers,timeout=(4,20)) allResult = {} allResult = json.loads(res.text) currentPage = str(allResult['data']['current']) currentNum = str(len(allResult['data']['list'])) print('正在获取第' + currentPage + '页厂商数据') print('本页共有' + currentNum + '条厂商') for num in range(int(currentNum)): print('厂商名字:'+allResult['data']['list'][int(num)]['company_name']+'\t\t厂商类型:'+allResult\ ['data']['list'][int(num)]['industry']+'\t\t厂商ID:'+allResult['data']['list'][int(num)]['company_id']) base='http://butian.360.cn/Loo/submit?cid=' with open('id.txt','a') as f: f.write(base+allResult['data']['list'][int(num)]['company_id']+'\n') if __name__=='__main__': data = { 's': '1', 'p': '1', 'token': '' } res = requests.post('http://butian.360.cn/Reward/pub/Message/send', data=data) allResult = {} allResult = json.loads(res.text) allPages = str(allResult['data']['count']) print('共' + allPages + '页') spider()
如下:
执行完之后会有一个id.txt
查看一下
获取完之后, 我们在用另一个py 获取厂商的名字和厂商的URL
import json import requests import time from bs4 import BeautifulSoup def Url(): ''' 遍历所有的ID 取得对应的域名 保存为target.txt :return: ''' headers={ 'Host':'butian.360.cn', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Referer':'http://butian.360.cn/Reward/pub', 'Cookie': '已经登陆的cookie!!!!', 'Connection':'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control':'max-age=0' } with open('id.txt','r') as f: for target in f.readlines(): target=target.strip() getUrl=requests.get(target,headers=headers,timeout=(4,20)) result=getUrl.text info=BeautifulSoup(result) url=info.find(name='input',attrs={"name":"host"}) name = info.find(name='input', attrs={"name": "company_name"}) lastUrl=url.attrs['value'] print('厂商:' + name.attrs['value'] + '\t网址:' + url.attrs['value']) url2="'厂商:' %s '\t网址:' %s "%(name.attrs['value'],url.attrs['value']) with open('url2.txt','a') as liang: liang.write(url2+'\n') with open('target.txt','a') as t: t.write(lastUrl+'\n') time.sleep(3) print('The target is right!') Url()
执行结果
执行完之后会有两个文件一个是url2.txt 一个是target.txt
url2.txt 是记录着是厂商和url
target.txt 是只记录url地址
完整的一个py如下:
import json import requests import time from bs4 import BeautifulSoup def spider(): ''' :return: ''' headers = { 'Host': 'butian.360.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'Referer': 'http://butian.360.cn/Reward/pub//Message/send', 'Cookie': '这里是已经登陆的cookie!!!!!', 'Connection': 'keep-alive' } for i in range(1,149): data={ 'p': i, 'token': '' } time.sleep(3) res = requests.post('http://butian.360.cn/Reward/pub/Message/send', data=data,headers=headers,timeout=(4,20)) allResult = {} allResult = json.loads(res.text) currentPage = str(allResult['data']['current']) currentNum = str(len(allResult['data']['list'])) print('正在获取第' + currentPage + '页厂商数据') print('本页共有' + currentNum + '条厂商') for num in range(int(currentNum)): print('厂商名字:'+allResult['data']['list'][int(num)]['company_name']+'\t\t厂商类型:'+allResult\ ['data']['list'][int(num)]['industry']+'\t\t厂商ID:'+allResult['data']['list'][int(num)]['company_id']) base='http://butian.360.cn/Loo/submit?cid=' with open('id.txt','a') as f: f.write(base+allResult['data']['list'][int(num)]['company_id']+'\n') def Url(): ''' 遍历所有的ID 取得对应的域名 保存为target.txt :return: ''' headers={ 'Host':'butian.360.cn', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Referer':'http://butian.360.cn/Reward/pub', 'Cookie': '已经登陆的cookie!!!! ', 'Connection':'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control':'max-age=0' } with open('id.txt','r') as f: for target in f.readlines(): target=target.strip() getUrl=requests.get(target,headers=headers,timeout=(4,20)) result=getUrl.text info=BeautifulSoup(result) url=info.find(name='input',attrs={"name":"host"}) name = info.find(name='input', attrs={"name": "company_name"}) lastUrl=url.attrs['value'] print('厂商:' + name.attrs['value'] + '\t网址:' + url.attrs['value']) url2="'厂商:' %s '\t网址:' %s "%(name.attrs['value'],url.attrs['value']) with open('url2.txt','a') as liang: liang.write(url2+'\n') with open('target.txt','a') as t: t.write(lastUrl+'\n') time.sleep(3) print('The target is right!') if __name__=='__main__': data = { 's': '1', 'p': '1', 'token': '' } res = requests.post('http://butian.360.cn/Reward/pub/Message/send', data=data) allResult = {} allResult = json.loads(res.text) allPages = str(allResult['data']['count']) print('共' + allPages + '页') spider() Url()
video buzz du moment 2017
2018年6月3日 下午3:13
Certains blogs proposent l’échange d’articles.
roy
2018年7月14日 上午2:45
不知道为什么一直报错
lastUrl=url.attrs[‘value’]
AttributeError: ‘NoneType’ object has no attribute ‘attrs’
print("")
2018年7月14日 上午10:29
你没有设置URL吧