python3 爬补天公益src 厂商名称和url

作者: print("") 分类: python 发布时间: 2018-06-02 22:16

获取厂商的地址:

注意: cookie 是已经登陆的cookie

import json
import requests
import time
from bs4 import BeautifulSoup

def spider():
    '''
    :return:
    '''
    headers = {
        'Host': 'butian.360.cn',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        'Accept-Encoding': 'gzip, deflate',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'X-Requested-With': 'XMLHttpRequest',
        'Referer': 'http://butian.360.cn/Reward/pub//Message/send',
        'Cookie': '这里是已经登陆的cookie!!!!!',
        'Connection': 'keep-alive'
    }
    for i in range(1,149):
        data={
            'p': i,
            'token': ''
        }
        time.sleep(3)
        res = requests.post('http://butian.360.cn/Reward/pub/Message/send', data=data,headers=headers,timeout=(4,20))
        allResult = {}
        allResult = json.loads(res.text)
        currentPage = str(allResult['data']['current'])
        currentNum = str(len(allResult['data']['list']))
        print('正在获取第' + currentPage + '页厂商数据')
        print('本页共有' + currentNum + '条厂商')
        for num in range(int(currentNum)):
            print('厂商名字:'+allResult['data']['list'][int(num)]['company_name']+'\t\t厂商类型:'+allResult\
                  ['data']['list'][int(num)]['industry']+'\t\t厂商ID:'+allResult['data']['list'][int(num)]['company_id'])
            base='http://butian.360.cn/Loo/submit?cid='
            with open('id.txt','a') as f:
                f.write(base+allResult['data']['list'][int(num)]['company_id']+'\n')

if __name__=='__main__':
    data = {
            's': '1',
            'p': '1',
            'token': ''
        }
    res = requests.post('http://butian.360.cn/Reward/pub/Message/send', data=data)
    allResult = {}
    allResult = json.loads(res.text)
    allPages = str(allResult['data']['count'])
    print('共' + allPages + '页')
    spider()
  

如下:

执行完之后会有一个id.txt 

查看一下

获取完之后, 我们在用另一个py 获取厂商的名字和厂商的URL

import json
import requests
import time
from bs4 import BeautifulSoup
def Url():
    '''
    遍历所有的ID
    取得对应的域名
    保存为target.txt
    :return:
    '''
    headers={
        'Host':'butian.360.cn',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        'Accept-Encoding': 'gzip, deflate',
        'Referer':'http://butian.360.cn/Reward/pub',
        'Cookie': '已经登陆的cookie!!!!',
        'Connection':'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Cache-Control':'max-age=0'
    }
    with open('id.txt','r') as f:
        for target in f.readlines():
            target=target.strip()
            getUrl=requests.get(target,headers=headers,timeout=(4,20))
            result=getUrl.text
            info=BeautifulSoup(result)
            url=info.find(name='input',attrs={"name":"host"})
            name = info.find(name='input', attrs={"name": "company_name"})
            lastUrl=url.attrs['value']
            print('厂商:' + name.attrs['value'] + '\t网址:' + url.attrs['value'])

            url2="'厂商:' %s '\t网址:' %s "%(name.attrs['value'],url.attrs['value'])
            with open('url2.txt','a') as liang:
                liang.write(url2+'\n')
            with open('target.txt','a') as t:
                t.write(lastUrl+'\n')
            time.sleep(3)
    print('The target is right!')
Url()

执行结果

执行完之后会有两个文件一个是url2.txt 一个是target.txt  

url2.txt 是记录着是厂商和url 

target.txt 是只记录url地址

完整的一个py如下:

import json
import requests
import time
from bs4 import BeautifulSoup

def spider():
    '''
    :return:
    '''
    headers = {
        'Host': 'butian.360.cn',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        'Accept-Encoding': 'gzip, deflate',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'X-Requested-With': 'XMLHttpRequest',
        'Referer': 'http://butian.360.cn/Reward/pub//Message/send',
        'Cookie': '这里是已经登陆的cookie!!!!!',
        'Connection': 'keep-alive'
    }
    for i in range(1,149):
        data={
            'p': i,
            'token': ''
        }
        time.sleep(3)
        res = requests.post('http://butian.360.cn/Reward/pub/Message/send', data=data,headers=headers,timeout=(4,20))
        allResult = {}
        allResult = json.loads(res.text)
        currentPage = str(allResult['data']['current'])
        currentNum = str(len(allResult['data']['list']))
        print('正在获取第' + currentPage + '页厂商数据')
        print('本页共有' + currentNum + '条厂商')
        for num in range(int(currentNum)):
            print('厂商名字:'+allResult['data']['list'][int(num)]['company_name']+'\t\t厂商类型:'+allResult\
                  ['data']['list'][int(num)]['industry']+'\t\t厂商ID:'+allResult['data']['list'][int(num)]['company_id'])
            base='http://butian.360.cn/Loo/submit?cid='
            with open('id.txt','a') as f:
                f.write(base+allResult['data']['list'][int(num)]['company_id']+'\n')
def Url():
    '''
    遍历所有的ID
    取得对应的域名
    保存为target.txt
    :return:
    '''
    headers={
        'Host':'butian.360.cn',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        'Accept-Encoding': 'gzip, deflate',
        'Referer':'http://butian.360.cn/Reward/pub',
        'Cookie': '已经登陆的cookie!!!! ',
        'Connection':'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Cache-Control':'max-age=0'
    }
    with open('id.txt','r') as f:
        for target in f.readlines():
            target=target.strip()
            getUrl=requests.get(target,headers=headers,timeout=(4,20))
            result=getUrl.text
            info=BeautifulSoup(result)
            url=info.find(name='input',attrs={"name":"host"})
            name = info.find(name='input', attrs={"name": "company_name"})
            lastUrl=url.attrs['value']
            print('厂商:' + name.attrs['value'] + '\t网址:' + url.attrs['value'])
			url2="'厂商:' %s '\t网址:' %s "%(name.attrs['value'],url.attrs['value'])
            with open('url2.txt','a') as liang:
                liang.write(url2+'\n')
            with open('target.txt','a') as t:
                t.write(lastUrl+'\n')
            time.sleep(3)
    print('The target is right!')
if __name__=='__main__':

    data = {
            's': '1',
            'p': '1',
            'token': ''
        }
    res = requests.post('http://butian.360.cn/Reward/pub/Message/send', data=data)
    allResult = {}
    allResult = json.loads(res.text)
    allPages = str(allResult['data']['count'])
    print('共' + allPages + '页')
    spider()
    Url()

如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!

3 条评论
  • video buzz du moment 2017

    2018年6月3日 下午3:13

    Certains blogs proposent l’échange d’articles.

  • roy

    2018年7月14日 上午2:45

    不知道为什么一直报错
    lastUrl=url.attrs[‘value’]
    AttributeError: ‘NoneType’ object has no attribute ‘attrs’

    1. print("")

      2018年7月14日 上午10:29

      你没有设置URL吧

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注