eth-top-address

eth top数据来源于这里
如何获得以上数据:
将以下代码复制到可出门的服务器上运行即可。每隔0-2秒爬一个页面,太频繁的话会被封掉的。 如果执行一段时间后,发现报错了,有可能是被封掉了,根据序号算一下执行到哪个页面了,然后从下一个页面开始重新跑就可以了。

服务器上需要安装beautifulsoup4,brotli,其他如果需要安装的话,就直接用pip安装吧。

# -*- coding:utf-8 -*-

import os, sys
import logging
import requests
#import socks, socket
from bs4 import BeautifulSoup 
from importlib import reload
import random
import brotli

reload(sys)  

PY_GEN_PATH = "/home/test/data/".replace('/', os.sep)

ETH_ADDR_URL = 'https://etherscan.io/accounts/{}'
logger = logging.getLogger('eth_addr')
LOG_FILE = 'addr.log'
LOG_FORMATTER = '%(message)s'

s = requests.Session()


def get_url(url, refer):
    try:
        UA_LST = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
              'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
              'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
              'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
              'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
              'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
              'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
        ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
        ACCEPT_ENCODING = "gzip, deflate, br"
        ACCEPT_LANGUAGE = "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7"
        AUTHORITY = "etherscan.io"
        CONNECTION = "keep-alive"
        HOST = "etherscan.io"
        
        path = url.replace('https://etherscan.io', '')
        
        cookie_my = '×××××' #找到自己的cookie
        header = {"AUTHORITY":AUTHORITY, "METHOD":"GET", "path":path,
                  "scheme":"https", "Accept":ACCEPT, "Accept-Encoding":ACCEPT_ENCODING,
                  "Accept-Language":ACCEPT_LANGUAGE, "cache-control":"no-cache",
                  "cookie":cookie_my, "pragma": "no-cache", "referer":refer,
                  "sec-fetch-dest":"document", "sec-fetch-mode":"navigate",
                  "sec-fetch-site":"same-origin", "sec-fetch-user":"?1",
                  "upgrade-insecure-requests":"1",
                  "sec-ch-ua-mobile":"?0", "sec-ch-ua-platform":"Windows",
                  "User-Agent":UA_LST[random.randrange(0, len(UA_LST))]
        }
        r = s.get(url, headers=header)
        sc = r.status_code
        if sc == 200:
            r.encoding = 'utf-8'
            
            key = 'Content-Encoding'
            if(key in r.headers and r.headers['Content-Encoding'] == 'br'):
                data = r.content
                data1 = data.decode('utf-8')
                return data1
            return r.text
        else:
            logger.error('response code:{}'.format(str(sc)))
            key = 'Content-Encoding'
            if(key in r.headers and r.headers['Content-Encoding'] == 'br'):
                logger.error(r.text)
                data = brotli.decompress(r.content)
                data1 = data.decode('utf-8')
            return None
    except Exception as e:
        logger.error(e)
        return None


def config_logger():
    logger.setLevel(logging.DEBUG)
    if not os.path.exists(PY_GEN_PATH):
        logger.info("doesn't exist ,and mk it!")
        os.makedirs(PY_GEN_PATH, 777)
    handler = logging.FileHandler(os.path.join(PY_GEN_PATH, LOG_FILE))
    handler.setLevel(logging.DEBUG)
    fmter = logging.Formatter(LOG_FORMATTER)
    handler.setFormatter(fmter)
    logger.addHandler(handler)

    # console display
    console = logging.StreamHandler()
    console.setLevel(level=logging.DEBUG)   
    console.setFormatter(fmter)
    logger.addHandler(console)


def main():
    import time,random
    refer_url = ''
    for p in range(1,401):
    #for p in range(1, 2):
        
        if p == 1:
            refer_url = ETH_ADDR_URL.format('')
        else:
            refer_url = ETH_ADDR_URL.format(str(p - 1))
        
        addr_url = ETH_ADDR_URL.format(str(p))
        time.sleep(random.randint(0, 2))
        parseEthAccount(addr_url, refer_url)


def parseEthAccount(url, refer):
    html = get_url(url, refer)
    soup = BeautifulSoup(html, "lxml")
    tbody = soup.find("body").find("tbody")
    tr_lst = tbody.findAll("tr")
    for tr in tr_lst:
        td_lst = tr.findAll("td")
        num = td_lst[0].get_text()
        addr = tr.find("a").get_text()
        balance = td_lst[3].get_text()
        logger.info('{}|{}|{}'.format(str(num), str(addr), str(balance)))


if __name__ == '__main__':
    config_logger()
    main()


执行结果类似于下面: 

9976|0x73c6d288ee44fdf66c5378339f28ecfbc9e373da|588.8017988 Ether
9977|0x1651391bf51483d2ff69cc85a38235f5ff7de7b1|588.50460252 Ether
9978|0xb5967de6328290103614f74da0c7fa5fcd9bf08a|588.49188877 Ether
9979|0xbd30d89c6c25c3ae61d035a2285dcf508fd2a98a|588.3966845 Ether
9980|0xec55c5b78c41c30adefea9ebafb2fd2a27475d71|588.258797 Ether
9981|0xcdd39b6d1cc4d0a7243b389ed9356e23df6240eb|588.24154266 Ether
9982|0x0c3971a66f4a96da628bfd602e51fe9a6039b2ef|588.16853465 Ether
9983|0x7105e0598937489abcefe12f9ab8fb99fa0ea40c|588.04199565 Ether
9984|0x720395a85afaa1e79f84c6f5a19588cd8777ad37|588.003055 Ether
9985|0x3af0a4b766299d817b555127db522ca000464572|587.88594898 Ether
9986|0xa2dbef8b032dcc93a20996f116ba27eeda02f2c9|587.71140296 Ether
9987|0x702c370f0624551248cf2b819206813b7d234327|587.49372705 Ether
9988|0x170a8da7c68571b5eea7d2554e57a7fafd5c9902|587.44251806 Ether
9989|0xa0d41edc7cf866580516be71567f57688d9fb153|587.43152725 Ether
9990|0x190b323e35dd1c15e86a0e0085eea986742594a4|587.420906 Ether
9991|0x831f5d6e6741134cd646a03238730279e1a32f61|587.4 Ether
9992|0xf277dcb2b59494d0ccd0d45a44ebf79155fd5cad|587.37743676 Ether
9993|0x7074085e6ce64a756ccc7e48aa04433c0ec0c9cf|587.35522956 Ether
9994|0x8cd4fe470409c23d2bd42e98c7e918d35927a5ea|587.3476 Ether
9995|0xb388a0b6a075657124cfb5566a4fde9cae058e99|587.1243235 Ether
9996|0x66a6bd6d2c28a7413f22bf26f4030d3f46c3400a|586.93075962 Ether
9997|0x519d6dcdf1acbfd8774751f1043deeea8778ef4a|586.86925905 Ether
9998|0x8672b0ebc3ec7525e3a973be338298e28c273fc2|586.61060352 Ether
9999|0xe62304f80e0176405387b353a5e9fac932d1d99a|586.59708089 Ether
10000|0x9ff8df03c89038470db5ccfe1f17a29a1fe508db|586.56751049 Ether