如何获得以上数据:
将以下代码复制到可出门的服务器上运行即可。每隔0-2秒爬一个页面,太频繁的话会被封掉的。 如果执行一段时间后,发现报错了,有可能是被封掉了,根据序号算一下执行到哪个页面了,然后从下一个页面开始重新跑就可以了。
服务器上需要安装beautifulsoup4,brotli,其他如果需要安装的话,就直接用pip安装吧。
# -*- coding:utf-8 -*-
import os, sys
import logging
import requests
#import socks, socket
from bs4 import BeautifulSoup
from importlib import reload
import random
import brotli
reload(sys)
PY_GEN_PATH = "/home/test/data/".replace('/', os.sep)
ETH_ADDR_URL = 'https://etherscan.io/accounts/{}'
logger = logging.getLogger('eth_addr')
LOG_FILE = 'addr.log'
LOG_FORMATTER = '%(message)s'
s = requests.Session()
def get_url(url, refer):
try:
UA_LST = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
ACCEPT_ENCODING = "gzip, deflate, br"
ACCEPT_LANGUAGE = "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7"
AUTHORITY = "etherscan.io"
CONNECTION = "keep-alive"
HOST = "etherscan.io"
path = url.replace('https://etherscan.io', '')
cookie_my = '×××××' #找到自己的cookie
header = {"AUTHORITY":AUTHORITY, "METHOD":"GET", "path":path,
"scheme":"https", "Accept":ACCEPT, "Accept-Encoding":ACCEPT_ENCODING,
"Accept-Language":ACCEPT_LANGUAGE, "cache-control":"no-cache",
"cookie":cookie_my, "pragma": "no-cache", "referer":refer,
"sec-fetch-dest":"document", "sec-fetch-mode":"navigate",
"sec-fetch-site":"same-origin", "sec-fetch-user":"?1",
"upgrade-insecure-requests":"1",
"sec-ch-ua-mobile":"?0", "sec-ch-ua-platform":"Windows",
"User-Agent":UA_LST[random.randrange(0, len(UA_LST))]
}
r = s.get(url, headers=header)
sc = r.status_code
if sc == 200:
r.encoding = 'utf-8'
key = 'Content-Encoding'
if(key in r.headers and r.headers['Content-Encoding'] == 'br'):
data = r.content
data1 = data.decode('utf-8')
return data1
return r.text
else:
logger.error('response code:{}'.format(str(sc)))
key = 'Content-Encoding'
if(key in r.headers and r.headers['Content-Encoding'] == 'br'):
logger.error(r.text)
data = brotli.decompress(r.content)
data1 = data.decode('utf-8')
return None
except Exception as e:
logger.error(e)
return None
def config_logger():
logger.setLevel(logging.DEBUG)
if not os.path.exists(PY_GEN_PATH):
logger.info("doesn't exist ,and mk it!")
os.makedirs(PY_GEN_PATH, 777)
handler = logging.FileHandler(os.path.join(PY_GEN_PATH, LOG_FILE))
handler.setLevel(logging.DEBUG)
fmter = logging.Formatter(LOG_FORMATTER)
handler.setFormatter(fmter)
logger.addHandler(handler)
# console display
console = logging.StreamHandler()
console.setLevel(level=logging.DEBUG)
console.setFormatter(fmter)
logger.addHandler(console)
def main():
import time,random
refer_url = ''
for p in range(1,401):
#for p in range(1, 2):
if p == 1:
refer_url = ETH_ADDR_URL.format('')
else:
refer_url = ETH_ADDR_URL.format(str(p - 1))
addr_url = ETH_ADDR_URL.format(str(p))
time.sleep(random.randint(0, 2))
parseEthAccount(addr_url, refer_url)
def parseEthAccount(url, refer):
html = get_url(url, refer)
soup = BeautifulSoup(html, "lxml")
tbody = soup.find("body").find("tbody")
tr_lst = tbody.findAll("tr")
for tr in tr_lst:
td_lst = tr.findAll("td")
num = td_lst[0].get_text()
addr = tr.find("a").get_text()
balance = td_lst[3].get_text()
logger.info('{}|{}|{}'.format(str(num), str(addr), str(balance)))
if __name__ == '__main__':
config_logger()
main()
执行结果类似于下面:
9976|0x73c6d288ee44fdf66c5378339f28ecfbc9e373da|588.8017988 Ether
9977|0x1651391bf51483d2ff69cc85a38235f5ff7de7b1|588.50460252 Ether
9978|0xb5967de6328290103614f74da0c7fa5fcd9bf08a|588.49188877 Ether
9979|0xbd30d89c6c25c3ae61d035a2285dcf508fd2a98a|588.3966845 Ether
9980|0xec55c5b78c41c30adefea9ebafb2fd2a27475d71|588.258797 Ether
9981|0xcdd39b6d1cc4d0a7243b389ed9356e23df6240eb|588.24154266 Ether
9982|0x0c3971a66f4a96da628bfd602e51fe9a6039b2ef|588.16853465 Ether
9983|0x7105e0598937489abcefe12f9ab8fb99fa0ea40c|588.04199565 Ether
9984|0x720395a85afaa1e79f84c6f5a19588cd8777ad37|588.003055 Ether
9985|0x3af0a4b766299d817b555127db522ca000464572|587.88594898 Ether
9986|0xa2dbef8b032dcc93a20996f116ba27eeda02f2c9|587.71140296 Ether
9987|0x702c370f0624551248cf2b819206813b7d234327|587.49372705 Ether
9988|0x170a8da7c68571b5eea7d2554e57a7fafd5c9902|587.44251806 Ether
9989|0xa0d41edc7cf866580516be71567f57688d9fb153|587.43152725 Ether
9990|0x190b323e35dd1c15e86a0e0085eea986742594a4|587.420906 Ether
9991|0x831f5d6e6741134cd646a03238730279e1a32f61|587.4 Ether
9992|0xf277dcb2b59494d0ccd0d45a44ebf79155fd5cad|587.37743676 Ether
9993|0x7074085e6ce64a756ccc7e48aa04433c0ec0c9cf|587.35522956 Ether
9994|0x8cd4fe470409c23d2bd42e98c7e918d35927a5ea|587.3476 Ether
9995|0xb388a0b6a075657124cfb5566a4fde9cae058e99|587.1243235 Ether
9996|0x66a6bd6d2c28a7413f22bf26f4030d3f46c3400a|586.93075962 Ether
9997|0x519d6dcdf1acbfd8774751f1043deeea8778ef4a|586.86925905 Ether
9998|0x8672b0ebc3ec7525e3a973be338298e28c273fc2|586.61060352 Ether
9999|0xe62304f80e0176405387b353a5e9fac932d1d99a|586.59708089 Ether
10000|0x9ff8df03c89038470db5ccfe1f17a29a1fe508db|586.56751049 Ether