使用了这个接口:https://itunes.apple.com/rss/customerreviews/page=1/id=414478124/sortby=mostrecent/json?l=en&&cc=us
和这个UA:iTunes/11.0 (Windows; Microsoft Windows 7 Business Edition Service Pack 1 (Build 7601)) AppleWebKit/536.27.1
https://itunes.apple.com/rss/customerreviews/page=1/id=414478124/sortby=mostrecent/json?l=en&&cc=us 这个接口地址中,id是appstore中的app的bundleid,cc是国家代码,page可以指定1-10,也就是说默认最多能获取到500条评论。
以下代码爬取了wechat第一页us区域的评论,并没有进一步分析评论的内容,仅仅将接口返回的信息写入了一个文件中。
代码如下:
# -*- coding:utf-8 -*-
'''
获取appstore评论
@author: user
'''
import os, sys
import logging
import requests,json
from bs4 import BeautifulSoup
from importlib import reload
import random
import brotli
reload(sys)
PY_GEN_PATH = "E:/data/priv/".replace('/', os.sep)
COMMENT_FILE=PY_GEN_PATH+"/"+"comment.txt"
APPSTORE_COMMENT_CRAWL_URL = "https://itunes.apple.com/rss/customerreviews/page={}/id={}/sortby=mostrecent/json?l=en&&cc={}"
logger = logging.getLogger('get_appstore_comment')
LOG_FILE = 'get_appstore_comment.log'
LOG_FORMATTER = '%(message)s'
s = requests.Session()
def get_url(url, refer=None):
try:
UA_LST = ['iTunes/11.0 (Windows; Microsoft Windows 7 Business Edition Service Pack 1 (Build 7601)) AppleWebKit/536.27.1']
header = {"scheme":"https", "User-Agent":UA_LST[random.randrange(0, len(UA_LST))] }
r = s.get(url, headers=header)
sc = r.status_code
if sc == 200:
r.encoding = 'utf-8'
key = 'Content-Encoding'
if(key in r.headers and r.headers['Content-Encoding'] == 'br'):
data = brotli.decompress(r.content)
data1 = data.decode('utf-8')
return data1
return r.text
return r.text
else:
logger.error('response code:{}'.format(str(sc)))
return None
except Exception as e:
logger.error(e)
return None
def config_logger():
logger.setLevel(logging.DEBUG)
if not os.path.exists(PY_GEN_PATH):
logger.info("文件夹不存在,已自行创建")
os.makedirs(PY_GEN_PATH, 777)
handler = logging.FileHandler(os.path.join(PY_GEN_PATH, LOG_FILE))
handler.setLevel(logging.DEBUG)
fmter = logging.Formatter(LOG_FORMATTER)
handler.setFormatter(fmter)
logger.addHandler(handler)
# 控制台打印
console = logging.StreamHandler()
console.setLevel(level=logging.DEBUG) # 设置为INFO级别
console.setFormatter(fmter)
logger.addHandler(console)
def get_appstore_comment(url):
logger.info("get_appstore_comment,url:{}".format(url))
comment_json = get_url(url)
#commentObj = json.loads(comment_json)
#logger.info(comment_json)
with open(COMMENT_FILE,'a',encoding='utf8') as f:
f.write(comment_json)
if __name__ == '__main__':
config_logger()
page_num = 1
app_id = '414478124' #wechat id
app_country = 'us'
u = APPSTORE_COMMENT_CRAWL_URL.format(str(page_num), str(app_id), str(app_country))
get_appstore_comment(u)
至于如何能够爬取到所有的评论,暂时还没找到接口,继续努力找找