获取appstore中app的评论

 参考了知乎这哥们儿的评论:

使用了这个接口:https://itunes.apple.com/rss/customerreviews/page=1/id=414478124/sortby=mostrecent/json?l=en&&cc=us

和这个UA:iTunes/11.0 (Windows; Microsoft Windows 7 Business Edition Service Pack 1 (Build 7601)) AppleWebKit/536.27.1


https://itunes.apple.com/rss/customerreviews/page=1/id=414478124/sortby=mostrecent/json?l=en&&cc=us 这个接口地址中,id是appstore中的app的bundleid,cc是国家代码,page可以指定1-10,也就是说默认最多能获取到500条评论。

以下代码爬取了wechat第一页us区域的评论,并没有进一步分析评论的内容,仅仅将接口返回的信息写入了一个文件中。

代码如下:



# -*- coding:utf-8 -*-
'''
获取appstore评论
@author: user
'''
import os, sys
import logging
import requests,json
from bs4 import BeautifulSoup
from importlib import reload
import random
import brotli

reload(sys)  

PY_GEN_PATH = "E:/data/priv/".replace('/', os.sep)
COMMENT_FILE=PY_GEN_PATH+"/"+"comment.txt"
APPSTORE_COMMENT_CRAWL_URL = "https://itunes.apple.com/rss/customerreviews/page={}/id={}/sortby=mostrecent/json?l=en&&cc={}"
logger = logging.getLogger('get_appstore_comment')
LOG_FILE = 'get_appstore_comment.log'
LOG_FORMATTER = '%(message)s'
s = requests.Session()


def get_url(url, refer=None):
    try:
        UA_LST = ['iTunes/11.0 (Windows; Microsoft Windows 7 Business Edition Service Pack 1 (Build 7601)) AppleWebKit/536.27.1']
        
        header = {"scheme":"https", "User-Agent":UA_LST[random.randrange(0, len(UA_LST))] }
        r = s.get(url, headers=header)
        sc = r.status_code
        if sc == 200:
            r.encoding = 'utf-8'
            key = 'Content-Encoding'
            if(key in r.headers and r.headers['Content-Encoding'] == 'br'):
                
                data = brotli.decompress(r.content)
                data1 = data.decode('utf-8')
                return data1
                
                return r.text
            return r.text
        else:
            logger.error('response code:{}'.format(str(sc)))
            return None
    except Exception as e:
        logger.error(e)
        return None


def config_logger():
    logger.setLevel(logging.DEBUG)
    if not os.path.exists(PY_GEN_PATH):
        logger.info("文件夹不存在,已自行创建")
        os.makedirs(PY_GEN_PATH, 777)
    handler = logging.FileHandler(os.path.join(PY_GEN_PATH, LOG_FILE))
    handler.setLevel(logging.DEBUG)
    fmter = logging.Formatter(LOG_FORMATTER)
    handler.setFormatter(fmter)
    logger.addHandler(handler)

    # 控制台打印
    console = logging.StreamHandler()
    console.setLevel(level=logging.DEBUG)  # 设置为INFO级别
    console.setFormatter(fmter)
    logger.addHandler(console)


def get_appstore_comment(url):
    logger.info("get_appstore_comment,url:{}".format(url))
    comment_json = get_url(url)
    #commentObj = json.loads(comment_json)
    #logger.info(comment_json)
    with open(COMMENT_FILE,'a',encoding='utf8') as f:
        f.write(comment_json)

if __name__ == '__main__':
    config_logger()
    
    page_num = 1
    app_id = '414478124' #wechat id
    app_country = 'us'
    u = APPSTORE_COMMENT_CRAWL_URL.format(str(page_num), str(app_id), str(app_country))
    get_appstore_comment(u)
    


至于如何能够爬取到所有的评论,暂时还没找到接口,继续努力找找