很久以前,爬了京东的生鲜页面,鞋子想把代码发出来共享

首先简明,爬取 商品数据 采用selenium操作chrome模拟浏览器动态渲染页面+ajax加载评论

具体的看下面的说明

所需内容:
商品小分类名称(苹果,橙子等)
商品名称(烟台红富士苹果 5kg 一级铂金大果 单果230-320g 新鲜水果)
商品总评论数量
商品好评率
评论星级
评论长度
评论点赞数量
评论回复数量
评论文本内容
评论者等级
评论发表距抓取的天数(days)
抓取部分带有追评的评论:追评文本内容、追评与初评相距时间

以上是这次任务的需求

这个页面大部分的信息都是动态渲染出来的,所以要用selenium


可以看到,要找到评论不是去常见的HXR而是JS,peoduct开头的就是评论信息

Request URL: https://sclub.jd.com/comment/...
Request Method: GET
Status Code: 200
Remote Address: 117.148.129.129:443
Referrer Policy: no-referrer-when-downgrade

在这个url中,查询字符串中的大部分参数不是必须的

def make_url(baseurl, page=0, score=0, productId='3756271'):    data1 = {        'callback': 'fetchJSON_comment98vv7490',        'productId': productId,        'score': score,        'sortType': '6',        'page': page,        'pageSize': '10',        'isShadowSku': '0',  #        'fold': '1',  #    }    url = baseurl + urlencode(data1)    return url

具体的可以在代码中体现。

下面我要贴代码了,坐稳扶好,不想复制的话,可以去我的github上下载。

# https://www.jd.com/allSort.aspximport requestsfrom pyquery import PyQuery as pqfrom prettyprinter import cpprintimport jsonfrom urllib.parse import urlencodefrom selenium import webdriverfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support import expected_conditions as ECimport timeimport csvimport datetimeimport sysdef get_ajax(url):    headers = {        'referer': 'https://item.jd.com/3756271.html',  # referer: https://item.jd.com/3756271.html        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',    }    response = requests.get(url, headers=headers)    return json.loads(response.text[26:-2])def make_url(baseurl, page=0, score=0, productId='3756271'):    data1 = {        'callback': 'fetchJSON_comment98vv7490',        'productId': productId,        'score': score,        'sortType': '6',        'page': page,        'pageSize': '10',        'isShadowSku': '0',  #        'fold': '1',  #    }    url = baseurl + urlencode(data1)    return urldef parse_json(rjson, url=None):    for comment in rjson.get('comments'):        item = {}        item['url'] = url        item['评论星级'] = comment.get('score')        item['评论长度'] = len(comment.get('content'))        item['评论点赞数量'] = comment.get('usefulVoteCount')        item['评论回复数量'] = comment.get('replyCount')        item['评论文本内容'] = comment.get('content')        item['评论者等级'] = comment.get('userLevelId')        try:            date1 = time.strptime(comment.get('creationTime'), "%Y-%m-%d %H:%M:%S")            date2 = time.localtime(time.time())            date1 = datetime.datetime(date1[0], date1[1], date1[2])            date2 = datetime.datetime(date2[0], date2[1], date2[2])            item['评论发表距抓取的天数(days)'] = str((date2 - date1).days)        except Exception as error:            print('error is >>>', error)            item['评论发表距抓取的天数(days)'] = ''        if comment.get('afterUserComment', {}).get('hAfterUserComment', {}).get('content', '') == '此用户未填写评价内容':            item['追评文本内容'] = ''        else:            item['追评文本内容'] = comment.get('afterUserComment', {}).get('hAfterUserComment', {}).get('content', '')        try:            date1 = time.strptime(comment.get('afterUserComment', {}).get('created', ''), "%Y-%m-%d %H:%M:%S")            date2 = time.localtime(time.time())            date1 = datetime.datetime(date1[0], date1[1], date1[2])            date2 = datetime.datetime(date2[0], date2[1], date2[2])            item['追评与初评相距时间'] = str((date2 - date1).days)        except Exception:            item['追评与初评相距时间'] = ''        if item['追评文本内容'] == '':            item['追评与初评相距时间'] = ''        yield itemdef save_csv_merinfo(item):    with open(FILENAME_MER, 'a', encoding=ENCODING, newline='') as f:        writer = csv.DictWriter(f, fieldnames=fieldnames_merinfo)        # writer.writeheader()        writer.writerow(item)def save_csv_cominfo(item):    with open(FILENAME_COM, 'a', encoding=ENCODING, newline='') as f:        writer = csv.DictWriter(f, fieldnames=fieldnames_cominfo)        # writer.writeheader()        writer.writerow(item)def get_page(url):    browser.get(url)    submit = wait.until(EC.presence_of_element_located((By.XPATH, '//div[contains(@class,"tab-main")]/ul/li[5]')))    time.sleep(2)    for i in range(30):        browser.execute_script("window.scrollBy(0,50)")        time.sleep(0.1)    submit.click()    time.sleep(3)    return browser.page_sourcedef parse_page(html, url):    page_item = {}    doc = pq(html, parser='html')    page_item['url'] = url    page_item['商品小分类名称'] = doc('#crumb-wrap > div > div.crumb.fl.clearfix > div:nth-child(5) > a').text()    page_item['商品名称'] = doc('div.itemInfo-wrap div.sku-name').text()    page_item['商品总评论数量'] = doc('#detail > div.tab-main.large > ul > li.current > s').text().replace('(', '').replace(        ')', '')    page_item['商品好评率'] = doc('#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div').text()    ##comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div    return page_itemdef csv_create():    with open(FILENAME_MER, 'w', encoding=ENCODING, newline='') as f:        writer = csv.writer(f)        writer.writerow(fieldnames_merinfo)    with open(FILENAME_COM, 'w', encoding=ENCODING, newline='') as f:        writer = csv.writer(f)        writer.writerow(fieldnames_cominfo)def crawl_all_page_url():    global ALL_PAGE_URL    browser = webdriver.Chrome()    wait = WebDriverWait(browser, 20)    browser.get('https://www.jd.com/allSort.aspx')    wait.until(EC.presence_of_element_located(        (By.XPATH, '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]')))    CASE = []    for i in range(10):  # 水果        initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[2]/dd/a[{}]'.format(i + 1)        CASE.append(initcase)    for i in range(4):  # 猪肉羊肉        initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[3]/dd/a[{}]'.format(i + 1)        CASE.append(initcase)    for i in range(8):  # 海鲜水产        initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[4]/dd/a[{}]'.format(i + 1)        CASE.append(initcase)    for i in range(4):  # 禽肉蛋白        initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[5]/dd/a[{}]'.format(i + 1)        CASE.append(initcase)    for i in range(6):  # 冷冻食品        initcase = '/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/div[9]/div[2]/div[3]/dl[6]/dd/a[{}]'.format(i + 1)        CASE.append(initcase)    # 规则只要更改range里面的值和dl[]里面的值,可高度扩展    for case in CASE:        print('>>>>>>>>>')        submit = wait.until(EC.element_to_be_clickable(            (By.XPATH, case)))        submit.click()        print(browser.current_url)        handle = browser.current_window_handle        handles = browser.window_handles        for newhandle in handles:            if newhandle != handle:                browser.switch_to.window(newhandle)        time.sleep(1.5)        wait.until(EC.presence_of_element_located((By.XPATH, '//div[@id="plist"]/ul[contains(@class,"gl-warp")]')))        doc = pq(browser.page_source, parser='html')        for li in list(doc('div#plist ul.gl-warp li').items())[:10]:            res = 'https:' + str(li('div div.p-commit-n strong a').attr('href')).replace('#comment', '')            print(res)            ALL_PAGE_URL.append(res)        time.sleep(1.5)        browser.close()        browser.switch_to.window(handle)def load_all_page_url():    global ALL_PAGE_URL    with open(FILENAME_CACHE, 'r', encoding='utf-8') as f:        reader = csv.reader(f)        for item in reader:            ALL_PAGE_URL.append(item[0])if __name__ == '__main__':    # 前期准备>>>>>>>>>>    browser = webdriver.Chrome()  # selenium模拟浏览器    wait = WebDriverWait(browser, 20)    MAXINDEX = 7  # 最大请求评论页数,为了控制评论数量在500条左右,应该设置为35左右,35时略大于500(网页评论非无限下拉)    # 用户自定义配置区********************************    TIMESLEEP = 2  # 睡眠间隔    FILENAME_MER = 'merinfo_test.csv'  # 商品信息的文件名    FILENAME_COM = 'cominfo_test.csv'  # 评论信息的文件名    FILENAME_CACHE = 'cache.csv'    ENCODING = 'UTF-8'  # 保存的CSV的编码    # **********************************************    # csv文件的字段    fieldnames_merinfo = ['url', '商品小分类名称', '商品名称', '商品总评论数量', '商品好评率']    fieldnames_cominfo = ['url', '评论星级', '评论长度', '评论点赞数量', '评论回复数量', '评论文本内容', '评论者等级', '评论发表距抓取的天数(days)', '追评文本内容',                          '追评与初评相距时间']    # <<<<<<<<<<<<<<<<<    start = time.time()    # csv_create()  # 重置    # 去重模块>>>    URLSET = []  # 已存在的url的集合    with open(FILENAME_MER, 'r', encoding=ENCODING) as f:        reader = csv.reader(f)        for res in reader:            URLSET.append(res[0])    print('URLSET is', URLSET)    # 爬取商品信息    ALL_PAGE_URL = []  # 所有的网页链接    load_all_page_url()  # 这两个函数要二选一,load_all_page_url会从本地的cache.csv载入,速度更快,脱机工作,不占用网络    # crawl_all_page_url() # 这两个函数要二选一,load_all_page_url会从本地的cache.csv载入,速度更快,脱机工作,不占用网络    for page_url in ALL_PAGE_URL:        if page_url not in URLSET:            URLSET.append(page_url)  # 动态去重            try:                html = get_page(page_url)  # 请求网页,selenium动态渲染                item_mer = parse_page(html, url=page_url)  # 解析网页,pyquery                cpprint(item_mer)                # 爬取评论信息,ajax                Flag = 0  # 计数器                ITEMS = []                baseurl = 'https://sclub.jd.com/comment/productPageComments.action?'                for score in [5, 3, 2, 1]:  # 0全部评论,5追评,3好评,2中评,1差评                    if score == 5:                        MAXINDEX_TEMP = MAXINDEX                    else:                        MAXINDEX_TEMP = int(MAXINDEX / 7)  # 控制比例为7:1:1:1                    for index in range(MAXINDEX_TEMP):                        time.sleep(TIMESLEEP)                        url = make_url(baseurl, page=index, score=score,                                       productId=''.join(list(filter(str.isdigit, page_url))))  # 构造url                        try:                            json_ = get_ajax(url)  # 进行ajax请求                            if len(json_.get('comments')) != 0:                                for item in parse_json(json_, url=page_url):  # 解析json                                    cpprint(item)                                    ITEMS.append(item)                                    Flag += 1                            else:                                break                        except Exception as error:                            print('AJAX请求发生错误{}>>>'.format(error))                            print('url is {}'.format(url))                            print(str(datetime.datetime.now()))                            sys.exit(0)  # ajax请求出错时退出程序,确保数据完整性                # 一个网页的商品信息和评论信息都爬取完毕时,保存数据                save_csv_merinfo(item_mer)  # 保存商品信息                for item in ITEMS:  # 保存评论信息                    try:                        save_csv_cominfo(item)                    except Exception as error:                        print(error)                print("保存了{}条评论".format(Flag))            except Exception as error:                print('网页请求发生错误{}>>>'.format(error))        print('一个网页请求已经结束>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')        # time.sleep(TIMESLEEP)    end = time.time()    print('总共用时{}秒'.format(end - start))

还是写了一些注释的,不懂得可以在评论中问。暂时这样吧!