豆瓣电影TOP250和书籍TOP250爬虫

34次阅读

共计 2877 个字符，预计需要花费 8 分钟才能阅读完成。

最近开始玩 Python , 学习爬虫相关知识的时候，心血来潮，爬取了豆瓣电影 TOP250 和书籍 TOP250, 这里记录一下自己玩的过程。

电影 TOP250 爬虫

import requests
from bs4 import BeautifulSoup
import time


def getlist(list_url):
    time.sleep(2)
    res = requests.get(list_url)
    soup = BeautifulSoup(res.text, 'html.parser')
    movie_list = soup.select('.grid_view li')
    for m in movie_list:
        rank = m.select('em')[0].text
        score = m.select('.rating_num')[0].text
        title = m.select('.title')[0].text
        direct = m.select('.info .bd p')[0].text.strip()
        actor = '\n 主演:'.join(direct.split('主演:'))
        director = '年代:'.join(actor.split(' '))
        if m.select('.inq'):
            comments = m.select('.inq')[0].text.strip()
        else:
            comments = 'None'
        movie.append(
            '排名:' + rank + '\n'
            + '评分:' + score + '\n'
            + '片名:' + title + '\n'
            + director + '\n'
            + '评论:' + comments + '\n'
            + '\n')
    if soup.select('.next a'):
        asoup = soup.select('.next a')[0]['href']
        next_page = seed_url + asoup
        getlist(next_page)
    else:
        print('结束')
    return movie


def write(movies):
    with open('movie.txt', 'w', encoding='utf8') as m:
        for a in movies:
            m.write(a)


def main():
    write(getlist(seed_url))
    pass


if __name__ == '__main__':
    seed_url = 'https://movie.douban.com/top250'
    movie = []
    main()

书籍 TOP250 爬虫

import bs4
import requests
import re
from bs4 import BeautifulSoup
from operator import itemgetter


def getHtmlText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""


def parserText(text, book_list):
    soup = BeautifulSoup(text, 'html.parser')
    for table in soup('table', {'width': '100%'}):
        if isinstance(table, bs4.element.Tag):
            tds = table.find('tr')('td')
            divs = tds[1]('div')
            content = {}
            for div in divs:
                if isinstance(div, bs4.element.Tag):
                    if div.find('a'):
                        name = div.find('a').attrs['title']
                        content.update({"书名": name})
                    if div.select('.rating_nums'):
                        score = div.select('.rating_nums')[0].text
                        content.update({"评分": score})
                    if div.select('.pl'):
                        people_num = div.select('.pl')[0].text
                        regex = re.compile(r'[\d]{1,10}')
                        content.update({"评价人数": regex.findall(people_num)[0]})

            ps = tds[1]('p')
            for p in ps:
                if isinstance(p, bs4.element.Tag):
                    if p.attrs['class'][0] == 'quote':
                        description = p.find('span').string
                        content.update({"介绍": description})
                    if p.attrs['class'][0] == 'pl':
                        author = p.string
                        content.update({"作者信息": author})

            book_list.append(content)

    next_books = soup.find('span', {'class': 'next'})
    if next_books.find('a'):
        a = next_books.find('a').attrs['href']
        text = getHtmlText(a)
        parserText(text, books)

    return book_list


def sortedBookTop250(book_list):
    tmp = sorted(book_list, key=itemgetter('评分'), reverse=True)
    for i in range(len(tmp)):
        tmp[i].update({"排名": i + 1})
    return tmp


def writeToFile(book_list):
    with open('good_books.txt', 'w', encoding='utf8') as book_file:
        for book in book_list:
            for key, value in book.items():
                book_file.write(f'{key}:{value}\n')
            book_file.write('\n')
    pass


def main():
    text = getHtmlText(seed_url)
    book_list = parserText(text, books)
    writeToFile(sortedBookTop250(book_list))
    pass


if __name__ == '__main__':
    seed_url = "https://book.douban.com/top250"
    books = []
    main()

点击查看我的 Github

点击查看我的个人 Blog

日拱一卒，不期速成

以上直接贴出了代码，这是很简单的两段代码，主要用到了 requests 库和 beautifulsoup 库，需要的可以直接拿去，或者直接去我的 GIthub 上拿 movies.txt 和 good_books.txt

正文完

发表至： java

2019-09-24

0

Lombok中关于Data的使用

关于java:ThreadLocal内存溢出代码演示和原因分析

关于-Immutable-Object-模式适用场景的探讨

关于java:2021年2月程序员工资统计又拖后腿了……

教程教你如何给你的头像添加一个好看的国旗

豆瓣电影TOP250和书籍TOP250爬虫

豆瓣电影 TOP250 和书籍 TOP250 爬虫

电影 TOP250 爬虫

书籍 TOP250 爬虫

总结

Just My Socks（注册教程内含优惠码）

豆瓣电影TOP250和书籍TOP250爬虫

豆瓣电影 TOP250 和书籍 TOP250 爬虫

电影 TOP250 爬虫

书籍 TOP250 爬虫

总结

Just My Socks（注册教程 内含优惠码）

Just My Socks（注册教程内含优惠码）