最近了解了下Python,感觉还是蛮强大的,边学习边爬了一个小网站。

ps:爬虫实时抓取,不记录数据,仅仅在请求的时候抓取,所以速度受制于宿主站的访问速度。

首页效果:

images章节效果:

images

内容效果:

images

一:准备

开发工具:Pycharm

一个小网站:键盘小说网(抓取难度挺低的,貌似没有反爬虫)

js插件库:bootstrap、Jquery、VueJs

python库:bottle(用于简单的WebApi),bs4(数据分析),urllib(请求相关),gzip(解压gzip压缩的网站)

目录结构:

images

二:编写实体对应抓取的数据(model.py)

# 基模型
class BaseModel(object):
    def __init__(self, title, url):
        self.title = title  # 标题
        self.url = url  # 路径


# 章节目录相关
class Article(BaseModel):
    def __init__(self, title, url, num):
        BaseModel.__init__(self, title, url)
        self.num = num  # 标识第几章


# 首页展示
class Index(BaseModel):
    def __init__(self, title, url, author, new):
        BaseModel.__init__(self, title, url)
        self.author = author  # 作者
        self.new = new  # 最新章节


# 首页返回格式
class IndexRes(object):
    def __init__(self, title, lst: Index):
        self.title = title  # 返回值的标题
        self.lst = lst  # 返回的数据集合

三:编写主逻辑(main.py)

# -*- coding: utf-8 -*-
# 抓取某小说网站首页、内容等
import gzip
import json
import urllib.request

from bottle import *
from bs4 import BeautifulSoup

from model import Article, Index, IndexRes

# 请求头
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) '
                  'Version/11.0 Mobile/15A372 Safari/604.1 ',
    'Upgrade-Insecure-Requests': '1',
    'Accept-Encoding': 'gzip,deflate'
}


# 允许跨域访问钩子
@hook('after_request')
def enable_cors():
    response.headers['Access-Control-Allow-Origin'] = '*'


# 获取首页数据API
@route('/api/GetIndex')
def index():
    books = getindex()
    rtn = []
    if books:
        for book in books:
            fenlei = book.title
            lists = []
            for lis in book.lst:
                lists.append({'title': lis.title, 'url': lis.url, 'author': lis.author, 'new': lis.new})
            rtn.append({'classify': fenlei, 'books': lists})
    return json.dumps(rtn)


# 获取章节列表API
@route('/api/GetCp/<url>')
def book(url):
    rtn = []
    cps = getlist(url)
    if book:
        for cp in cps:
            rtn.append({'name': cp.title, 'url': cp.url})
    return json.dumps(rtn)


# 获取内容API
@post('/api/GetContent')
def content():
    url = request.body.readlines()[0].decode('utf-8')
    url = url.replace('%2F', '/')  # 处理一下参数的编码,因为使用到‘/’
    url = url.replace('url=', '')  # 获取url
    content = getcontent(url)  # 根据url获取内容
    return content


def main():
    # 启动http服务
    run(host='192.168.1.105', port=1234)


# 抓取首页
def getindex():
    res = []
    try:
        response = urllib.request.Request('http://www.janpn.com', headers=headers)  # 通过自定义请求头请求网页
        html = urllib.request.urlopen(response)
        result = gzip.decompress(html.read()).decode('utf-8')  # 解压gzip压缩的网页数据
        soup = BeautifulSoup(result, 'html.parser')  # 文档对象
        titles = soup.findAll(class_='titleh_list')  # 找到分类
        lists = soup.findAll(class_='list')  # 找到列表
        for idx in range(len(titles)):
            fenlei = titles[idx].string
            books = []
            for atag in lists[idx].findAll('a'):
                bookname = atag['title']  # 获取小说名
                bookhre = atag['href']  # 取到超链接
                divtag = atag.findAll(class_="shop-info")  # 获取a标签下的div标签
                bookurl = re.search('(//)(.*?)(\.)', bookhre, re.S).group(2)  # 获取小说id(url)
                author = divtag[1].string.strip()  # 获取作者名,顺便去除空格
                newCp = divtag[2].span.string  # 获取最新章节
                books.append(Index(bookname, bookurl, author, newCp))  # 添加到集合中
            res.append(IndexRes(fenlei, books))
    except:
        return None
    else:
        return res


# 抓取章节
def getlist(url):
    res = []
    try:
        response = urllib.request.urlopen('http://www.janpn.com/book/' + url + '.html')
        result = response.read().decode('utf-8')
        soup = BeautifulSoup(result, 'html.parser')
        ul = soup.findAll('ul')[1]  # 获取第二个ul(章节)
        for a in ul.find_all('a'):
            cpurl = re.search('(book/)(.*?)(\.)', a['href'], re.S).group(2)  # 获取章节url
            cpname = a.string
            res.append(Article(cpname, cpurl, None))
    except:
        return None
    else:
        return res


# 抓取内容
def getcontent(path):
    res = ''
    try:
        url = "http://www.janpn.com/book/" + path + ".html"
        response = urllib.request.urlopen(url)
        result = response.read().decode('utf-8')
        soup = BeautifulSoup(result, 'html.parser')  # 文档对象
        res = str(soup.find(id='htmlContent'))
        res = res.replace("\t", "").replace('\n', '').strip()  # 去除回车、换行符号
    except:
        return None
    else:
        return res


# 入口函数
if __name__ == '__main__':
    main()

四:编写web页面(使用静态页面传值)

首页

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>抓包测试</title>
    <link rel="stylesheet" href="../jslib/bootstrap-3.3.7/css/bootstrap.min.css">
    <script src="../jslib/jquery-3.3.1.min.js"></script>
    <script src="../jslib/bootstrap-3.3.7/js/bootstrap.min.js"></script>
    <script src="../jslib/vue.min.js"></script>
</head>
<body>
<div class="container col-sm-12" id="vue_app">
    <div class="row clearfix">
        <div class="col-md-12 column">
            <div class="tabbable" id="tabs">
                <ul class="nav nav-tabs">
                    <li v-for="item in result">
                        <a v-bind:href="'#panel-'+item.classify" data-toggle="tab">{{item.classify}}</a>
                    </li>
                </ul>
                <div class="tab-content" style="margin-top: 10px">
                    <div class="tab-pane" v-bind:id="'panel-'+item.classify" v-for="item in result">
                        <div class="panel panel-default" v-for="art in item.books" v-on:click="openCp(art.url)">
                            <div class="panel-heading">
                                <h3 class="panel-title" style="text-align: center">
                                    {{art.title}}
                                </h3>
                            </div>
                            <div class="panel-body" style="text-align: center">
                                最新章节:{{art.new}}
                            </div>
                            <div class="panel-footer" style="text-align: right">
                                {{art.author}}
                            </div>
                        </div>
                    </div>
                </div>
            </div>
        </div>
    </div>
</div>
<script>
    var M_Vue = new Vue({
        el: '#vue_app',
        data: {
            result: []
        }
    });
    $(function () {
        $.get('http://192.168.1.105:1234/api/GetIndex', function (res) {
            M_Vue.result = res;
            setTimeout(function () {
                $('#tabs a:first').tab('show');
            }, 200);
        }, 'json')
    })

    function openCp(url) {
        window.open("menu.html?url=" + url);
    }
</script>
</body>
</html>

文章页

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
    <link rel="stylesheet" href="../jslib/bootstrap-3.3.7/css/bootstrap.min.css">
    <script src="../jslib/jquery-3.3.1.min.js"></script>
    <script src="../jslib/bootstrap-3.3.7/js/bootstrap.min.js"></script>
    <script src="../jslib/vue.min.js"></script>
</head>
<body>
<div class="container col-sm-12" id="vue_app">
    <div class="row clearfix">
        <div class="col-md-12 column">
            <div class="list-group">
                <a href="#" class="list-group-item active">章节目录</a>
                <div class="list-group-item" v-for="item in result" v-on:click="show(item.url)">
                    {{item.name}}
                </div>
            </div>
        </div>
    </div>
</div>
<script>
    var M_Vue = new Vue({
        el: '#vue_app',
        data: {
            result: []
        }
    });
    $(function () {
        var qs = window.location.search.substring(1);//获得了当前链接的中?号后的参数
        var url = 'http://192.168.1.105:1234/api/GetCp/' + qs.substring(qs.indexOf('=') + 1);
        $.get(url, function (res) {
            M_Vue.result = res;
            console.log(M_Vue.result);
        }, 'json')
    })

    function show(url) {
        window.open("content.html?url="+ url  );
    }
</script>
</body>
</html>

内容页

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
    <link rel="stylesheet" href="../jslib/bootstrap-3.3.7/css/bootstrap.min.css">
    <script src="../jslib/jquery-3.3.1.min.js"></script>
    <script src="../jslib/bootstrap-3.3.7/js/bootstrap.min.js"></script>
</head>
<body>
<div class="container col-sm-12" id="vue_app">
    <div class="row clearfix">
        <div class="col-md-12 column">
            <div class="jumbotron">

            </div>
        </div>
    </div>
</div>
<script>
    $(function () {
        var qs = window.location.search.substring(1);//获得了当前链接的中?号后的参数
        var url = 'http://192.168.1.105:1234/api/GetContent';
        $.post(url, {'url': qs.substring(qs.indexOf('=') + 1)}, function (res) {
            $('.jumbotron').html(res);
        }, 'text')
    })
</script>
</body>
</html>

源码下载

WebCrawler

 

发表回复