随便玩玩Python爬虫
最近了解了下Python,感觉还是蛮强大的,边学习边爬了一个小网站。
ps:爬虫实时抓取,不记录数据,仅仅在请求的时候抓取,所以速度受制于宿主站的访问速度。
首页效果:
章节效果:
内容效果:
一:准备
开发工具:Pycharm
一个小网站:键盘小说网(抓取难度挺低的,貌似没有反爬虫)
js插件库:bootstrap、Jquery、VueJs
python库:bottle(用于简单的WebApi),bs4(数据分析),urllib(请求相关),gzip(解压gzip压缩的网站)
目录结构:
二:编写实体对应抓取的数据(model.py)
# 基模型 class BaseModel(object): def __init__(self, title, url): self.title = title # 标题 self.url = url # 路径 # 章节目录相关 class Article(BaseModel): def __init__(self, title, url, num): BaseModel.__init__(self, title, url) self.num = num # 标识第几章 # 首页展示 class Index(BaseModel): def __init__(self, title, url, author, new): BaseModel.__init__(self, title, url) self.author = author # 作者 self.new = new # 最新章节 # 首页返回格式 class IndexRes(object): def __init__(self, title, lst: Index): self.title = title # 返回值的标题 self.lst = lst # 返回的数据集合
三:编写主逻辑(main.py)
# -*- coding: utf-8 -*- # 抓取某小说网站首页、内容等 import gzip import json import urllib.request from bottle import * from bs4 import BeautifulSoup from model import Article, Index, IndexRes # 请求头 headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) ' 'Version/11.0 Mobile/15A372 Safari/604.1 ', 'Upgrade-Insecure-Requests': '1', 'Accept-Encoding': 'gzip,deflate' } # 允许跨域访问钩子 @hook('after_request') def enable_cors(): response.headers['Access-Control-Allow-Origin'] = '*' # 获取首页数据API @route('/api/GetIndex') def index(): books = getindex() rtn = [] if books: for book in books: fenlei = book.title lists = [] for lis in book.lst: lists.append({'title': lis.title, 'url': lis.url, 'author': lis.author, 'new': lis.new}) rtn.append({'classify': fenlei, 'books': lists}) return json.dumps(rtn) # 获取章节列表API @route('/api/GetCp/<url>') def book(url): rtn = [] cps = getlist(url) if book: for cp in cps: rtn.append({'name': cp.title, 'url': cp.url}) return json.dumps(rtn) # 获取内容API @post('/api/GetContent') def content(): url = request.body.readlines()[0].decode('utf-8') url = url.replace('%2F', '/') # 处理一下参数的编码,因为使用到‘/’ url = url.replace('url=', '') # 获取url content = getcontent(url) # 根据url获取内容 return content def main(): # 启动http服务 run(host='192.168.1.105', port=1234) # 抓取首页 def getindex(): res = [] try: response = urllib.request.Request('http://www.janpn.com', headers=headers) # 通过自定义请求头请求网页 html = urllib.request.urlopen(response) result = gzip.decompress(html.read()).decode('utf-8') # 解压gzip压缩的网页数据 soup = BeautifulSoup(result, 'html.parser') # 文档对象 titles = soup.findAll(class_='titleh_list') # 找到分类 lists = soup.findAll(class_='list') # 找到列表 for idx in range(len(titles)): fenlei = titles[idx].string books = [] for atag in lists[idx].findAll('a'): bookname = atag['title'] # 获取小说名 bookhre = atag['href'] # 取到超链接 divtag = atag.findAll(class_="shop-info") # 获取a标签下的div标签 bookurl = re.search('(//)(.*?)(\.)', bookhre, re.S).group(2) # 获取小说id(url) author = divtag[1].string.strip() # 获取作者名,顺便去除空格 newCp = divtag[2].span.string # 获取最新章节 books.append(Index(bookname, bookurl, author, newCp)) # 添加到集合中 res.append(IndexRes(fenlei, books)) except: return None else: return res # 抓取章节 def getlist(url): res = [] try: response = urllib.request.urlopen('http://www.janpn.com/book/' + url + '.html') result = response.read().decode('utf-8') soup = BeautifulSoup(result, 'html.parser') ul = soup.findAll('ul')[1] # 获取第二个ul(章节) for a in ul.find_all('a'): cpurl = re.search('(book/)(.*?)(\.)', a['href'], re.S).group(2) # 获取章节url cpname = a.string res.append(Article(cpname, cpurl, None)) except: return None else: return res # 抓取内容 def getcontent(path): res = '' try: url = "http://www.janpn.com/book/" + path + ".html" response = urllib.request.urlopen(url) result = response.read().decode('utf-8') soup = BeautifulSoup(result, 'html.parser') # 文档对象 res = str(soup.find(id='htmlContent')) res = res.replace("\t", "").replace('\n', '').strip() # 去除回车、换行符号 except: return None else: return res # 入口函数 if __name__ == '__main__': main()
四:编写web页面(使用静态页面传值)
首页
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>抓包测试</title> <link rel="stylesheet" href="../jslib/bootstrap-3.3.7/css/bootstrap.min.css"> <script src="../jslib/jquery-3.3.1.min.js"></script> <script src="../jslib/bootstrap-3.3.7/js/bootstrap.min.js"></script> <script src="../jslib/vue.min.js"></script> </head> <body> <div class="container col-sm-12" id="vue_app"> <div class="row clearfix"> <div class="col-md-12 column"> <div class="tabbable" id="tabs"> <ul class="nav nav-tabs"> <li v-for="item in result"> <a v-bind:href="'#panel-'+item.classify" data-toggle="tab">{{item.classify}}</a> </li> </ul> <div class="tab-content" style="margin-top: 10px"> <div class="tab-pane" v-bind:id="'panel-'+item.classify" v-for="item in result"> <div class="panel panel-default" v-for="art in item.books" v-on:click="openCp(art.url)"> <div class="panel-heading"> <h3 class="panel-title" style="text-align: center"> {{art.title}} </h3> </div> <div class="panel-body" style="text-align: center"> 最新章节:{{art.new}} </div> <div class="panel-footer" style="text-align: right"> {{art.author}} </div> </div> </div> </div> </div> </div> </div> </div> <script> var M_Vue = new Vue({ el: '#vue_app', data: { result: [] } }); $(function () { $.get('http://192.168.1.105:1234/api/GetIndex', function (res) { M_Vue.result = res; setTimeout(function () { $('#tabs a:first').tab('show'); }, 200); }, 'json') }) function openCp(url) { window.open("menu.html?url=" + url); } </script> </body> </html>
文章页
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> <link rel="stylesheet" href="../jslib/bootstrap-3.3.7/css/bootstrap.min.css"> <script src="../jslib/jquery-3.3.1.min.js"></script> <script src="../jslib/bootstrap-3.3.7/js/bootstrap.min.js"></script> <script src="../jslib/vue.min.js"></script> </head> <body> <div class="container col-sm-12" id="vue_app"> <div class="row clearfix"> <div class="col-md-12 column"> <div class="list-group"> <a href="#" class="list-group-item active">章节目录</a> <div class="list-group-item" v-for="item in result" v-on:click="show(item.url)"> {{item.name}} </div> </div> </div> </div> </div> <script> var M_Vue = new Vue({ el: '#vue_app', data: { result: [] } }); $(function () { var qs = window.location.search.substring(1);//获得了当前链接的中?号后的参数 var url = 'http://192.168.1.105:1234/api/GetCp/' + qs.substring(qs.indexOf('=') + 1); $.get(url, function (res) { M_Vue.result = res; console.log(M_Vue.result); }, 'json') }) function show(url) { window.open("content.html?url="+ url ); } </script> </body> </html>
内容页
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> <link rel="stylesheet" href="../jslib/bootstrap-3.3.7/css/bootstrap.min.css"> <script src="../jslib/jquery-3.3.1.min.js"></script> <script src="../jslib/bootstrap-3.3.7/js/bootstrap.min.js"></script> </head> <body> <div class="container col-sm-12" id="vue_app"> <div class="row clearfix"> <div class="col-md-12 column"> <div class="jumbotron"> </div> </div> </div> </div> <script> $(function () { var qs = window.location.search.substring(1);//获得了当前链接的中?号后的参数 var url = 'http://192.168.1.105:1234/api/GetContent'; $.post(url, {'url': qs.substring(qs.indexOf('=') + 1)}, function (res) { $('.jumbotron').html(res); }, 'text') }) </script> </body> </html>
源码下载