scrapy框架使用教程
阅读目录
scrapy 框架真的是很强大。非常值得学习一下。本身 py 就追求简洁,所以本身代码量很少却能写出很强大的功能。对比 java 来说。不过 py 的语法有些操蛋,比如没有智能提示。动态语言的通病。我也刚学习不到 1 周时间。记录一下。全部干货。
首先安装 scrapy 框架。选择的 ide 是 pycharm。
创建一个 scrapy 项目。项目名称 xxoo
scrapy startproject xxoo
会得到一个项目目录。具体目录的作用自己百度下。然后再用一条命令创建一个爬虫类。就是一个模板。帮我们创建好的类。我们只需要写逻辑就行。程序员的天性就是懒!!!
意思是创建了一个 xxooSpider 的类 这个类只爬取 baidu.com 这个网站
scrapy genspider [-t template] <name> <domain> 即:scrapy genspider xxooSpider baidu.com
在 pycharm 中调试项目。
需要特殊配置下。
在根目录下创建一个 start.py 的文件。 -o itcast1.csv 是输出到 csv 文件中。可以不加
from scrapy import cmdlinecmdline.execute("scrapy crawl xxooSpider --nolog -o itcast1.csv".split())
就 ok 了。
使用豆瓣镜像源下载
pip install -i https://pypi.doubanio.com/simple/ scrapy-splash
获取 setting.py 中的值
from scrapy.conf import settingscookie = settings['COOKIE']
获取图片的 url 地址
大牛通常使用这个方法。原因是,我们一般情况下也可以直接得到 src 属性的值。但是,有时候 src 属性的值没有带网址前缀,比如说是 /img/1.png 这样。我们需要手动加上 http://www.baidu.com 才可以。用下面这个方法。可以很简单的解决这个问题。
from urllib import parseurl="http://www.baidu.com/xx"
xx="/pic/1/1.png"urljoin = parse.urljoin(url, xx)
print(urljoin)
http://www.baidu.com/pic/1/1.png
下载图片
scrapy 给我们提供好了图片下载的模板。我们只需要在 setting 中指定一下管道中间件,和需要下载的字段。需要下载的字段值一定是数组类型,不然报错
ITEM_PIPELINES = { 'xxoo.pipelines.XxooPipeline': 300, 'scrapy.pipelines.images.ImagesPipeline': 1, } #在 item 中定义图片 url 的字段,ImagesPipeline 会自动下载这个 url 地址 IMAGES_URLS_FIELD="image" #存放的路径, 根目录下的 img 文件夹 IMAGES_STORE=os.path.join(os.path.abspath(os.path.dirname(__file__)),"img")
但是按照上面的写的话,全部都是由 scrapy 帮我们做了,自己生成文件夹,文件名。非常不可控。如果我们想自定义的话。我们需要继承 ImagesPipeline 类,重写几个方法
from scrapy.pipelines.images import ImagesPipeline import re from scrapy import Requestclass ImagesrenamePipeline(ImagesPipeline):
# 1 看源码可以知道,这个方法只是遍历出我们指定的图片字段,是个数组,然后一个一个请求
def get_media_requests(self, item, info):
# 循环每一张图片地址下载,若传过来的不是集合则无需循环直接 yield
for image_url in item['imgurl']:
# meta 里面的数据是从 spider 获取,然后通过 meta 传递给下面方法:file_path
yield Request(image_url,meta={'name':item['imgname']})</span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 2重命名,若不重写这函数,图片名为哈希,就是一串乱七八糟的名字</span> <span style="color: rgba(0, 0, 255, 1)">def</span> file_path(self, request, response=None, info=<span style="color: rgba(0, 0, 0, 1)">None): </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 提取url前面名称作为图片名。</span> image_guid = request.url.split(<span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">/</span><span style="color: rgba(128, 0, 0, 1)">'</span>)[-1<span style="color: rgba(0, 0, 0, 1)">] </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 接收上面meta传递过来的图片名称</span> name = request.meta[<span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">name</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">] </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 过滤windows字符串,不经过这么一个步骤,你会发现有乱码或无法下载</span> name = re.sub(r<span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">[?\\*|“<>:/]</span><span style="color: rgba(128, 0, 0, 1)">'</span>, <span style="color: rgba(128, 0, 0, 1)">''</span><span style="color: rgba(0, 0, 0, 1)">, name) </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 分文件夹存储的关键:{0}对应着name;{1}对应着image_guid</span> filename = u<span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">{0}/{1}</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">.format(name, image_guid) </span><span style="color: rgba(0, 0, 255, 1)">return</span><span style="color: rgba(0, 0, 0, 1)"> filename </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)">3这个是请求完成之后走的方法,我们可以得到请求的url和存放的地址</span> <span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> item_completed(self, results, item, info): </span><span style="color: rgba(0, 0, 255, 1)">pass</span></pre>
保存 item 到 json 文件
自定义的
import codecs import json class jsonwrite(object): # 初始化,打开文件 def __init__(self): self.file = codecs.open("xxoo.json", "w",encoding="utf-8") # scrapy 会走这个方法进行 item 的写入 def process_item(self,item,spider): self.file.write(json.dumps(dict(item),ensure_ascii=False) + "\n") # 通常是关闭文件的操作 def spider_closed(self,spider): self.file.close()
scrapy 给我们提供的
from scrapy.exporters import JsonItemExporter class JsonExporterPipleline(object): #调用 scrapy 提供的 json export 导出 json 文件 def __init__(self): self.file = open('articleexport.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)self.exporter.start_exporting()</span><span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> close_spider(self, spider): self.exporter.finish_exporting() self.file.close() </span><span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> process_item(self, item, spider): self.exporter.export_item(item) </span><span style="color: rgba(0, 0, 255, 1)">return</span> item</pre>
保存到 mysql 中(两种方法)
import MySQLdb import MySQLdb.cursors from twisted.enterprise import adbapiclass MysqlPipeline(object):
#采用同步的机制写入 mysql
def init(self):
self.conn = MySQLdb.connect('192.168.0.106', 'root', 'root', 'article_spider', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()</span><span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> process_item(self, item, spider): insert_sql </span>= <span style="color: rgba(128, 0, 0, 1)">"""</span><span style="color: rgba(128, 0, 0, 1)"> insert into jobbole_article(title, url, create_date, fav_nums) VALUES (%s, %s, %s, %s) </span><span style="color: rgba(128, 0, 0, 1)">"""</span><span style="color: rgba(0, 0, 0, 1)"> self.cursor.execute(insert_sql, (item[</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">title</span><span style="color: rgba(128, 0, 0, 1)">"</span>], item[<span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">url</span><span style="color: rgba(128, 0, 0, 1)">"</span>], item[<span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">create_date</span><span style="color: rgba(128, 0, 0, 1)">"</span>], item[<span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">fav_nums</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">])) self.conn.commit()
#采用异步数据库连接池的方法
class MysqlTwistedPipline(object):
def init(self, dbpool):
self.dbpool = dbpool@classmethod </span><span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> from_settings(cls, settings): dbparms </span>=<span style="color: rgba(0, 0, 0, 1)"> dict( host </span>= settings[<span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">MYSQL_HOST</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">], db </span>= settings[<span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">MYSQL_DBNAME</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">], user </span>= settings[<span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">MYSQL_USER</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">], passwd </span>= settings[<span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">MYSQL_PASSWORD</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">], charset</span>=<span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">utf8</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">, cursorclass</span>=<span style="color: rgba(0, 0, 0, 1)">MySQLdb.cursors.DictCursor, use_unicode</span>=<span style="color: rgba(0, 0, 0, 1)">True, ) dbpool </span>= adbapi.ConnectionPool(<span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">MySQLdb</span><span style="color: rgba(128, 0, 0, 1)">"</span>, **<span style="color: rgba(0, 0, 0, 1)">dbparms) </span><span style="color: rgba(0, 0, 255, 1)">return</span><span style="color: rgba(0, 0, 0, 1)"> cls(dbpool) </span><span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> process_item(self, item, spider): </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)">使用twisted将mysql插入变成异步执行</span> query =<span style="color: rgba(0, 0, 0, 1)"> self.dbpool.runInteraction(self.do_insert, item) query.addErrback(self.handle_error, item, spider) </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)">处理异常</span> <span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> handle_error(self, failure, item, spider): </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)">处理异步插入的异常</span> <span style="color: rgba(0, 0, 255, 1)">print</span><span style="color: rgba(0, 0, 0, 1)"> (failure) </span><span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> do_insert(self, cursor, item): </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)">执行具体的插入</span> <span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)">根据不同的item 构建不同的sql语句并插入到mysql中</span> insert_sql, params =<span style="color: rgba(0, 0, 0, 1)"> item.get_insert_sql() cursor.execute(insert_sql, params)</span></pre>
优化 item 类(重要)
我们可以用 xpath 或者 css 解析页面,然后写一些判断逻辑。如果你不嫌麻烦的话。
scrapy 给我们提供了一整套的流程。可以让代码变得非常精简。处理 item 的业务逻辑在 item 中写。爬虫文件只写 item 的生成规则。
先看 item 类
from scrapy.loader import ItemLoader from scrapy.loader.processors import MapCompose, TakeFirst, Join#一个小技巧,可以覆盖默认的规则,就是 TakeFirst() 把列表转换成字符串,我们这里不让转成字符串,还是数组
def return_value(value):
return value#因为通过自带的 ItemLoader 类生成的 item_loader 他都是 list,所以我们自定义下。默认的处理规则(可以单个字段覆盖),这样就不用每个字段都写重复的代码了
class ArticleItemLoader(ItemLoader):
#自定义 itemloader
default_output_processor = TakeFirst()#自定义的 item 类。input_processor 是指需要处理的业务逻辑,比如一些格式的转换什么的,output_processor 可以覆盖默认的规则。
class JobBoleArticleItem(scrapy.Item):
title = scrapy.Field()
create_date = scrapy.Field(
input_processor=MapCompose(date_convert),
)
url = scrapy.Field()
url_object_id = scrapy.Field()
front_image_url = scrapy.Field(
output_processor=MapCompose(return_value)
)
front_image_path = scrapy.Field()
praise_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
comment_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
fav_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
tags = scrapy.Field(
input_processor=MapCompose(remove_comment_tags),
output_processor=Join(",")
)
content = scrapy.Field()
爬虫类
from scrapy.loader import ItemLoader from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoaderdef parse_detail(self, response):
article_item = JobBoleArticleItem()
#通过 item loader 加载 item
front_image_url = response.meta.get("front_image_url", "") # 文章封面图
item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
item_loader.add_css("title", ".entry-header h1::text")
item_loader.add_value("url", response.url)
item_loader.add_value("url_object_id", get_md5(response.url))
item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
item_loader.add_value("front_image_url", [front_image_url])
item_loader.add_css("praise_nums", ".vote-post-up h10::text")
item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
item_loader.add_css("fav_nums", ".bookmark-btn::text")
item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
item_loader.add_css("content", "div.entry")article_item </span>=<span style="color: rgba(0, 0, 0, 1)"> item_loader.load_item() </span><span style="color: rgba(0, 0, 255, 1)">yield</span> article_item</pre>
获取一个页面的全部 url
我们当然可以用 xpath 得到,但是还不够精简。我们可以使用 linkExtractor 类来得到。非常的简单。
from scrapy.linkextractors import LinkExtractor# 需要搞一个对象实例,然后写一个符合的规则,利用 extract_links 方法传一个 response 过去就能得到这个页面匹配的 url
link = linkExtractor=LinkExtractor(allow=r'http://lab.scrapyd.cn')# link = linkExtractor=LinkExtractor()#allow=r'http://lab.scrapyd.cn/archives/\d+.html'
links = link.extract_links(response)
if links:
for link_one in links:
print(link_one)
日志的使用
Scrapy 提供了 log 功能,可以通过 logging 模块使用。
可以修改配置文件 settings.py,任意位置添加下面两行,效果会清爽很多。
LOG_FILE = "TencentSpider.log" LOG_LEVEL = "INFO"
Log levels
Scrapy 提供 5 层 logging 级别:
CRITICAL - 严重错误 (critical)
ERROR - 一般错误 (regular errors)
WARNING - 警告信息 (warning messages)
INFO - 一般信息 (informational messages)
DEBUG - 调试信息 (debugging messages)
logging 设置
通过在 setting.py 中进行以下设置可以被用来配置 logging:
LOG_ENABLED 默认: True,启用 logging LOG_ENCODING 默认: 'utf-8',logging 使用的编码 LOG_FILE 默认: None,在当前目录里创建 logging 输出文件的文件名 LOG_LEVEL 默认: 'DEBUG',log 的最低级别 LOG_STDOUT 默认: False 如果为 True,进程所有的标准输出 (及错误) 将会被重定向到 log 中。例如,执行 print "hello" ,其将会在 Scrapy log 中显示。
保存到 mongdb 数据库
import pymongo from scrapy.conf import settingsclass DoubanPipeline(object):
def init(self):
host = settings["MONGODB_HOST"]
port = settings["MONGODB_PORT"]
dbname = settings["MONGODB_DBNAME"]
sheetname= settings["MONGODB_SHEETNAME"]</span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 创建MONGODB数据库链接</span> client = pymongo.MongoClient(host = host, port =<span style="color: rgba(0, 0, 0, 1)"> port) </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 指定数据库</span> mydb =<span style="color: rgba(0, 0, 0, 1)"> client[dbname] </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 存放数据的数据库表名</span> self.sheet =<span style="color: rgba(0, 0, 0, 1)"> mydb[sheetname] </span><span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> process_item(self, item, spider): data </span>=<span style="color: rgba(0, 0, 0, 1)"> dict(item) self.sheet.insert(data) </span><span style="color: rgba(0, 0, 255, 1)">return</span> item</pre>
setting 文件
# MONGODB 主机名 MONGODB_HOST = "127.0.0.1"# MONGODB 端口号
MONGODB_PORT = 27017# 数据库名称
MONGODB_DBNAME = "Douban"# 存放数据的表名称
MONGODB_SHEETNAME = "doubanmovies"
下载中间件,随机更换 user-Agent 和 ip
import random import base64from settings import USER_AGENTS
from settings import PROXIES# 随机的 User-Agent
class RandomUserAgent(object):
def process_request(self, request, spider):
useragent = random.choice(USER_AGENTS)
#print useragent
request.headers.setdefault("User-Agent", useragent)class RandomProxy(object):
def process_request(self, request, spider):
proxy = random.choice(PROXIES)</span><span style="color: rgba(0, 0, 255, 1)">if</span> proxy[<span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">user_passwd</span><span style="color: rgba(128, 0, 0, 1)">'</span>] <span style="color: rgba(0, 0, 255, 1)">is</span><span style="color: rgba(0, 0, 0, 1)"> None: </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 没有代理账户验证的代理使用方式</span> request.meta[<span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">proxy</span><span style="color: rgba(128, 0, 0, 1)">'</span>] = <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">http://</span><span style="color: rgba(128, 0, 0, 1)">"</span> + proxy[<span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">ip_port</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">] </span><span style="color: rgba(0, 0, 255, 1)">else</span><span style="color: rgba(0, 0, 0, 1)">: </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 对账户密码进行base64编码转换</span> base64_userpasswd = base64.b64encode(proxy[<span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">user_passwd</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(0, 0, 0, 1)">]) </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> 对应到代理服务器的信令格式里</span> request.headers[<span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">Proxy-Authorization</span><span style="color: rgba(128, 0, 0, 1)">'</span>] = <span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">Basic </span><span style="color: rgba(128, 0, 0, 1)">'</span> +<span style="color: rgba(0, 0, 0, 1)"> base64_userpasswd request.meta[</span><span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">proxy</span><span style="color: rgba(128, 0, 0, 1)">'</span>] = <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">http://</span><span style="color: rgba(128, 0, 0, 1)">"</span> + proxy[<span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">ip_port</span><span style="color: rgba(128, 0, 0, 1)">'</span>]</pre>
setting 文件
USER_AGENTS = [ 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)', 'Opera/9.27 (Windows NT 5.2; U; zh-cn)', 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)', 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0', 'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13' ]PROXIES = [
{"ip_port" :"121.42.140.113:16816", "user_passwd" : "mr_mao_hacker:sffqry9r"},
#{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
#{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
#{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
]
登陆的三种方法
1,直接找到登陆接口,提供账号密码进行登陆,也是最简单的。
2,有时候需要从登录页找到隐藏的值,然后提交到后台,比如知乎就需要在登录页得到 _xsrf,
3,最麻烦的一种,对方各种加密验证,我们可以采用 cookie 进行登陆。
分别写三个代码参考下:
1. 简单
# -*- coding: utf-8 -*- import scrapy# 只要是需要提供 post 数据的,就可以用这种方法,
# 下面示例:post 数据是账户密码
class Renren1Spider(scrapy.Spider):
name = "renren1"
allowed_domains = ["renren.com"]</span><span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> start_requests(self): url </span>= <span style="color: rgba(128, 0, 0, 1)">'</span><span style="color: rgba(128, 0, 0, 1)">http://www.renren.com/PLogin.do</span><span style="color: rgba(128, 0, 0, 1)">'</span> <span style="color: rgba(0, 0, 255, 1)">yield</span><span style="color: rgba(0, 0, 0, 1)"> scrapy.FormRequest( url </span>=<span style="color: rgba(0, 0, 0, 1)"> url, formdata </span>= {<span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">email</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">mr_mao_hacker@163.com</span><span style="color: rgba(128, 0, 0, 1)">"</span>, <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">password</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">alarmchime</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">}, callback </span>=<span style="color: rgba(0, 0, 0, 1)"> self.parse_page) </span><span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> parse_page(self, response): with open(</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">mao2.html</span><span style="color: rgba(128, 0, 0, 1)">"</span>, <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">w</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">) as filename: filename.write(response.body)</span></pre>
2. 中等
# -*- coding: utf-8 -*- import scrapy# 正统模拟登录方法:
# 首先发送登录页面的 get 请求,获取到页面里的登录必须的参数,比如说 zhihu 的 _xsrf
# 然后和账户密码一起 post 到服务器,登录成功class Renren2Spider(scrapy.Spider):
name = "renren2"
allowed_domains = ["renren.com"]
start_urls = (
"http://www.renren.com/PLogin.do",
)</span><span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> parse(self, response): </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)">_xsrf = response.xpath("//_xsrf").extract()[0]</span> <span style="color: rgba(0, 0, 255, 1)">yield</span><span style="color: rgba(0, 0, 0, 1)"> scrapy.FormRequest.from_response( response, formdata </span>= {<span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">email</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">mr_mao_hacker@163.com</span><span style="color: rgba(128, 0, 0, 1)">"</span>, <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">password</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">alarmchime</span><span style="color: rgba(128, 0, 0, 1)">"</span>},<span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)">, "_xsrf" = _xsrf},</span> callback =<span style="color: rgba(0, 0, 0, 1)"> self.parse_page ) </span><span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> parse_page(self, response): </span><span style="color: rgba(0, 0, 255, 1)">print</span> <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">=========1===</span><span style="color: rgba(128, 0, 0, 1)">"</span> +<span style="color: rgba(0, 0, 0, 1)"> response.url </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)">with open("mao.html", "w") as filename:</span> <span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)"> filename.write(response.body)</span> url = <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">http://www.renren.com/422167102/profile</span><span style="color: rgba(128, 0, 0, 1)">"</span> <span style="color: rgba(0, 0, 255, 1)">yield</span> scrapy.Request(url, callback =<span style="color: rgba(0, 0, 0, 1)"> self.parse_newpage) </span><span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> parse_newpage(self, response): </span><span style="color: rgba(0, 0, 255, 1)">print</span> <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">===========2====</span><span style="color: rgba(128, 0, 0, 1)">"</span> +<span style="color: rgba(0, 0, 0, 1)"> response.url with open(</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">xiao.html</span><span style="color: rgba(128, 0, 0, 1)">"</span>, <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">w</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">) as filename: filename.write(response.body)</span></pre>
3. 困难
# -*- coding: utf-8 -*- import scrapy# 实在没办法了,可以用这种方法模拟登录,麻烦一点,成功率 100%
class RenrenSpider(scrapy.Spider):
name = "renren"
allowed_domains = ["renren.com"]
start_urls = (
'http://www.renren.com/xxxxx',
'http://www.renren.com/11111',
'http://www.renren.com/xx',
)cookies </span>=<span style="color: rgba(0, 0, 0, 1)"> { </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">anonymid</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">ixrna3fysufnwv</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">_r01_</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">1</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">ap</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">327550029</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">JSESSIONID</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">abciwg61A_RvtaRS3GjOv</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">depovince</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">GW</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">springskin</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">set</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">jebe_key</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198628950</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">jebe_key</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198619601</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">ver</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">7.0</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">XNESSESSIONID</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">e703b11f8809</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">jebecookies</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">98c7c881-779f-4da8-a57c-7464175cd469|||||</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">ick_login</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">4b4a254a-9f25-4d4a-b686-a41fda73e173</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">_de</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">BF09EE3A28DED52E6B65F6A4705D973F1383380866D39FF5</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">p</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">ea5541736f993365a23d04c0946c10e29</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">first_login_flag</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">1</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">ln_uact</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">mr_mao_hacker@163.com</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">ln_hurl</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">http://hdn.xnimg.cn/photos/hdn521/20140529/1055/h_main_9A3Z_e0c300019f6a195a.jpg</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">t</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">691808127750a83d33704a565d8340ae9</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">societyguester</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">691808127750a83d33704a565d8340ae9</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">id</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">327550029</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">xnsid</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">f42b25cf</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">, </span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">loginfrom</span><span style="color: rgba(128, 0, 0, 1)">"</span> : <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">syshome</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)"> } </span><span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> start_requests(self): </span><span style="color: rgba(0, 0, 255, 1)">for</span> url <span style="color: rgba(0, 0, 255, 1)">in</span><span style="color: rgba(0, 0, 0, 1)"> self.start_urls: </span><span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)">yield scrapy.Request(url, callback = self.parse)</span> <span style="color: rgba(0, 128, 0, 1)">#</span><span style="color: rgba(0, 128, 0, 1)">url = "http://www.renren.com/410043129/profile"</span> <span style="color: rgba(0, 0, 255, 1)">yield</span> scrapy.FormRequest(url, cookies = self.cookies, callback =<span style="color: rgba(0, 0, 0, 1)"> self.parse_page) </span><span style="color: rgba(0, 0, 255, 1)">def</span><span style="color: rgba(0, 0, 0, 1)"> parse_page(self, response): </span><span style="color: rgba(0, 0, 255, 1)">print</span> <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">===========</span><span style="color: rgba(128, 0, 0, 1)">"</span> +<span style="color: rgba(0, 0, 0, 1)"> response.url with open(</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">deng.html</span><span style="color: rgba(128, 0, 0, 1)">"</span>, <span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(128, 0, 0, 1)">w</span><span style="color: rgba(128, 0, 0, 1)">"</span><span style="color: rgba(0, 0, 0, 1)">) as filename: filename.write(response.body)</span></pre>