- 帖子
- 3
- 精华
- 0
- 积分
- 22
- 阅读权限
- 10
- 注册时间
- 2019-4-11
- 最后登录
- 2019-5-15
|
win10 python3.70
请crossin老师和大佬们看看
scrapy 框架 scrapy crawl xxxx -o xxx.json -t json
import scrapy
class RunnoobSpider(scrapy.Spider):
name = 'runnoob_spider'
allowed_domains = ['runoob.com']
start_urls = ['http://www.runoob.com/w3cnote']
def parse(self, response):
for LinkTitle in response.css('div.post-intro'):
yield {
'title': LinkTitle.xpath('h2/a/@title').get().encode('utf-8').decode('unicode_escape'),
## 这里尝试过 latin-1 utf-8 unicode_escape 排列组合9种应该都试过了 输出
'link': response.urljoin(LinkTitle.xpath('h2/a/@href').get()),
'desc': LinkTitle.xpath('p/text()').get(default='no content').strip(),
}
for next_page in response.css('li.next-page > a::attr(href)'):
if int(next_page.get().split('/')[-1]) < 2:
yield response.follow(next_page, self.parse)
输出的json (这里用的 encode('utf-8').decode('unicode_escape')
{"title": "localstorage \u00e5\u00bf\u0085\u00e7\u009f\u00a5\u00e5\u00bf\u0085\u00e4\u00bc\u009a", "link": "http://www.runoob.com/w3cnote/localstorage-spec.html", "desc": "HTML API\r\nlocalstorage \u5728\u6d4f\u89c8\u5668\u7684 API \u6709\u4e24\u4e2a\uff1alocalStorage \u548csessionStorage\uff0c\u5b58\u5728\u4e8e window \u5bf9\u8c61\u4e2d\uff1alocalStorage \u5bf9\u5e94 window.localStor..."},
{"title": "Mac OS SSH \u00e4\u00bd\u00bf\u00e7\u0094\u00a8 PEM \u00e6\u0096\u0087\u00e4\u00bb\u00b6\u00e7\u0099\u00bb\u00e5\u00bd\u0095", "link": "http://www.runoob.com/w3cnote/mac-os-ssh-pem.html", "desc": "\u9996\u5148\u4fee\u6539 PEM \u7684\u6743\u9650\uff1a\r\nsudo chmod 600 key.pem\r\nMac OS \u8fde\u63a5\u670d\u52a1\u5668\u4f7f\u7528 PEM \u6587\u4ef6\u7684\u547d\u4ee4\u5982\u4e0b\uff1a\r\n\r\n\r\nssh -i key.pem root@IP\r\n\r\n\r\n\u4f60\u4e5f\u53ef\u4ee5\u4f7f..."},
{"title": "\u00e5\u0085\u00b3\u00e4\u00ba\u008e\u00e7\u00a8\u008b\u00e5\u00ba\u008f\u00e5\u0091\u0098\u00e9\u0084\u0099\u00e8\u00a7\u0086\输出在python shell 里面就会正常展示为中文字符 真是逼疯我了。。
|
|