设为首页收藏本站

Crossin的编程教室

 找回密码
 立即加入
楼主: crossin先生
打印 上一主题 下一主题

Python 实战(5):拿来主义

[复制链接]

2

主题

0

好友

476

积分

中级会员

Rank: 3Rank: 3

楼主
发表于 2018-5-19 17:18:55 |显示全部楼层
#-*-coding: utf-8 -*-
#D:\ProgramData\Anaconda3\envs\py36\python.exe code.py 127.0.0.1
import web
import urllib.request
import json
import time

urls = (
    '/', 'index',
    '/movie/(\d+)', 'movie',
)

render = web.template.render('templates/')

#web.py 连接 SQLite
db = web.database(dbn='sqlite', db='MovieSite.db')

def add_movie(data):
    movie = json.loads(data)
    #print (movie['title'])
db.insert('movie',
        id = int(movie['id']),
        title = movie['title'],
        origin = movie['original_title'],
        url = movie['alt'],
        rating = movie['rating']['average'],
        image = movie['images']['large'],
        directors = ','.join([d['name'] for d in movie['directors']]),
        casts = ','.join([c['name'] for c in movie['casts']]),
        year = movie['year'],
        genres = ','.join(movie['genres']),
        countries = ','.join(movie['countries']),
        summary = movie['summary'],
    )

def movie_exist(data):
    movie = json.loads(data)
    n_id = int(movie['id'])
    pass
class index:
    def GET(self):
        movies = db.select('movie')
        return render.index(movies)

    def POST(self):
        data = web.input()
        condition = r'title like "%' + data.title + r'%"'
movies = db.select('movie', where=condition)
        return render.index(movies)

class movie:
    def GET(self, movie_id):
        condition = 'id=' + movie_id
        movie = db.select('movie', where=condition)[0]
        return render.movie(movie)


movie_ids = []
for index in range(0, 250, 50):
    response = urllib.request.urlopen('http://api.douban.com/v2/movie/top250?start=%d&count=50' % index)
    data = response.read()
    data_json = json.loads(data)
    movie250 = data_json['subjects']
    for movie in movie250:
        movie_ids.append(movie['id'])
        #print(movie['id'], movie['title'])
time.sleep(3)
#print(movie_ids)
count = 0
for mid in movie_ids:
    #print (count, mid)
try:
        response = urllib.request.urlopen('http://api.douban.com/v2/movie/subject/%s' % mid)
        data = response.read()
        add_movie(data)
        count += 1
        time.sleep(3)
    except:
        print('movie %s is not found' % mid)

if __name__ == "__main__":
    app = web.application(urls, globals())
    app.run()

第一次从头开始sqlite3 MovieSite.db
create table movie (id, title, origin, url, rating, image, directors, casts, year, genres, countries, summary);然后运行上面的程序,进入网页报错
无标题5.jpg 无标题4.jpg
第二次把获取movie_ids和存入数据库的两段注释掉,再运行,就可以正常显示网页了(虽然也没全部抓取到,只有90多个)
麻烦老师看下这会是什么问题?



回复

使用道具 举报

2

主题

0

好友

476

积分

中级会员

Rank: 3Rank: 3

沙发
发表于 2018-5-20 20:29:25 |显示全部楼层
crossin先生 发表于 2018-5-19 17:30
可能只是达到豆瓣接口限制被暂时拒绝了

你要调试的话,把输出的url 给 print 出来看一下 ...

一头雾水...
稍微改了下想减少点数量,调试方便一点
#-*-coding: utf-8 -*-
#D:\ProgramData\Anaconda3\envs\py36\python.exe code.py 127.0.0.1
import web
import urllib.request
import json
import time

urls = (
    '/', 'index',
    '/movie/(\d+)', 'movie',
)

render = web.template.render('templates/')

#web.py 连接 SQLite
db = web.database(dbn='sqlite', db='MovieSite.db')

def add_movie(data):
    movie = json.loads(data)
    #print (movie['title'])
    db.insert('movie',
        id = int(movie['id']),
        title = movie['title'],
        origin = movie['original_title'],
        url = movie['alt'],
        rating = movie['rating']['average'],
        image = movie['images']['large'],
        directors = ','.join([d['name'] for d in movie['directors']]),
        casts = ','.join([c['name'] for c in movie['casts']]),
        year = movie['year'],
        genres = ','.join(movie['genres']),
        countries = ','.join(movie['countries']),
        summary = movie['summary'],
    )

def movie_exist(data):
    movie = json.loads(data)
    n_id = int(movie['id'])
    pass

class index:
    def GET(self):
        movies = db.select('movie')
        return render.index(movies)

    def POST(self):
        data = web.input()
        condition = r'title like "%' + data.title + r'%"'
        movies = db.select('movie', where=condition)
        return render.index(movies)

class movie:
    def GET(self, movie_id):
        condition = 'id=' + movie_id
        movie = db.select('movie', where=condition)[0]
        return render.movie(movie)

def get_movie_ids(total_num, step_num):
    movie_ids = []
    for index in range(0, total_num, step_num):
        response = urllib.request.urlopen('http://api.douban.com/v2/movie/top250?start=%d&count=step_num' % index)
        data = response.read()
        data_json = json.loads(data)
        movie_total = data_json['subjects']
        #print(len(movie_total))
        for movie in movie_total:
            movie_ids.append(movie['id'])
            print(movie['id'], movie['title'])
        time.sleep(3)
    #print(movie_ids)
    #print(len(movie_ids))
    return  movie_ids

def store_movie(movie_ids):
    count = 0
    for mid in movie_ids:
        #print (count, mid)
        try:
            response = urllib.request.urlopen('http://api.douban.com/v2/movie/subject/%s' % mid)
            data = response.read()
            add_movie(data)
            count += 1
            time.sleep(3)
        except:
            print('movie %s is not found' % mid)

movie_ids = get_movie_ids(30, 10)
store_movie(movie_ids)

if __name__ == "__main__":
    app = web.application(urls, globals())
    app.run()

第一次运行,看到数据库已经导入记录成功了,终端里显示http://127.0.0.1:8080/
但是浏览器打开网页之后,页面一直没有显示,这时候看到终端里又自动重新运行了一遍,直到被豆瓣接口拒绝
老师能不能看下这是什么问题?
回复

使用道具 举报

2

主题

0

好友

476

积分

中级会员

Rank: 3Rank: 3

板凳
发表于 2018-5-21 20:26:50 |显示全部楼层
crossin先生 发表于 2018-5-21 10:57
movie_ids = get_movie_ids(30, 10)
store_movie(movie_ids)

好了,现在差不多了,把movie_ids的结果放到一个文件里,每次从文件读取,如果文件不存在再做get_movie_ids
不过MovieSite.db里仍然可能重复保存,浪费时间,而且弄不好又被豆瓣封掉了,我找找怎么样可以做db.insert之前先检测一下相同的id是否已经存在
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即加入

QQ|手机版|Archiver|Crossin的编程教室 ( 苏ICP备15063769号  

GMT+8, 2024-5-3 13:52 , Processed in 0.020301 second(s), 24 queries .

Powered by Discuz! X2.5

© 2001-2012 Comsenz Inc.

回顶部