- 帖子
- 1
- 精华
- 0
- 积分
- 17
- 阅读权限
- 10
- 注册时间
- 2013-11-8
- 最后登录
- 2013-11-20
|
回帖奖励 +3 金钱
我也写了一份自己的版本,但是还比较简陋,只是抓取了豆瓣主要类别标签页中前9页的电影,然后排了一个序,总数大概4000多,有时候会有点问题,数量每次运行不太一致,估计和网络环境,以及有些编码问题没解决好有关(欢迎大家拍砖):
#!usr/bin/python
#coding:utf-8
import sys
import os
import re
'''
@author:wxfengyun
新浪围脖:http://weibo.com/u/1490668587
'''
import urllib2
from bs4 import BeautifulSoup
def FindMovie(mysoup):
#获取电影名称
nameFind = mysoup.find('a', {"class":"nbg"})
name = nameFind['title']
#获取相应电影的评价人数
pinglunFind = mysoup.find('span', {"class":"pl"})
pinglun = pinglunFind.get_text()
match = re.search(r'\d+', pinglun)
if match:
g_Movie[name] = int(match.group())
while True:
nameFind = nameFind.find_next('a', {"class":"nbg"})
if nameFind == None:
break
name = nameFind['title']
pinglunFind = pinglunFind.find_next('span', {"class":"pl"})
if pinglunFind:
pinglun = pinglunFind.get_text()
if pinglun:
match = re.search(r'\d+', pinglun)
if match:
g_Movie[name] = int(match.group())
reload(sys)
sys.setdefaultencoding('utf8')
g_Movie = {}
url = "http://movie.douban.com/tag/?view=type"
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html)
div_hot = soup.tbody
#获取标签页
for i in div_hot.find_all('a'):
url = "http://movie.douban.com/tag/%s" % i.get_text()
html = urllib2.urlopen(url).read()
if html == None:
break
soup = BeautifulSoup(html)
nameFind = soup.find('span', {"class":"thispage"})
#FindMovie(soup)
for j in range(0,9) :
#获取同一标签页下的前9个版面
nameFind = nameFind.find_next('a')
html = urllib2.urlopen(nameFind['href']).read()
if html == None:
break
soup = BeautifulSoup(html)
if soup:
FindMovie(soup)
print u"\t\t\t 豆瓣电影排行榜\n"
ff = sorted(g_Movie.items(), key=lambda e:e[1], reverse=True)
count = 1
for t in ff:
print u"%d: %s(%s)" % (count, t[0], t[1])
count += 1
|
|