1 23 / 3 页

「一道大数据习题」豆瓣评论最多的三千部电影


21 金钱	回复本帖可获得 3 金钱奖励! 每人限 1 次

wxfengyun

0 主题	0 好友	17 积分

新手上路

Rank: 1

发消息

21^#

发表于 2013-11-9 00:01:58 |只看该作者

回帖奖励 +3 金钱

我也写了一份自己的版本，但是还比较简陋，只是抓取了豆瓣主要类别标签页中前9页的电影，然后排了一个序，总数大概4000多，有时候会有点问题，数量每次运行不太一致，估计和网络环境，以及有些编码问题没解决好有关（欢迎大家拍砖）：

#!usr/bin/python
#coding:utf-8
import sys
import os
import re

'''
@author：wxfengyun
新浪围脖：http://weibo.com/u/1490668587
'''
import urllib2
from bs4 import BeautifulSoup

def FindMovie(mysoup):
#获取电影名称
nameFind = mysoup.find('a', {"class":"nbg"})
name = nameFind['title']

#获取相应电影的评价人数
pinglunFind = mysoup.find('span', {"class":"pl"})
pinglun = pinglunFind.get_text()

match = re.search(r'\d+', pinglun)

if match:
g_Movie[name] = int(match.group())

while True:

nameFind = nameFind.find_next('a', {"class":"nbg"})
if nameFind == None:
break

name = nameFind['title']

pinglunFind = pinglunFind.find_next('span', {"class":"pl"})
if pinglunFind:
pinglun = pinglunFind.get_text()

if pinglun:
match = re.search(r'\d+', pinglun)
if match:
g_Movie[name] = int(match.group())

reload(sys)
sys.setdefaultencoding('utf8')

g_Movie = {}

url = "http://movie.douban.com/tag/?view=type"
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html)

div_hot = soup.tbody

#获取标签页
for i in div_hot.find_all('a'):

url = "http://movie.douban.com/tag/%s" % i.get_text()

html = urllib2.urlopen(url).read()

if html == None:
break

soup = BeautifulSoup(html)
nameFind = soup.find('span', {"class":"thispage"})

#FindMovie(soup)

for j in range(0,9) :
#获取同一标签页下的前9个版面
nameFind = nameFind.find_next('a')

html = urllib2.urlopen(nameFind['href']).read()
if html == None:
break

soup = BeautifulSoup(html)

if soup:
FindMovie(soup)

print u"\t\t\t 豆瓣电影排行榜\n"

ff = sorted(g_Movie.items(), key=lambda e:e[1], reverse=True)

count = 1

for t in ff:
print u"%d: %s(%s)" % (count, t[0], t[1])
count += 1

使用道具举报