查看: 23932|回复: 18

抓取美女图片的爬虫小程序

1 主题	0 好友	50 积分

注册会员

Rank: 2

发消息

电梯直达

楼主

发表于 2014-6-27 18:51:22 |只看该作者 |倒序浏览

一个python爬虫小程序，爬的是www.22mm.cc里面的美女图片，该如何把它改成多线程呢？？

#!/usr/bin/env python
#coding:UTF-8
import urllib
import re
import os
import os.path
index=0
#抓取页面的函数
def getPage(url):
page=urllib.urlopen(url).read()
return page
#抓取首页美女分类的链接信息
def getSortLinkInfo(html):
patt='<a href="/[^\s]+?\.html" title="[^\s]+?"'
regex=re.compile(patt)
linkInfo=re.findall(regex,html)
links={}
for i in linkInfo:
#links.append('http://www.22mm.cc'+i.split('"')[1])
links['http://www.22mm.cc'+i.split('"')[1]]=i.split('"')[3]
return links #links是有效链接的列表
#获取特定美女页面中的链接信息
def getBeautyLinkInfo(link):
page=getPage(link)
patt='<a href=\'[^\s]+?-\d+?\.html\'>\d+?</a>'
regex=re.compile(patt)
lastLink=re.findall(regex,page)
if len(lastLink)>0:
lastLink=lastLink[-1].split("'")[1]
return lastLink #lastLink是最后一个美女页面的相对路径
#提取最终美女图片的链接
def getImgLinks(lastLink):
page=getPage(lastLink)
patt='arrayImg\[0\]="(http://[^\s]+?\.jpg)"'
regex=re.compile(patt)
imgLinks=re.findall(regex,page)
return imgLinks #imgLinks是有效的图片链接的列表
#下载并且保存图片
def saveImg(imgLinks,dirname):
global index
path=unicode('D:\\pics\\'+dirname,'utf8')
os.mkdir('%s' %(path))
dirname=dirname.decode('utf8')
for i in imgLinks:
urllib.urlretrieve(i,'D:\\pics\%s\%d.jpg' % (dirname,index))
print '%s has been downloaded and saved successfully.'%(i)
index+=1
indexURL='http://www.22mm.cc'
def start():
homePage=getPage(indexURL)
links=getSortLinkInfo(homePage)
for i in links:
#dirname=unicode('D:\\pics\\'+links[i],'utf8')
#os.mkdir('%s' %(dirname))
dirname=links[i]
relPath=getBeautyLinkInfo(i)
if len(relPath)>0:
lastLink='http://www.22mm.cc/mm/'+i.split("/")[4]+'/'+relPath
tempLinks=getImgLinks(lastLink)
imgLinks=[]
for j in tempLinks:
imgLinks.append(re.sub('big','pic',j))
saveImg(imgLinks,dirname)
start()

复制代码

收藏2

使用道具举报

crossin先生

174 主题	45 好友	11万积分

管理员

Rank: 9 Rank: 9 Rank: 9

发消息

沙发

发表于 2014-6-28 16:57:06 |只看该作者

这个程序有意思
去看一下 thread 模块相关的用法

#==== Crossin的编程教室 ====#
微信ID：crossincode
网站：http://crossincode.com

使用道具举报

creek

1 主题	0 好友	50 积分

注册会员

Rank: 2

发消息

板凳

发表于 2014-6-30 23:19:22 |只看该作者

修改了一下，之前的只能抓取首页的图片，下面这个应该能爬取全站的

#!usr/bin/env python
#coding:UTF-8
import urllib2
import urllib
import re
import os
# 获取页面的html
def get_page(url):
req=urllib2.Request(url)
try:
html=urllib2.urlopen(req).read()
return html
except urllib2.URLError,e:
if e.code==404:
return False
#获取美女图片的四个分类链接
def get_fen_lei_link(html):
patt='<a href="/mm/[^\s]+?/" >'
regex=re.compile(patt)
fen_lei_link=[]
temp_link=re.findall(regex,html)[0:4]
for link in temp_link:
fen_lei_link.append('http://www.22mm.cc'+link.split('"')[1])
return fen_lei_link
#获取套图的链接
def get_taotu_link(specific_page):
global taotu_links
taotu_links={}
patt='<a href="/[^\s]+?\.html" title=".+?"'
regex=re.compile(patt)
link_info=re.findall(regex,specific_page)
for i in link_info:
taotu_links['http://www.22mm.cc'+i.split('"')[1]]=i.split('"')[3]
#获取套图页面中指向最后一个图片的链接
def get_taotu_last_link(taotu_link):
taotu_page=get_page(taotu_link)
patt='<a href=\'[^\s]+?-\d+?\.html\'>\d+?</a>'
regex=re.compile(patt)
taotu_last_link=re.findall(regex,taotu_page)[-1].split("'")[1]
return taotu_last_link
#获取暂时的图片链接
def get_temp_image_link(taotu_last_link):
taotu_last_page=get_page(taotu_last_link)
patt=patt='arrayImg\[\d\]="(http://[^\s]+?\.jpg)"'
regex=re.compile(patt)
temp_image_link=re.findall(regex,taotu_last_page)
return temp_image_link
#将图片下载并且保存到D盘的pic文件夹中
def save_image(image_links,dirname):
global index
path=unicode('D:\\pic\\'+dirname,'utf8')
os.mkdir('%s' %(path))
dirname=dirname.decode('utf8')
for img_link in image_links:
urllib.urlretrieve(img_link,'D:\\pic\%s\%d.jpg' % (dirname,index))
print '%s has been downloaded and saved successfully.'%(img_link)
index+=1
def start():
print 'Waiting............'
url='http://www.22mm.cc'
html=get_page(url)
fen_lei_link=get_fen_lei_link(html)
for link in fen_lei_link:
temp_link=link
page_index=1 #page_index是页面索引
specific_page=get_page(link)
get_taotu_link(specific_page)
while page_index<4: #套图的数目很多，暂时只抓取每个分类的前三页图片
if page_index==1:
specific_page=get_page(link)
get_taotu_link(specific_page)
else:
link=temp_link+'index_%d.html' %(page_index)
specific_page=get_page(link)
if not specific_page:
break
else:
get_taotu_link(specific_page)
for key in taotu_links:
dirname=taotu_links[key]
temp_taotu_last_link=get_taotu_last_link(key)
taotu_last_link='http://www.22mm.cc/mm/'+key.split("/")[4]+'/'+temp_taotu_last_link
temp_image_link=get_temp_image_link(taotu_last_link)
image_links=[]
for each in temp_image_link:
image_links.append(re.sub('big','pic',each))
save_image(image_links,dirname)
page_index+=1
#links是存储套图链接信息的字典
taotu_links={}
#index是图片名称的索引
index=0
start()

复制代码

对于Python的多线程，我有好些疑问，搜索了也没得到满意的答案
1.一个进程里面同一时刻只能运行一个线程么？
2.倘若1成立，那么sleep()在实际的应用程序中不是反而拉低了效率？
提了两个很菜的问题，希望crossin先生看到了能顺手解答一下，多谢

使用道具举报

crossin先生

174 主题	45 好友	11万积分

管理员

Rank: 9 Rank: 9 Rank: 9

发消息

地板

发表于 2014-7-1 11:50:26 |只看该作者

creek 发表于 2014-6-30 23:19
修改了一下，之前的只能抓取首页的图片，下面这个应该能爬取全站的对于Python的多线程，我有好些疑问，搜索 ...

是的。如果cpu只能同时执行一个进程，那么多线程的计算没有什么好处，甚至还耽误了线程切换的时间。
但是多线程可以避免一个线程被阻塞住，导致其他任务无法进行的情况，这在有网络请求或者文件读写的时候很有用。相当于把等待对方服务器响应和下载的时间节省了下来。
另外，我不是很确定，多核cpu是否可以并行多个python线程。

#==== Crossin的编程教室 ====#
微信ID：crossincode
网站：http://crossincode.com

使用道具举报

liu-pengfei

0 主题	0 好友	558 积分

高级会员

Rank: 4

发消息

5^#

发表于 2014-9-22 19:07:59 |只看该作者

楼主，我复制了你的改进后的代码，运行的时候只是在命令行输出了wait...其他什么都没有，D盘没有图片。于是我又在D盘下手动新建一个文件夹，名字是pic，再次运行，可以了，但是下载了两组图片后，大概有十多张，就报错停止了，说是下表越界。重新运行，又是只是在命令行输出了wait...其他什么都没有。这是怎么回事啊？

学习，纯粹。

使用道具举报

小燕smile

0 主题	1 好友	38 积分

新手上路

Rank: 1

发消息

6^#

发表于 2015-9-7 13:13:52 |只看该作者

@crossin先生我自己也写了一个抓取图片的脚本，但是在下载图片到电脑的时候发生urllib.error.ContentTooShortError: <urlopen error retrieval incomplete: got only 36211 out of 508217 bytes>的错误，再次执行，却没有任何问题，怀疑是不是和网络有关？由于网速较慢，导致下载图片到内存中的部分较少或者内存占用率较高导致？有没有什么办法能够避免这种错误发生？

使用道具举报