Python实现多线程抓取妹子图

心血来潮写了个多线程抓妹子图，虽然代码还是有一些瑕疵，但是还是记录下来，分享给大家。
pic_downloader.py
# -*- coding: utf-8 -*-created on fri aug 07 17:30:58 2015 @author: dreaceimport urllib2import sysimport timeimport osimport randomfrom multiprocessing.dummy import pool as threadpool type_ = sys.getfilesystemencoding()def rename(): return time.strftime(%y%m%d%h%m%s)def rename_2(name): if len(name) == 2: name = '0' + name + '.jpg' elif len(name) == 1: name = '00' + name + '.jpg' else: name = name + '.jpg' return namedef download_pic(i): global count global time_out if filter(i): try: content = urllib2.urlopen(i,timeout = time_out) url_content = content.read() f = open(repr(random.randint(10000,999999999)) + _ + rename_2(repr(count)),wb) f.write(url_content) f.close() count += 1 except exception, e: print i + 下载超时，跳过！.decode(utf-8).encode(type_)def filter(content): for line in filter_list: line=line.strip('\n') if content.find(line) == -1: return truedef get_pic(url_address): global pic_list try: str_ = urllib2.urlopen(url_address, timeout = time_out).read() url_content = str_.split(\) for i in url_content: if i.find(.jpg) != -1: pic_list.append(i) except exception, e: print 获取图片超时，跳过！.decode(utf-8).encode(type_)max = 2count = 0time_out = 60thread_num = 30pic_list = []page_list = []filter_list = [imgsize.ph.126.net,img.ph.126.net,img2.ph.126.net]dir_name = c:\photos\\+rename()os.makedirs(dir_name)os.chdir(dir_name)start_time = time.time()url_address = http://sexy.faceks.com/?page=for i in range(1,max + 1): page_list.append(url_address + repr(i))page_pool = threadpool(thread_num)page_pool.map(get_pic,page_list)print 获取到.decode(utf-8).encode(type_),len(pic_list),张图片，开始下载！.decode(utf-8).encode(type_)pool = threadpool(thread_num) pool.map(download_pic,pic_list)pool.close() pool.join()print count,张图片保存在.decode(utf-8).encode(type_) + dir_nameprint 共耗时.decode(utf-8).encode(type_),time.time() - start_time,s
我们来看下一个网友的作品
#coding: utf-8 ############################################################## file name: main.py# author: mylonly# mail: mylonly@gmail.com# created time: wed 11 jun 2014 08:22:12 pm cst##########################################################################!/usr/bin/pythonimport re,urllib2,htmlparser,threading,queue,time#各图集入口链接htmldoorlist = []#包含图片的hmtl链接htmlurllist = []#图片url链接queueimageurllist = queue.queue(0)#捕获图片数量imagegetcount = 0#已下载图片数量imagedownloadcount = 0#每个图集的起始地址，用于判断终止nexthtmlurl = ''#本地保存路径localsavepath = '/data/1920x1080/'#如果你想下你需要的分辨率的，请修改replace_str,有如下分辨率可供选择1920x1200，1980x1920,1680x1050,1600x900,1440x900,1366x768,1280x1024,1024x768,1280x800replace_str = '1920x1080'replaced_str = '960x600'#内页分析处理类class imagehtmlparser(htmlparser.htmlparser):def __init__(self):self.nexturl = ''htmlparser.htmlparser.__init__(self)def handle_starttag(self,tag,attrs):global imageurllistif(tag == 'img' and len(attrs) > 2 ):if(attrs[0] == ('id','bigimg')):url = attrs[1][1]url = url.replace(replaced_str,replace_str)imageurllist.put(url)global imagegetcountimagegetcount = imagegetcount + 1print urlelif(tag == 'a' and len(attrs) == 4):if(attrs[0] == ('id','pagenext') and attrs[1] == ('class','next')):global nexthtmlurlnexthtmlurl = attrs[2][1];#首页分析类class indexhtmlparser(htmlparser.htmlparser):def __init__(self):self.urllist = []self.index = 0self.nexturl = ''self.taglist = ['li','a']self.classlist = ['photo-list-padding','pic']htmlparser.htmlparser.__init__(self)def handle_starttag(self,tag,attrs):if(tag == self.taglist[self.index]):for attr in attrs:if (attr[1] == self.classlist[self.index]):if(self.index == 0):#第一层找到了self.index = 1else:#第二层找到了self.index = 0print attrs[1][1]self.urllist.append(attrs[1][1])breakelif(tag == 'a'):for attr in attrs:if (attr[0] == 'id' and attr[1] == 'pagenext'):self.nexturl = attrs[1][1]print 'nexturl:',self.nexturlbreak#首页hmtl解析器indexparser = indexhtmlparser()#内页html解析器imageparser = imagehtmlparser()#根据首页得到所有入口链接print '开始扫描首页...'host = 'http://desk.zol.com.cn'indexurl = '/meinv/'while (indexurl != ''):print '正在抓取网页:',host+indexurlrequest = urllib2.request(host+indexurl)try:m = urllib2.urlopen(request)con = m.read()indexparser.feed(con)if (indexurl == indexparser.nexturl):breakelse:indexurl = indexparser.nexturlexcept urllib2.urlerror,e:print e.reasonprint '首页扫描完成，所有图集链接已获得：'htmldoorlist = indexparser.urllist#根据入口链接得到所有图片的urlclass getimageurl(threading.thread):def __init__(self):threading.thread.__init__(self)def run(self):for door in htmldoorlist:print '开始获取图片地址,入口地址为:',doorglobal nexthtmlurlnexthtmlurl = ''while(door != ''):print '开始从网页%s获取图片...'% (host+door)if(nexthtmlurl != ''):request = urllib2.request(host+nexthtmlurl)else:request = urllib2.request(host+door)try:m = urllib2.urlopen(request)con = m.read()imageparser.feed(con)print '下一个页面地址为:',nexthtmlurlif(door == nexthtmlurl):breakexcept urllib2.urlerror,e:print e.reasonprint '所有图片地址均已获得:',imageurllistclass getimage(threading.thread):def __init__(self):threading.thread.__init__(self)def run(self):global imageurllistprint '开始下载图片...'while(true):print '目前捕获图片数量:',imagegetcountprint '已下载图片数量:',imagedownloadcountimage = imageurllist.get()print '下载文件路径:',imagetry:cont = urllib2.urlopen(image).read()patter = '[0-9]*\.jpg';match = re.search(patter,image);if match:print '正在下载文件：',match.group()filename = localsavepath+match.group()f = open(filename,'wb')f.write(cont)f.close()global imagedownloadcountimagedownloadcount = imagedownloadcount + 1else:print 'no match'if(imageurllist.empty()):breakexcept urllib2.urlerror,e:print e.reasonprint '文件全部下载完成...'get = getimageurl()get.start()print '获取图片链接线程启动:'time.sleep(2)download = getimage()download.start()print '下载图片链接线程启动:'
批量抓取指定网页上的所有图片
# -*- coding:utf-8 -*-# coding=utf-8 import os,urllib,urllib2,re url = uhttp://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=python&oq=python&rsp=-1outpath = t:\\ def gethtml(url): webfile = urllib.urlopen(url) outhtml = webfile.read() print outhtml return outhtml def getimagelist(html): restr=ur'(' restr+=ur'http:\/\/[^\s,]*\.jpg' restr+=ur'|http:\/\/[^\s,]*\.jpeg' restr+=ur'|http:\/\/[^\s,]*\.png' restr+=ur'|http:\/\/[^\s,]*\.gif' restr+=ur'|http:\/\/[^\s,]*\.bmp' restr+=ur'|https:\/\/[^\s,]*\.jpeg' restr+=ur'|https:\/\/[^\s,]*\.jpeg' restr+=ur'|https:\/\/[^\s,]*\.png' restr+=ur'|https:\/\/[^\s,]*\.gif' restr+=ur'|https:\/\/[^\s,]*\.bmp' restr+=ur')' htmlurl = re.compile(restr) imglist = re.findall(htmlurl,html) print imglist return imglist def download(imglist, page): x = 1 for imgurl in imglist: filepathname=str(outpath+'pic_%09d_%010d'%(page,x)+str(os.path.splitext(urllib2.unquote(imgurl).decode('utf8').split('/')[-1])[1])).lower() print '[debug] download file :'+ imgurl+' >> '+filepathname urllib.urlretrieve(imgurl,filepathname) x+=1 def downimagenum(pagenum): page = 1 pagenumber = pagenum while(page <= pagenumber): html = gethtml(url)#获得url指向的html内容 imagelist = getimagelist(html)#获得所有图片的地址，返回列表 download(imagelist,page)#下载所有的图片 page = page+1 if __name__ == '__main__': downimagenum(1)
以上就是给大家汇总的3款python实现的批量抓取妹纸图片的代码了，希望对大家学习python爬虫能够有所帮助。

Python实现多线程抓取妹子图

推荐信息