今天用selenium和casperjs2种对https://class.coursera.org/nlp/lecture网站的ppt、pdf、srt、mp4的下载地址进行数据抓取 1、pythonselenium #!/usr/bin/python# -*- coding: utf-8 -*-from selenium import webdriverfrom bs4 import beautifulsoupimport t
今天用selenium和casperjs2种对https://class.coursera.org/nlp/lecture网站的ppt、pdf、srt、mp4的下载地址进行数据抓取
1、python+selenium
#!/usr/bin/python# -*- coding: utf-8 -*-from selenium import webdriverfrom bs4 import beautifulsoupimport timeimport sysreload(sys)sys.setdefaultencoding('utf-8')def catchdate(s): 页面数据提取 soup = beautifulsoup(s) z = [] m = soup.findall(ul,class_=course-item-list-div-list) for obj in m: try: print obj.previous_sibling.find('h3').get_text() tmp = obj.findall('li', class_=unviewed) for eachli in tmp: titleli = eachli.find('a').get_text() print ' '+titleli allaineachdiv = eachli.find('div', class_=course-lecture-item-resource).findall('a') for eacha in allaineachdiv: print ' '+eacha['href'] except exception, e: continue if(tmp != ): z.append(tmp) return zstarttime = time.time()driver = webdriver.phantomjs(executable_path='c:\phantomjs-1.9.7-windows\phantomjs.exe')driver.get(https://class.coursera.org/nlp/lecture)html = driver.page_sourcecontent = catchdate(html)endtime = time.time()print endtime - starttimedriver.quit
2、casperjsvar casper = require(casper).create({ clientscripts: [jquery-1.7.js], steptimeout: 120 * 1000, pagesettings: { loadimages: false }, verbose: true, loglevel: error }); var numberoflinks = 0;var fs = require('fs');var filename = 'content.txt';var fullcontent = ;var starttime = new date(), endtime; casper.start(https://class.coursera.org/nlp/lecture, function() { numberoflinks = this.evaluate(function() { return __utils__.findall('.course-item-list-div-list').length; }); this.echo(numberoflinks + items found);});getstarttime = function(){ this.echo(starttime); this.then(getcontent);};getcontent = function() { fullcontent = this.evaluate(function() { var content = ; jquery('.course-item-list-div-list').each(function() { var btitle = $(this).prev().find(h3).text(); content += btitle + '\r\n'; $(this).find(li).each(function(){ var stitle = $(this).find(a).first().text(); content += stitle + '\r'; $(this).find(div a).each(function(){ content += $(this).attr(href)+'\r'; }); content += '\r\n'; }); content += '\r\n\r\n'; }); return content; }); this.then(writefile);};writefile = function() { this.echo('writing to ' + filename); fs.write(filename, fullcontent, 'w'); this.then(getendtime);};getendtime = function(){ endtime = new date();}casper.then(getstarttime);casper.then(function exitsystem() { this.echo(new date() - starttime); casper.exit(); }); casper.run();
因为不熟练,感觉写的不太好,求大神对方法进行指导!!!
参考:
https://gist.github.com/imjared/5201405
http://casperjs.readthedocs.org/en/latest/modules/casper.html#evaluate
http://blog.csdn.net/u012577500/article/details/18185399
http://stackoverflow.com/questions/14894311/casperjs-windows-installation-how-is-it-done-the-correct-way-please
http://blog.csdn.net/sagomilk/article/details/20800543