python爬虫源码分享:汤不热tumblr爬虫爬取博主视频图片

2018-10-30 小惟 143

这是一个可以crawl 汤不热博主发布的所有视频图片的spider,并下载保存至本地

2018亲测可用,正不正经我就不知道了

这次的爬虫作者不是博主,博主只是拿来用用,详情请看代码:

  1. # -*- coding: utf-8 -*-


  2. import os

  3. import sys

  4. import requests

  5. import xmltodict

  6. from six.moves import queue as Queue

  7. from threading import Thread

  8. import re

  9. import json

  10. import random



  11. # Setting timeout

  12. TIMEOUT = 15


  13. # Retry times

  14. RETRY = 5


  15. # Medium Index Number that Starts from

  16. START = 0


  17. # Numbers of photos/videos per page

  18. MEDIA_NUM = 20


  19. # Numbers of downloading threads concurrently

  20. THREADS = 5



  21. def video_hd_match():

  22.     hd_pattern = re.compile(r'.*"hdUrl":("([^\s,]*)"|false),')


  23.     def match(video_player):

  24.         hd_match = hd_pattern.match(video_player)

  25.         try:

  26.             if hd_match is not None and hd_match.group(1) != 'false':

  27.                 return hd_match.group(2).replace('\\', '')

  28.         except:

  29.             return None

  30.     return match



  31. def video_default_match():

  32.     default_pattern = re.compile(r'.*src="(\S*)" ', re.DOTALL)


  33.     def match(video_player):

  34.         default_match = default_pattern.match(video_player)

  35.         if default_match is not None:

  36.             try:

  37.                 return default_match.group(1)

  38.             except:

  39.                 return None

  40.     return match



  41. class DownloadWorker(Thread):

  42.     def __init__(self, queue, proxies=None):

  43.         Thread.__init__(self)

  44.         self.queue = queue

  45.         self.proxies = proxies

  46.         self._register_regex_match_rules()


  47.     def run(self):

  48.         while True:

  49.             medium_type, post, target_folder = self.queue.get()

  50.             self.download(medium_type, post, target_folder)

  51.             self.queue.task_done()


  52.     def download(self, medium_type, post, target_folder):

  53.         try:

  54.             medium_url = self._handle_medium_url(medium_type, post)

  55.             if medium_url is not None:

  56.                 self._download(medium_type, medium_url, target_folder)

  57.         except TypeError:

  58.             pass


  59.     # can register differnet regex match rules

  60.     def _register_regex_match_rules(self):

  61.         # will iterate all the rules

  62.         # the first matched result will be returned

  63.         self.regex_rules = [video_hd_match(), video_default_match()]


  64.     def _handle_medium_url(self, medium_type, post):

  65.         try:

  66.             if medium_type == "photo":

  67.                 return post["photo-url"][0]["#text"]


  68.             if medium_type == "video":

  69.                 video_player = post["video-player"][1]["#text"]

  70.                 for regex_rule in self.regex_rules:

  71.                     matched_url = regex_rule(video_player)

  72.                     if matched_url is not None:

  73.                         return matched_url

  74.                 else:

  75.                     raise Exception

  76.         except:

  77.             raise TypeError("Unable to find the right url for downloading. "

  78.                             "Please open a new issue on "

  79.                             "https://github.com/dixudx/tumblr-crawler/"

  80.                             "issues/new attached with below information:\n\n"

  81.                             "%s" % post)


  82.     def _download(self, medium_type, medium_url, target_folder):

  83.         medium_name = medium_url.split("/")[-1].split("?")[0]

  84.         if medium_type == "video":

  85.             if not medium_name.startswith("tumblr"):

  86.                 medium_name = "_".join([medium_url.split("/")[-2],

  87.                                         medium_name])


  88.             medium_name += ".mp4"

  89.         file_path = os.path.join(target_folder, medium_name)

  90.         if not os.path.isfile(file_path):

  91.             print("Downloading %s from %s.\n" % (medium_name,

  92.                                                  medium_url))

  93.             retry_times = 0

  94.             while retry_times < RETRY:

  95.                 try:

  96.                     resp = requests.get(medium_url,

  97.                                         headers=headers,

  98.                                         stream=True,

  99.                                         proxies=self.proxies,

  100.                                         timeout=TIMEOUT)

  101.                     if resp.status_code == 403:

  102.                         retry_times = RETRY

  103.                         print("Access Denied when retrieve %s.\n" % medium_url)

  104.                         raise Exception("Access Denied")

  105.                     with open(file_path, 'wb') as fh:

  106.                         for chunk in resp.iter_content(chunk_size=1024):

  107.                             fh.write(chunk)

  108.                     break

  109.                 except:

  110.                     # try again

  111.                     pass

  112.                 retry_times += 1

  113.             else:

  114.                 try:

  115.                     os.remove(file_path)

  116.                 except OSError:

  117.                     pass

  118.                 print("Failed to retrieve %s from %s.\n" % (medium_type,

  119.                                                             medium_url))



  120. class CrawlerScheduler(object):


  121.     def __init__(self, sites, proxies=None):

  122.         self.sites = sites

  123.         self.proxies = proxies

  124.         self.queue = Queue.Queue()

  125.         self.scheduling()


  126.     def scheduling(self):

  127.         # create workers

  128.         for x in range(THREADS):

  129.             worker = DownloadWorker(self.queue,

  130.                                     proxies=self.proxies)

  131.             # Setting daemon to True will let the main thread exit

  132.             # even though the workers are blocking

  133.             worker.daemon = True

  134.             worker.start()


  135.         for site in self.sites:

  136.             self.download_media(site)


  137.     def download_media(selfsite):

  138.         self.download_photos(site)

  139.         self.download_videos(site)


  140. #先爬取视频还是图片自己把这两个换一下位置!!!!

  141.   

  142.     def download_videos(selfsite):

  143.         self._download_media(site, "video", START)

  144.         # wait for the queue to finish processing all the tasks from one

  145.         # single site

  146.         self.queue.join()

  147.         print("Finish Downloading All the videos from %s" % site)


  148.     def download_photos(selfsite):

  149.         self._download_media(site, "photo", START)

  150.         # wait for the queue to finish processing all the tasks from one

  151.         # single site

  152.         self.queue.join()

  153.         print("Finish Downloading All the photos from %s" % site)


  154.     def _download_media(selfsite, medium_type, start):

  155.         current_folder = os.getcwd()

  156.         target_folder = os.path.join(current_folder, site)

  157.         if not os.path.isdir(target_folder):

  158.             os.mkdir(target_folder)


  159.         base_url = "http://{0}.tumblr.com/api/read?type={1}&num={2}&start={3}"

  160.         start = START

  161.         while True:

  162.             media_url = base_url.format(site, medium_type, MEDIA_NUM, start)

  163.             response = requests.get(media_url,

  164.                                     proxies=self.proxies)

  165.             if response.status_code == 404:

  166.                 print("Site %s does not exist" % site)

  167.                 break


  168.             try:

  169.                 xml_cleaned = re.sub(u'[^\x20-\x7f]+',

  170.                                      u'', response.content.decode('utf-8'))

  171.                 data = xmltodict.parse(xml_cleaned)

  172.                 posts = data["tumblr"]["posts"]["post"]

  173.                 for post in posts:

  174.                     try:

  175.                         # if post has photoset, walk into photoset for each photo

  176.                         photoset = post["photoset"]["photo"]

  177.                         for photo in photoset:

  178.                             self.queue.put((medium_type, photo, target_folder))

  179.                     except:

  180.                         # select the largest resolution

  181.                         # usually in the first element

  182.                         self.queue.put((medium_type, post, target_folder))

  183.                 start += MEDIA_NUM

  184.             except KeyError:

  185.                 break

  186.             except UnicodeDecodeError:

  187.                 print("Cannot decode response data from URL %s" % media_url)

  188.                 continue

  189.             except:

  190.                 print("Unknown xml-vulnerabilities from URL %s" % media_url)

  191.                 continue



  192. def usage():

  193.     print("1. Please create file sites.txt under this same directory.\n"

  194.           "2. In sites.txt, you can specify tumblr sites separated by "

  195.           "comma/space/tab/CR. Accept multiple lines of text\n"

  196.           "3. Save the file and retry.\n\n"

  197.           "Sample File Content:\nsite1,site2\n\n"

  198.           "Or use command line options:\n\n"

  199.           "Sample:\npython tumblr-photo-video-ripper.py site1,site2\n\n\n")

  200.     print(u"未找到sites.txt文件,请创建.\n"

  201.           u"请在文件中指定Tumblr站点名,并以 逗号/空格/tab/表格鍵/回车符 分割,支持多行.\n"

  202.           u"保存文件并重试.\n\n"

  203.           u"例子: site1,site2\n\n"

  204.           u"或者直接使用命令行参数指定站点\n"

  205.           u"例子: python tumblr-photo-video-ripper.py site1,site2")



  206. def illegal_json():

  207.     print("Illegal JSON format in file 'proxies.json'.\n"

  208.           "Please refer to 'proxies_sample1.json' and 'proxies_sample2.json'.\n"

  209.           "And go to http://jsonlint.com/ for validation.\n\n\n")

  210.     print(u"文件proxies.json格式非法.\n"

  211.           u"请参照示例文件'proxies_sample1.json'和'proxies_sample2.json'.\n"

  212.           u"然后去 http://jsonlint.com/ 进行验证.")



  213. def parse_sites(filename):

  214.     with open(filename, "r") as f:

  215.         raw_sites = f.read().rstrip().lstrip()


  216.     raw_sites = raw_sites.replace("\t", ",") \

  217.                          .replace("\r", ",") \

  218.                          .replace("\n", ",") \

  219.                          .replace(" ", ",")

  220.     raw_sites = raw_sites.split(",")


  221.     sites = list()

  222.     for raw_site in raw_sites:

  223.         site = raw_site.lstrip().rstrip()

  224.         if site:

  225.             sites.append(site)

  226.     return sites



  227. if __name__ == "__main__":

  228.     cur_dir = os.path.dirname(os.path.realpath(__file__))

  229.     sites = None

  230.     useragent_list = [

  231.     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",

  232.     "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",

  233.     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",

  234.     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

  235.     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

  236.     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

  237.     "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

  238.     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

  239.     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

  240.     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

  241.     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

  242.     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

  243.     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

  244.     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

  245.     "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

  246.     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",

  247.     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

  248.     'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4620.400 QQBrowser/9.7.13014.400'

  249.     ]


  250.     headers = {

  251.         'User-Agent': random.choice(useragent_list)}

  252.     proxies = None

  253.     proxy_path = os.path.join(cur_dir, "proxies.json")

  254.     if os.path.exists(proxy_path):

  255.         with open(proxy_path, "r") as fj:

  256.             try:

  257.                 proxies = json.load(fj)

  258.                 if proxies is not None and len(proxies) > 0:

  259.                     print("You are using proxies.\n%s" % proxies)

  260.             except:

  261.                 illegal_json()

  262.                 sys.exit(1)


  263.     if len(sys.argv) < 2:

  264.         # check the sites file

  265.         filename = os.path.join(cur_dir, "sites.txt")

  266.         if os.path.exists(filename):

  267.             sites = parse_sites(filename)

  268.         else:

  269.             usage()

  270.             sys.exit(1)

  271.     else:

  272.         sites = sys.argv[1].split(",")


  273.     if len(sites) == 0 or sites[0] == "":

  274.         usage()

  275.         sys.exit(1)

  276.     try:

  277.         CrawlerScheduler(sites, proxies=proxies)

  278.     except:

  279.         print "Unexpected error:", sys.exc_info()

  280.         raw_input('press enter key to exit')

如上,你只需要保存py文件,在Windows下或者linux下命令行:python ****.py ???

???表示你要爬去的博主昵称,也就是***.tumblr.com,域名里的***,中间可以用逗号空格连接多个博主

当然这些你看代码就够了,小白反正也不会用。

 

这个开始已经声明不是博主的code,转自:https://github.com/dixudx/tumblr-crawler

 

博主只加了一个随机头文件的选择,因为我在爬取的过程中发现被禁了,加了头文件就解决了

爬取的时候挂上ss全局代理即可,这是博主觉得最快最方便的,其他的用代码添加自己折腾去吧

然后最后加了个try语句,因为打包成exe不知道会有什么奇怪的错误,就是这样

因为爬虫维护频率实在是高,博主打包的exe程序就不发出来了,爬的人多了代码失效的也快

总之,代码留给有需要的人学习吧,以后tumblr改版了,逻辑还是不变的,稍微修改下还是可以用的,但我就不更新了

后面如果有人要最新可爬的代码可以跟我留言,也可以去上面那个大佬的GitHub上找,整个GitHub上面有很多开源的啊!

如果你不想写的话,善加利用。


模板坊版权所有 京ICP备16066985号-2 京公网安备11010602022036号

100041

QQ:504895209

— 努力创造优质作品,奉献更多精品佳作 —

模板坊 - APP应用开发|网站建设|平面设计