Pastebin launched a little side project called HostCabi.net, check it out ;-)Don't like ads? PRO users don't see any ads ;-)
Guest

fimfic downloader

By: waterapple on Jul 20th, 2013  |  syntax: Python  |  size: 8.27 KB  |  hits: 2  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. #-------------------------------------------------------------------------------
  2. # Name:        FimFiction_downloader
  3. # Purpose:      Automated bulk download of stories from fimfiction.com
  4. #
  5. # Author:      new
  6. #
  7. # Created:     10/07/2013
  8. # Copyright:   (c) new 2013
  9. # Licence:     <your licence>
  10. #-------------------------------------------------------------------------------
  11. #!/usr/bin/env python
  12.  
  13.  
  14. import mechanize
  15. import cookielib
  16. import re
  17. import json
  18. import imghdr
  19. import os
  20. import time
  21. import string
  22. from urllib2 import urlopen, URLError, HTTPError
  23. from httplib import BadStatusLine
  24. import glob
  25. import random
  26. import logging
  27. from bs4 import BeautifulSoup
  28.  
  29. # getwithinfo()
  30. GET_REQUEST_DELAY = 0
  31. GET_REQUEST_DELAY_UPPER_RANDOM = 1
  32. GET_RETRY_DELAY = 30
  33. GET_MAX_ATTEMPTS = 20
  34.  
  35. LOGGING_LOG_PATH = "FimFiction_Downloader_log.txt"
  36.  
  37. def delay(basetime,upperrandom=10):
  38.     #replacement for using time.sleep, this adds a random delay to be sneaky
  39.     sleeptime = basetime + random.randint(0,upperrandom)
  40.     logger.debug("pausing for "+str(sleeptime)+" ...")
  41.     time.sleep(sleeptime)
  42.  
  43.  
  44. def get(url):
  45.     #try to retreive a url. If unable to return None object
  46.     #Example useage:
  47.     #html = get('')
  48.     #if html:
  49.     gettuple = getwithinfo(url)
  50.     if gettuple:
  51.         reply, info = gettuple
  52.         return reply
  53.  
  54. def getwithinfo(url):
  55.     """Try to retreive a url. If unable to return None objects
  56.    Example useage:
  57.    html = get('')
  58.        if html:
  59.    """
  60.     attemptcount = 0
  61.     while attemptcount < GET_MAX_ATTEMPTS:
  62.         attemptcount = attemptcount + 1
  63.         if attemptcount > 1:
  64.             print 'Attempt', attemptcount
  65.         try:
  66.             r = br.open(url)
  67.             info = r.info()
  68.             reply = r.read()
  69.             delay(GET_REQUEST_DELAY, GET_REQUEST_DELAY_UPPER_RANDOM)
  70.             # Save html responses for debugging
  71.             #print info
  72.             #print info["content-type"]
  73.             if "html" in info["content-type"]:
  74.                 #print "saving debug html"
  75.                 savefile("debug\\get_last_html.htm", reply, True)
  76.             return reply,info
  77.         except HTTPError, err:
  78.             logger.debug(str(err))
  79.             if err.code == 404:
  80.                 logger.debug('404 error')
  81.                 return
  82.             elif err.code == 410:
  83.                 logger.debug('410 error, GONE')
  84.                 return
  85.             else:
  86.                 savefile("\debug\\error.htm",err.fp.read(),True)
  87.         except URLError, err:
  88.             logger.debug(str(err))
  89.             if 'unknown url type:' in err.reason:
  90.                 return
  91.         except BadStatusLine, err:
  92.             logger.debug(str(err))
  93.         delay(GET_RETRY_DELAY)
  94.  
  95. def savefile(filenamein,data,force_save=False):
  96.     if not force_save:
  97.         if os.path.exists(filenamein):
  98.             logger.debug("file already exists! "+str(filenamein))
  99.             return
  100.     sanitizedpath = sanitizepath(filenamein)
  101.     foldername = os.path.dirname(sanitizedpath)
  102.     if not os.path.isdir(foldername):
  103.         os.makedirs(foldername)
  104.     file = open(sanitizedpath, "wb")
  105.     file.write(data)
  106.     file.close()
  107.  
  108. def sanitizepath(pathin):
  109.     #from pathsanitizer
  110.     #sanitize a filepath for use on windows
  111.     #http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247%28v=vs.85%29.aspx
  112.     assert(type(pathin)==type(""))
  113.     segments = []
  114.     workingpath = pathin# make a copy for easier debugging
  115.     #split the path into segments
  116.     while True:
  117.         workingpath, segment = os.path.split(workingpath)
  118.         segments.append(segment)
  119.         if len(workingpath) <= 0:
  120.             break
  121.     segments.reverse()
  122.     #sanitize segments
  123.     precessedsegments = []
  124.     for segment in segments:
  125.         s0 = re.sub('[^A-Za-z0-9\ \.\_]+', '-', segment)#remove all non-alphanumeric
  126.         s1 = s0.strip()#strip whitespace so it doesn't get turned into hyphens
  127.         s2 = re.sub('[<>:"/\|?*]+', '-',s1)#remove forbidden characters
  128.         s3 = s2.strip()#strip whitespace
  129.         s4 = s3.strip(".-")#strip characters that shouldn't be at ends of filenames
  130.         s5 = re.sub(r"\ +", " ", s4)#remove repeated spaces
  131.         s6 = re.sub(r"\-+", "-", s5)#remove repeated hyphens
  132.         s7 = re.sub(r"\_+", "_", s6)#remove repeated underscores
  133.         precessedsegments.append(s7)
  134.     #join segments
  135.     pathout = os.path.join(*precessedsegments)
  136.     assert(type(pathout)==type(""))
  137.     return pathout
  138.  
  139. def setupbrowser():
  140.     #Initialize browser object to global variable "br" using cokie jar "cj"
  141.     # Browser
  142.     global br
  143.     br = mechanize.Browser()
  144.     br.set_cookiejar(cj)
  145.     # Browser options
  146.     br.set_handle_equiv(True)
  147.     br.set_handle_gzip(True)
  148.     br.set_handle_redirect(True)
  149.     br.set_handle_referer(True)
  150.     br.set_handle_robots(False)
  151.     # Follows refresh 0 but not hangs on refresh > 0
  152.     br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
  153.     # User-Agent (this is cheating, ok?)
  154.     br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
  155.  
  156.  
  157. def extract_filename_from_http_metadata(metadata):
  158.     # Read filename from metadata
  159.     #print metadata
  160.     filename = metadata['content-disposition'].split('filename=')[1][1:][:-1]
  161.     return filename
  162.  
  163.  
  164.  
  165.  
  166. def download_story(download_dir_path, story_id):
  167.     """Download a story using fimfics download urls"""
  168.     # Change storyid integer into a string stuitable for download links
  169.     story_id_string = str(story_id).rjust(6, "0")
  170.     # Form download urls
  171.     story_html_download_url = "https://www.fimfiction.net/download_story.php?story="+story_id_string+"&html"
  172.     story_txt_download_url = "https://www.fimfiction.net/download_story.php?story="+story_id_string
  173.     story_epub_download_url = "https://www.fimfiction.net/download_epub.php?story="+story_id_string
  174.     download_urls = [story_html_download_url, story_txt_download_url, story_epub_download_url]
  175.  
  176.     # Download story urls
  177.     for download_url in download_urls:
  178.         # Load url
  179.         download_data, download_info = getwithinfo(download_url)
  180.         if not download_data:
  181.             return False
  182.         remote_filename = extract_filename_from_http_metadata(download_info)
  183.         save_filename = story_id_string+" - "+remote_filename
  184.         save_path = os.path.join(download_dir_path, save_filename)
  185.         savefile(save_path, download_data)
  186.     return True
  187.  
  188.  
  189. def download_stories(download_dir_path,start_number,stop_number):
  190.     logger.debug("downloading stories from "+str(start_number)+" to "+str(stop_number))
  191.     story_counter = start_number
  192.     max_total_skipped = 10000
  193.     max_sequential_skipped = 100
  194.     total_skipped_counter = 0
  195.     sequential_skipped = 0
  196.     while (story_counter <= stop_number) and (total_skipped_counter <= max_total_skipped) and (sequential_skipped <=  max_sequential_skipped):
  197.         logger.debug("downloading story id: "+str(story_counter))
  198.         download_success = download_story(download_dir_path,story_counter)
  199.         if not download_success:
  200.             total_skipped_counter += 1
  201.             sequential_skipped += 1
  202.         else:
  203.             sequential_skipped = 0
  204.         story_counter += 1
  205.  
  206. def main():
  207.     date_of_run_start = time.strftime('%Y %m %d')
  208.     download_folder = os.path.join("download", date_of_run_start)
  209.     download_stories(download_folder, 1, 1000000)
  210.  
  211. # Setup logging (Before running any other code)
  212. # http://inventwithpython.com/blog/2012/04/06/stop-using-print-for-debugging-a-5-minute-quickstart-guide-to-pythons-logging-module/
  213. logger = logging.getLogger()
  214. logger.setLevel(logging.DEBUG)
  215. formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
  216. fh = logging.FileHandler(LOGGING_LOG_PATH)
  217. fh.setLevel(logging.DEBUG)
  218. fh.setFormatter(formatter)
  219. logger.addHandler(fh)
  220. ch = logging.StreamHandler()
  221. ch.setLevel(logging.DEBUG)
  222. ch.setFormatter(formatter)
  223. logger.addHandler(ch)
  224. logger.debug('Logging started.')
  225.  
  226. if __name__ == '__main__':
  227.     try:
  228.         cj = cookielib.LWPCookieJar()
  229.         setupbrowser()
  230.         main()
  231.         logger.debug("End of run.")
  232.      # Log exceptions
  233.     except Exception, e:
  234.         logger.critical("Unhandled exception!")
  235.         logging.exception(e)