- #-------------------------------------------------------------------------------
- # Name: FimFiction_downloader
- # Purpose: Automated bulk download of stories from fimfiction.com
- #
- # Author: new
- #
- # Created: 10/07/2013
- # Copyright: (c) new 2013
- # Licence: <your licence>
- #-------------------------------------------------------------------------------
- #!/usr/bin/env python
- import mechanize
- import cookielib
- import re
- import json
- import imghdr
- import os
- import time
- import string
- from urllib2 import urlopen, URLError, HTTPError
- from httplib import BadStatusLine
- import glob
- import random
- import logging
- from bs4 import BeautifulSoup
- # getwithinfo()
- GET_REQUEST_DELAY = 0
- GET_REQUEST_DELAY_UPPER_RANDOM = 1
- GET_RETRY_DELAY = 30
- GET_MAX_ATTEMPTS = 20
- LOGGING_LOG_PATH = "FimFiction_Downloader_log.txt"
- def delay(basetime,upperrandom=10):
- #replacement for using time.sleep, this adds a random delay to be sneaky
- sleeptime = basetime + random.randint(0,upperrandom)
- logger.debug("pausing for "+str(sleeptime)+" ...")
- time.sleep(sleeptime)
- def get(url):
- #try to retreive a url. If unable to return None object
- #Example useage:
- #html = get('')
- #if html:
- gettuple = getwithinfo(url)
- if gettuple:
- reply, info = gettuple
- return reply
- def getwithinfo(url):
- """Try to retreive a url. If unable to return None objects
- Example useage:
- html = get('')
- if html:
- """
- attemptcount = 0
- while attemptcount < GET_MAX_ATTEMPTS:
- attemptcount = attemptcount + 1
- if attemptcount > 1:
- print 'Attempt', attemptcount
- try:
- r = br.open(url)
- info = r.info()
- reply = r.read()
- delay(GET_REQUEST_DELAY, GET_REQUEST_DELAY_UPPER_RANDOM)
- # Save html responses for debugging
- #print info
- #print info["content-type"]
- if "html" in info["content-type"]:
- #print "saving debug html"
- savefile("debug\\get_last_html.htm", reply, True)
- return reply,info
- except HTTPError, err:
- logger.debug(str(err))
- if err.code == 404:
- logger.debug('404 error')
- return
- elif err.code == 410:
- logger.debug('410 error, GONE')
- return
- else:
- savefile("\debug\\error.htm",err.fp.read(),True)
- except URLError, err:
- logger.debug(str(err))
- if 'unknown url type:' in err.reason:
- return
- except BadStatusLine, err:
- logger.debug(str(err))
- delay(GET_RETRY_DELAY)
- def savefile(filenamein,data,force_save=False):
- if not force_save:
- if os.path.exists(filenamein):
- logger.debug("file already exists! "+str(filenamein))
- return
- sanitizedpath = sanitizepath(filenamein)
- foldername = os.path.dirname(sanitizedpath)
- if not os.path.isdir(foldername):
- os.makedirs(foldername)
- file = open(sanitizedpath, "wb")
- file.write(data)
- file.close()
- def sanitizepath(pathin):
- #from pathsanitizer
- #sanitize a filepath for use on windows
- #http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247%28v=vs.85%29.aspx
- assert(type(pathin)==type(""))
- segments = []
- workingpath = pathin# make a copy for easier debugging
- #split the path into segments
- while True:
- workingpath, segment = os.path.split(workingpath)
- segments.append(segment)
- if len(workingpath) <= 0:
- break
- segments.reverse()
- #sanitize segments
- precessedsegments = []
- for segment in segments:
- s0 = re.sub('[^A-Za-z0-9\ \.\_]+', '-', segment)#remove all non-alphanumeric
- s1 = s0.strip()#strip whitespace so it doesn't get turned into hyphens
- s2 = re.sub('[<>:"/\|?*]+', '-',s1)#remove forbidden characters
- s3 = s2.strip()#strip whitespace
- s4 = s3.strip(".-")#strip characters that shouldn't be at ends of filenames
- s5 = re.sub(r"\ +", " ", s4)#remove repeated spaces
- s6 = re.sub(r"\-+", "-", s5)#remove repeated hyphens
- s7 = re.sub(r"\_+", "_", s6)#remove repeated underscores
- precessedsegments.append(s7)
- #join segments
- pathout = os.path.join(*precessedsegments)
- assert(type(pathout)==type(""))
- return pathout
- def setupbrowser():
- #Initialize browser object to global variable "br" using cokie jar "cj"
- # Browser
- global br
- br = mechanize.Browser()
- br.set_cookiejar(cj)
- # Browser options
- br.set_handle_equiv(True)
- br.set_handle_gzip(True)
- br.set_handle_redirect(True)
- br.set_handle_referer(True)
- br.set_handle_robots(False)
- # Follows refresh 0 but not hangs on refresh > 0
- br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
- # User-Agent (this is cheating, ok?)
- br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
- def extract_filename_from_http_metadata(metadata):
- # Read filename from metadata
- #print metadata
- filename = metadata['content-disposition'].split('filename=')[1][1:][:-1]
- filename = re.sub(r"[/\\]", r"", filename)# remove slashes to stop confusion
- # crop filename to 100 chars plus extention if length of filename is over 100
- if len(filename) > 100:
- filename_no_extention, file_extention = os.path.splitext(filename)
- length_to_crop_to = 100
- marker_string = "---"
- allowed_number_of_filename_chars = (length_to_crop_to - len(file_extention) - len(marker_string))
- cropped_filename_no_extention = filename_no_extention[0:allowed_number_of_filename_chars]
- cropped_filename = cropped_filename_no_extention + marker_string + file_extention
- filename = cropped_filename
- return filename
- def make_story_filename(story_id_string,download_dir_path,download_metadata):
- remote_filename = extract_filename_from_http_metadata(download_metadata)
- save_filename = story_id_string+" - "+remote_filename
- save_path = os.path.join(download_dir_path, save_filename)
- logger.debug(save_path)
- return save_path
- def download_story(download_dir_path, story_id):
- """Download a story using fimfics download urls"""
- # Change storyid integer into a string stuitable for download links
- story_id_string = str(story_id).rjust(6, "0")
- # Form download urls
- story_html_download_url = "https://www.fimfiction.net/download_story.php?story="+story_id_string+"&html"
- story_txt_download_url = "https://www.fimfiction.net/download_story.php?story="+story_id_string
- story_epub_download_url = "https://www.fimfiction.net/download_epub.php?story="+story_id_string
- download_urls = [story_html_download_url, story_txt_download_url, story_epub_download_url]
- # Download story urls
- for download_url in download_urls:
- # Load url
- download_data, download_info = getwithinfo(download_url)
- if not download_data:
- return False
- save_path = make_story_filename(story_id_string, download_dir_path, download_info)
- savefile(save_path, download_data)
- return True
- def download_stories(download_dir_path,start_number,stop_number):
- logger.debug("downloading stories from "+str(start_number)+" to "+str(stop_number))
- story_counter = start_number
- max_total_skipped = 10000
- max_sequential_skipped = 100
- total_skipped_counter = 0
- sequential_skipped = 0
- while (story_counter <= stop_number) and (total_skipped_counter <= max_total_skipped) and (sequential_skipped <= max_sequential_skipped):
- logger.debug("downloading story id: "+str(story_counter))
- download_success = download_story(download_dir_path,story_counter)
- if not download_success:
- total_skipped_counter += 1
- sequential_skipped += 1
- logger.debug("download failed, sequential fails: "+str(sequential_skipped)+" total fails:"+str(total_skipped_counter))
- else:
- sequential_skipped = 0
- logger.debug("download succeeded.")
- story_counter += 1
- def main():
- date_of_run_start = time.strftime('%Y %m')
- download_folder = os.path.join("download", date_of_run_start)
- download_stories(download_folder, 25110, 2000000)
- # Setup logging (Before running any other code)
- # http://inventwithpython.com/blog/2012/04/06/stop-using-print-for-debugging-a-5-minute-quickstart-guide-to-pythons-logging-module/
- logger = logging.getLogger()
- logger.setLevel(logging.DEBUG)
- formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
- fh = logging.FileHandler(LOGGING_LOG_PATH)
- fh.setLevel(logging.DEBUG)
- fh.setFormatter(formatter)
- logger.addHandler(fh)
- ch = logging.StreamHandler()
- ch.setLevel(logging.DEBUG)
- ch.setFormatter(formatter)
- logger.addHandler(ch)
- logger.debug('Logging started.')
- if __name__ == '__main__':
- try:
- cj = cookielib.LWPCookieJar()
- setupbrowser()
- main()
- logger.debug("End of run.")
- # Log exceptions
- except Exception, e:
- logger.critical("Unhandled exception!")
- logging.exception(e)