#-------------------------------------------------------------------------------
# Name: FimFiction_downloader
# Purpose: Automated bulk download of stories from fimfiction.com
#
# Author: new
#
# Created: 10/07/2013
# Copyright: (c) new 2013
# Licence: <your licence>
#-------------------------------------------------------------------------------
#!/usr/bin/env python
import mechanize
import cookielib
import re
import json
import imghdr
import os
import time
import string
from urllib2 import urlopen, URLError, HTTPError
from httplib import BadStatusLine
import glob
import random
import logging
from bs4 import BeautifulSoup
# getwithinfo()
GET_REQUEST_DELAY = 0
GET_REQUEST_DELAY_UPPER_RANDOM = 1
GET_RETRY_DELAY = 30
GET_MAX_ATTEMPTS = 20
LOGGING_LOG_PATH = "FimFiction_Downloader_log.txt"
def delay(basetime,upperrandom=10):
#replacement for using time.sleep, this adds a random delay to be sneaky
sleeptime = basetime + random.randint(0,upperrandom)
logger.debug("pausing for "+str(sleeptime)+" ...")
time.sleep(sleeptime)
def get(url):
#try to retreive a url. If unable to return None object
#Example useage:
#html = get('')
#if html:
gettuple = getwithinfo(url)
if gettuple:
reply, info = gettuple
return reply
def getwithinfo(url):
"""Try to retreive a url. If unable to return None objects
Example useage:
html = get('')
if html:
"""
attemptcount = 0
while attemptcount < GET_MAX_ATTEMPTS:
attemptcount = attemptcount + 1
if attemptcount > 1:
print 'Attempt', attemptcount
try:
r = br.open(url)
info = r.info()
reply = r.read()
delay(GET_REQUEST_DELAY, GET_REQUEST_DELAY_UPPER_RANDOM)
# Save html responses for debugging
#print info
#print info["content-type"]
if "html" in info["content-type"]:
#print "saving debug html"
savefile("debug\\get_last_html.htm", reply, True)
return reply,info
except HTTPError, err:
logger.debug(str(err))
if err.code == 404:
logger.debug('404 error')
return
elif err.code == 410:
logger.debug('410 error, GONE')
return
else:
savefile("\debug\\error.htm",err.fp.read(),True)
except URLError, err:
logger.debug(str(err))
if 'unknown url type:' in err.reason:
return
except BadStatusLine, err:
logger.debug(str(err))
delay(GET_RETRY_DELAY)
def savefile(filenamein,data,force_save=False):
if not force_save:
if os.path.exists(filenamein):
logger.debug("file already exists! "+str(filenamein))
return
sanitizedpath = sanitizepath(filenamein)
foldername = os.path.dirname(sanitizedpath)
if not os.path.isdir(foldername):
os.makedirs(foldername)
file = open(sanitizedpath, "wb")
file.write(data)
file.close()
def sanitizepath(pathin):
#from pathsanitizer
#sanitize a filepath for use on windows
#http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247%28v=vs.85%29.aspx
assert(type(pathin)==type(""))
segments = []
workingpath = pathin# make a copy for easier debugging
#split the path into segments
while True:
workingpath, segment = os.path.split(workingpath)
segments.append(segment)
if len(workingpath) <= 0:
break
segments.reverse()
#sanitize segments
precessedsegments = []
for segment in segments:
s0 = re.sub('[^A-Za-z0-9\ \.\_]+', '-', segment)#remove all non-alphanumeric
s1 = s0.strip()#strip whitespace so it doesn't get turned into hyphens
s2 = re.sub('[<>:"/\|?*]+', '-',s1)#remove forbidden characters
s3 = s2.strip()#strip whitespace
s4 = s3.strip(".-")#strip characters that shouldn't be at ends of filenames
s5 = re.sub(r"\ +", " ", s4)#remove repeated spaces
s6 = re.sub(r"\-+", "-", s5)#remove repeated hyphens
s7 = re.sub(r"\_+", "_", s6)#remove repeated underscores
precessedsegments.append(s7)
#join segments
pathout = os.path.join(*precessedsegments)
assert(type(pathout)==type(""))
return pathout
def setupbrowser():
#Initialize browser object to global variable "br" using cokie jar "cj"
# Browser
global br
br = mechanize.Browser()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
def extract_filename_from_http_metadata(metadata):
# Read filename from metadata
#print metadata
filename = metadata['content-disposition'].split('filename=')[1][1:][:-1]
return filename
def download_story(download_dir_path, story_id):
"""Download a story using fimfics download urls"""
# Change storyid integer into a string stuitable for download links
story_id_string = str(story_id).rjust(6, "0")
# Form download urls
story_html_download_url = "https://www.fimfiction.net/download_story.php?story="+story_id_string+"&html"
story_txt_download_url = "https://www.fimfiction.net/download_story.php?story="+story_id_string
story_epub_download_url = "https://www.fimfiction.net/download_epub.php?story="+story_id_string
download_urls = [story_html_download_url, story_txt_download_url, story_epub_download_url]
# Download story urls
for download_url in download_urls:
# Load url
download_data, download_info = getwithinfo(download_url)
if not download_data:
return False
remote_filename = extract_filename_from_http_metadata(download_info)
save_filename = story_id_string+" - "+remote_filename
save_path = os.path.join(download_dir_path, save_filename)
savefile(save_path, download_data)
return True
def download_stories(download_dir_path,start_number,stop_number):
logger.debug("downloading stories from "+str(start_number)+" to "+str(stop_number))
story_counter = start_number
max_total_skipped = 10000
max_sequential_skipped = 100
total_skipped_counter = 0
sequential_skipped = 0
while (story_counter <= stop_number) and (total_skipped_counter <= max_total_skipped) and (sequential_skipped <= max_sequential_skipped):
logger.debug("downloading story id: "+str(story_counter))
download_success = download_story(download_dir_path,story_counter)
if not download_success:
total_skipped_counter += 1
sequential_skipped += 1
else:
sequential_skipped = 0
story_counter += 1
def main():
date_of_run_start = time.strftime('%Y %m %d')
download_folder = os.path.join("download", date_of_run_start)
download_stories(download_folder, 1, 1000000)
# Setup logging (Before running any other code)
# http://inventwithpython.com/blog/2012/04/06/stop-using-print-for-debugging-a-5-minute-quickstart-guide-to-pythons-logging-module/
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
fh = logging.FileHandler(LOGGING_LOG_PATH)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.debug('Logging started.')
if __name__ == '__main__':
try:
cj = cookielib.LWPCookieJar()
setupbrowser()
main()
logger.debug("End of run.")
# Log exceptions
except Exception, e:
logger.critical("Unhandled exception!")
logging.exception(e)