#       4chan thread image collector # # # #       Author: #       Date:   9/Aug/2013 #       License: GNU General Public License # #       Purpose: #               Supply a valid 4chan thread URL, and optionally a download directory. #               Will then find every image in the specified thread and download it. # #       Usage: #               4search.py [-d download location] [-h{elp}]         from os                 import chdir,getcwd,mkdir,remove from os.path    import isdir,isfile from platform   import system from re                 import finditer,match from subprocess import call,check_call,check_output from sys                import argv from urllib     import urlretrieve     class imgboard_info:         def __init__(self,thread_url):                 raise Exception("imgboard_info.__init__ not overridden")         def download_from_match(self,image_match):                 raise Exception("imgboard_info.download_from_match not overridden")   # Class holding the board name and thread ID number of a thread class _4chan_thread_info(imgboard_info):                 # Constructor         def __init__(self,thread_url):                                 self.site = '4chan'                                 print "Checking URL against 4chan format..."                                 # Attempt to pull the board name and thread ID from a URL using regex                 self.thread_pattern = '(http://)?boards\.4chan\.org/([^/]*)/res/(.*)'                 thread_match = match(self.thread_pattern, thread_url)                                 # If the thread is invalid, print so and throw an exception                 if thread_match==None:                         print "\tNot a 4chan thread"                         raise NameError(thread_url)                                         # Otherwise, pull the correct groups and return                 else:                         print "\tURL is a 4chan thread"                         self.image_pattern = (                                 'a class=\"fileThumb'   # All images start with this                                 '( imgspoiler)?\" '             # Catches spoillered images                                 '(target="_blank )?'    # This can come in random places                                 'href=\"//'                             # HTML tag for link location                                 '(images\.4chan\.org/'  # The start of the image URL                                 '[^/]*'                                 # The board                                 '/src/)'                                # All images are stored in this directory                                 '([^\.]*\.[^\"]*)'              # The filename is the final part of the URL                         )                         self.board = thread_match.group(2)                         self.thread = thread_match.group(3)         # End of __init__                 def download_from_match(self, image_match):                 image_name = image_match.group(4)                 image_url = image_match.group(3)+image_name                 if not isfile(image_name):                         urlretrieve('http://'+image_url, image_name)                 return image_name                         # End of download_from_match                 # End of class _4chan_thread_info     class mlpchan_thread_info(imgboard_info):           # Constructor         def __init__(self,thread_url):                                 self.site = 'mlpchan'                                 print "Checking URL against mlpchan format..."                         # Attempt to pull the board name and thread ID from a URL using regex                 self.thread_pattern = '(http://|https://)?mlpchan\.net/([^/]*)/res/(.*)'                 thread_match = match(self.thread_pattern, thread_url)                   # If the thread is invalid, print so and throw an exception                 if thread_match==None:                         print "\tNot a mlpchan thread"                         raise NameError(thread_url)                   # Otherwise, pull the correct groups and return                 else:                         print "\tURL is an mlpchan thread"                         self.image_pattern = (                         'a href=\"(/[^/]*/src/)([^\"]*)\"'                         )                         self.board = thread_match.group(2)                         self.thread = thread_match.group(3)                                 # End of __init__                         def download_from_match(self, image_match):                 image_name = image_match.group(2)                 image_url = image_match.group(1)+image_name                 if not isfile(image_name):                         urlretrieve('http://mlpchan.net'+image_url, image_name)                 return image_name                 # End of download_from_match           # End of class mlpchan_thread_info         # Class used to download all images from a 4chan thread class _4chan_thread_collector:           # Constructor         def __init__(self,argv):                   self.info_types = {_4chan_thread_info, mlpchan_thread_info}                                 # Record the command line arguments                 self.args = argv                         # end of __init__                         # Run the downloader         def run_downloader(self):                 print "Starting"                 # Record the starting directory                 self.starting_dir = getcwd()                                 # Process command line arguments to figure out the correct thread URL                 #       and download location                 thread_url = self.process_arguments()                                 # Loop through each of the possible imageboards to fine if any                 #       match the given URL                 for next_type in self.info_types:                         try:                                 self.thread_info = next_type(thread_url)                                 break;                         except NameError:                                 self.thread_info = None                                                 if self.thread_info == None:                         print "URL matches no known imageboard, exiting"                         quit()                                 # Create the directory for the board and thread,                 #       and move into that directory                 self.create_directories(self.thread_info)                                 # Download the HTML source for the thread                 urlretrieve(thread_url,self.thread_info.thread)                                 # Search the HTML source for every image in the thread and download them                 self.download_images(thread_url,self.thread_info)                                 # Delete the HTML source file and navigate back to where we started                 remove(self.thread_info.thread)                 chdir(self.starting_dir)                         # End of run_downloader                         # Print a help message         def print_help(self):                 print 'Usage:'                 print '4search.py thread_url [-d download_location] [-h to display help]'                         # End of print_help                         # Process the incoming command line arguments         def process_arguments(self):                 print self.args                 # If there are no argument, print the help message and exit                 if len(self.args)==1:                         self.print_help()                         quit()                                         # If the -h flag is present, print the help message                 if "-h" in self.args:                         hswitch = self.args.index("-h")                         self.args.pop(hswitch)                         self.print_help()                                         # if the -d flag is present, navigate to the given directory                 #       create it if necessary                 if "-d" in self.args:                         dswitch = self.args.index("-d")                         self.args.pop(dswitch)                         directory = self.args.pop(dswitch)                         if not isdir(directory):                                 mkdir(directory)                         chdir(directory)                 # After evaluating any flags, if there are 2 arguments,                 #       the 2nd is the thread URL                 if len(self.args)==2:                         return self.args[1]                                         # If there are any other number of arguments, exit                 else:                         print 'Bad arg list'                         quit()                         # End of process_arguments                         # Create directories for the board and thread         def create_directories(self,thread_info):                 site = thread_info.site                 board = thread_info.board                 thread = thread_info.thread                 if not isdir(site):                         mkdir(site)                 chdir(site)                 if not isdir(board):                         mkdir(board)                 chdir(board)                 if not isdir(thread):                         mkdir(thread)                 chdir(thread)                         # End of create_directories                         # Parse a downloaded HTML source for thread images and download each         def download_images(self,thread_url,thread_info):                                 # Open the HTML source                 f = open(thread_info.thread, 'r')                                 # Read each line of the source                 while True:                         line = f.readline()                         if line=="":                                 break                                                 # Create an iterator for each match of the pattern on that line                         images = finditer(thread_info.image_pattern, line)                         # Iterate through each match, pull the image URL from the pattern                         #       and download that image                         try:                                 while True:                                         print thread_info.download_from_match(images.next())+' Successfully Retrieved'                         except StopIteration:                                 print 'Finished'                                 pass                         # End of download_images           # End of class _4chan_thread_collector   # Actual Scipt: # Create an instance of the collector # Pass the command line arguments to its constructor # Call the download method _4chan_thread_collector(argv).run_downloader()