#       4chan thread image collector
#
#
#
#       Author:
#       Date:   9/Aug/2013
#       License: GNU General Public License
#
#       Purpose:
#               Supply a valid 4chan thread URL, and optionally a download directory.
#               Will then find every image in the specified thread and download it.
#
#       Usage:
#               4search.py <thread url> [-d download location] [-h{elp}]
 
 
 
 
from os                 import chdir,getcwd,mkdir,remove
from os.path    import isdir,isfile
from platform   import system
from re                 import finditer,match
from subprocess import call,check_call,check_output
from sys                import argv
from urllib     import urlretrieve
 
 
class imgboard_info:
        def __init__(self,thread_url):
                raise Exception("imgboard_info.__init__ not overridden")
        def download_from_match(self,image_match):
                raise Exception("imgboard_info.download_from_match not overridden")
 
# Class holding the board name and thread ID number of a thread
class _4chan_thread_info(imgboard_info):
       
        # Constructor
        def __init__(self,thread_url):
               
                self.site = '4chan'
               
                print "Checking URL against 4chan format..."
               
                # Attempt to pull the board name and thread ID from a URL using regex
                self.thread_pattern = '(http://)?boards\.4chan\.org/([^/]*)/res/(.*)'
                thread_match = match(self.thread_pattern, thread_url)
               
                # If the thread is invalid, print so and throw an exception
                if thread_match==None:
                        print "\tNot a 4chan thread"
                        raise NameError(thread_url)
                       
                # Otherwise, pull the correct groups and return
                else:
                        print "\tURL is a 4chan thread"
                        self.image_pattern = (
                                'a class=\"fileThumb'   # All images start with this
                                '( imgspoiler)?\" '             # Catches spoillered images
                                '(target="_blank )?'    # This can come in random places
                                'href=\"//'                             # HTML tag for link location
                                '(images\.4chan\.org/'  # The start of the image URL
                                '[^/]*'                                 # The board
                                '/src/)'                                # All images are stored in this directory
                                '([^\.]*\.[^\"]*)'              # The filename is the final part of the URL
                        )
                        self.board = thread_match.group(2)
                        self.thread = thread_match.group(3)
        # End of __init__
       
        def download_from_match(self, image_match):
                image_name = image_match.group(4)
                image_url = image_match.group(3)+image_name
                if not isfile(image_name):
                        urlretrieve('http://'+image_url, image_name)
                return image_name
               
        # End of download_from_match
       
       
# End of class _4chan_thread_info
 
 
class mlpchan_thread_info(imgboard_info):
 
        # Constructor
        def __init__(self,thread_url):
               
                self.site = 'mlpchan'
               
                print "Checking URL against mlpchan format..."
       
                # Attempt to pull the board name and thread ID from a URL using regex
                self.thread_pattern = '(http://|https://)?mlpchan\.net/([^/]*)/res/(.*)'
                thread_match = match(self.thread_pattern, thread_url)
 
                # If the thread is invalid, print so and throw an exception
                if thread_match==None:
                        print "\tNot a mlpchan thread"
                        raise NameError(thread_url)
 
                # Otherwise, pull the correct groups and return
                else:
                        print "\tURL is an mlpchan thread"
                        self.image_pattern = (
                        'a href=\"(/[^/]*/src/)([^\"]*)\"'
                        )
                        self.board = thread_match.group(2)
                        self.thread = thread_match.group(3)
                       
        # End of __init__
       
       
        def download_from_match(self, image_match):
                image_name = image_match.group(2)
                image_url = image_match.group(1)+image_name
                if not isfile(image_name):
                        urlretrieve('http://mlpchan.net'+image_url, image_name)
                return image_name
       
        # End of download_from_match
       
 
# End of class mlpchan_thread_info
 
 
 
 
# Class used to download all images from a 4chan thread
class _4chan_thread_collector:
 
        # Constructor
        def __init__(self,argv):
 
                self.info_types = {_4chan_thread_info, mlpchan_thread_info}
               
                # Record the command line arguments
                self.args = argv
               
        # end of __init__
       
       
        # Run the downloader
        def run_downloader(self):
                print "Starting"
                # Record the starting directory
                self.starting_dir = getcwd()
               
                # Process command line arguments to figure out the correct thread URL
                #       and download location
                thread_url = self.process_arguments()
               
                # Loop through each of the possible imageboards to fine if any
                #       match the given URL
                for next_type in self.info_types:
                        try:
                                self.thread_info = next_type(thread_url)
                                break;
                        except NameError:
                                self.thread_info = None
               
               
                if self.thread_info == None:
                        print "URL matches no known imageboard, exiting"
                        quit()
               
                # Create the directory for the board and thread,
                #       and move into that directory
                self.create_directories(self.thread_info)
               
                # Download the HTML source for the thread
                urlretrieve(thread_url,self.thread_info.thread)
               
                # Search the HTML source for every image in the thread and download them
                self.download_images(thread_url,self.thread_info)
               
                # Delete the HTML source file and navigate back to where we started
                remove(self.thread_info.thread)
                chdir(self.starting_dir)
               
        # End of run_downloader
       
       
        # Print a help message
        def print_help(self):
                print 'Usage:'
                print '4search.py thread_url [-d download_location] [-h to display help]'
               
        # End of print_help
       
       
        # Process the incoming command line arguments
        def process_arguments(self):
                print self.args
                # If there are no argument, print the help message and exit
                if len(self.args)==1:
                        self.print_help()
                        quit()
                       
                # If the -h flag is present, print the help message
                if "-h" in self.args:
                        hswitch = self.args.index("-h")
                        self.args.pop(hswitch)
                        self.print_help()
                       
                # if the -d flag is present, navigate to the given directory
                #       create it if necessary
                if "-d" in self.args:
                        dswitch = self.args.index("-d")
                        self.args.pop(dswitch)
                        directory = self.args.pop(dswitch)
                        if not isdir(directory):
                                mkdir(directory)
                        chdir(directory)
                # After evaluating any flags, if there are 2 arguments,
                #       the 2nd is the thread URL
                if len(self.args)==2:
                        return self.args[1]
                       
                # If there are any other number of arguments, exit
                else:
                        print 'Bad arg list'
                        quit()
               
        # End of process_arguments
       
       
        # Create directories for the board and thread
        def create_directories(self,thread_info):
                site = thread_info.site
                board = thread_info.board
                thread = thread_info.thread
                if not isdir(site):
                        mkdir(site)
                chdir(site)
                if not isdir(board):
                        mkdir(board)
                chdir(board)
                if not isdir(thread):
                        mkdir(thread)
                chdir(thread)
               
        # End of create_directories
       
       
        # Parse a downloaded HTML source for thread images and download each
        def download_images(self,thread_url,thread_info):
               
                # Open the HTML source
                f = open(thread_info.thread, 'r')
               
                # Read each line of the source
                while True:
                        line = f.readline()
                        if line=="":
                                break
                       
                        # Create an iterator for each match of the pattern on that line
                        images = finditer(thread_info.image_pattern, line)
                        # Iterate through each match, pull the image URL from the pattern
                        #       and download that image
                        try:
                                while True:
                                        print thread_info.download_from_match(images.next())+' Successfully Retrieved'
                        except StopIteration:
                                print 'Finished'
                                pass
               
        # End of download_images
 
       
# End of class _4chan_thread_collector
 
# Actual Scipt:
# Create an instance of the collector
# Pass the command line arguments to its constructor
# Call the download method
_4chan_thread_collector(argv).run_downloader()