# 4chan thread image collector
#
#
#
# Author: Andrew Melnick
# Date: 9/Aug/2013
# License: GNU General Public License
#
# Purpose:
# Supply a valid 4chan thread URL, and optionally a download directory.
# Will then find every image in the specified thread and download it.
#
# Usage:
# 4search.py <thread url> [-d download location] [-h{elp}]
from os import chdir,getcwd,mkdir,remove
from os.path import isdir,isfile
from platform import system
from re import finditer,match
from subprocess import call,check_call,check_output
from sys import argv
from urllib import urlretrieve
class imgboard_info:
def __init__(self,thread_url):
raise Exception("imgboard_info.__init__ not overridden")
def download_from_match(self,image_match):
raise Exception("imgboard_info.download_from_match not overridden")
# Class holding the board name and thread ID number of a thread
class _4chan_thread_info(imgboard_info):
# Constructor
def __init__(self,thread_url):
self.site = '4chan'
print "Checking URL against 4chan format..."
# Attempt to pull the board name and thread ID from a URL using regex
self.thread_pattern = '(http://)?boards\.4chan\.org/([^/]*)/res/(.*)'
thread_match = match(self.thread_pattern, thread_url)
# If the thread is invalid, print so and throw an exception
if thread_match==None:
print "\tNot a 4chan thread"
raise NameError(thread_url)
# Otherwise, pull the correct groups and return
else:
print "\tURL is a 4chan thread"
self.image_pattern = (
'a class=\"fileThumb' # All images start with this
'( imgspoiler)?\" ' # Catches spoillered images
'(target="_blank )?' # This can come in random places
'href=\"//' # HTML tag for link location
'(images\.4chan\.org/' # The start of the image URL
'[^/]*' # The board
'/src/)' # All images are stored in this directory
'([^\.]*\.[^\"]*)' # The filename is the final part of the URL
)
self.board = thread_match.group(2)
self.thread = thread_match.group(3)
# End of __init__
def download_from_match(self, image_match):
image_name = image_match.group(4)
image_url = image_match.group(3)+image_name
if not isfile(image_name):
urlretrieve('http://'+image_url, image_name)
return image_name
# End of download_from_match
# End of class _4chan_thread_info
class mlpchan_thread_info(imgboard_info):
# Constructor
def __init__(self,thread_url):
self.site = 'mlpchan'
print "Checking URL against mlpchan format..."
# Attempt to pull the board name and thread ID from a URL using regex
self.thread_pattern = '(http://|https://)?mlpchan\.net/([^/]*)/res/(.*)'
thread_match = match(self.thread_pattern, thread_url)
# If the thread is invalid, print so and throw an exception
if thread_match==None:
print "\tNot a mlpchan thread"
raise NameError(thread_url)
# Otherwise, pull the correct groups and return
else:
print "\tURL is an mlpchan thread"
self.image_pattern = (
'a href=\"(/[^/]*/src/)([^\"]*)\"'
)
self.board = thread_match.group(2)
self.thread = thread_match.group(3)
# End of __init__
def download_from_match(self, image_match):
image_name = image_match.group(2)
image_url = image_match.group(1)+image_name
if not isfile(image_name):
urlretrieve('http://mlpchan.net'+image_url, image_name)
return image_name
# End of download_from_match
# End of class mlpchan_thread_info
# Class used to download all images from a 4chan thread
class _4chan_thread_collector:
# Constructor
def __init__(self,argv):
self.info_types = {_4chan_thread_info, mlpchan_thread_info}
# Record the command line arguments
self.args = argv
# end of __init__
# Run the downloader
def run_downloader(self):
print "Starting"
# Record the starting directory
self.starting_dir = getcwd()
# Process command line arguments to figure out the correct thread URL
# and download location
thread_url = self.process_arguments()
# Loop through each of the possible imageboards to fine if any
# match the given URL
for next_type in self.info_types:
try:
self.thread_info = next_type(thread_url)
break;
except NameError:
self.thread_info = None
if self.thread_info == None:
print "URL matches no known imageboard, exiting"
quit()
# Create the directory for the board and thread,
# and move into that directory
self.create_directories(self.thread_info)
# Download the HTML source for the thread
urlretrieve(thread_url,self.thread_info.thread)
# Search the HTML source for every image in the thread and download them
self.download_images(thread_url,self.thread_info)
# Delete the HTML source file and navigate back to where we started
remove(self.thread_info.thread)
chdir(self.starting_dir)
# End of run_downloader
# Print a help message
def print_help(self):
print 'Usage:'
print '4search.py thread_url [-d download_location] [-h to display help]'
# End of print_help
# Process the incoming command line arguments
def process_arguments(self):
print self.args
# If there are no argument, print the help message and exit
if len(self.args)==1:
self.print_help()
quit()
# If the -h flag is present, print the help message
if "-h" in self.args:
hswitch = self.args.index("-h")
self.args.pop(hswitch)
self.print_help()
# if the -d flag is present, navigate to the given directory
# create it if necessary
if "-d" in self.args:
dswitch = self.args.index("-d")
self.args.pop(dswitch)
directory = self.args.pop(dswitch)
if not isdir(directory):
mkdir(directory)
chdir(directory)
# After evaluating any flags, if there are 2 arguments,
# the 2nd is the thread URL
if len(self.args)==2:
return self.args[1]
# If there are any other number of arguments, exit
else:
print 'Bad arg list'
quit()
# End of process_arguments
# Create directories for the board and thread
def create_directories(self,thread_info):
site = thread_info.site
board = thread_info.board
thread = thread_info.thread
if not isdir(site):
mkdir(site)
chdir(site)
if not isdir(board):
mkdir(board)
chdir(board)
if not isdir(thread):
mkdir(thread)
chdir(thread)
# End of create_directories
# Parse a downloaded HTML source for thread images and download each
def download_images(self,thread_url,thread_info):
# Open the HTML source
f = open(thread_info.thread, 'r')
# Read each line of the source
while True:
line = f.readline()
if line=="":
break
# Create an iterator for each match of the pattern on that line
images = finditer(thread_info.image_pattern, line)
# Iterate through each match, pull the image URL from the pattern
# and download that image
try:
while True:
print thread_info.download_from_match(images.next())+' Successfully Retrieved'
except StopIteration:
print 'Finished'
pass
# End of download_images
# End of class _4chan_thread_collector
# Actual Scipt:
# Create an instance of the collector
# Pass the command line arguments to its constructor
# Call the download method
_4chan_thread_collector(argv).run_downloader()