Pastebin launched a little side project called HostCabi.net, check it out ;-)Don't like ads? PRO users don't see any ads ;-)

Untitled2 sec ago
Untitled6 sec ago
Untitled6 sec ago
Untitled11 sec ago
Untitled13 sec ago
Untitled15 sec ago
Untitled24 sec ago
Untitled20 sec ago

4search.py

By: ChallengerAppeared on Sep 12th, 2013 | syntax: Python | size: 7.48 KB | hits: 27 | expires: Never

download | raw | embed | report abuse | print

Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)

# 4chan thread image collector
#
#
#
# Author: Andrew Melnick
# Date: 9/Aug/2013
# License: GNU General Public License
#
# Purpose:
# Supply a valid 4chan thread URL, and optionally a download directory.
# Will then find every image in the specified thread and download it.
#
# Usage:
# 4search.py <thread url> [-d download location] [-h{elp}]
from os import chdir,getcwd,mkdir,remove
from os.path import isdir,isfile
from platform import system
from re import finditer,match
from subprocess import call,check_call,check_output
from sys import argv
from urllib import urlretrieve
class imgboard_info:
def __init__(self,thread_url):
raise Exception("imgboard_info.__init__ not overridden")
def download_from_match(self,image_match):
raise Exception("imgboard_info.download_from_match not overridden")
# Class holding the board name and thread ID number of a thread
class _4chan_thread_info(imgboard_info):
# Constructor
def __init__(self,thread_url):
self.site = '4chan'
print "Checking URL against 4chan format..."
# Attempt to pull the board name and thread ID from a URL using regex
self.thread_pattern = '(http://)?boards\.4chan\.org/([^/]*)/res/(.*)'
thread_match = match(self.thread_pattern, thread_url)
# If the thread is invalid, print so and throw an exception
if thread_match==None:
print "\tNot a 4chan thread"
raise NameError(thread_url)
# Otherwise, pull the correct groups and return
else:
print "\tURL is a 4chan thread"
self.image_pattern = (
'a class=\"fileThumb' # All images start with this
'( imgspoiler)?\" ' # Catches spoillered images
'(target="_blank )?' # This can come in random places
'href=\"//' # HTML tag for link location
'(images\.4chan\.org/' # The start of the image URL
'[^/]*' # The board
'/src/)' # All images are stored in this directory
'([^\.]*\.[^\"]*)' # The filename is the final part of the URL
)
self.board = thread_match.group(2)
self.thread = thread_match.group(3)
# End of __init__
def download_from_match(self, image_match):
image_name = image_match.group(4)
image_url = image_match.group(3)+image_name
if not isfile(image_name):
urlretrieve('http://'+image_url, image_name)
return image_name
# End of download_from_match
# End of class _4chan_thread_info
class mlpchan_thread_info(imgboard_info):
# Constructor
def __init__(self,thread_url):
self.site = 'mlpchan'
print "Checking URL against mlpchan format..."
# Attempt to pull the board name and thread ID from a URL using regex
self.thread_pattern = '(http://|https://)?mlpchan\.net/([^/]*)/res/(.*)'
thread_match = match(self.thread_pattern, thread_url)
# If the thread is invalid, print so and throw an exception
if thread_match==None:
print "\tNot a mlpchan thread"
raise NameError(thread_url)
# Otherwise, pull the correct groups and return
else:
print "\tURL is an mlpchan thread"
self.image_pattern = (
'a href=\"(/[^/]*/src/)([^\"]*)\"'
)
self.board = thread_match.group(2)
self.thread = thread_match.group(3)
# End of __init__
def download_from_match(self, image_match):
image_name = image_match.group(2)
image_url = image_match.group(1)+image_name
if not isfile(image_name):
urlretrieve('http://mlpchan.net'+image_url, image_name)
return image_name
# End of download_from_match
# End of class mlpchan_thread_info
# Class used to download all images from a 4chan thread
class _4chan_thread_collector:
# Constructor
def __init__(self,argv):
self.info_types = {_4chan_thread_info, mlpchan_thread_info}
# Record the command line arguments
self.args = argv
# end of __init__
# Run the downloader
def run_downloader(self):
print "Starting"
# Record the starting directory
self.starting_dir = getcwd()
# Process command line arguments to figure out the correct thread URL
# and download location
thread_url = self.process_arguments()
# Loop through each of the possible imageboards to fine if any
# match the given URL
for next_type in self.info_types:
try:
self.thread_info = next_type(thread_url)
break;
except NameError:
self.thread_info = None
if self.thread_info == None:
print "URL matches no known imageboard, exiting"
quit()
# Create the directory for the board and thread,
# and move into that directory
self.create_directories(self.thread_info)
# Download the HTML source for the thread
urlretrieve(thread_url,self.thread_info.thread)
# Search the HTML source for every image in the thread and download them
self.download_images(thread_url,self.thread_info)
# Delete the HTML source file and navigate back to where we started
remove(self.thread_info.thread)
chdir(self.starting_dir)
# End of run_downloader
# Print a help message
def print_help(self):
print 'Usage:'
print '4search.py thread_url [-d download_location] [-h to display help]'
# End of print_help
# Process the incoming command line arguments
def process_arguments(self):
print self.args
# If there are no argument, print the help message and exit
if len(self.args)==1:
self.print_help()
quit()
# If the -h flag is present, print the help message
if "-h" in self.args:
hswitch = self.args.index("-h")
self.args.pop(hswitch)
self.print_help()
# if the -d flag is present, navigate to the given directory
# create it if necessary
if "-d" in self.args:
dswitch = self.args.index("-d")
self.args.pop(dswitch)
directory = self.args.pop(dswitch)
if not isdir(directory):
mkdir(directory)
chdir(directory)
# After evaluating any flags, if there are 2 arguments,
# the 2nd is the thread URL
if len(self.args)==2:
return self.args[1]
# If there are any other number of arguments, exit
else:
print 'Bad arg list'
quit()
# End of process_arguments
# Create directories for the board and thread
def create_directories(self,thread_info):
site = thread_info.site
board = thread_info.board
thread = thread_info.thread
if not isdir(site):
mkdir(site)
chdir(site)
if not isdir(board):
mkdir(board)
chdir(board)
if not isdir(thread):
mkdir(thread)
chdir(thread)
# End of create_directories
# Parse a downloaded HTML source for thread images and download each
def download_images(self,thread_url,thread_info):
# Open the HTML source
f = open(thread_info.thread, 'r')
# Read each line of the source
while True:
line = f.readline()
if line=="":
break
# Create an iterator for each match of the pattern on that line
images = finditer(thread_info.image_pattern, line)
# Iterate through each match, pull the image URL from the pattern
# and download that image
try:
while True:
print thread_info.download_from_match(images.next())+' Successfully Retrieved'
except StopIteration:
print 'Finished'
pass
# End of download_images
# End of class _4chan_thread_collector
# Actual Scipt:
# Create an instance of the collector
# Pass the command line arguments to its constructor
# Call the download method
_4chan_thread_collector(argv).run_downloader()

create a new version of this paste RAW Paste Data

#	4chan thread image collector
#
#
#
#	Author:	Andrew Melnick
#	Date:	9/Aug/2013
#	License: GNU General Public License
#
#	Purpose:
#		Supply a valid 4chan thread URL, and optionally a download directory.
#		Will then find every image in the specified thread and download it.
#
#	Usage:
#		4search.py <thread url> [-d download location] [-h{elp}]




from os 		import chdir,getcwd,mkdir,remove
from os.path 	import isdir,isfile
from platform 	import system
from re 		import finditer,match
from subprocess import call,check_call,check_output
from sys		import argv
from urllib 	import urlretrieve


class imgboard_info:
	def __init__(self,thread_url):
		raise Exception("imgboard_info.__init__ not overridden")
	def download_from_match(self,image_match):
		raise Exception("imgboard_info.download_from_match not overridden")

# Class holding the board name and thread ID number of a thread
class _4chan_thread_info(imgboard_info):
	
	# Constructor
	def __init__(self,thread_url):
		
		self.site = '4chan'
		
		print "Checking URL against 4chan format..."
		
		# Attempt to pull the board name and thread ID from a URL using regex
		self.thread_pattern = '(http://)?boards\.4chan\.org/([^/]*)/res/(.*)'
		thread_match = match(self.thread_pattern, thread_url)
		
		# If the thread is invalid, print so and throw an exception
		if thread_match==None:
			print "\tNot a 4chan thread"
			raise NameError(thread_url)
			
		# Otherwise, pull the correct groups and return
		else:
			print "\tURL is a 4chan thread"
			self.image_pattern = (
				'a class=\"fileThumb'	# All images start with this
				'( imgspoiler)?\" '		# Catches spoillered images
				'(target="_blank )?'	# This can come in random places
				'href=\"//'				# HTML tag for link location
				'(images\.4chan\.org/'	# The start of the image URL
				'[^/]*'					# The board
				'/src/)'				# All images are stored in this directory
				'([^\.]*\.[^\"]*)'		# The filename is the final part of the URL
			)
			self.board = thread_match.group(2)
			self.thread = thread_match.group(3)
	# End of __init__
	
	def download_from_match(self, image_match):
		image_name = image_match.group(4)
		image_url = image_match.group(3)+image_name
		if not isfile(image_name):
                        urlretrieve('http://'+image_url, image_name)
		return image_name
		
	# End of download_from_match
	
	
# End of class _4chan_thread_info


class mlpchan_thread_info(imgboard_info):

	# Constructor
	def __init__(self,thread_url):
		
		self.site = 'mlpchan'
		
		print "Checking URL against mlpchan format..."
	
		# Attempt to pull the board name and thread ID from a URL using regex
		self.thread_pattern = '(http://|https://)?mlpchan\.net/([^/]*)/res/(.*)'
		thread_match = match(self.thread_pattern, thread_url)

		# If the thread is invalid, print so and throw an exception
		if thread_match==None:
			print "\tNot a mlpchan thread"
			raise NameError(thread_url)

		# Otherwise, pull the correct groups and return
		else:
			print "\tURL is an mlpchan thread"
			self.image_pattern = (
			'a href=\"(/[^/]*/src/)([^\"]*)\"'
			)
			self.board = thread_match.group(2)
			self.thread = thread_match.group(3)
			
	# End of __init__
	
	
	def download_from_match(self, image_match):
		image_name = image_match.group(2)
		image_url = image_match.group(1)+image_name
		if not isfile(image_name):
                        urlretrieve('http://mlpchan.net'+image_url, image_name)
		return image_name
	
	# End of download_from_match
	

# End of class mlpchan_thread_info




# Class used to download all images from a 4chan thread
class _4chan_thread_collector:

	# Constructor
	def __init__(self,argv):

		self.info_types = {_4chan_thread_info, mlpchan_thread_info}
		
		# Record the command line arguments
		self.args = argv
		
	# end of __init__
	
	
	# Run the downloader
	def run_downloader(self):
		print "Starting"
		# Record the starting directory
		self.starting_dir = getcwd()
		
		# Process command line arguments to figure out the correct thread URL
		# 	and download location
		thread_url = self.process_arguments()
		
		# Loop through each of the possible imageboards to fine if any
		#	match the given URL
		for next_type in self.info_types:
			try:
				self.thread_info = next_type(thread_url)
				break;
			except NameError:
				self.thread_info = None
                
		
		if self.thread_info == None:
			print "URL matches no known imageboard, exiting"
			quit()
		
		# Create the directory for the board and thread,
		# 	and move into that directory
		self.create_directories(self.thread_info)
		
		# Download the HTML source for the thread
		urlretrieve(thread_url,self.thread_info.thread)
		
		# Search the HTML source for every image in the thread and download them
		self.download_images(thread_url,self.thread_info)
		
		# Delete the HTML source file and navigate back to where we started
		remove(self.thread_info.thread)
		chdir(self.starting_dir)
		
	# End of run_downloader
	
	
	# Print a help message
	def print_help(self):
		print 'Usage:'
		print '4search.py thread_url [-d download_location] [-h to display help]'
		
	# End of print_help
	
	
	# Process the incoming command line arguments
	def process_arguments(self):
		print self.args
		# If there are no argument, print the help message and exit
		if len(self.args)==1:
			self.print_help()
			quit()
			
		# If the -h flag is present, print the help message
		if "-h" in self.args:
			hswitch = self.args.index("-h")
			self.args.pop(hswitch)
			self.print_help()
			
		# if the -d flag is present, navigate to the given directory
		#	create it if necessary
		if "-d" in self.args:
			dswitch = self.args.index("-d")
			self.args.pop(dswitch)
			directory = self.args.pop(dswitch)
			if not isdir(directory):
				mkdir(directory)
			chdir(directory)
		# After evaluating any flags, if there are 2 arguments,
		#	the 2nd is the thread URL
		if len(self.args)==2:
			return self.args[1]
			
		# If there are any other number of arguments, exit
		else:
                        print 'Bad arg list'
			quit()
		
	# End of process_arguments
	
	
	# Create directories for the board and thread
	def create_directories(self,thread_info):
		site = thread_info.site
		board = thread_info.board
		thread = thread_info.thread
		if not isdir(site):
			mkdir(site)
		chdir(site)
		if not isdir(board):
			mkdir(board)
		chdir(board)
		if not isdir(thread):
			mkdir(thread)
		chdir(thread)
		
	# End of create_directories
	
	
	# Parse a downloaded HTML source for thread images and download each
	def download_images(self,thread_url,thread_info):
		
		# Open the HTML source
		f = open(thread_info.thread, 'r')
		
		# Read each line of the source
		while True:
			line = f.readline()
			if line=="":
				break
			
			# Create an iterator for each match of the pattern on that line
			images = finditer(thread_info.image_pattern, line)
			# Iterate through each match, pull the image URL from the pattern
			#	and download that image
			try:
				while True:
					print thread_info.download_from_match(images.next())+' Successfully Retrieved'
			except StopIteration:
                                print 'Finished'
				pass
		
	# End of download_images

	
# End of class _4chan_thread_collector

# Actual Scipt:
# Create an instance of the collector
# Pass the command line arguments to its constructor
# Call the download method
_4chan_thread_collector(argv).run_downloader()

Pastebin.com Tools & Applications

iPhone/iPad

Windows

Firefox

Chrome

WebOS

Android

Mac

Opera

Click.to

UNIX

WinPhone