Title: [Perl] SiMalizer
Author: Anonymous
Pastebin link: http://pastebin.com/MWTGvNgP
First Edit: Saturday 11th of April 2015 02:00:16 AM CDT
Last Edit: Saturday 11th of April 2015 02:00:16 AM CDT
#!/usr/bin/perl
use warnings;
use strict;
use LWP::Simple;
use HTML::Entities;
use MIME::Base64;
use Encode;
#########################
# --- config starts --- #
#########################
# where we put the datas
my $data_directory = "./datas";
# where we get the datas
my @data_sources = (
{'thread_num' => '56', 'source_type' => 'moe', 'thread_id' => '22223717'},
{'thread_num' => '55', 'source_type' => 'moe', 'thread_id' => '22056180'},
{'thread_num' => '54', 'source_type' => 'moe', 'thread_id' => '21904753'},
{'thread_num' => '53', 'source_type' => 'moe', 'thread_id' => '21764308'},
{'thread_num' => '52.1', 'source_type' => 'moe', 'thread_id' => '21594261'},
{'thread_num' => '52', 'source_type' => 'moe', 'thread_id' => '21512193'},
{'thread_num' => '51.1', 'source_type' => 'moe', 'thread_id' => '21356468'},
{'thread_num' => '51', 'source_type' => 'moe', 'thread_id' => '21325472'},
{'thread_num' => '50', 'source_type' => 'moe', 'thread_id' => '21172471'},
{'thread_num' => '49', 'source_type' => 'moe', 'thread_id' => '21059280'},
{'thread_num' => '48', 'source_type' => 'moe', 'thread_id' => '20950338'},
{'thread_num' => '47', 'source_type' => 'moe', 'thread_id' => '20842499'},
{'thread_num' => '46', 'source_type' => 'moe', 'thread_id' => '20790264'},
{'thread_num' => '45', 'source_type' => 'moe', 'thread_id' => '20752370'},
{'thread_num' => '44.1', 'source_type' => 'moe', 'thread_id' => '20752275'},
{'thread_num' => '44', 'source_type' => 'moe', 'thread_id' => '20698725'},
{'thread_num' => '43', 'source_type' => 'moe', 'thread_id' => '20587927'},
{'thread_num' => '42', 'source_type' => 'moe', 'thread_id' => '20510451'},
{'thread_num' => '41', 'source_type' => 'moe', 'thread_id' => '20394858'},
{'thread_num' => '40', 'source_type' => 'moe', 'thread_id' => '20298241'},
{'thread_num' => '39', 'source_type' => 'moe', 'thread_id' => '20158458'},
{'thread_num' => '38', 'source_type' => 'moe', 'thread_id' => '20046947'},
{'thread_num' => '37', 'source_type' => 'moe', 'thread_id' => '19948544'},
{'thread_num' => '36', 'source_type' => 'moe', 'thread_id' => '19789936'},
{'thread_num' => '35', 'source_type' => 'moe', 'thread_id' => '19732633'},
{'thread_num' => '34', 'source_type' => 'moe', 'thread_id' => '19619101'},
{'thread_num' => '33', 'source_type' => 'moe', 'thread_id' => '19555102'},
{'thread_num' => '32', 'source_type' => 'moe', 'thread_id' => '19464712'},
{'thread_num' => '31', 'source_type' => 'moe', 'thread_id' => '19327661'},
{'thread_num' => '30', 'source_type' => 'moe', 'thread_id' => '19168550'},
{'thread_num' => '29', 'source_type' => 'moe', 'thread_id' => '19019961'},
{'thread_num' => '28', 'source_type' => 'moe', 'thread_id' => '18880638'},
{'thread_num' => '27', 'source_type' => 'moe', 'thread_id' => '18819253'},
{'thread_num' => '26', 'source_type' => 'moe', 'thread_id' => '18801160'},
{'thread_num' => '25', 'source_type' => 'moe', 'thread_id' => '18696442'},
{'thread_num' => '24', 'source_type' => 'moe', 'thread_id' => '18544500'},
{'thread_num' => '23', 'source_type' => 'moe', 'thread_id' => '18470791'},
{'thread_num' => '22', 'source_type' => 'moe', 'thread_id' => '18283512'},
{'thread_num' => '21', 'source_type' => 'moe', 'thread_id' => '18136462'},
{'thread_num' => '20', 'source_type' => 'moe', 'thread_id' => '18024823'},
{'thread_num' => '19', 'source_type' => 'moe', 'thread_id' => '18020372'},
{'thread_num' => '18', 'source_type' => 'moe', 'thread_id' => '17892238'},
{'thread_num' => '17', 'source_type' => 'moe', 'thread_id' => '17797463'},
{'thread_num' => '16', 'source_type' => 'moe', 'thread_id' => '17775480'},
{'thread_num' => '15', 'source_type' => 'moe', 'thread_id' => '17720880'},
{'thread_num' => '14', 'source_type' => 'moe', 'thread_id' => '17633979'},
{'thread_num' => '13', 'source_type' => 'moe', 'thread_id' => '17561647'},
{'thread_num' => '12', 'source_type' => 'moe', 'thread_id' => '17468561'},
{'thread_num' => '11', 'source_type' => 'moe', 'thread_id' => '17434563'},
{'thread_num' => '10', 'source_type' => 'moe', 'thread_id' => '17366686'},
{'thread_num' => '09.2', 'source_type' => 'moe', 'thread_id' => '17353816'},
{'thread_num' => '09.1', 'source_type' => 'moe', 'thread_id' => '17328655'},
{'thread_num' => '09', 'source_type' => 'moe', 'thread_id' => '17319438'},
{'thread_num' => '08', 'source_type' => 'moe', 'thread_id' => '17259685'},
{'thread_num' => '07', 'source_type' => 'moe', 'thread_id' => '17221214'},
{'thread_num' => '06', 'source_type' => 'moe', 'thread_id' => '17196576'},
{'thread_num' => '05', 'source_type' => 'moe', 'thread_id' => '17168092'},
{'thread_num' => '04', 'source_type' => 'moe', 'thread_id' => '17125021'},
{'thread_num' => '03', 'source_type' => 'moe', 'thread_id' => '17064170'},
{'thread_num' => '02', 'source_type' => 'moe', 'thread_id' => '17034786'},
{'thread_num' => '01.2', 'source_type' => 'moe', 'thread_id' => '16981347'},
{'thread_num' => '01.1', 'source_type' => 'moe', 'thread_id' => '16965745'},
{'thread_num' => '01', 'source_type' => 'moe', 'thread_id' => '16952866'},
{'thread_num' => '00', 'source_type' => 'moe', 'thread_id' => '16901518'}
);
# delay between grabs (seconds)
my $grab_delay_sec = 5;
#######################
# --- config ends --- #
#######################
# make directory if it doesn't exist
unless (-d $data_directory)
{
mkdir $data_directory || die "Couldn't make $data_directory: $!";
}
# do through our sources and do our thing
for my $data_source (@data_sources)
{
# particulars
my $thread_num = $data_source->{thread_num};
my $source_type = $data_source->{source_type};
my $thread_id = $data_source->{thread_id};
# figure out name for output file
my $output_file = $data_directory . "/" . $thread_num . "__" . $thread_id . ".csv";
# skip if it already exists
if (-e $output_file)
{
print "Skipping thread number $thread_num as $output_file already exists\n";
}
# otherwise call appropriate handler
else
{
print "Processing thread number $thread_num to $output_file\n";
my $handler_sub = 'handle_' . $source_type;
eval("$handler_sub(\$thread_id, \$output_file)") || die $!; # yay perl!
print "Waiting $grab_delay_sec seconds\n";
sleep($grab_delay_sec);
}
}
# handler for archive.moe (moe)
sub handle_moe
{
# params
my ($thread_id, $output_file) = @_;
# open up our output file
open(my $output_fh, '>', $output_file) || die "Couldn't open $output_file: $!";
# write header line
print $output_fh "thread id,post id,author,tripcode,post text (base64),post html (base64)\n";
# grab it
my $url = "https://archive.moe/mlp/thread/" . $thread_id;
my $html = get($url);
# first we extract the "posts" section (everything between )
my ($posts_section_html) = $html =~ m::si;
# now we extract the "articles" (everything between a set of tags) and go through them
# each article represents a post
my @articles = $posts_section_html =~ m:(.*?):sig;
print "$#articles posts found.\n";
for my $article (@articles)
{
# pull the post id from the article tag itself
my ($post_id) = $article =~ m::si;
# pull author
my ($author) = $article =~ m:(.*?):si;
# pull tripcode
my ($tripcode) = $article =~ m:(.*?):si;
# grab the post contents (in html form)
my ($post_text_html) = $article =~ m:(.*?):si;
# clean it up
my $post_text = html_to_text($post_text_html);
# build our csv line and write to file
# convert shit to utf8 so perl stops bitching about wide characters.. fucking unicode..
# also knob creek is awesome stuff
my $post_text_utf8 = encode("utf-8", $post_text);
my $post_text_html_utf8 = encode("utf-8", $post_text_html);
my $author_utf8 = encode("utf-8", $author);
my $tripcode_utf8 = encode("utf-8", $tripcode);
my $output_line = $thread_id;
$output_line .= "," . $post_id;
$output_line .= "," . $author_utf8;
$output_line .= "," . $tripcode_utf8;
$output_line .= "," . encode_base64($post_text_utf8, "");
$output_line .= "," . encode_base64($post_text_html_utf8, "");
$output_line .= "\n";
print $output_fh $output_line
}
# close our file
close($output_fh);
}
sub html_to_text
{
my ($html) = @_;
# turn linebreak tags into actual linebreaks
$html =~ s::\n:sig;
# just remove all other tags
$html =~ s:<.*?>::sig;
# decode other html elements (like > and <)
$html = decode_entities($html);
return $html;
}