Title: [Perl] SiMalizer Author: Anonymous Pastebin link: http://pastebin.com/MWTGvNgP First Edit: Saturday 11th of April 2015 02:00:16 AM CDT Last Edit: Saturday 11th of April 2015 02:00:16 AM CDT #!/usr/bin/perl   use warnings; use strict;   use LWP::Simple; use HTML::Entities; use MIME::Base64; use Encode;   ######################### # --- config starts --- # #########################   # where we put the datas my $data_directory = "./datas";   # where we get the datas my @data_sources = (         {'thread_num' => '56',   'source_type' => 'moe', 'thread_id' => '22223717'},         {'thread_num' => '55',   'source_type' => 'moe', 'thread_id' => '22056180'},         {'thread_num' => '54',   'source_type' => 'moe', 'thread_id' => '21904753'},         {'thread_num' => '53',   'source_type' => 'moe', 'thread_id' => '21764308'},         {'thread_num' => '52.1', 'source_type' => 'moe', 'thread_id' => '21594261'},         {'thread_num' => '52',   'source_type' => 'moe', 'thread_id' => '21512193'},         {'thread_num' => '51.1', 'source_type' => 'moe', 'thread_id' => '21356468'},         {'thread_num' => '51',   'source_type' => 'moe', 'thread_id' => '21325472'},         {'thread_num' => '50',   'source_type' => 'moe', 'thread_id' => '21172471'},         {'thread_num' => '49',   'source_type' => 'moe', 'thread_id' => '21059280'},         {'thread_num' => '48',   'source_type' => 'moe', 'thread_id' => '20950338'},         {'thread_num' => '47',   'source_type' => 'moe', 'thread_id' => '20842499'},         {'thread_num' => '46',   'source_type' => 'moe', 'thread_id' => '20790264'},         {'thread_num' => '45',   'source_type' => 'moe', 'thread_id' => '20752370'},         {'thread_num' => '44.1', 'source_type' => 'moe', 'thread_id' => '20752275'},         {'thread_num' => '44',   'source_type' => 'moe', 'thread_id' => '20698725'},         {'thread_num' => '43',   'source_type' => 'moe', 'thread_id' => '20587927'},         {'thread_num' => '42',   'source_type' => 'moe', 'thread_id' => '20510451'},         {'thread_num' => '41',   'source_type' => 'moe', 'thread_id' => '20394858'},         {'thread_num' => '40',   'source_type' => 'moe', 'thread_id' => '20298241'},         {'thread_num' => '39',   'source_type' => 'moe', 'thread_id' => '20158458'},         {'thread_num' => '38',   'source_type' => 'moe', 'thread_id' => '20046947'},         {'thread_num' => '37',   'source_type' => 'moe', 'thread_id' => '19948544'},         {'thread_num' => '36',   'source_type' => 'moe', 'thread_id' => '19789936'},         {'thread_num' => '35',   'source_type' => 'moe', 'thread_id' => '19732633'},         {'thread_num' => '34',   'source_type' => 'moe', 'thread_id' => '19619101'},         {'thread_num' => '33',   'source_type' => 'moe', 'thread_id' => '19555102'},         {'thread_num' => '32',   'source_type' => 'moe', 'thread_id' => '19464712'},         {'thread_num' => '31',   'source_type' => 'moe', 'thread_id' => '19327661'},         {'thread_num' => '30',   'source_type' => 'moe', 'thread_id' => '19168550'},         {'thread_num' => '29',   'source_type' => 'moe', 'thread_id' => '19019961'},         {'thread_num' => '28',   'source_type' => 'moe', 'thread_id' => '18880638'},         {'thread_num' => '27',   'source_type' => 'moe', 'thread_id' => '18819253'},         {'thread_num' => '26',   'source_type' => 'moe', 'thread_id' => '18801160'},         {'thread_num' => '25',   'source_type' => 'moe', 'thread_id' => '18696442'},         {'thread_num' => '24',   'source_type' => 'moe', 'thread_id' => '18544500'},         {'thread_num' => '23',   'source_type' => 'moe', 'thread_id' => '18470791'},         {'thread_num' => '22',   'source_type' => 'moe', 'thread_id' => '18283512'},         {'thread_num' => '21',   'source_type' => 'moe', 'thread_id' => '18136462'},         {'thread_num' => '20',   'source_type' => 'moe', 'thread_id' => '18024823'},         {'thread_num' => '19',   'source_type' => 'moe', 'thread_id' => '18020372'},         {'thread_num' => '18',   'source_type' => 'moe', 'thread_id' => '17892238'},         {'thread_num' => '17',   'source_type' => 'moe', 'thread_id' => '17797463'},         {'thread_num' => '16',   'source_type' => 'moe', 'thread_id' => '17775480'},         {'thread_num' => '15',   'source_type' => 'moe', 'thread_id' => '17720880'},         {'thread_num' => '14',   'source_type' => 'moe', 'thread_id' => '17633979'},         {'thread_num' => '13',   'source_type' => 'moe', 'thread_id' => '17561647'},         {'thread_num' => '12',   'source_type' => 'moe', 'thread_id' => '17468561'},         {'thread_num' => '11',   'source_type' => 'moe', 'thread_id' => '17434563'},         {'thread_num' => '10',   'source_type' => 'moe', 'thread_id' => '17366686'},         {'thread_num' => '09.2', 'source_type' => 'moe', 'thread_id' => '17353816'},         {'thread_num' => '09.1', 'source_type' => 'moe', 'thread_id' => '17328655'},         {'thread_num' => '09',   'source_type' => 'moe', 'thread_id' => '17319438'},         {'thread_num' => '08',   'source_type' => 'moe', 'thread_id' => '17259685'},         {'thread_num' => '07',   'source_type' => 'moe', 'thread_id' => '17221214'},         {'thread_num' => '06',   'source_type' => 'moe', 'thread_id' => '17196576'},         {'thread_num' => '05',   'source_type' => 'moe', 'thread_id' => '17168092'},         {'thread_num' => '04',   'source_type' => 'moe', 'thread_id' => '17125021'},         {'thread_num' => '03',   'source_type' => 'moe', 'thread_id' => '17064170'},         {'thread_num' => '02',   'source_type' => 'moe', 'thread_id' => '17034786'},         {'thread_num' => '01.2', 'source_type' => 'moe', 'thread_id' => '16981347'},         {'thread_num' => '01.1', 'source_type' => 'moe', 'thread_id' => '16965745'},         {'thread_num' => '01',   'source_type' => 'moe', 'thread_id' => '16952866'},         {'thread_num' => '00',   'source_type' => 'moe', 'thread_id' => '16901518'} );   # delay between grabs (seconds) my $grab_delay_sec = 5;   ####################### # --- config ends --- # #######################       # make directory if it doesn't exist unless (-d $data_directory) {         mkdir $data_directory || die "Couldn't make $data_directory: $!"; }   # do through our sources and do our thing for my $data_source (@data_sources) {               # particulars         my $thread_num = $data_source->{thread_num};         my $source_type = $data_source->{source_type};         my $thread_id = $data_source->{thread_id};                 # figure out name for output file         my $output_file = $data_directory . "/" . $thread_num . "__" . $thread_id . ".csv";                 # skip if it already exists         if (-e $output_file)         {                 print "Skipping thread number $thread_num as $output_file already exists\n";            }         # otherwise call appropriate handler         else         {                 print "Processing thread number $thread_num to $output_file\n";                 my $handler_sub = 'handle_' . $source_type;                 eval("$handler_sub(\$thread_id, \$output_file)") || die $!; # yay perl!                                                         print "Waiting $grab_delay_sec seconds\n";                 sleep($grab_delay_sec);         }       }   # handler for archive.moe (moe) sub handle_moe {         # params         my ($thread_id, $output_file) = @_;                 # open up our output file         open(my $output_fh, '>', $output_file) || die "Couldn't open $output_file: $!";                 # write header line         print $output_fh "thread id,post id,author,tripcode,post text (base64),post html (base64)\n";                 # grab it               my $url = "https://archive.moe/mlp/thread/" . $thread_id;         my $html = get($url);                           # first we extract the "posts" section (everything between )         my ($posts_section_html) = $html =~ m::si;                 # now we extract the "articles" (everything between a set of
tags) and go through them         # each article represents a post                my @articles = $posts_section_html =~ m:(.*?
):sig;         print "$#articles posts found.\n";         for my $article (@articles)         {                 # pull the post id from the article tag itself                 my ($post_id) = $article =~ m::si;                                                 # pull author                 my ($author) = $article =~ m:(.*?):si;                                                 # pull tripcode                 my ($tripcode) = $article =~ m:(.*?):si;                                                 # grab the post contents (in html form)                 my ($post_text_html) = $article =~ m:(.*?):si;                                                                          # clean it up                 my $post_text = html_to_text($post_text_html);                                                          # build our csv line and write to file                 # convert shit to utf8 so perl stops bitching about wide characters.. fucking unicode..                 # also knob creek is awesome stuff                 my $post_text_utf8 = encode("utf-8", $post_text);                 my $post_text_html_utf8 = encode("utf-8", $post_text_html);                 my $author_utf8 = encode("utf-8", $author);                 my $tripcode_utf8 = encode("utf-8", $tripcode);                 my $output_line = $thread_id;                         $output_line .= "," . $post_id;                         $output_line .= "," . $author_utf8;                         $output_line .= "," . $tripcode_utf8;                         $output_line .= "," . encode_base64($post_text_utf8, "");                         $output_line .= "," . encode_base64($post_text_html_utf8, "");                         $output_line .= "\n";                 print $output_fh $output_line         }                 # close our file         close($output_fh);              }   sub html_to_text {         my ($html) = @_;                                 # turn linebreak tags into actual linebreaks         $html =~ s::\n:sig;                 # just remove all other tags         $html =~ s:<.*?>::sig;                  # decode other html elements (like > and <)                       $html = decode_entities($html);                                         return $html; }