import urllib2
import re
import sys
import operator
def getmostcharacters(l1, l2):
    highchars = []
    s = sum(l2)
    dictionary = dict(zip(l1, l2))
    dictionary[""] = 0
    h1 = ""
    h2 = ""
    h3 = ""
    for i in range(len(l1)):
        if l2[i] > dictionary[h1]:
            h3 = h2
            h2 = h1
            h1 = l1[i]
        elif l2[i] > dictionary[h2]:
            h3 = h2
 
            h2 = l1[i]
        elif l2[i] > dictionary[h3]:
            h3 = l1[i]
    if h1 != "":
        highchars.append(h1)
    if h2 != "":
        highchars.append(h2)
    if h3 != "":
        highchars.append(h3)
        return highchars
    else:
        return ["uncategorized"]
def isnsfw(l1, l2):
    for i in range(len(l2)):
        if l1.count(l2[i]) >= 1:
            return "NSFW"
    return "SFW"
api_key= "1bd4b82919f280d3e449cab6f169da33"
raw_url = "http://pastebin.com/raw.php?i="
url = "http://pastebin.com/"
f = open("pasteids.txt", "r")
frl = f.readlines()
characters = ["Celestia", "Luna", "Cadance", "Velvet", "Twilight", "Maud", "Pinkie", "Applejack", "Rainbow", "Fluttershy", "Nurse Redheart",
              "Sweetie Belle", "Scootaloo", "Apple Bloom", "Spike", "Babs Seed", "Granny", "Tiara", "Silver Spoon",
              "Sunset", "Nightmare", "Chrysalis", "Adagio", "Sonata", "Aria", "Trixie", "Gilda", "Fleur", "Sugar Belle",
              "Spitfire", "Aloe", "Lotus", "Coco Pommel", "Tree Hugger", "Cloudchaser", "Flitter", "Blossomforth", "Lightning Dust"]
nsfw = ["pussy", "balls", "cock", "dick", "penis", "semen", "cum", "vagina", "horsepussy", "dripping"]
charamounts = []
b = open("somanypastes.txt", "w")
massbin = open("megapaste.txt", "w")
x = 0
print(len(frl))
for i in frl[::-1]:
    print(x)
    x += 1
    charamounts = []
    for q in range(len(characters)):
        charamounts.append(0)
    if i.split("\n")[0] != "":
        a = raw_url + i.split("\n")[0]
        try:
            request = urllib2.Request(a)
            resp = urllib2.urlopen(request)
            html = resp.read()
            allwords = html.split(" ")
            for z in allwords:
                z = re.sub('[^0-9a-zA-Z]+', '', z).lower()
            for z in range(len(characters)):
                charamounts[z] = allwords.count(characters[z])
            rating = isnsfw(allwords, nsfw)
            b.write(url + i.split("\n")[0] + " " + rating + " " + ", ".join(getmostcharacters(characters, charamounts)) + "\n")
            ##print(i.split("\n")[0] + " " + rating + " " + ", ".join(getmostcharacters(characters, charamounts)))
        except urllib2.HTTPError, e:
            ##print(e.code)
            pass
        except urllib2.URLError, e:
            ##print(e)
            pass
        except KeyboardInterrupt:
            f.close()
            b.close()
            massbin.close()
            print "Exit."
            sys.exit()
f.close()
b.close()
massbin.close()