#-------------------------------------------------------------------------------
# Name:        module1
# Purpose:
#
# Author:      new
#
# Created:     15/12/2011
# Copyright:   (c) new 2011
# Licence:     <your licence>
#-------------------------------------------------------------------------------
#!/usr/bin/env python


import re
import os
import sys
import fnmatch
import scanpastebin



def scantext(text):
    #
    #pastebin.com
    #/u/<anything> OR <8Chars>
    #
    regexes = [
    r'(?:http://)?(?:www\.)?(?:pastebin\.com)/u/\w+',#find users
    r'(?:http://)?(?:www\.)?(?:pastebin\.com)/\w{8}'#find individual items
    ]
    cleanedmatches = []
    for regex in regexes:
        matches = re.findall(regex, text, re.IGNORECASE|re.DOTALL)
        #print 'Matches: ',matches
        for match in matches:
            #print type(match)
            cleanedmatches.append(match)
        #print cleanedmatches
    #print cleanedmatches
    return cleanedmatches


def walk():
    rootPath = sys.path[0]#use the folder the script is in
    patterns = ['*.html','*.htm','*.txt']
    matches = []
    for root, dirs, files in os.walk(rootPath):
        #print root,dirs,files
        #print "files: "+str(files)
        for pattern in patterns:
			for filename in fnmatch.filter(files, pattern):
				matches.append(os.path.join(root, filename))
    print matches
    return matches


def readfile(path):
    f = open(path, 'rU')
    data = f.read()
    f.close()
    return data


def exportlist(listin):
    #takes a list of strings and saves to a file with a newline character between each string
    builtfile = ''
    for listitem in listin:
        builtfile = builtfile+'\n'+listitem
    data = builtfile
    filepath = 'foundlinks.txt'
    try:
        os.makedirs(folder)
    except:
        pass
    print 'Savefile: Saving file: ', filepath
    f = open(filepath, 'wb')
    f.write(data)
    f.close()
    return


def removeduplicates(listin):
    #take a list of strings and return a list with duplicate items removed
    #http://love-python.blogspot.com.au/2008/09/remove-duplicate-items-from-list-using.html
    listout = list(set(listin))
    return listout

def crop_middle_filenames(filepathlist):
    # Take a list of strings for filepaths and return a similar list with only the first and last by alphabetical order from each folder.
    # Sort filenames by folder.
    sortingdict = {}
    for filepath in filepathlist:
        splitpath = filepath.split("\\")
        filename = splitpath[-1]
        prefixlist = splitpath[:-1]
        prefix = os.path.join(*prefixlist)
        print prefix, filename
        try:
            sortingdict[prefix].append(filepath)
        except KeyError:
            sortingdict[prefix] = [filepath]
    # For each folder, collect first and last items.
    outputitems = []
    for key in sortingdict.keys():
        folderitems = sortingdict[key]
        folderitems.sort()
        outputitems.append(folderitems[0])
        outputitems.append(folderitems[-1])
    return outputitems























def main():
    linklist = []
    filecounter = 0
    foundfilepaths = walk()
    if "\\Threads\\" in foundfilepaths:
        keptpaths = crop_middle_filenames(foundfilepaths)
    else:
        keptpaths = foundfilepaths
    for filepath in keptpaths:
        filecounter += 1
        print '\nReading file',filecounter,': ',filepath
        data = readfile(filepath)
        if scanpastebin.check_if_pastebin_html(data):
            data = scanpastebin.grab_paste(data)
        links = scantext(data)
        if len(links) != 0:
            linklist = linklist+links
    #print 'linklist:\n',linklist
    print '\nRemoving duplicates\n'
    outputlist = removeduplicates(linklist)
    print 'outputlist:\n', outputlist
    exportlist(outputlist)

if __name__ == '__main__':
    main()
    print 'Job done.'
    raw_input('Press enter to exit')