#-------------------------------------------------------------------------------
# Name: module1
# Purpose:
#
# Author: new
#
# Created: 22/04/2013
# Copyright: (c) new 2013
# Licence: <your licence>
#-------------------------------------------------------------------------------
#!/usr/bin/env python
from selenium import webdriver
import logging
from bs4 import BeautifulSoup, SoupStrainer
import re
import lxml.html
# Setup logging (Before running any other code)
# http://inventwithpython.com/blog/2012/04/06/stop-using-print-for-debugging-a-5-minute-quickstart-guide-to-pythons-logging-module/
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
fh = logging.FileHandler('Log.txt')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.debug('Logging started.')
# End logging setup
def login(username,password):
logger.debug("Logging in")
# Load login page
driver.get("http://writing.com")
# select username box
usernamebox = driver.find_element_by_name("login_username")
# enter username
usernamebox.send_keys(username)
# Select password box
passwordbox = driver.find_element_by_name("login_password")
# enter password
passwordbox.send_keys(password)
# submit form
passwordbox.submit()
def findlinks(html):
# Copied from:
# http://stackoverflow.com/questions/520031/whats-the-cleanest-way-to-extract-urls-from-a-string-using-python
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
links = re.findall(url_regex,html, re.DOTALL)
return links
def main():
login(username,password)
if __name__ == '__main__':
# Init selenium browser
driver = webdriver.Firefox()
# Set username and password to use
username = "drymarker"
password = "notrouble"
main()
#def searchInteractives(searchstring,itemtype,sortmode,maxsearchpages):
# Run search for items
searchstring = "vore"
itemtype = "Interactives"
sortmode = "Newest Last"
maxsearchpages = 100
foundlinks = []
# Load search page
driver.get("http://www.writing.com/main/search?")
# Set view to detailed
viewdetailed = driver.find_element_by_name("lp2")
viewdetailed.click()
# Fill in search form
searchbox = driver.find_element_by_name("search_for")
searchbox.clear()
searchbox.send_keys(searchstring)
itemtypemenu = driver.find_element_by_name("ps_type")
for typeoption in itemtypemenu.find_elements_by_tag_name("option"):
if itemtype in typeoption.text:
typeoption.click()
# Run search page one
searchbox.submit()
# Change sort mode
logger.debug("Changing sort mode to:" + sortmode)
sortmenu = driver.find_element_by_name("sort_by")
for sortoption in sortmenu.find_elements_by_tag_name("option"):
if sortmode in sortoption.text:
sortoption.click()
nextpagebutton = driver.find_element_by_xpath('//*[@id="Content_Column_Inner"]/font/table[1]/tbody/tr/td/form/table/tbody/tr[2]/td/input[5]')
nextpagebutton.click()
# Iterate over searvh
for searchloopcounter in xrange(maxsearchpages):
# Get page HTML
pagehtml = driver.page_source
# TODO: Run search for more pages, collecting thml for all
# Find all item links inside ceter pane
# Extract center pane
centerpanesearch = re.search(r"Content_Column_Inner.+Footer_Wrapper", pagehtml, re.IGNORECASE|re.DOTALL)
centerpanehtml = centerpanesearch.group()
# Grab links to items
pagelinks = findlinks(centerpanehtml)
# Remove anythin not an item link
for link in pagelinks:
if "view_item" in link:
foundlinks.append(link)
# Grab page number
# "Viewing page <b>54</b> of <b>53</b>"
pagenumbersearch = re.search(r"Viewing page <b>(\d+)</b> of <b>(\d+)</b>", pagehtml, re.IGNORECASE|re.DOTALL)
if pagenumbersearch:
currentpage = int(pagenumbersearch.group(1))
totalpages = int(pagenumbersearch.group(2))
logger.debug(pagenumbersearch.group(0))
# Check if there is a next page
if currentpage < totalpages:
# Load next page of results
nextpagebutton = driver.find_element_by_xpath('//*[@id="Content_Column_Inner"]/font/table[1]/tbody/tr/td/form/table/tbody/tr[2]/td/input[5]')
nextpagebutton.click()
else:
break
#def favitems(favurllist):
logger.debug("Starting to add items to favs.")
favurllist = foundlinks[:]
logger.debug(str(favurllist))
# Add items to favs
for favpageurl in favurllist:
logger.debug("Loading url: "+favpageurl)
# Open item page
driver.get(favpageurl)
# Get page HTML
favpagehtml = driver.page_source
# Check if already a fav
favchecksearch = re.search(r"Item #\d+ is a favorite of yours", favpagehtml, re.IGNORECASE|re.DOTALL)
if favchecksearch:
isfav = True
else:
isfav = False
# Add to favs if needed
if not isfav:
logger.debug("Item is not faved, adding to favs.")
# Click add to favs link
favxpath = '//*[@id="Content_Column_Inner"]/font/div[3]/table/tbody/tr/td[2]/div[1]/div[1]/table/tbody/tr/td[2]/a[2]'
favbutton = driver.find_element_by_xpath(favxpath)
favbutton.click()
# Confirm fav add worked
favconfirmhtml = driver.page_source
favconfirmsearch = re.search(r"Item #\d+ is a favorite of yours", favconfirmhtml, re.IGNORECASE|re.DOTALL)
assert(favconfirmsearch)
if favconfirmsearch:
console.log("Added to favs")
else:
logger.debug("Item is already a fav")