#!/usr/bin/env python3
import argparse
import logging
import re
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Manager

import coloredlogs
from bs4 import BeautifulSoup

from scraper.catbox.rentry import CrawlerThread
from scraper.catbox.scrape import search_for_catbox_png
from scraper.proxied_http import handle_request

logger: logging.Logger


def parse_rentry(url):
    logger.info(url)
    response = handle_request(url)
    return search_for_catbox_png(response.text)


def search_for_rentry(text):
    soup = BeautifulSoup(text, 'html.parser')
    links = soup.find_all('a', href=True, rel='nofollow', target='_blank')
    found = set()
    for link in links:
        if link['href'].startswith('https://rentry.org/'):
            url = re.sub(r'#.*?$', '', link['href'])
            logger.debug(url)
            found.add(url)
    return found


def main():
    global logger
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--threads', type=int, default=10, help='Number of threads to use.')
    parser.add_argument('--pages', type=int, default=301, help='How many pages to parse on desuarchive.')
    parser.add_argument('--rentry-limit', type=int, default=301, help='How many pages to parse on desuarchive.')
    parser.add_argument('--output', default='./tavern-catbox-found', help='Directory to save the files.')
    parser.add_argument('--list-file', default=None, help='File to save the latest scrape info to.')
    parser.add_argument('-d', '--debug', action='store_true', help='Enable debug logging.')
    args = parser.parse_args()

    logging.getLogger().setLevel(logging.WARNING)
    logger = logger = configure_logger('MAIN')
    logger.setLevel(logging.DEBUG)
    coloredlogs.install(level=logging.DEBUG)
    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.getLogger("urllib3").setLevel(logging.WARNING)

    rentry_links = set()
    for i in range(1, args.pages + 1):
        logger.info(f'Page {i}')
        response, url = handle_request(f"https://desuarchive.org/g/search/text/https%3A%2F%2Frentry.org%2F/{i}/")
        rentry_links.update(search_for_rentry(response.text))

    logger.info(f'{len(rentry_links)} to crawl.')

    with Manager() as manager:
        found = manager.list()
        with ThreadPoolExecutor(max_workers=5) as executor:
            for url in rentry_links:
                crawler = CrawlerThread(url, args.rentry_limit, found)
                executor.submit(crawler.crawl, url)

        found = rentry_links
        found_clean = {f for f in found}
        print(f'Found {len(found_clean)} URLs')
        with open("./rentry-spider.txt", "w") as file:
            for item in found_clean:
                file.write(item + '\n')


if __name__ == "__main__":
    main()
