import threading
from urllib.parse import urlparse

from bs4 import BeautifulSoup

import scraper
from scraper import http_queue
from scraper.catbox.scrape import search_for_catbox_png
from scraper.log import root_logger

MAX_ACTIVE_CRAWL_THREADS = 50

semaphore = threading.Semaphore(MAX_ACTIVE_CRAWL_THREADS)

_logger = root_logger.get_child('CATBOX.RENTRY')


def crawl_rentry_page(url, depth: int = 2, crawled: dict = None):
    """
    Crawl a rentry.org page up to a certain depth.
    :param url: The rentry.org page to start on
    :param depth: How many pages deep to crawl. 2 pages means crawl the starting page and one level below.
    :param crawled: Internal, don't use, FUCKING RETARD.
    :return:
    """
    semaphore.acquire()
    try:
        if crawled is None:
            crawled = {}

        if depth == 0 or url in crawled:
            return crawled

        response = http_queue.add(url)
        if response.status_code != 200:
            return crawled

        soup = BeautifulSoup(response.text, 'html.parser')
        links = [a['href'] for a in soup.find_all('a', href=True)]
        crawled[url] = remove_internal_links(links)

        rentry_links = [link for link in links if link.startswith('https://rentry.org/')]

        threads = []
        for link in rentry_links:
            if link not in crawled:
                thread = threading.Thread(target=crawl_rentry_page, args=(link, depth - 1, crawled))
                thread.start()
                threads.append(thread)

        for thread in threads:
            thread.join()
    finally:
        semaphore.release()

    return crawled


def remove_internal_links(links: list):
    links = set(links)
    for link in links.copy():
        if link in ['/', '/what', '/how', '/langs', '/what#contacts']:
            links.remove(link)
        elif link.startswith('#') or link.endswith('/edit') or link.endswith('/raw') or link.endswith('/pdf') or link.endswith('/png'):
            links.remove(link)
    return links


def parse_rentry(show, url):
    response = scraper.http_queue.add(url)
    if show:
        _logger.info(url)
    return search_for_catbox_png(response.text)


def extract_catbox_from_crawled_rentry(rentry_pages: dict):
    sub_rentry_links = set()
    catbox_links = set()
    for page, links in rentry_pages.items():
        sub_rentry_links.add(page)
        for link in links:
            url_parts = urlparse(link)
            if '.catbox.moe' in url_parts.netloc and url_parts.path.endswith('.png'):
                catbox_links.add(link)
    return sub_rentry_links, catbox_links
