import functools
import re
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Callable, Union
from urllib.parse import urlparse

from bs4 import BeautifulSoup

import scraper
from scraper.log import root_logger

_logger = root_logger.get_child('CATBOX.DESU')


def scrape_archive_for_domain(archive: str, board: str, domain: str, pages: int, desu_threads: int, search_func):
    _logger.info(f"Searching for '{domain}' on desuarchive.org/g/")
    g_catbox_links = search_links(pages, f"https://{archive}/vg/search/text/https%3A%2F%2F{domain}%2F/page/",
                                  search_func, desu_threads)

    _logger.info(f"Searching for '{domain}' on desuarchive.org/vt/")
    vt_catbox_links = search_links(pages, f"https://{archive}/vg/search/text/https%3A%2F%2F{domain}%2F/page/",
                                   search_func, desu_threads)

    _logger.info(f"Searching for '{domain}' on desuarchive.org (global)")
    global_catbox_links = search_links(pages, f"https://{archive}/_/search/text/https%3A%2F%2F{domain}/page/",
                                       search_func, desu_threads)

    return g_catbox_links | vt_catbox_links | global_catbox_links


def search_for_rentry(text):
    soup = BeautifulSoup(text, 'html.parser')
    links = soup.find_all('a', href=True, rel='nofollow', target='_blank')
    found = set()
    for link in links:
        if link['href'].startswith('https://rentry.org/'):
            url = re.sub(r'#.*?$', '', link['href'])
            _logger.debug(url)
            found.add(url)
    return found


def process_desu_page(search_url, search_func: Union[Callable[[str], set], Callable[[str], str], set], page_num):
    u = urlparse(search_url)
    for i in range(3):
        response = scraper.http_queue.add(f"{search_url}{page_num}/")
        if response and response.status_code == 200:
            _logger.info(f'{u.hostname} page {page_num}')
            return page_num, search_func(response.text)
        elif not response:
            _logger.debug(f'{u.hostname} page {page_num} returned nothing')
        elif response.status_code != 200:
            _logger.debug(f'{u.hostname} page {page_num} got {response.status_code}!')
        else:
            _logger.warning(f'{u.hostname} page {page_num} failed for unknown reason. Did the proxy master fail?')
        time.sleep(10)
        continue
    _logger.error(f'{u.hostname} page {page_num} failed!')
    return page_num, set()


def search_links(pages, search_url, search_func: Union[Callable[[str], set], Callable[[str], str], set], threads):
    u = urlparse(search_url)
    if not search_url.endswith('/page/'):
        search_url = f'{search_url.strip("/")}/page/'
    with ThreadPoolExecutor(max_workers=threads) as executor:
        results = executor.map(functools.partial(process_desu_page, search_url, search_func), range(1, pages + 1))
    try:
        links = {link for result in results for link in result[1]}
    except:
        print(search_url, pages, list(results))
        raise
    for result in results:
        _logger.info(f'{u.hostname} - Page {result[0]} - {len(result[1])} links')
    return links
