import functools
import os
import re
import traceback
from concurrent.futures import ThreadPoolExecutor
from typing import List, Callable, Union

from bs4 import BeautifulSoup

from .desu import search_links
from ..log import root_logger

_logger = root_logger.get_child('CATBOX.SCRAPE')

BASE_URL = '.catbox.moe/'
VALID_EXTENSIONS = ['.png', '.webp']


def search_for_catbox_png(text: str):
    found = set()
    try:
        soup = BeautifulSoup(text, 'html.parser')
    except:
        _logger.warning(f'Failed to parse with BeautifulSoup: {traceback.format_exc()}')
        return found
    for link in soup.find_all('a', href=True, rel='nofollow', target='_blank'):
        if BASE_URL in link['href'] and not link['href'].startswith(BASE_URL + 'thumbs/'):
            _, ext = os.path.splitext(link['href'])
            if ext in VALID_EXTENSIONS:
                found.add(link['href'])
                _logger.debug(link['href'])
    for img in soup.find_all('img'):
        src = img.get('src') or img.get('data-src')
        if src and BASE_URL in src and not src.startswith(BASE_URL + 'thumbs/'):
            _, ext = os.path.splitext(src)
            if ext in VALID_EXTENSIONS:
                found.add(src)
                _logger.debug(src)
    return found


def search_for_regex(regex_pattern: str, html_text: str):
    found = set()
    for match in re.finditer(regex_pattern, html_text):
        mg = match.group(1)
        if mg:
            _logger.debug(mg)
            found.add(mg)
    return found


def parse_links(threads, links, parse_func, show: bool = False):
    with ThreadPoolExecutor(max_workers=threads) as executor:
        results = executor.map(functools.partial(parse_func, show), links)
    parsed_links = {link for result in results for link in result}
    return parsed_links


def perform_archive_scrape(pages: int, domains: List[str], desu_threads: int, search_func: Union[Callable[[str], set], Callable[[str], str], set]):
    results = set()
    for domain in domains:
        results.update(search_links(pages, f"https://desuarchive.org/g/search/text/https%3A%2F%2F{domain}%2F/page/", search_func, desu_threads))
        results.update(search_links(pages, f"https://desuarchive.org/_/search/text/https%3A%2F%2F{domain}%2F/page/", search_func, desu_threads))
        results.update(search_links(pages, f"https://arch.b4k.co/vg/search/text/https%3A%2F%2F{domain}%2F/page/", search_func, desu_threads))
        results.update(search_links(pages, f"https://arch.b4k.co/_/search/text/https%3A%2F%2F{domain}%2F/page/", search_func, desu_threads))
    return results
