import sys
import threading
import time
from multiprocessing import Process, Manager
from urllib.parse import urlparse

from bs4 import BeautifulSoup
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from scrapy.signalmanager import dispatcher
from scrapy.spiders import CrawlSpider, Rule

from scraper import http_queue
from scraper.globals import GLOBALS
from scraper.log import root_logger
from scraper.stop_event import global_stop_event

logger = root_logger.get_child('WEBRING.SCRAPE')

EXCLUDED_DOMAINS = ['beta.character.ai', 'c.ai', 'avatars.charhub.io', 'images.characterhub.org']


class PngSpider(CrawlSpider):
    name = 'png_spider'

    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

    def __init__(self, start_url, *a, **kw):
        super().__init__(*a, **kw)
        self.start_urls = [start_url]
        self.allowed_domains = [urlparse(start_url).netloc]

    def parse_start_url(self, response, **kwargs):
        if global_stop_event.is_set():
            sys.exit(0)

        # Handle on one-page sites as well
        return self.parse_item(response)

    def parse_item(self, response):
        if global_stop_event.is_set():
            sys.exit(0)

        if 'text/html' in response.headers.get('Content-Type', '').decode().lower():
            try:
                img_srcs = response.css('img::attr(src)').getall()
                a_hrefs = response.css('a::attr(href)').getall()
                found_imgs = set(img_srcs + a_hrefs)

                logger.debug(f'Found {len(found_imgs)} image links on {response.url}')

                for link in found_imgs:
                    if not link:
                        continue  # Skip empty links

                    absolute_url = response.urljoin(link)
                    parsed_url = urlparse(absolute_url)

                    if parsed_url.netloc in EXCLUDED_DOMAINS:
                        logger.debug(f'Excluded domain: {parsed_url.netloc}')
                        continue  # Skip excluded domains

                    if parsed_url.path.lower().endswith('.png'):
                        logger.debug(f'Found PNG: "{absolute_url}"')
                        yield {'image_url': absolute_url}

            except Exception as e:
                logger.error(f'PngSpider parse_item() - {e.__class__.__name__}: {e} - {response.url}')


def run_spider(target_site, results):
    def crawler_results(signal, sender, item, response, spider):
        results.append(item)

    process = CrawlerProcess({
        'USER_AGENT': GLOBALS.headers['user-agent'],
        'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
        'LOG_ENABLED': False,  # scrapy logging breaks our logger
        'ROBOTSTXT_OBEY': True,

    })

    dispatcher.connect(crawler_results, signal=signals.item_scraped)
    process.crawl(PngSpider, start_url=target_site)
    process.start()  # the script will block here until the crawling is finished


def crawl_site(target_site):
    """
    Run a scraper in a separate process to get around the `twisted.internet.error.ReactorNotRestartable` error.
    :param target_site:
    :return:
    """

    if global_stop_event.is_set():
        return set()

    response = http_queue.add(target_site)

    # We no longer respect the exclude tag since we don't exclude anything else.
    # soup = BeautifulSoup(response.text, 'html.parser')
    # if soup.head:  # some sites don't have the <head> element for some reason
    #     meta_tag = soup.head.find('meta', attrs={'name': 'chub-archive-exclude', 'content': 'true'})
    #     if meta_tag:
    #         logger.warning(f'{target_site} had the exclude meta tag! Skipping...')
    #         return set()

    with Manager() as manager:
        results = manager.list()

        p = Process(target=run_spider, args=(target_site, results))

        def watchdog():
            while True:
                if global_stop_event.is_set():
                    try:
                        p.kill()
                    except:
                        pass
                    sys.exit(0)
                time.sleep(0.01)

        watchdog_thread = threading.Thread(target=watchdog, daemon=True)
        watchdog_thread.start()

        p.start()
        p.join()

        found_images = set()
        for result in results:
            found_images.add(result['image_url'])

        return found_images


def fetch_sites_list():
    html = http_queue.add(f'https://chatbots.neocities.org').text
    soup = BeautifulSoup(html, 'html.parser')
    sites_div = soup.find('div', id='sites')
    links = set(a['href'] for a in sites_div.find_all('a', href=True))
    return links
