#!/usr/bin/env python3
import argparse
import functools
import logging
import os
import signal
import traceback
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from pathlib import Path
from urllib.parse import urlparse

import scraper
from scraper import log
from scraper.config.load import load_cfg
from scraper.database.connection import Database
from scraper.suicide import signal_handler, watchdog_expired
from scraper.time import calculate_elapsed_time
from scraper.webring.download import webring_download_card
from scraper.webring.scape import fetch_sites_list, crawl_site

_SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
_START_TIMESTAMP = datetime.now()

signal.signal(signal.SIGINT, signal_handler)


def clean_images(found_images, domain, site_domains):
    return {img_url.replace('\\', '/') for img_url in found_images
            if (urlparse(img_url).netloc == domain or urlparse(img_url).netloc not in site_domains)
            and not img_url.startswith(('https://gfycat.com/', 'https://web.archive.org/', 'https://x.', 'https://mega.nz', 'https://danbooru.donmai.us', 'https://twitter.com'))
            and img_url.startswith('http')}


def main(args):
    global _logger
    if args.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    log.root_logger.init(log_level)
    _logger = log.root_logger.logger

    cfg = load_cfg(args)
    Database.initialise(minconn=1, maxconn=100, host=cfg.database.host, database=cfg.database.database, user=cfg.database.user, password=cfg.database.password)

    sites_to_crawl = fetch_sites_list()
    site_domains = [urlparse(site).netloc for site in sites_to_crawl]
    _logger.info(f'Crawling {len(sites_to_crawl)} sites...')
    dl_start = datetime.now()

    new_cards = 0

    for neocity_site in sites_to_crawl:
        domain = urlparse(neocity_site).netloc
        _logger.info(f'Crawling {domain}')

        found_images = clean_images(crawl_site(f'https://{domain}/'), domain, site_domains)

        if len(found_images):
            with ThreadPoolExecutor(max_workers=cfg.webring.download_threads) as executor:
                for new_card, img_url, error in executor.map(functools.partial(webring_download_card, domain, cfg.hashed_data_path), found_images):
                    if error and error != 'Invalid PNG metadata':
                        _logger.error(f'{domain} -> {img_url} - {error}')
                    elif new_card:
                        _logger.info(f'NEW: {domain} -> "{img_url}"')
                        new_cards += 1

    _logger.info(f'Downloaded {new_cards} new cards in {calculate_elapsed_time(dl_start)}')
    _logger.info(f'Process completed in {calculate_elapsed_time(_START_TIMESTAMP)}')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--config', default=Path(_SCRIPT_DIR, 'config', 'chub.yml'), help='Path to the config file. Default: config/chub.yml in the config directory next to the script.')
    parser.add_argument('-d', '--debug', action='store_true', help='Enable debug logging.')
    parser.add_argument('--log-requests', action='store_true', help='Log all HTTP requests when debugging is enabled.')
    parser.add_argument('--run-time-limit', type=int, default=3600, help='If the program runs longer than this, kill it. This helps prevent cases where the scraper gets stuck. Value in seconds. Default: 3600 (one hour)')
    args = parser.parse_args()

    signal.signal(signal.SIGALRM, watchdog_expired)
    bad = False
    try:
        main(args)
    except:
        traceback.print_exc()
        bad = True
    scraper.http_queue.quit()
    if bad:
        quit(1)
