#!/usr/bin/env python3
import argparse
import functools
import logging
import os
import signal
import time
import traceback
from datetime import datetime
from pathlib import Path

import scraper
from scraper import log
from scraper.catbox.database import load_stored_rentry_links, insert_rentry_links
from scraper.catbox.process import catbox_download_pngs
from scraper.catbox.rentry import crawl_rentry_page, parse_rentry, extract_catbox_from_crawled_rentry
from scraper.catbox.scrape import parse_links, perform_archive_scrape, search_for_catbox_png, search_for_regex
from scraper.config.load import load_cfg
from scraper.database.connection import Database
from scraper.helpers import is_service_running
from scraper.paths import create_directory
from scraper.suicide import signal_handler, watchdog_expired, watchdog_suicide
from scraper.time import calculate_elapsed_time

signal.signal(signal.SIGINT, signal_handler)

_logger: logging.Logger

_SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
_START_TIMESTAMP = datetime.now()


# TODO: make sure --exit-when-service works


def main(args):
    global _logger
    if args.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    log.root_logger.init(log_level)
    _logger = log.root_logger.logger

    if args.exit_when_service:
        if is_service_running(args.exit_when_service):
            _logger.error(f'Service {args.exit_when_service} is running, exiting...')
            return
        else:
            _logger.info(f'Service {args.exit_when_service} is not running, continuing...')

    cfg = load_cfg(args)
    Database.initialise(minconn=1, maxconn=100, host=cfg.database.host, database=cfg.database.database, user=cfg.database.user, password=cfg.database.password)
    hashed_data_path = create_directory(cfg.hashed_data_path)
    rentry_threads = cfg.catbox.rentry_threads
    desu_threads = cfg.catbox.desu_threads
    catbox_threads = cfg.catbox.catbox_threads

    catbox_images = set()

    if args.url:
        _logger.info(f'Scraping {args.url}')
        catbox_images.add(args.url)
    elif args.litterbox:
        _logger.info('Scraping archives for litterbox...')
        catbox_images.update(perform_archive_scrape(args.pages, ['litterbox.catbox.moe', 'litter.catbox.moe'], desu_threads, search_for_catbox_png))
    else:
        if not args.skip_catbox:
            _logger.info('Scraping archives for catbox and litterbox...')
            catbox_images.update(perform_archive_scrape(args.pages, ['files.catbox.moe', 'litterbox.catbox.moe'], desu_threads, search_for_catbox_png))

        if not args.skip_rentry_crawl:
            stored_rentry_links = set()
            if not args.skip_rentry_crawl_cached:
                _logger.info('Loading cached rentry links...')
                stored_rentry_links = load_stored_rentry_links()
            _logger.info('Scraping archives for rentry pages...')
            rentry_pages = perform_archive_scrape(args.pages, ['rentry.org'], desu_threads, functools.partial(search_for_regex, r'<a href="(https://rentry\.org/.*?)" target="_blank" rel="nofollow">'))
            charcard_rentry_links = charcard_catbox_links = meta_bot_list_rentry_links = meta_bot_list_catbox_links = set()

            if not args.skip_character_rentry:
                _logger.info('Crawling rentry.org/charcardrentrylist')
                charcard_pages = crawl_rentry_page('https://rentry.org/charcardrentrylist', 2)
                charcard_rentry_links, charcard_catbox_links = extract_catbox_from_crawled_rentry(charcard_pages)
                _logger.info(f'Found {len(charcard_catbox_links)} Catbox links on rentry.org/charcardrentrylist')
                _logger.info('Crawling rentry.org/meta_bot_list')
                meta_bot_list_pages = crawl_rentry_page('https://rentry.org/meta_bot_list', 2)
                meta_bot_list_rentry_links, meta_bot_list_catbox_links = extract_catbox_from_crawled_rentry(meta_bot_list_pages)
                _logger.info(f'Found {len(meta_bot_list_catbox_links)} Catbox links on rentry.org/meta_bot_list')

            all_rentry_links = rentry_pages | charcard_rentry_links | meta_bot_list_rentry_links | stored_rentry_links
            new_found_rentry = (rentry_pages | charcard_rentry_links | meta_bot_list_rentry_links) - stored_rentry_links
            insert_rentry_links(all_rentry_links)
            _logger.info(f'Cached {len(new_found_rentry)} new rentry.org URLs for a total of {len(all_rentry_links)} found links.')

            _logger.info(f'Crawling {len(new_found_rentry)} rentry pages for catbox.moe URLs...')
            rentry_catbox_images = parse_links(rentry_threads, new_found_rentry, parse_rentry, show=args.debug)
            _logger.info(f'Found {len(rentry_catbox_images)} Catbox links on those rentry pages.')

            uncrawled_rentry_links = stored_rentry_links - new_found_rentry
            _logger.info(f'Crawling {len(uncrawled_rentry_links)} stored rentry pages...')
            stored_rentry_catbox = parse_links(rentry_threads, uncrawled_rentry_links, parse_rentry, show=args.debug)

            catbox_images.update(rentry_catbox_images | charcard_catbox_links | meta_bot_list_catbox_links | stored_rentry_catbox)

    _logger.info(f'Scraping completed in {calculate_elapsed_time(_START_TIMESTAMP)}')
    time.sleep(1)

    dl_start = datetime.now()
    _logger.info(f'Downloading {len(catbox_images)} PNGs...')

    new_cards, new_chub = catbox_download_pngs(catbox_threads, hashed_data_path, catbox_images)

    _logger.info(f'Downloaded {len(catbox_images)} cards in {calculate_elapsed_time(dl_start)}')
    _logger.info(f'Process completed in {calculate_elapsed_time(_START_TIMESTAMP)}')
    _logger.info(f'New cards: {new_cards}')
    _logger.info(f'Found {new_chub} chub.ai cards')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--config', default=Path(_SCRIPT_DIR, 'config', 'chub.yml'), help='Path to the config file. Default: config/chub.yml in the config directory next to the script.')
    parser.add_argument('--pages', type=int, default=205, help='How many pages to parse on desuarchive.')
    parser.add_argument('-d', '--debug', action='store_true', help='Enable debug logging.')
    parser.add_argument('--log-requests', action='store_true', help='Log all HTTP requests when debugging is enabled.')
    parser.add_argument('--skip-rentry-crawl', action='store_true', help='Skip crawling for rentry.org pages.')
    parser.add_argument('--skip-rentry-crawl-cached', action='store_true', help='Skip crawling for cached rentry.org pages.')
    parser.add_argument('--skip-catbox', action='store_true', help='Skip crawling the archives for catbox.moe URLs.')
    parser.add_argument('--litterbox', action='store_true', help='Crawl the archives for litterbox only.')
    parser.add_argument('--skip-character-rentry', action='store_true', help='Skip crawling rentry.org/charcardrentrylist')
    parser.add_argument('--run-time-limit', type=int, default=3600, help='If the program runs longer than this, kill it. This helps prevent cases where the scraper gets stuck. Value in seconds. Default: 3600 (one hour)')
    parser.add_argument('--exit-when-service', type=str, default=None, help='If this systemctl service is running, exit. Checks services running in --user mode.')
    parser.add_argument('--url', type=str, default=None, help='Only scrape this URL.')
    args = parser.parse_args()

    signal.signal(signal.SIGALRM, watchdog_expired)
    bad = False
    try:
        main(args)
    except:
        traceback.print_exc()
        bad = True
    scraper.http_queue.quit()
    if bad:
        watchdog_suicide()
