#!/usr/bin/env python3
import argparse
import logging
import os
import signal
import sys
import time
import traceback
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from pathlib import Path

import elastic_transport
import yaml

script_dir = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, str(Path(script_dir).parent))

from scraper.config.models import ConfigModel
from scraper.globals import GLOBALS
from scraper.database.connection import Database, CursorFromConnectionFromPool
from scraper.stop_event import global_stop_event
from scraper.suicide import signal_handler, watchdog_expired
from scraper.paths import resolve_path
import scraper
from helpers.scrape import parse_space_results, scraper_validate_proxy
from scraper import RequestQueueManager, log
from helpers.elastic import ELASTIC_CLIENT

signal.signal(signal.SIGINT, signal_handler)

Database.initialise(minconn=1, maxconn=100, host='172.0.3.101', database='proxy_stats', user='char_archive', password='hei3ucheet5oochohjongeisahV3mei0')

elastic_connected = False
t_l = logging.getLogger(__name__)
for i in range(6):
    try:
        ELASTIC_CLIENT.connect('https://172.0.3.105:9200', 'proxy_stats', 'RU9FNWRZOEJod0d0ZGl3YjlRMGM6NS1OV0dTbl9ROGVDZkNqSnJId3c0Zw==')
        elastic_connected = True
        break
    except elastic_transport.ConnectionError as e:
        t_l.error(f'Failed to connect to Elasticsearch - {e} - sleeping 10 sec... - retry #{i}')
        time.sleep(5)

if not elastic_connected:
    t_l.critical('Failed to connect to Elasticsearch')
    sys.exit(1)
del t_l


def main(args):
    space_links = set()

    # =======================================================================================
    # Setup

    if args.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO

    log.root_logger.init(log_level)
    logger = log.root_logger.logger

    cfg_data = yaml.load(resolve_path(args.config).read_text(), Loader=yaml.FullLoader)
    cfg = ConfigModel(**cfg_data)

    logger.info(f'Run time limit: {args.run_time_limit} seconds.')
    signal.alarm(args.run_time_limit)

    GLOBALS.log_http_requests = cfg.log_http_requests or args.log_requests
    GLOBALS.request_timeout = (cfg.request_connect_timeout, cfg.request_read_timeout)

    GLOBALS.proxies = cfg.proxies
    scraper.init_proxy_pool()
    scraper.http_queue.http_queue = RequestQueueManager(
        num_workers_per_proxy=cfg.chub.http_workers
    )

    scrape_start = datetime.now()
    working_proxies = 0

    # =======================================================================================
    # Database setup

    ELASTIC_CLIENT.create_index()

    # =======================================================================================

    logger.info('Fetching data from MySQL...')
    with CursorFromConnectionFromPool() as cursor:
        cursor.execute('SELECT url FROM proxies')
        tables = list(cursor.fetchall())
    for table_name_row in tables:
        space_links.add(table_name_row[0])

    # =======================================================================================
    # Have to update Evulid after the other spaces so Elastic doesn't get confused.

    # logger.info(f'Checking proxy.chub-archive.example.com...')
    # try:
    #     get_proxy_evulid()
    # except:
    #     logger.error(f'proxy.chub-archive.example.com FAILED!!\n{traceback.format_exc()}')

    # =======================================================================================

    logger.info(f'Checking URLs for active proxies...')
    with ThreadPoolExecutor(max_workers=cfg.proxy_tracker.http_workers) as executor:
        for sql_proxy_json, space_name, err in executor.map(scraper_validate_proxy, sorted(space_links)):
            if global_stop_event.is_set():
                return
            if not sql_proxy_json:
                logger.info(f'NO - {space_name} - {err}')
            else:
                parse_space_results(sql_proxy_json)
                logger.info(f'YES - {space_name}')
                working_proxies += 1

    diff = datetime.now() - scrape_start
    hours, remainder = divmod(diff.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    logger.info(f'Scraping took {hours:02}:{minutes:02}:{seconds:02} and found {working_proxies} working proxies.')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Scrape data from chub.ai.')
    parser.add_argument('--config', default=resolve_path(script_dir, '..', 'config', 'chub.yml'), help='Path to the config file. Default: config/chub.yml in the config directory next to the script.')
    parser.add_argument('--debug', '-d', '-v', action='store_true', help='Debug logging.')
    parser.add_argument('--update', '-u', action='store_true', help="Update existing proxies, don't search for new ones.")
    parser.add_argument('--log-requests', '-r', action='store_true', help='Log all HTTP requests when debugging is enabled.')
    parser.add_argument('--run-time-limit', type=int, default=3600, help='If the program runs longer than this, kill it. This helps prevent cases where the scraper gets stuck. Value in seconds. Default: 3600 (1 hour)')
    args = parser.parse_args()

    signal.signal(signal.SIGALRM, watchdog_expired)
    try:
        main(args)
    except:
        traceback.print_exc()
    scraper.http_queue.quit()
