import concurrent
import re
import time
import traceback
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timezone
from typing import List

import requests

from ..globals import GLOBALS
from ..http_queue import http_queue
from ..log import root_logger
from ..suicide import watchdog_suicide

_logger = root_logger.get_child('CHUB.SCRAPE')


def fetch_api_node(card_type: str, full_path: str, full_path_pretty: str = None):
    chub_url = f'https://api.chub.ai/api/{card_type}/{re.sub(r"^lorebooks/", "", full_path)}?full=true'
    api_r = http_queue.add(chub_url)
    if api_r.status_code != 200:
        _logger.error(f'Failed to fetch API node {full_path if not full_path_pretty else full_path_pretty}: {api_r.status_code} -> {chub_url}')
        return
    chub_data = api_r.json()
    return chub_data


def scrape_nodes(node_type: str, token: str, sort_latest: bool = True, since_date: datetime = None, test_mode: bool = False) -> List[dict]:
    nodes = []
    BATCH_SIZE = 10  # Number of pages to fetch concurrently

    if test_mode:
        _logger.info('Fetching nodes from test mode...')
        data = _fetch_data(1, node_type, token, sort_latest=sort_latest, search_count=20)
        return data['data']['nodes']

    def fetch_and_append(page_t):
        response = _fetch_data(page_t, node_type, token, sort_latest=sort_latest)
        return response.get('data', {}).get('nodes', [])

    with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
        current_page = 1
        while True:
            batch_pages = range(current_page, current_page + BATCH_SIZE)
            page_futures = executor.map(fetch_and_append, batch_pages)

            any_empty = False  # Flag to check if any page returned empty

            for page, page_nodes in zip(batch_pages, page_futures):
                try:
                    if not page_nodes:
                        _logger.debug(f"Page {page} returned empty. No more pages to fetch.")
                        any_empty = True
                        # continue
                    nodes.extend(page_nodes)

                    if sort_latest and since_date:
                        # Assuming nodes are sorted by 'createdAt' in descending order within each page
                        earliest_node_date = datetime.strptime(page_nodes[-1]['createdAt'], '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
                        if earliest_node_date < since_date:
                            _logger.debug(f"Earliest node date {earliest_node_date} is older than since_date {since_date}. Stopping pagination.")
                            # Optionally, you can trim nodes that are older than since_date
                            nodes = [d for d in nodes if datetime.strptime(d['createdAt'], '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc) > since_date]
                            return nodes
                except Exception as e:
                    _logger.error(f"Error fetching page {page}: {e}\n{traceback.format_exc()}")

            if any_empty:
                break  # Exit the loop if any page in the batch returned empty

            current_page += BATCH_SIZE

    if since_date:
        nodes = [
            d for d in nodes
            if datetime.strptime(d['lastActivityAt'], '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc) > since_date
        ]
        nodes.sort(
            key=lambda d: datetime.strptime(d['lastActivityAt'], '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc),
            reverse=True
        )
        if not nodes:
            _logger.error(f'Bad `since` filter for chub {node_type}, all nodes were filtered out')

    return nodes


def _fetch_data(page: int, namespace: str, token: str, sort_latest: bool = False, search_count: int = None) -> dict:
    if search_count is None or search_count > 500:
        search_count = 500

    url = f'https://gateway.chub.ai/search?excludetopics=&first={search_count}&page={page}&namespace={namespace}&search=&include_forks=true&nsfw=true&nsfw_only=false&require_custom_prompt=false&require_example_dialogues=false&require_images=false&require_expressions=false&nsfl=true&asc=false&min_ai_rating=0&min_tokens=50&max_tokens=100000&chub=true&require_lore=false&exclude_mine=true&require_lore_embedded=false&require_lore_linked=false&language=&sort={"star_count" if not sort_latest else "last_activity_at"}&min_tags=2&topics=&inclusive_or=false&recommended_verified=false&require_alternate_greetings=false&count=false'
    special_headers = {
        'CH-API-KEY': token,
        'samwise': token
    }

    for i in range(3):
        try:
            r = http_queue.add(
                url,
                method='POST',
                timeout=GLOBALS.request_timeout_longer,
                suicide_on_429=False,
                headers=special_headers,
            )
        except Exception as e:
            _logger.error(f"Request failed due to exception: {e}")
            r = None

        if r is not None and r.status_code == 429:
            retry_after = r.headers.get('Retry-After', 60)
            retry_after = (int(retry_after) if retry_after.isdigit() else 60) + 120
            _logger.warning(f'Ratelimited! Retry after: {retry_after}. Retry attempt: {i + 1}')
            time.sleep(retry_after)
        elif r is not None and str(r.status_code).startswith('4'):
            _logger.critical(f'Got code {r.status_code} when fetching nodes for namespace {namespace}, page {page}. Terminating...')
            _logger.critical(r.text)
            _logger.critical(url)
            watchdog_suicide()
        elif r is None:
            _logger.warning(f'Failed to fetch nodes for namespace {namespace}, page {page}. Sleeping 5s...')
            time.sleep(5)
        else:
            try:
                return r.json()
            except requests.exceptions.JSONDecodeError:
                _logger.critical(f'Failed to parse search JSON:\n{r.text}')
                watchdog_suicide()

    _logger.critical('Failed to fetch chub.ai nodes after retries.')
    watchdog_suicide()
