import pytest
from dotmap import DotMap
from icecream import ic
from transformers import AutoTokenizer
import base64

import lm_node.base
from lm_node.sanitize import *

tokenizer = AutoTokenizer.from_pretrained("gpt2")

#It's too hard to create tokenizer with my own functions, make it easier. make sanitization not need config and other shit as well.
req_params = {
    "top_k": 1,
    "top_p": 0.1,
    "temperature": 0.1,
    "min_length": 1,
    'max_length': 1,
    'repetition_penalty': 1.105,
    "repetition_penalty_range": 2,
    'tail_free_sampling': 1,
    "pad_token_id": 50256,
    "use_cache": True,
    "do_sample": True,
    "generate_until_sentence": False,
}

text2 = """9DNkAcoC0gGsAQsAXmENANFKbzU9AAAAyHArEx8A"""

honkers = [
    [True, "Nestled between the winding dunes", [45, 395, 992, 1022, 262, 28967, 288, 4015],],
    [False, text2, [13300, 356, 714, 466, 428, 11, 24926, 13, 19153, 13679, 61, 0, 28872, 4907, 31]],
]

#test hash2text and text2hash

def test_tokenize():
    req_dict = DotMap()
    req_dict.parameters = DotMap(req_params)

    for req in honkers:
        req_dict.input = req[1]
        ids = tokenize(req_dict, req[0], tokenizer)
        assert ids[0][0].tolist() == req[2]
        

def test_base64_to_list():
    test_list = [
        ("DQBg6s7qDAA=", False), #Bigger than 50k integer, explosive
        ("9DNkAcoC0gGsAQsAXmENANFKbzU9AAAAyHArEx8A", [13300,356,714,466,428,11,24926,13,19153,13679,61,0,28872,4907,31]),  #normal
    ]
    for case in test_list:
        out = base64_to_list(tokenizer, case[0])
        ic(out)
        if case[1]:
            assert out[0] == True
            assert out[1] == case[1]

        else:
            assert out[0] == False

def test_sanitize():
    return

if __name__ == '__main__':
    test_base64_to_list()