# todo: arg parsing maybe
metadata_folder = "../danbooru_metadata/danbooru_metadata_new/"
existing_dataset = "/home/xuser/nvme1/dataset/danbooru/"
new_dataset_folder = "/home/xuser/nvme1/dataset/danbooru_updated/"

from tqdm import tqdm
import os
import json
from codecs import decode
from basedformer.dataset import ShardedImageDataset, ImageDatasetBuilder


all_metadata = {}

for f in tqdm(os.listdir(metadata_folder)):
    try:
        metadata = json.loads(decode(open(metadata_folder + f, 'rb').read(), 'zlib').decode("UTF-8"))
        for post in metadata:
            if "id" in post:
                all_metadata[post["id"]] = post
    except:
        print("Error parsing " + f)


print("Loading old dataset...")
old_dataset = ShardedImageDataset(existing_dataset + "danbooru.ds", existing_dataset + "danbooru.index", existing_dataset + "danbooru.metadata")
old_meta = old_dataset.metadata
old_lookup = old_dataset.pointer_lookup

print("Calculating entries that can be copied from existing db...")
bad_types = ['jpg', 'jpeg']
can_keep = {}
can_keep_list = []
for k in all_metadata.keys():
    data = all_metadata[k]
    if data['file_ext'] not in bad_types:
        if data['id'] in old_meta and data['id'] in old_lookup:
            #can_keep.append(data['id'])
            can_keep[data['id']] = True
            can_keep_list.append(data['id'])

print("can keep %d of %d with new db of %d" % (len(can_keep), len(old_meta), len(all_metadata)))

#estimate new dataset size
total_new_bytes = (os.path.getsize(existing_dataset + "danbooru.ds") * (len(can_keep) / len(old_meta)))
total_new_jpegs = 0
for k in all_metadata.keys():
    if k not in can_keep:
        data = all_metadata[k]
        if data['file_ext'] in bad_types:
            total_new_bytes = total_new_bytes + data['file_size']
            total_new_jpegs = total_new_jpegs + 1

total_new_bytes = total_new_bytes + int((os.path.getsize(existing_dataset + "danbooru.ds") / len(old_meta)) * (len(all_metadata) - (len(can_keep) + total_new_jpegs)))
print("Estimated new dataset size: %dGiB" % (((total_new_bytes / 1024) / 1024) / 1024))


print("Copyng old db data...")

# detect block size of fs the archive is stored on
block_size = int(os.popen("getconf PAGE_SIZE").read().lstrip().rstrip()) #int(os.popen("stat -fc %s " + new_dataset_folder).read().lstrip().rstrip())
new_dataset = ImageDatasetBuilder(folder_path=new_dataset_folder, name="danbooru_updated", threads=32, block_size=block_size, align_fs_blocks=True)
new_dataset.build()

# how many operations to run at once
copy_chunk_size = 4096
for e in tqdm(range(0, len(can_keep_list), copy_chunk_size)):
    chunk = can_keep_list[e:e+copy_chunk_size]
    new_dataset.operate(lambda id: old_dataset.read_from_id(id, decode=False), chunk, chunk, use_tqdm=True)

new_dataset.flush()
new_dataset.flush_index()
new_dataset.flush_metadata()

print("Scraping...")
to_scrape = []
for k in all_metadata.keys():
    if k not in can_keep:
        to_scrape.append(k)

def download_danbooru(id):
    meta = all_metadata[id]
    return new_dataset.url_op(meta["large_file_url"] if meta["has_large"] else meta["file_url"], meta["md5"])
    
save_every = 25
for e in tqdm(range(0, len(to_scrape), copy_chunk_size)):
    chunk = to_scrape[e:e+copy_chunk_size]
    new_dataset.operate(download_danbooru, chunk, chunk, use_tqdm=True)
    if (e // copy_chunk_size) % save_every == 0:
        new_dataset.flush()
        new_dataset.flush_index()
        new_dataset.flush_metadata()

new_dataset.flush()
new_dataset.flush_index()
new_dataset.flush_metadata()
