unit_test / data /preprocess /process_flickr_caption_json.py

Upload 259 files

32b542e about 3 years ago

1.46 kB

	import json
	from collections import defaultdict
	import jsonlines

	subsets = ['train', 'val', 'test']
	savepath = "flickr30k/annotations"

	set2jsonline = {
	'train': 'flickr30k/all_data_final_train_2014.jsonline',
	'val': 'flickr30k/all_data_final_val_set0_2014.jsonline',
	'test': 'flickr30k/all_data_final_test_set0_2014.jsonline',
	}

	import os
	if not os.path.exists(savepath):
	os.makedirs(savepath)


	savename = {
	'train': "flickr30k/captions_train.json",
	'val': "flickr30k/captions_val.json",
	'test': "flickr30k/captions_test.json",
	}

	# imagefields = defaultdict(list)
	# annotationsfields = defaultdict(list)

	for subset in subsets:
	imagefield = []
	annotaionfiled = []
	sen_id = 0
	with jsonlines.open(set2jsonline[subset]) as reader:
	for annotation in reader:
	sentences = annotation["sentences"]
	image_id = annotation["img_path"]
	imagefield.append({
	"filename": annotation["img_path"],
	"id": annotation['id'],
	})
	for sentence in sentences:
	annotaionfiled.append({
	"image_id": annotation['id'],
	"id": sen_id,
	"caption": sentence,
	})
	sen_id += 1

	data = {
	"images": imagefield,
	"annotations": annotaionfiled,
	}
	json.dump( data, open(savename[subset], "w"))