diff --git a/dataset/.gitignore b/dataset/.gitignore new file mode 100644 index 0000000..a1703ab --- /dev/null +++ b/dataset/.gitignore @@ -0,0 +1,5 @@ +derek-thomas* +*.lock +speechcolab* +lmms-lab* +downloads/* \ No newline at end of file diff --git a/dataset/OCR-VQA-200K/.gitignore b/dataset/OCR-VQA-200K/.gitignore new file mode 100644 index 0000000..3e9e6c4 --- /dev/null +++ b/dataset/OCR-VQA-200K/.gitignore @@ -0,0 +1,2 @@ +images/* +dataset.json \ No newline at end of file diff --git a/dataset/OCR-VQA-200K/download.py b/dataset/OCR-VQA-200K/download.py new file mode 100644 index 0000000..6d8351b --- /dev/null +++ b/dataset/OCR-VQA-200K/download.py @@ -0,0 +1,49 @@ +import os +import json +import urllib.request as ureq +import urllib.error +import concurrent.futures +import threading + +# Set the file paths for your Google Drive +dataset_path = './dataset.json' +images_path = './images' +download = 1 # Set to 0 if images are already downloaded + +# Load dataset json file +with open(dataset_path, 'r') as fp: + data = json.load(fp) + +# Initialize a counter and a lock for thread-safe counting +downloaded_count = 0 +count_lock = threading.Lock() + +# Function to download an image +def download_image(k): + global downloaded_count + imageURL = data[k]['imageURL'] + ext = os.path.splitext(imageURL)[1] + outputFile = os.path.join(images_path, f'{k}{ext}') + + # Only download the image if it doesn't exist + if not os.path.exists(outputFile): + try: + ureq.urlretrieve(imageURL, outputFile) + + with count_lock: + downloaded_count += 1 + if downloaded_count % 100 == 0: + print(f'{downloaded_count} images downloaded.') + except urllib.error.URLError as e: + print(f'Error downloading {outputFile}: {e}') + +# Download images using multiple threads +if download == 1: + if not os.path.exists(images_path): + os.makedirs(images_path) + + # Create a thread pool and download the images in parallel + # Increase max_workers to potentially speed up downloads for many small files. + # The optimal number may vary based on your network and the server's capacity. + with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor: + executor.map(download_image, data.keys()) diff --git a/dataset/TextVQA/.gitignore b/dataset/TextVQA/.gitignore new file mode 100644 index 0000000..f3e4424 --- /dev/null +++ b/dataset/TextVQA/.gitignore @@ -0,0 +1,5 @@ +images/test_images/* +images/train_images/* +TextVQA_0.5.1_test.json +TextVQA_0.5.1_train.json +TextVQA_0.5.1_val.json diff --git a/dataset/vizwiz/Annotations/.gitignore b/dataset/vizwiz/Annotations/.gitignore new file mode 100644 index 0000000..95cd600 --- /dev/null +++ b/dataset/vizwiz/Annotations/.gitignore @@ -0,0 +1,3 @@ +train.json +test.json +val.json \ No newline at end of file diff --git a/dataset/vizwiz/images/.gitignore b/dataset/vizwiz/images/.gitignore new file mode 100644 index 0000000..20e2e79 --- /dev/null +++ b/dataset/vizwiz/images/.gitignore @@ -0,0 +1,3 @@ +val/* +train/* +test/* \ No newline at end of file diff --git a/src/dataset_library/GigaspeechDataset.py b/src/dataset_library/GigaspeechDataset.py index 08ffab0..ee32c38 100644 --- a/src/dataset_library/GigaspeechDataset.py +++ b/src/dataset_library/GigaspeechDataset.py @@ -18,8 +18,9 @@ class GigaspeechDataset(Dataset): self.audio_processor = audio_processor self.text_processor = text_processor - gs = load_dataset("speechcolab/gigaspeech", "xs") - self.data = gs[split] + from .format import dataset_dir + gs = load_dataset("speechcolab/gigaspeech", "xs", cache_dir=dataset_dir) # type: ignore + self.data = gs[split] # type: ignore def __len__(self): return len(self.data) @@ -54,7 +55,7 @@ class GigaspeechDataset(Dataset): audios=[(audio, sampling_rate)], chat=chat, original=sample, - ) + ) # type: ignore class GigaspeechDatasetForGeneration(GigaspeechDataset): @@ -87,7 +88,7 @@ class GigaspeechDatasetForGeneration(GigaspeechDataset): chat=chat, answer=text, original=sample, - ) + ) # type: ignore def test_gigaspeech(): @@ -103,3 +104,6 @@ def test_gigaspeech(): print(dataset[0]) assert len(dataset) > 0 assert len(dataset[0]["chat"]) > 0 + +if __name__ == "__main__": + test_gigaspeech() diff --git a/src/dataset_library/OCRVQA200KDataset.py b/src/dataset_library/OCRVQA200KDataset.py index 2e04398..f95f254 100644 --- a/src/dataset_library/OCRVQA200KDataset.py +++ b/src/dataset_library/OCRVQA200KDataset.py @@ -22,8 +22,10 @@ class OCRVQADataset(Dataset): """ self.vis_root = vis_root + from .vis_processor import size_processor + self.vis_processor = ( - vis_processor if vis_processor is not None else self._vis_processor + vis_processor if vis_processor is not None else size_processor ) self.text_processor = text_processor if split == "train": @@ -64,24 +66,6 @@ class OCRVQADataset(Dataset): def __len__(self): return len(self.data) - def _vis_processor(self, image: Image.Image): - width, height = image.size - if width > 500 or height > 500: - max_size = max(width, height) - ratio = 500 / max_size - new_width = int(width * ratio) - new_height = int(height * ratio) - image = image.resize((new_width, new_height), Image.Resampling.BILINEAR) - - if width < 28 or height < 28: - min_size = min(width, height) - ratio = 28 / min_size + 1 - new_width = int(width * ratio) - new_height = int(height * ratio) - image = image.resize((new_width, new_height), Image.Resampling.BILINEAR) - - return image - def __getitem__(self, index): sample = self.data[index] image: Image.Image = Image.open( @@ -117,7 +101,7 @@ class OCRVQADataset(Dataset): chat=chat, original=sample["original"], images=[image], - ) + ) # type: ignore class OCRVQADatasetForGeneration(OCRVQADataset): @@ -153,4 +137,4 @@ class OCRVQADatasetForGeneration(OCRVQADataset): chat=chat, answer=answer, original=sample["original"], - ) + ) # type: ignore diff --git a/src/dataset_library/RefCOCODataset.py b/src/dataset_library/RefCOCODataset.py new file mode 100644 index 0000000..c7578fc --- /dev/null +++ b/src/dataset_library/RefCOCODataset.py @@ -0,0 +1,121 @@ +from .format import ( + Conversation, + ConverstationAudio, + ConverstationImage, + ConverstationText, + DatasetOutput, +) +from torch.utils.data import Dataset +from datasets import load_dataset, DatasetDict +from typing import Literal + + +class RefCOCODataset(Dataset): + def __init__( + self, + vis_processor=None, + text_processor=None, + split: Literal["val", "test"] = "val", + ): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + + self.vis_processor = vis_processor + self.text_processor = text_processor + from .format import dataset_dir + ds = load_dataset("lmms-lab/RefCOCO", cache_dir=dataset_dir) # type: ignore + self.data = ds[split] # type: ignore + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + sample = self.data[index] + # print(sample) + images = sample["image"] + question = sample["question"] + answer = sample["answer"] + + if self.vis_processor is not None: + images = self.vis_processor(images) + if self.text_processor is not None: + question = self.text_processor(question) + + chat = [ + Conversation( + role="user", + content=[ + ConverstationImage(type="image", image_url=""), + ConverstationText( + type="text", + text=question, + ), + ], + ), + Conversation( + role="assistant", + content=[ConverstationText(type="text", text=answer)], + ), + ] + + return DatasetOutput( + images=[images], + chat=chat, + original=sample, + ) # type: ignore + + +class RefCOCODatasetForGeneration(RefCOCODataset): + + def __getitem__(self, index): + sample = self.data[index] + # print(sample) + images = sample["image"] + question = sample["question"] + answer = sample["answer"] + + if self.vis_processor is not None: + images = self.vis_processor(images) + if self.text_processor is not None: + question = self.text_processor(question) + + chat = [ + Conversation( + role="user", + content=[ + ConverstationImage(type="image", image_url=""), + ConverstationText( + type="text", + text=f"{question}", + ), + ], + ), + ] + + return DatasetOutput( + images=[images], + chat=chat, + answer=answer, + original=sample, + ) # type: ignore + + +def test_RefCOCO(): + dataset = RefCOCODataset( + split="val", + ) + print(dataset[3]) + assert len(dataset) > 0 + assert len(dataset[0]["chat"]) > 0 + dataset = RefCOCODatasetForGeneration( + split="test", + ) + print(dataset[3]) + assert len(dataset) > 0 + assert len(dataset[0]["chat"]) > 0 + + +if __name__ == "__main__": + test_RefCOCO() diff --git a/src/dataset_library/RefCOCOPlusDataset.py b/src/dataset_library/RefCOCOPlusDataset.py new file mode 100644 index 0000000..8d574b3 --- /dev/null +++ b/src/dataset_library/RefCOCOPlusDataset.py @@ -0,0 +1,121 @@ +from .format import ( + Conversation, + ConverstationAudio, + ConverstationImage, + ConverstationText, + DatasetOutput, +) +from torch.utils.data import Dataset +from datasets import load_dataset, DatasetDict +from typing import Literal + + +class RefCOCOplusDataset(Dataset): + def __init__( + self, + vis_processor=None, + text_processor=None, + split: Literal["val", "testA"] = "val", + ): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + + self.vis_processor = vis_processor + self.text_processor = text_processor + from .format import dataset_dir + ds = load_dataset("lmms-lab/RefCOCOplus", cache_dir=dataset_dir) # type: ignore + self.data = ds[split] # type: ignore + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + sample = self.data[index] + # print(sample) + images = sample["image"] + question = sample["question"] + answer = sample["answer"] + + if self.vis_processor is not None: + images = self.vis_processor(images) + if self.text_processor is not None: + question = self.text_processor(question) + + chat = [ + Conversation( + role="user", + content=[ + ConverstationImage(type="image", image_url=""), + ConverstationText( + type="text", + text=question, + ), + ], + ), + Conversation( + role="assistant", + content=[ConverstationText(type="text", text=answer)], + ), + ] + + return DatasetOutput( + images=[images], + chat=chat, + original=sample, + ) # type: ignore + + +class RefCOCOplusDatasetForGeneration(RefCOCOplusDataset): + + def __getitem__(self, index): + sample = self.data[index] + # print(sample) + images = sample["image"] + question = sample["question"] + answer = sample["answer"] + + if self.vis_processor is not None: + images = self.vis_processor(images) + if self.text_processor is not None: + question = self.text_processor(question) + + chat = [ + Conversation( + role="user", + content=[ + ConverstationImage(type="image", image_url=""), + ConverstationText( + type="text", + text=f"{question}", + ), + ], + ), + ] + + return DatasetOutput( + images=[images], + chat=chat, + answer=answer, + original=sample, + ) # type: ignore + + +def test_RefCOCOplus(): + dataset = RefCOCOplusDataset( + split="val", + ) + print(dataset[3]) + assert len(dataset) > 0 + assert len(dataset[0]["chat"]) > 0 + dataset = RefCOCOplusDatasetForGeneration( + split="testA", + ) + print(dataset[3]) + assert len(dataset) > 0 + assert len(dataset[0]["chat"]) > 0 + + +if __name__ == "__main__": + test_RefCOCOplus() diff --git a/src/dataset_library/RefCOCOgDataset.py b/src/dataset_library/RefCOCOgDataset.py new file mode 100644 index 0000000..2121491 --- /dev/null +++ b/src/dataset_library/RefCOCOgDataset.py @@ -0,0 +1,121 @@ +from .format import ( + Conversation, + ConverstationAudio, + ConverstationImage, + ConverstationText, + DatasetOutput, +) +from torch.utils.data import Dataset +from datasets import load_dataset, DatasetDict +from typing import Literal + + +class RefCOCOgDataset(Dataset): + def __init__( + self, + vis_processor=None, + text_processor=None, + split: Literal["val", "test"] = "val", + ): + """ + vis_root (string): Root directory of images (e.g. coco/images/) + ann_root (string): directory to store the annotation file + """ + + self.vis_processor = vis_processor + self.text_processor = text_processor + from .format import dataset_dir + ds = load_dataset("lmms-lab/RefCOCOg", cache_dir=dataset_dir) # type: ignore + self.data = ds[split] # type: ignore + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + sample = self.data[index] + # print(sample) + images = sample["image"] + question = sample["question"] + answer = sample["answer"] + + if self.vis_processor is not None: + images = self.vis_processor(images) + if self.text_processor is not None: + question = self.text_processor(question) + + chat = [ + Conversation( + role="user", + content=[ + ConverstationImage(type="image", image_url=""), + ConverstationText( + type="text", + text=question, + ), + ], + ), + Conversation( + role="assistant", + content=[ConverstationText(type="text", text=answer)], + ), + ] + + return DatasetOutput( + images=[images], + chat=chat, + original=sample, + ) # type: ignore + + +class RefCOCOgDatasetForGeneration(RefCOCOgDataset): + + def __getitem__(self, index): + sample = self.data[index] + # print(sample) + images = sample["image"] + question = sample["question"] + answer = sample["answer"] + + if self.vis_processor is not None: + images = self.vis_processor(images) + if self.text_processor is not None: + question = self.text_processor(question) + + chat = [ + Conversation( + role="user", + content=[ + ConverstationImage(type="image", image_url=""), + ConverstationText( + type="text", + text=f"{question}", + ), + ], + ), + ] + + return DatasetOutput( + images=[images], + chat=chat, + answer=answer, + original=sample, + ) # type: ignore + + +def test_RefCOCOg(): + dataset = RefCOCOgDataset( + split="val", + ) + print(dataset[3]) + assert len(dataset) > 0 + assert len(dataset[0]["chat"]) > 0 + dataset = RefCOCOgDatasetForGeneration( + split="test", + ) + print(dataset[3]) + assert len(dataset) > 0 + assert len(dataset[0]["chat"]) > 0 + + +if __name__ == "__main__": + test_RefCOCOg() diff --git a/src/dataset_library/ScienceQADataset.py b/src/dataset_library/ScienceQADataset.py index 77f92ec..6f3c17e 100644 --- a/src/dataset_library/ScienceQADataset.py +++ b/src/dataset_library/ScienceQADataset.py @@ -6,20 +6,21 @@ from .format import ( DatasetOutput, ) from torch.utils.data import Dataset -from datasets import load_dataset +from datasets import load_dataset, DatasetDict class ScienceQADataset(Dataset): - def __init__(self, audio_processor=None, text_processor=None, split="train"): + def __init__(self, vis_processor=None, text_processor=None, split="train"): """ vis_root (string): Root directory of images (e.g. coco/images/) ann_root (string): directory to store the annotation file """ - self.vis_processor = audio_processor + self.vis_processor = vis_processor self.text_processor = text_processor - ds = load_dataset("derek-thomas/ScienceQA") - self.data = ds[split] + from .format import dataset_dir + ds = load_dataset("derek-thomas/ScienceQA",cache_dir=dataset_dir) + self.data = ds[split] # type: ignore def __len__(self): return len(self.data) @@ -60,7 +61,7 @@ class ScienceQADataset(Dataset): images=[images], chat=chat, original=sample, - ) + ) # type: ignore class ScienceQADatasetForGeneration(ScienceQADataset): @@ -98,7 +99,7 @@ class ScienceQADatasetForGeneration(ScienceQADataset): chat=chat, answer=choices[answer], original=sample, - ) + ) # type: ignore def test_scienceQA(): diff --git a/src/dataset_library/TextVQADataset.py b/src/dataset_library/TextVQADataset.py index e7c7506..51fe8d8 100644 --- a/src/dataset_library/TextVQADataset.py +++ b/src/dataset_library/TextVQADataset.py @@ -124,7 +124,7 @@ class TextVQADataset(Dataset): chat=chat, original=sample["original"], images=[image], - ) + ) # type: ignore class TextVQADatasetForGeneration(TextVQADataset): @@ -158,16 +158,5 @@ class TextVQADatasetForGeneration(TextVQADataset): chat=chat, answer=answer, original=sample["original"], - ) + ) # type: ignore - -def test_dataset(): - vis_root = "/home/zyy/dataset/TextVQA/images" - ann_path = "/home/zyy/dataset/TextVQA" - dataset = TextVQADataset(vis_root, ann_path) - for i in range(10): - print(dataset[i]) - - -if __name__ == "__main__": - test_dataset() diff --git a/src/dataset_library/factory.py b/src/dataset_library/factory.py index eba5842..3f6e4b9 100644 --- a/src/dataset_library/factory.py +++ b/src/dataset_library/factory.py @@ -1,10 +1,11 @@ from torch.utils.data import Dataset from typing import Literal from pathlib import Path +from dataset_library.format import dataset_dir def get_dataset( - dataset_name, base_path="/home/zyy/dataset" + dataset_name, base_path=dataset_dir ) -> dict[Literal["train", "test", "generation"], Dataset]: dataset: dict[Literal["train", "test", "generation"], Dataset] = {} match dataset_name: @@ -92,4 +93,40 @@ def get_dataset( "generation": ScienceQADatasetForGeneration(split="test"), } + case "refcoco": + from .RefCOCODataset import ( + RefCOCODataset, + RefCOCODatasetForGeneration, + ) + + dataset = { + "train": RefCOCODataset(split="val"), + "test": RefCOCODataset(split="test"), + "generation": RefCOCODatasetForGeneration(split="test"), + } + + case "refcocog": + from .RefCOCOgDataset import ( + RefCOCOgDataset, + RefCOCOgDatasetForGeneration, + ) + + dataset = { + "train": RefCOCOgDataset(split="val"), + "test": RefCOCOgDataset(split="test"), + "generation": RefCOCOgDatasetForGeneration(split="test"), + } + + case "refcocoplus": + from .RefCOCOPlusDataset import ( + RefCOCOplusDataset, + RefCOCOplusDatasetForGeneration, + ) + + dataset = { + "train": RefCOCOplusDataset(split="val"), + "test": RefCOCOplusDataset(split="testA"), + "generation": RefCOCOplusDatasetForGeneration(split="testA"), + } + return dataset diff --git a/src/dataset_library/format.py b/src/dataset_library/format.py index f4c6efc..f983eec 100644 --- a/src/dataset_library/format.py +++ b/src/dataset_library/format.py @@ -1,6 +1,9 @@ from typing import Any, Tuple, TypedDict, Literal, Optional import numpy as np from PIL import Image +from pathlib import Path + +dataset_dir = Path(__file__).resolve().parent.parent.parent / "dataset" class ConverstationText(TypedDict): diff --git a/src/dataset_library/test_dataset.py b/src/dataset_library/test_dataset.py index 516abc7..21690e9 100644 --- a/src/dataset_library/test_dataset.py +++ b/src/dataset_library/test_dataset.py @@ -1,46 +1,70 @@ from .factory import get_dataset -def test_gigaspeech(): - dataset = get_dataset("gigaspeech") - assert len(dataset["train"]) > 0 +# def test_gigaspeech(): +# dataset = get_dataset("gigaspeech") +# assert len(dataset["train"]) > 0 # type: ignore +# assert len(dataset["train"][0]["chat"]) > 0 + +# assert len(dataset["test"]) > 0 # type: ignore +# assert len(dataset["test"][0]["chat"]) > 0 + + +# def test_chem(): +# dataset = get_dataset("chem") +# assert len(dataset["train"]) > 0 # type: ignore +# assert len(dataset["train"][0]["chat"]) > 0 + +# assert len(dataset["test"]) > 0 # type: ignore +# assert len(dataset["test"][0]["chat"]) > 0 + + +# def test_ocrvqa200k(): +# dataset = get_dataset("ocrvqa200k") +# assert len(dataset["train"]) > 0 # type: ignore +# assert len(dataset["train"][0]["chat"]) > 0 + +# assert len(dataset["test"]) > 0 # type: ignore +# assert len(dataset["test"][0]["chat"]) > 0 + + +# def test_textvqa(): +# dataset = get_dataset("textvqa") +# assert len(dataset["train"]) > 0 # type: ignore +# assert len(dataset["train"][0]["chat"]) > 0 + +# assert len(dataset["test"]) > 0 # type: ignore +# assert len(dataset["test"][0]["chat"]) > 0 + + +# def test_scienceqa(): +# dataset = get_dataset("scienceqa") +# assert len(dataset["train"]) > 0 # type: ignore +# assert len(dataset["train"][0]["chat"]) > 0 + +# assert len(dataset["test"]) > 0 # type: ignore +# assert len(dataset["test"][0]["chat"]) > 0 + +def test_refcoco(): + dataset = get_dataset("refcoco") + assert len(dataset["train"]) > 0 # type: ignore assert len(dataset["train"][0]["chat"]) > 0 - assert len(dataset["test"]) > 0 + assert len(dataset["test"]) > 0 # type: ignore assert len(dataset["test"][0]["chat"]) > 0 - -def test_chem(): - dataset = get_dataset("chem") - assert len(dataset["train"]) > 0 +def test_refcocog(): + dataset = get_dataset("refcocog") + assert len(dataset["train"]) > 0 # type: ignore assert len(dataset["train"][0]["chat"]) > 0 - assert len(dataset["test"]) > 0 + assert len(dataset["test"]) > 0 # type: ignore assert len(dataset["test"][0]["chat"]) > 0 - -def test_ocrvqa200k(): - dataset = get_dataset("ocrvqa200k") - assert len(dataset["train"]) > 0 +def test_refcocoplus(): + dataset = get_dataset("refcocoplus") + assert len(dataset["train"]) > 0 # type: ignore assert len(dataset["train"][0]["chat"]) > 0 - assert len(dataset["test"]) > 0 - assert len(dataset["test"][0]["chat"]) > 0 - - -def test_textvqa(): - dataset = get_dataset("textvqa") - assert len(dataset["train"]) > 0 - assert len(dataset["train"][0]["chat"]) > 0 - - assert len(dataset["test"]) > 0 - assert len(dataset["test"][0]["chat"]) > 0 - - -def test_scienceqa(): - dataset = get_dataset("scienceqa") - assert len(dataset["train"]) > 0 - assert len(dataset["train"][0]["chat"]) > 0 - - assert len(dataset["test"]) > 0 + assert len(dataset["test"]) > 0 # type: ignore assert len(dataset["test"][0]["chat"]) > 0 diff --git a/src/dataset_library/vis_processor.py b/src/dataset_library/vis_processor.py new file mode 100644 index 0000000..e452143 --- /dev/null +++ b/src/dataset_library/vis_processor.py @@ -0,0 +1,18 @@ +from PIL import Image +def size_processor(image: Image.Image): + width, height = image.size + if width > 500 or height > 500: + max_size = max(width, height) + ratio = 500 / max_size + new_width = int(width * ratio) + new_height = int(height * ratio) + image = image.resize((new_width, new_height), Image.Resampling.BILINEAR) + + if width < 28 or height < 28: + min_size = min(width, height) + ratio = 28 / min_size + 1 + new_width = int(width * ratio) + new_height = int(height * ratio) + image = image.resize((new_width, new_height), Image.Resampling.BILINEAR) + + return image \ No newline at end of file