feat✨: 添加多个数据集的支持，包括Gigaspeech、TextVQA、OCR-VQA-200K、RefCOCO系列，更新数据集工厂和处理逻辑，优化图像处理功能

2025-05-15 20:33:29 +08:00
parent 9ca588224d
commit 24a6c3c114
17 changed files with 568 additions and 78 deletions
@@ -0,0 +1,2 @@
+images/*
+dataset.json
@@ -0,0 +1,49 @@
+import os
+import json
+import urllib.request as ureq
+import urllib.error
+import concurrent.futures
+import threading
+
+# Set the file paths for your Google Drive
+dataset_path = './dataset.json'
+images_path = './images'
+download = 1  # Set to 0 if images are already downloaded
+
+# Load dataset json file
+with open(dataset_path, 'r') as fp:
+    data = json.load(fp)
+
+# Initialize a counter and a lock for thread-safe counting
+downloaded_count = 0
+count_lock = threading.Lock()
+
+# Function to download an image
+def download_image(k):
+    global downloaded_count
+    imageURL = data[k]['imageURL']
+    ext = os.path.splitext(imageURL)[1]
+    outputFile = os.path.join(images_path, f'{k}{ext}')
+
+    # Only download the image if it doesn't exist
+    if not os.path.exists(outputFile):
+        try:
+            ureq.urlretrieve(imageURL, outputFile)
+
+            with count_lock:
+                downloaded_count += 1
+                if downloaded_count % 100 == 0:
+                    print(f'{downloaded_count} images downloaded.')
+        except urllib.error.URLError as e:
+            print(f'Error downloading {outputFile}: {e}')
+
+# Download images using multiple threads
+if download == 1:
+    if not os.path.exists(images_path):
+        os.makedirs(images_path)
+
+    # Create a thread pool and download the images in parallel
+    # Increase max_workers to potentially speed up downloads for many small files.
+    # The optimal number may vary based on your network and the server's capacity.
+    with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
+        executor.map(download_image, data.keys())