Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions data_toolkit/asset_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,19 @@
if os.path.exists(os.path.join(opt.pbr_dump_root, 'pbr_dumps', 'metadata.csv')):
metadata = metadata.combine_first(pd.read_csv(os.path.join(opt.pbr_dump_root, 'pbr_dumps', 'metadata.csv')).set_index('sha256'))
metadata = metadata.reset_index()
if 'mesh_dumped' not in metadata.columns:
metadata['mesh_dumped'] = False
dump_path = os.path.join(opt.mesh_dump_root, 'mesh_dumps')
if os.path.exists(dump_path):
dumped_files = [f.replace('.pickle', '') for f in os.listdir(dump_path) if f.endswith('.pickle')]
metadata.loc[metadata['sha256'].isin(dumped_files), 'mesh_dumped'] = True

if 'pbr_dumped' not in metadata.columns:
metadata['pbr_dumped'] = False
dump_path = os.path.join(opt.pbr_dump_root, 'pbr_dumps')
if os.path.exists(dump_path):
dumped_files = [f.replace('.pickle', '') for f in os.listdir(dump_path) if f.endswith('.pickle')]
metadata.loc[metadata['sha256'].isin(dumped_files), 'pbr_dumped'] = True
if opt.instances is None:
if 'num_faces' in metadata.columns:
metadata = metadata[metadata['num_faces'].isnull()]
Expand Down
94 changes: 94 additions & 0 deletions data_toolkit/datasets/ObjaverseXL.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import os
import argparse
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import pandas as pd
import objaverse.xl as oxl
from utils import get_file_hash
import tempfile
import zipfile



def add_args(parser: argparse.ArgumentParser):
parser.add_argument('--source', type=str, default='sketchfab',
help='Data source to download annotations from (github, sketchfab)')


def get_metadata(source, **kwargs):
if source == 'sketchfab':
metadata = pd.read_csv("hf://datasets/JeffreyXiang/TRELLIS-500K/ObjaverseXL_sketchfab.csv")
elif source == 'github':
metadata = pd.read_csv("hf://datasets/JeffreyXiang/TRELLIS-500K/ObjaverseXL_github.csv")
else:
raise ValueError(f"Invalid source: {source}")
return metadata


def download(metadata, output_dir, **kwargs):
os.makedirs(os.path.join(output_dir, 'raw'), exist_ok=True)

# download annotations
annotations = oxl.get_annotations()
annotations = annotations[annotations['sha256'].isin(metadata['sha256'].values)]

# download and render objects
file_paths = oxl.download_objects(
annotations,
download_dir=os.path.join(output_dir, "raw"),
save_repo_format="zip",
)

downloaded = {}
metadata = metadata.set_index("file_identifier")
for k, v in file_paths.items():
sha256 = metadata.loc[k, "sha256"]
downloaded[sha256] = os.path.relpath(v, output_dir)

return pd.DataFrame(downloaded.items(), columns=['sha256', 'local_path'])

def foreach_instance(metadata, output_dir, func, max_workers=None, desc='Processing objects', no_file=False):
records = []
if max_workers is None or max_workers <= 0:
max_workers = os.cpu_count()

try:
with ThreadPoolExecutor(max_workers=max_workers) as executor, \
tqdm(total=len(metadata), desc=desc) as pbar:

def worker(metadatum):
try:
sha256 = metadatum['sha256']
if no_file:
record = func(None, metadatum)
else:
local_path = metadatum['local_path']
if local_path.startswith('raw/github/repos/'):
path_parts = local_path.split('/')
file_name = os.path.join(*path_parts[5:])
zip_file = os.path.join(output_dir, *path_parts[:5])
import tempfile, zipfile
with tempfile.TemporaryDirectory() as tmp_dir:
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(tmp_dir)
file = os.path.join(tmp_dir, file_name)
record = func(file, metadatum)
else:
file = os.path.join(output_dir, local_path)
record = func(file, metadatum)

if record is not None:
records.append(record)
pbar.update()
except Exception as e:
print(f"Error processing object {metadatum.get('sha256', 'unknown')}: {e}")
pbar.update()

for metadatum in metadata.to_dict('records'):
executor.submit(worker, metadatum)

executor.shutdown(wait=True)
except Exception as e:
print(f"Error happened during processing: {e}")

return pd.DataFrame.from_records(records)
3 changes: 3 additions & 0 deletions data_toolkit/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
parser = argparse.ArgumentParser()
parser.add_argument('--root', type=str, required=True,
help='Directory to save the metadata')
parser.add_argument('--output_dir', type=str, default=None,
help='Directory to save the metadata')
parser.add_argument('--download_root', type=str, default=None,
help='Directory to download the objects')
parser.add_argument('--filter_low_aesthetic_score', type=float, default=None,
Expand All @@ -25,6 +27,7 @@
parser.add_argument('--world_size', type=int, default=1)
opt = parser.parse_args(sys.argv[2:])
opt = edict(vars(opt))
opt.output_dir = opt.output_dir or opt.root
opt.download_root = opt.download_root or opt.root

os.makedirs(opt.root, exist_ok=True)
Expand Down