-
Notifications
You must be signed in to change notification settings - Fork 0
Description
on three independant machines i ran into problems iterating over the file, which i hadn't had with the 2.2 file.
after investigation i found a corrupted chunk in collumn 776627, rows 56000:58000.
I patched over it by erasing the data there:
#| export
def fix_arch_2_5(path):
with h5py.File(path, 'r+', locking=False) as f:
d = f['data/expression']
buf = np.zeros((58000 - 56000, 1), dtype=d.dtype)
d[56000:58000, 776627:776628] = buf
here are some of the loops where i tested (testing was done on a single machine):
path = path_dict.data.arch4_data
col = 776627
with h5py.File(path, 'r') as f:
dset = f['data/expression']
chunk_rows, chunk_cols = dset.chunks
n_rows = dset.shape[0]
print(f"testing column {col} in {int(np.ceil(n_rows/chunk_rows))} row-chunks of size {chunk_rows}")
for row_start in tqdm(range(0, n_rows, chunk_rows), desc="Row-chunks"):
row_end = min(row_start + chunk_rows, n_rows)
height = row_end - row_start
# allocate exactly the slab we need
buf = np.empty((height, 1), dtype=dset.dtype)
try:
# read just this slab
dset.read_direct(
buf,
source_sel = np.s_[row_start:row_end, col:col+1],
dest_sel = np.s_[:, :]
)
print(f" rows {row_start:5d}-{row_end:5d}: OK (sum={int(buf.sum())})")
except Exception as e:
print(f"❌ rows {row_start:5d}-{row_end:5d}: FAILED → {type(e).__name__}: {e}")
rows 48000-50000: OK (sum=80394)
rows 50000-52000: OK (sum=196096)
rows 52000-54000: OK (sum=129561)
rows 54000-56000: OK (sum=132451)
❌ rows 56000-58000: FAILED → OSError: Can't synchronously read data (inflate() failed)
rows 58000-60000: OK (sum=237353)
rows 60000-62000: OK (sum=94529)
rows 62000-64000: OK (sum=106239)
rows 64000-66000: OK (sum=65025)
for idx in (776626, 776627, 776628):
try:
with h5py.File(path, 'r') as f:
col = f['data/expression'][:, idx]
print(f"{idx}: OK")
except Exception as e:
print(f"{idx}: FAIL → {e}")
776626: OK
776627: FAIL → Can't synchronously read data (inflate() failed)
776628: OK