versity · chriskirby1 · Jun 9, 2025 · zabbo · Jun 13, 2025 · zabbo
diff --git a/kmod/src/data.c b/kmod/src/data.c
@@ -296,36 +296,49 @@ static s64 truncate_extents(struct super_block *sb, struct inode *inode,
  * and offline blocks.  If it's not provided then the inode is being
  * destroyed and isn't reachable, we don't need to update it.
  *
+ * If 'pause' is set, then we are destroying the inode and we should take
+ * breaks occasionally to allow other nodes access to this inode lock shard.
+ *
  * The caller is in charge of locking the inode and data, but we may
  * have to modify far more items than fit in a transaction so we're in
  * charge of batching updates into transactions.  If the inode is
  * provided then we're responsible for updating its item as we go.
  */
 int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
-				u64 ino, u64 iblock, u64 last, bool offline,
-				struct scoutfs_lock *lock)
+				u64 ino, u64 *iblock, u64 last, bool offline,
+				struct scoutfs_lock *lock, bool pause)
 {
 	struct scoutfs_inode_info *si = NULL;
 	LIST_HEAD(ind_locks);
+	u64 cur_seq;
 	s64 ret = 0;
 
 	WARN_ON_ONCE(inode && !inode_is_locked(inode));
 
+	/*
+	 * If the inode is provided, then we aren't destroying it. So it's not
+	 * safe to pause while removing items- it needs to be done in one chunk.
+	 */
+	if (WARN_ON_ONCE(pause && inode))
+		return -EINVAL;
+
 	/* clamp last to the last possible block? */
 	if (last > SCOUTFS_BLOCK_SM_MAX)
 		last = SCOUTFS_BLOCK_SM_MAX;
 
-	trace_scoutfs_data_truncate_items(sb, iblock, last, offline);
+	trace_scoutfs_data_truncate_items(sb, *iblock, last, offline, pause);
 
-	if (WARN_ON_ONCE(last < iblock))
+	if (WARN_ON_ONCE(last < *iblock))
 		return -EINVAL;
 
 	if (inode) {
 		si = SCOUTFS_I(inode);
 		down_write(&si->extent_sem);
 	}
 
-	while (iblock <= last) {
+	cur_seq = scoutfs_trans_sample_seq(sb);
+
+	while (*iblock <= last) {
 		if (inode)
 			ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
 		else
@@ -339,7 +352,7 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
 			ret = 0;
 
 		if (ret == 0)
-			ret = truncate_extents(sb, inode, ino, iblock, last,
+			ret = truncate_extents(sb, inode, ino, *iblock, last,
 					       offline, lock);
 
 		if (inode)
@@ -351,8 +364,19 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
 		if (ret <= 0)
 			break;
 
-		iblock = ret;
+		*iblock = ret;
 		ret = 0;
+
+		/*
+		 * We know there's more to do because truncate_extents()
+		 * pauses every EXTENTS_PER_HOLD extents and it returned the
+		 * next starting block. Our caller might also want us to pause,
+		 * which we will do whenever we cross a transaction boundary.
+		 */
+		if (pause && (cur_seq != scoutfs_trans_sample_seq(sb))) {
+			ret = -EINPROGRESS;
+			break;
+		}
 	}
 
 	if (si)

diff --git a/kmod/src/data.h b/kmod/src/data.h
@@ -47,8 +47,8 @@ int scoutfs_get_block_write(struct inode *inode, sector_t iblock, struct buffer_
 			    int create);
 
 int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
-				u64 ino, u64 iblock, u64 last, bool offline,
-				struct scoutfs_lock *lock);
+				u64 ino, u64 *iblock, u64 last, bool offline,
+				struct scoutfs_lock *lock, bool pause);
 int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			u64 start, u64 len);
 long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len);

diff --git a/kmod/src/inode.c b/kmod/src/inode.c
@@ -468,8 +468,8 @@ int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock)
 	start = (i_size_read(inode) + SCOUTFS_BLOCK_SM_SIZE - 1) >>
 		SCOUTFS_BLOCK_SM_SHIFT;
 	ret = scoutfs_data_truncate_items(inode->i_sb, inode,
-					  scoutfs_ino(inode), start, ~0ULL,
-					  false, lock);
+					  scoutfs_ino(inode), &start, ~0ULL,
+					  false, lock, false);
 	err = clear_truncate_flag(inode, lock);
 
 	return ret ? ret : err;
@@ -1635,7 +1635,8 @@ int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_
  * partial deletion until all deletion is complete and the orphan item
  * is removed.
  */
-static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_inode *sinode,
+static int delete_inode_items(struct super_block *sb, u64 ino,
+			      struct scoutfs_inode *sinode, u64 *start,
 			      struct scoutfs_lock *lock, struct scoutfs_lock *orph_lock)
 {
 	struct scoutfs_key key;
@@ -1654,8 +1655,8 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_in
 
 	/* remove data items in their own transactions */
 	if (S_ISREG(mode)) {
-		ret = scoutfs_data_truncate_items(sb, NULL, ino, 0, ~0ULL,
-						  false, lock);
+		ret = scoutfs_data_truncate_items(sb, NULL, ino, start, ~0ULL,
+						  false, lock, true);
 		if (ret)
 			goto out;
 	}
@@ -1803,16 +1804,23 @@ static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lo
  */
 static int try_delete_inode_items(struct super_block *sb, u64 ino)
 {
-	struct inode_deletion_lock_data *ldata = NULL;
-	struct scoutfs_lock *orph_lock = NULL;
-	struct scoutfs_lock *lock = NULL;
+	struct inode_deletion_lock_data *ldata;
+	struct scoutfs_lock *orph_lock;
+	struct scoutfs_lock *lock;
 	struct scoutfs_inode sinode;
 	struct scoutfs_key key;
 	bool clear_trying = false;
+	bool more = false;
 	u64 group_nr;
+	u64 start = 0;
 	int bit_nr;
 	int ret;
 
+again:
+	ldata = NULL;
+	orph_lock = NULL;
+	lock = NULL;
+
 	ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &lock);
 	if (ret < 0)
 		goto out;
@@ -1824,11 +1832,12 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
 		goto out;
 
 	/* only one local attempt per inode at a time */
-	if (test_and_set_bit(bit_nr, ldata->trying)) {
+	if (!more && test_and_set_bit(bit_nr, ldata->trying)) {
 		ret = 0;
 		goto out;
 	}
 	clear_trying = true;
+	more = false;
 
 	/* can't delete if it's cached in local or remote mounts */
 	if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) {
@@ -1853,14 +1862,25 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
 	if (ret < 0)
 		goto out;
 
-	ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
+	ret = delete_inode_items(sb, ino, &sinode, &start, lock, orph_lock);
+
+	if (ret == -EINPROGRESS) {
+		more = true;
+		clear_trying = false;
+	} else {
+		more = false;
+	}
+
 out:
 	if (clear_trying)
 		clear_bit(bit_nr, ldata->trying);
 
 	scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
 	scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);
 
+	if (more)
+		goto again;
+
 	return ret;
 }
 

diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c
@@ -372,9 +372,9 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
 	sblock = args.offset >> SCOUTFS_BLOCK_SM_SHIFT;
 	eblock = (args.offset + args.length - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
 	ret = scoutfs_data_truncate_items(sb, inode, scoutfs_ino(inode),
-					  sblock,
+					  &sblock,
 					  eblock, true,
-					  lock);
+					  lock, false);
 	if (ret == 0) {
 		scoutfs_inode_get_onoff(inode, &online, &offline);
 		isize = i_size_read(inode);
@@ -383,8 +383,8 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
 					>> SCOUTFS_BLOCK_SM_SHIFT;
 			ret = scoutfs_data_truncate_items(sb, inode,
 							  scoutfs_ino(inode),
-							  sblock, U64_MAX,
-							  false, lock);
+							  &sblock, U64_MAX,
+							  false, lock, false);
 		}
 	}
 

diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h
@@ -378,26 +378,29 @@ DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_fiemap_extent,
 );
 
 TRACE_EVENT(scoutfs_data_truncate_items,
-	TP_PROTO(struct super_block *sb, __u64 iblock, __u64 last, int offline),
+	TP_PROTO(struct super_block *sb, __u64 iblock, __u64 last, int offline, bool pause),
 
-	TP_ARGS(sb, iblock, last, offline),
+	TP_ARGS(sb, iblock, last, offline, pause),
 
 	TP_STRUCT__entry(
 		SCSB_TRACE_FIELDS
 		__field(__u64, iblock)
 		__field(__u64, last)
 		__field(int, offline)
+		__field(bool, pause)
 	),
 
 	TP_fast_assign(
 		SCSB_TRACE_ASSIGN(sb);
 		__entry->iblock = iblock;
 		__entry->last = last;
 		__entry->offline = offline;
+		__entry->pause = pause;
 	),
 
-	TP_printk(SCSBF" iblock %llu last %llu offline %u", SCSB_TRACE_ARGS,
-		  __entry->iblock, __entry->last, __entry->offline)
+	TP_printk(SCSBF" iblock %llu last %llu offline %u pause %d",
+		  SCSB_TRACE_ARGS, __entry->iblock, __entry->last,
+		  __entry->offline, __entry->pause)
 );
 
 TRACE_EVENT(scoutfs_data_wait_check,

diff --git a/tests/golden/large-fragmented-free b/tests/golden/large-fragmented-free
@@ -1,4 +1,3 @@
-== setting longer hung task timeout
 == creating fragmented extents
 == unlink file with moved extents to free extents per block
 == cleanup
diff --git a/tests/tests/large-fragmented-free.sh b/tests/tests/large-fragmented-free.sh
@@ -10,30 +10,6 @@ EXTENTS_PER_BTREE_BLOCK=600
 EXTENTS_PER_LIST_BLOCK=8192
 FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))
 
-#
-# This test specifically creates a pathologically sparse file that will
-# be as expensive as possible to free.  This is usually fine on
-# dedicated or reasonable hardware, but trying to run this in
-# virtualized debug kernels can take a very long time.  This test is
-# about making sure that the server doesn't fail, not that the platform
-# can handle the scale of work that our btree formats happen to require
-# while execution is bogged down with use-after-free memory reference
-# tracking.  So we give the test a lot more breathing room before
-# deciding that its hung.
-#
-echo "== setting longer hung task timeout"
-if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
-	secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
-	test "$secs" -gt 0 || \
-		t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
-	restore_hung_task_timeout()
-	{
-		echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
-	}
-	trap restore_hung_task_timeout EXIT
-	echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
-fi
-
 echo "== creating fragmented extents"
 fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"