Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 31 additions & 7 deletions kmod/src/data.c
Original file line number Diff line number Diff line change
Expand Up @@ -296,36 +296,49 @@ static s64 truncate_extents(struct super_block *sb, struct inode *inode,
* and offline blocks. If it's not provided then the inode is being
* destroyed and isn't reachable, we don't need to update it.
*
* If 'pause' is set, then we are destroying the inode and we should take
* breaks occasionally to allow other nodes access to this inode lock shard.
*
* The caller is in charge of locking the inode and data, but we may
* have to modify far more items than fit in a transaction so we're in
* charge of batching updates into transactions. If the inode is
* provided then we're responsible for updating its item as we go.
*/
int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
u64 ino, u64 iblock, u64 last, bool offline,
struct scoutfs_lock *lock)
u64 ino, u64 *iblock, u64 last, bool offline,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this function has gotten to the point where it's no longer helpful shared code with some differences, but acctually two different things interleaved with conditionals. Instead of also letting it make partial progress, let's split it into two functions: this that truncates live inodes and requires the inode pointer, and a new _data_free_items that is specifically built to remove a transaction's worth of extents and then return. It wouldn't need any of the arguments -- it'd always start at 0 and remove whatever it finds.

struct scoutfs_lock *lock, bool pause)
{
struct scoutfs_inode_info *si = NULL;
LIST_HEAD(ind_locks);
u64 cur_seq;
s64 ret = 0;

WARN_ON_ONCE(inode && !inode_is_locked(inode));

/*
* If the inode is provided, then we aren't destroying it. So it's not
* safe to pause while removing items- it needs to be done in one chunk.
*/
if (WARN_ON_ONCE(pause && inode))
return -EINVAL;

/* clamp last to the last possible block? */
if (last > SCOUTFS_BLOCK_SM_MAX)
last = SCOUTFS_BLOCK_SM_MAX;

trace_scoutfs_data_truncate_items(sb, iblock, last, offline);
trace_scoutfs_data_truncate_items(sb, *iblock, last, offline, pause);

if (WARN_ON_ONCE(last < iblock))
if (WARN_ON_ONCE(last < *iblock))
return -EINVAL;

if (inode) {
si = SCOUTFS_I(inode);
down_write(&si->extent_sem);
}

while (iblock <= last) {
cur_seq = scoutfs_trans_sample_seq(sb);

while (*iblock <= last) {
if (inode)
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
else
Expand All @@ -339,7 +352,7 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
ret = 0;

if (ret == 0)
ret = truncate_extents(sb, inode, ino, iblock, last,
ret = truncate_extents(sb, inode, ino, *iblock, last,
offline, lock);

if (inode)
Expand All @@ -351,8 +364,19 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
if (ret <= 0)
break;

iblock = ret;
*iblock = ret;
ret = 0;

/*
* We know there's more to do because truncate_extents()
* pauses every EXTENTS_PER_HOLD extents and it returned the
* next starting block. Our caller might also want us to pause,
* which we will do whenever we cross a transaction boundary.
*/
if (pause && (cur_seq != scoutfs_trans_sample_seq(sb))) {
ret = -EINPROGRESS;
break;
}
}

if (si)
Expand Down
4 changes: 2 additions & 2 deletions kmod/src/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ int scoutfs_get_block_write(struct inode *inode, sector_t iblock, struct buffer_
int create);

int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
u64 ino, u64 iblock, u64 last, bool offline,
struct scoutfs_lock *lock);
u64 ino, u64 *iblock, u64 last, bool offline,
struct scoutfs_lock *lock, bool pause);
int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len);
Expand Down
40 changes: 30 additions & 10 deletions kmod/src/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -468,8 +468,8 @@ int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock)
start = (i_size_read(inode) + SCOUTFS_BLOCK_SM_SIZE - 1) >>
SCOUTFS_BLOCK_SM_SHIFT;
ret = scoutfs_data_truncate_items(inode->i_sb, inode,
scoutfs_ino(inode), start, ~0ULL,
false, lock);
scoutfs_ino(inode), &start, ~0ULL,
false, lock, false);
err = clear_truncate_flag(inode, lock);

return ret ? ret : err;
Expand Down Expand Up @@ -1635,7 +1635,8 @@ int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_
* partial deletion until all deletion is complete and the orphan item
* is removed.
*/
static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_inode *sinode,
static int delete_inode_items(struct super_block *sb, u64 ino,
struct scoutfs_inode *sinode, u64 *start,
struct scoutfs_lock *lock, struct scoutfs_lock *orph_lock)
{
struct scoutfs_key key;
Expand All @@ -1654,8 +1655,8 @@ static int delete_inode_items(struct super_block *sb, u64 ino, struct scoutfs_in

/* remove data items in their own transactions */
if (S_ISREG(mode)) {
ret = scoutfs_data_truncate_items(sb, NULL, ino, 0, ~0ULL,
false, lock);
ret = scoutfs_data_truncate_items(sb, NULL, ino, start, ~0ULL,
false, lock, true);
if (ret)
goto out;
}
Expand Down Expand Up @@ -1803,16 +1804,23 @@ static int get_current_lock_data(struct super_block *sb, struct scoutfs_lock *lo
*/
static int try_delete_inode_items(struct super_block *sb, u64 ino)
{
struct inode_deletion_lock_data *ldata = NULL;
struct scoutfs_lock *orph_lock = NULL;
struct scoutfs_lock *lock = NULL;
struct inode_deletion_lock_data *ldata;
struct scoutfs_lock *orph_lock;
struct scoutfs_lock *lock;
struct scoutfs_inode sinode;
struct scoutfs_key key;
bool clear_trying = false;
bool more = false;
u64 group_nr;
u64 start = 0;
int bit_nr;
int ret;

again:
ldata = NULL;
orph_lock = NULL;
lock = NULL;

ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &lock);
if (ret < 0)
goto out;
Expand All @@ -1824,11 +1832,12 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
goto out;

/* only one local attempt per inode at a time */
if (test_and_set_bit(bit_nr, ldata->trying)) {
if (!more && test_and_set_bit(bit_nr, ldata->trying)) {
ret = 0;
goto out;
}
clear_trying = true;
more = false;

/* can't delete if it's cached in local or remote mounts */
if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) {
Expand All @@ -1853,14 +1862,25 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
if (ret < 0)
goto out;

ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
ret = delete_inode_items(sb, ino, &sinode, &start, lock, orph_lock);

if (ret == -EINPROGRESS) {
more = true;
clear_trying = false;
} else {
more = false;
}

out:
if (clear_trying)
clear_bit(bit_nr, ldata->trying);

scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
scoutfs_unlock(sb, orph_lock, SCOUTFS_LOCK_WRITE_ONLY);

if (more)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding this loop without clearing more at the start makes me nervous. It certainly looks like it'll loop forever if any of the errors before more is cleared are returned (perhaps from shutdown or forced unmount returning blanket errors from net/io paths). I'd rather not bother with this additional loop and would instead make sure to do enough work per iteration that we don't mind returning to the caller and letting them use their existing retry mechanisms.

goto again;

return ret;
}

Expand Down
8 changes: 4 additions & 4 deletions kmod/src/ioctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -372,9 +372,9 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
sblock = args.offset >> SCOUTFS_BLOCK_SM_SHIFT;
eblock = (args.offset + args.length - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
ret = scoutfs_data_truncate_items(sb, inode, scoutfs_ino(inode),
sblock,
&sblock,
eblock, true,
lock);
lock, false);
if (ret == 0) {
scoutfs_inode_get_onoff(inode, &online, &offline);
isize = i_size_read(inode);
Expand All @@ -383,8 +383,8 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg)
>> SCOUTFS_BLOCK_SM_SHIFT;
ret = scoutfs_data_truncate_items(sb, inode,
scoutfs_ino(inode),
sblock, U64_MAX,
false, lock);
&sblock, U64_MAX,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These changes make me nervous. This used to obviously always truncate (s,e) and (s, ~0). Now it probably truncates from (s, e) and (e, ~0), but we'd have to go diving into the implementation to make sure there isn't some accidental final increment before returning that means it actually truncates (e + 1, ~0), or whatever. Let's avoid this pattern as much as possible. (It's why truncate_extent_items() returns the block number instead of updating an arg. This lesson was learned over and over in the kernel with the *ppos arguments to write functions that sometimes updated write pointers, or not, and was a nightmare.)

false, lock, false);
}
}

Expand Down
11 changes: 7 additions & 4 deletions kmod/src/scoutfs_trace.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,26 +378,29 @@ DEFINE_EVENT(scoutfs_data_file_extent_class, scoutfs_data_fiemap_extent,
);

TRACE_EVENT(scoutfs_data_truncate_items,
TP_PROTO(struct super_block *sb, __u64 iblock, __u64 last, int offline),
TP_PROTO(struct super_block *sb, __u64 iblock, __u64 last, int offline, bool pause),

TP_ARGS(sb, iblock, last, offline),
TP_ARGS(sb, iblock, last, offline, pause),

TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, iblock)
__field(__u64, last)
__field(int, offline)
__field(bool, pause)
),

TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->iblock = iblock;
__entry->last = last;
__entry->offline = offline;
__entry->pause = pause;
),

TP_printk(SCSBF" iblock %llu last %llu offline %u", SCSB_TRACE_ARGS,
__entry->iblock, __entry->last, __entry->offline)
TP_printk(SCSBF" iblock %llu last %llu offline %u pause %d",
SCSB_TRACE_ARGS, __entry->iblock, __entry->last,
__entry->offline, __entry->pause)
);

TRACE_EVENT(scoutfs_data_wait_check,
Expand Down
1 change: 0 additions & 1 deletion tests/golden/large-fragmented-free
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
== setting longer hung task timeout
== creating fragmented extents
== unlink file with moved extents to free extents per block
== cleanup
24 changes: 0 additions & 24 deletions tests/tests/large-fragmented-free.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,30 +10,6 @@ EXTENTS_PER_BTREE_BLOCK=600
EXTENTS_PER_LIST_BLOCK=8192
FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))

#
# This test specifically creates a pathologically sparse file that will
# be as expensive as possible to free. This is usually fine on
# dedicated or reasonable hardware, but trying to run this in
# virtualized debug kernels can take a very long time. This test is
# about making sure that the server doesn't fail, not that the platform
# can handle the scale of work that our btree formats happen to require
# while execution is bogged down with use-after-free memory reference
# tracking. So we give the test a lot more breathing room before
# deciding that its hung.
#
echo "== setting longer hung task timeout"
if [ -w /proc/sys/kernel/hung_task_timeout_secs ]; then
secs=$(cat /proc/sys/kernel/hung_task_timeout_secs)
test "$secs" -gt 0 || \
t_fail "confusing value '$secs' from /proc/sys/kernel/hung_task_timeout_secs"
restore_hung_task_timeout()
{
echo "$secs" > /proc/sys/kernel/hung_task_timeout_secs
}
trap restore_hung_task_timeout EXIT
echo "$((secs * 5))" > /proc/sys/kernel/hung_task_timeout_secs
fi

echo "== creating fragmented extents"
fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"

Expand Down