Skip to content

Conversation

@kkysen
Copy link
Collaborator

@kkysen kkysen commented May 20, 2025

Previously, two adjacent 8-bit stores were done. By doing them as u16s instead of [u8]s/[u8; 2]s, we can get a single 16-bit store to be emitted instead.

This gets strhs emitted instead of the pair of strbs previously.

Based on what @iximeow found in iximeow@e133a89 and #1401 (comment).

Before:

❯ cargo asm -p rav1d --lib --rust rav1d_create_lf_mask_intra --color | rg filter_level -C 10
                if start <= end && end <= len {
        cmp x13, x12
        b.hi .LBB440_79
                // /home/khyber/work/rav1d/src/lf_mask.rs : 458
                let lvl = &mut *level_cache.index_mut((idx + 0.., ..2));
        ldr x12, [x1, #8]
                // /home/khyber/.rustup/toolchains/nightly-2025-05-01-aarch64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/cmp.rs : 1849
                fn lt(&self, other: &Self) -> bool { *self <  *other }
        subs x7, x7, #1
                // /home/khyber/work/rav1d/src/lf_mask.rs : 459
                lvl[0] = filter_level[0][0][0];
        add x12, x12, x11
                // /home/khyber/.rustup/toolchains/nightly-2025-05-01-aarch64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/cmp.rs : 1849
                fn lt(&self, other: &Self) -> bool { *self <  *other }
        add x11, x11, #4
                // /home/khyber/work/rav1d/src/lf_mask.rs : 459
                lvl[0] = filter_level[0][0][0];
        strb w15, [x12]
                // /home/khyber/work/rav1d/src/lf_mask.rs : 460
                lvl[1] = filter_level[1][0][0];
        strb w16, [x12, #1]
                // /home/khyber/work/rav1d/src/lf_mask.rs : 455
                for x in 0..bw4 {
        b.ne .LBB440_4
                // /home/khyber/.rustup/toolchains/nightly-2025-05-01-aarch64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/cmp.rs : 1849
                fn lt(&self, other: &Self) -> bool { *self <  *other }
        cmp x6, x20
                // /home/khyber/work/rav1d/src/lf_mask.rs : 454
                for _y in 0..bh4 {
        add x17, x17, x18
--
                // /home/khyber/work/rav1d/src/disjoint_mut.rs : 729
                if start <= end && end <= len {
        add x20, x3, x19
        add x21, x20, #3
        cmp x21, x2
        b.hs .LBB440_80
                // /home/khyber/work/rav1d/src/lf_mask.rs : 496
                let lvl = &mut *level_cache.index_mut((idx + 2.., ..2));
        ldr x2, [x1, #8]
                // /home/khyber/work/rav1d/src/lf_mask.rs : 497
                lvl[0] = filter_level[2][0][0];
        add x2, x2, x3
        add x2, x2, x19
                // /home/khyber/.rustup/toolchains/nightly-2025-05-01-aarch64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/cmp.rs : 1849
                fn lt(&self, other: &Self) -> bool { *self <  *other }
        add x19, x19, #4
                // /home/khyber/work/rav1d/src/lf_mask.rs : 493
                for x in 0..cbw4 {
        cmp x18, x19
                // /home/khyber/work/rav1d/src/lf_mask.rs : 497
                lvl[0] = filter_level[2][0][0];
        strb w16, [x2, #2]
                // /home/khyber/work/rav1d/src/lf_mask.rs : 498
                lvl[1] = filter_level[3][0][0];
        strb w17, [x2, #3]
                // /home/khyber/work/rav1d/src/lf_mask.rs : 493
                for x in 0..cbw4 {
        b.ne .LBB440_72
                // /home/khyber/.rustup/toolchains/nightly-2025-05-01-aarch64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/cmp.rs : 1849
                fn lt(&self, other: &Self) -> bool { *self <  *other }
        cmp x6, x4
                // /home/khyber/work/rav1d/src/lf_mask.rs : 492
                for _y in 0..cbh4 {
        add x3, x3, x5

After:

❯ cargo asm -p rav1d --lib --rust rav1d_create_lf_mask_intra --color | rg filter_level -C 10
        ldr x24, [sp, #224]
        ldrb w10, [sp, #184]
                // /home/khyber/work/rav1d/src/lf_mask.rs : 447
                let b_dim = b_dim.map(|it| it as usize);
        ldr w8, [x9, x8, lsl #2]
                // /home/khyber/work/rav1d/src/lf_mask.rs : 448
                let bw4 = cmp::min(iw - bx, b_dim[0]);
        sub x9, x25, x28
        ldrb w6, [sp, #176]
                // /home/khyber/work/rav1d/src/lf_mask.rs : 453
                let filter_level_yuv = filter_level.0.map(|a| a[0][0]);
        ldrb w23, [x3, #32]
                // /home/khyber/.rustup/toolchains/nightly-2025-05-01-aarch64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/array/mod.rs : 160
                }
        ldrb w14, [x3, #48]
                // /home/khyber/work/rav1d/src/lf_mask.rs : 450
                let bx4 = bx & 31;
        and x30, x28, #0x1f
                // /home/khyber/work/rav1d/src/lf_mask.rs : 447
                let b_dim = b_dim.map(|it| it as usize);
        and x26, x8, #0xff
--
                let lvl = &mut *level_cache.index_mut((idx + 0.., ..2));
        ldr x12, [x1, #8]
        add w13, w11, w12
                // /home/khyber/.rustup/toolchains/nightly-2025-05-01-aarch64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/option.rs : 1001
                match self {
        tbnz w13, #0, .LBB440_21
                // /home/khyber/.rustup/toolchains/nightly-2025-05-01-aarch64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/cmp.rs : 1849
                fn lt(&self, other: &Self) -> bool { *self <  *other }
        subs x3, x3, #1
                // /home/khyber/work/rav1d/src/lf_mask.rs : 463
                *u16::mut_from(lvl).unwrap() = filter_level_y;
        strh w15, [x12, x11]
                // /home/khyber/.rustup/toolchains/nightly-2025-05-01-aarch64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/cmp.rs : 1849
                fn lt(&self, other: &Self) -> bool { *self <  *other }
        add x11, x11, #4
                // /home/khyber/work/rav1d/src/lf_mask.rs : 459
                for x in 0..bw4 {
        b.ne .LBB440_4
                // /home/khyber/.rustup/toolchains/nightly-2025-05-01-aarch64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/cmp.rs : 1849
                fn lt(&self, other: &Self) -> bool { *self <  *other }
        cmp x18, x20
--
        tbnz w19, #0, .LBB440_82
        add x2, x2, x17
        add x2, x2, x21
                // /home/khyber/.rustup/toolchains/nightly-2025-05-01-aarch64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/cmp.rs : 1849
                fn lt(&self, other: &Self) -> bool { *self <  *other }
        add x21, x21, #4
                // /home/khyber/work/rav1d/src/lf_mask.rs : 496
                for x in 0..cbw4 {
        cmp x16, x21
                // /home/khyber/work/rav1d/src/lf_mask.rs : 500
                *u16::mut_from(lvl).unwrap() = filter_level_uv;
        strh w15, [x2, #2]
                // /home/khyber/work/rav1d/src/lf_mask.rs : 496
                for x in 0..cbw4 {
        b.ne .LBB440_76
                // /home/khyber/.rustup/toolchains/nightly-2025-05-01-aarch64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/cmp.rs : 1849
                fn lt(&self, other: &Self) -> bool { *self <  *other }
        cmp x5, x4
                // /home/khyber/work/rav1d/src/lf_mask.rs : 495
                for _y in 0..cbh4 {
        add x17, x17, x18

Copy link
Collaborator Author

@kkysen kkysen left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@iximeow, @coderkalyan, if you want to review.

Copy link

@coderkalyan coderkalyan left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pending the open comments, LGTM.

@kkysen kkysen changed the title perf: fn rav1d_create_lf_mask_intra: store as u16s perf: fn rav1d_create_lf_mask_{intra,inter}: store as u16s May 27, 2025
@kkysen kkysen requested review from coderkalyan, djc and iximeow May 27, 2025 08:34
@coderkalyan
Copy link

Will try to take a look at this tomorrow!

kkysen added 2 commits June 13, 2025 03:12
Previously, two adjacent 8-bit stores were done.
By doing them as `u16`s instead of `[u8]`s/`[u8; 2]`s,
we can get a single 16-bit store to be emitted instead.
Same as for `fn rav1d_create_lf_mask_intra`.
@kkysen kkysen force-pushed the kkysen/fn-rav1d_create_lf_mask_intra-u16-stores branch from 10da56b to f7fd7df Compare June 13, 2025 07:13
Copy link

@coderkalyan coderkalyan left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops, forgot about this - but with the AlignedVec2, LGTM!

@kkysen kkysen merged commit 08ef052 into main Jun 16, 2025
28 checks passed
@kkysen kkysen deleted the kkysen/fn-rav1d_create_lf_mask_intra-u16-stores branch June 16, 2025 12:01
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants