Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions flashinfer/autotuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,8 +717,10 @@ def _get_cache_key(
input_shapes: Tuple[torch.Size],
tuning_config: TuningConfig,
) -> Tuple:
if hasattr(input_shapes, '__len__'):
shapes_tuple = tuple(tuple(s) if hasattr(s, '__iter__') else s for s in input_shapes)
if hasattr(input_shapes, "__len__"):
shapes_tuple = tuple(
tuple(s) if hasattr(s, "__iter__") else s for s in input_shapes
)
else:
shapes_tuple = input_shapes
return (
Expand Down
1 change: 1 addition & 0 deletions flashinfer/fused_moe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
See the License for the specific language governing permissions and
limitations under the License.
"""

import paddle
import functools
from enum import IntEnum
Expand Down
8 changes: 3 additions & 5 deletions flashinfer/gemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,9 +562,7 @@ def forward(
# tgv_gemm takes mat1 as weights and mat2 as input tensor
# from [m,k]x[k,n]+[n,] to [n,k]x[k,m]+[n,]
gemm_fn = module.tgv_gemm
c = torch.empty(
(a.shape[0], b.shape[1]), dtype=a.dtype, device=a.place
)
c = torch.empty((a.shape[0], b.shape[1]), dtype=a.dtype, device=a.place)
gemm_fn(b.t(), a.t(), bias, tactic, c, pdl)
return c

Expand Down Expand Up @@ -2078,12 +2076,12 @@ def bmm_fp8(
if out is None:
out = torch.empty(
(A.shape[0], A.shape[1], B.shape[2]),
device=a.place,
device=A.place,
dtype=dtype,
)

workspace_buffer = _get_cache_buf(
"bmm_fp8_workspace", DEFAULT_WORKSPACE_SIZE, a.place
"bmm_fp8_workspace", DEFAULT_WORKSPACE_SIZE, A.place
)

if backend == "cudnn":
Expand Down
4 changes: 3 additions & 1 deletion tests/attention/test_attention_sink_blackwell.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
See the License for the specific language governing permissions and
limitations under the License.
"""

import paddle

paddle.compat.enable_torch_proxy()
import einops
import pytest
Expand All @@ -22,7 +24,7 @@
from tests.test_helpers.sink_attention_reference import sink_attention_unified

import flashinfer
from flashinfer.utils import get_compute_capability
# from flashinfer.utils import get_compute_capability


# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
Expand Down
11 changes: 6 additions & 5 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
from typing import Any, Dict, Set

import paddle

paddle.compat.enable_torch_proxy()
import pytest
import torch
# from torch.torch_version import TorchVersion
# from torch.torch_version import __version__ as torch_version

import flashinfer
from flashinfer.jit import MissingJITCacheError
# from flashinfer.jit import MissingJITCacheError

# Global tracking for JIT cache coverage
# Store tuples of (test_name, module_name, spec_info)
Expand Down Expand Up @@ -128,8 +129,8 @@ def wrapper(*args, **kwargs):

def pytest_configure(config):
if os.environ.get("FLASHINFER_TEST_TORCH_COMPILE", "0") == "1":
if torch_version < TorchVersion("2.4"):
pytest.skip("torch.compile requires torch >= 2.4")
# if torch_version < TorchVersion("2.4"):
# pytest.skip("torch.compile requires torch >= 2.4")
_set_torch_compile_options()
for fn in TORCH_COMPILE_FNS:
_monkeypatch_add_torch_compile(fn)
Expand All @@ -144,8 +145,8 @@ def pytest_runtest_call(item):
# skip OOM error and missing JIT cache errors
try:
item.runtest()
except:
assert(False)
except Exception:
raise
# try:
# item.runtest()
# except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
Expand Down
11 changes: 6 additions & 5 deletions tests/moe/test_trtllm_gen_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
See the License for the specific language governing permissions and
limitations under the License.
"""

import paddle

paddle.compat.enable_torch_proxy()
import functools
from typing import Tuple
Expand Down Expand Up @@ -48,7 +50,9 @@
get_w2_permute_indices_with_cache,
_maybe_get_cached_w3_w1_permute_indices,
)
from flashinfer.utils import calculate_tile_tokens_dim, get_compute_capability

# from flashinfer.utils import calculate_tile_tokens_dim, get_compute_capability
from flashinfer.utils import calculate_tile_tokens_dim


@functools.cache
Expand Down Expand Up @@ -2504,10 +2508,7 @@ def test_moe_quantization_classes(
else:
# Other routing methods (Renormalize, RenormalizeNaive, Llama4) use bfloat16
expert_logits = torch.randn((num_tokens, num_experts), device="cuda")
print("oringingin expert_logits:", expert_logits)
expert_logits = expert_logits.to(
torch.bfloat16
)
expert_logits = expert_logits.to(torch.bfloat16)
# torch.set_printoptions(edgeitems=1000) # 显示更多边缘项
# torch.set_printoptions(linewidth=1000) # 增加每行宽度
print("expert_logits:", expert_logits)
Expand Down
5 changes: 3 additions & 2 deletions tests/test_helpers/sink_attention_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,9 @@ def sink_attention_unified(
# mask = torch.arange(kv_len - qo_len, kv_len, device=q.device).unsqueeze(
# 1
# ) >= torch.arange(0, kv_len, device=q.device).unsqueeze(0)
mask = torch.arange(kv_len - qo_len, kv_len).unsqueeze(
1) >= torch.arange(0, kv_len).unsqueeze(0)
mask = torch.arange(kv_len - qo_len, kv_len).unsqueeze(1) >= torch.arange(
0, kv_len
).unsqueeze(0)
if window_left >= 0:
row_idx = torch.arange(qo_len, dtype=torch.int32, device=q.device)[
:, None
Expand Down