diff --git a/.gitignore b/.gitignore index 7ebbf0d..264b65e 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,8 @@ dist/ target/ .aider* .gradle +.venv +bench*.py # Go WASM - use compressed version only go/wasm/evmole.wasm diff --git a/Cargo.toml b/Cargo.toml index ef1b7ab..ea01d69 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,11 +33,13 @@ wasm = ["serde", "dep:serde_json"] trace_selectors = [] trace_arguments = [] trace_mutability = [] +trace_events = [] trace_storage = [] trace = [ "trace_selectors", "trace_arguments", "trace_mutability", + "trace_events", "trace_storage", ] diff --git a/benchmark/Makefile b/benchmark/Makefile index 7d81060..fd15aa8 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -1,5 +1,6 @@ PROVIDER_BASE = etherscan PROVIDERS_SELECTORS ?= whatsabi evm-hound-rs sevm evmole-rs evmole-js evmole-py evmole-go +PROVIDERS_EVENTS ?= evmole-rs PROVIDERS_ARGUMENTS ?= evmole-rs evmole-js evmole-py evmole-go PROVIDERS_MUTABILITY ?= whatsabi sevm evmole-rs evmole-js evmole-py evmole-go PROVIDERS_STORAGE ?= evmole-rs evmole-js evmole-py evmole-go smlxl @@ -18,9 +19,11 @@ RSYNC_EXCLUDES := --exclude benchmark --exclude target --exclude dist \ PROVIDERS_SELECTORS := $(PROVIDER_BASE) $(PROVIDERS_SELECTORS) PROVIDERS_ARGUMENTS := $(PROVIDER_BASE) $(PROVIDERS_ARGUMENTS) +PROVIDERS_EVENTS := $(PROVIDER_BASE) $(PROVIDERS_EVENTS) PROVIDERS_MUTABILITY := $(PROVIDER_BASE) $(PROVIDERS_MUTABILITY) PROVIDERS_STORAGE := $(PROVIDER_BASE) $(PROVIDERS_STORAGE) -PROVIDERS_UNIQ := $(sort $(PROVIDERS_SELECTORS) $(PROVIDERS_ARGUMENTS) $(PROVIDERS_MUTABILITY) $(PROVIDERS_STORAGE) $(PROVIDERS_BLOCKS) $(PROVIDERS_FLOW)) +PROVIDERS_EVENTS := $(sort $(PROVIDER_BASE) $(PROVIDERS_EVENTS)) +PROVIDERS_UNIQ := $(sort $(PROVIDERS_SELECTORS) $(PROVIDERS_EVENTS) $(PROVIDERS_ARGUMENTS) $(PROVIDERS_MUTABILITY) $(PROVIDERS_STORAGE) $(PROVIDERS_BLOCKS) $(PROVIDERS_FLOW)) DATASET := $(shell pwd)/datasets RES := $(shell pwd)/results @@ -28,15 +31,17 @@ RES := $(shell pwd)/results BUILD_TARGETS := $(addsuffix .build, $(PROVIDERS_UNIQ)) UNPACK_TARGETS := $(foreach d,$(DATASETS) $(DATASETS_STORAGE),$(addprefix datasets/, $(d))) RUN_SELECTORS_TARGETS := $(foreach p,$(PROVIDERS_SELECTORS),$(addprefix $(p).selectors/, $(DATASETS))) +RUN_EVENTS_TARGETS := $(foreach p,$(PROVIDERS_EVENTS),$(addprefix $(p).events/, $(DATASETS))) RUN_ARGUMENTS_TARGETS := $(foreach p,$(PROVIDERS_ARGUMENTS),$(addprefix $(p).arguments/, $(DATASETS))) RUN_MUTABILITY_TARGETS := $(foreach p,$(PROVIDERS_MUTABILITY),$(addprefix $(p).mutability/, $(DATASETS))) RUN_STORAGE_TARGETS := $(foreach p,$(PROVIDERS_STORAGE),$(addprefix $(p).storage/, $(DATASETS_STORAGE))) RUN_BLOCKS_TARGETS := $(foreach p,$(PROVIDERS_BLOCKS),$(addprefix $(p).blocks/, $(DATASETS_FLOW))) RUN_FLOW_TARGETS := $(foreach p,$(PROVIDERS_FLOW),$(addprefix $(p).flow/, $(DATASETS_FLOW))) -RUN_TARGETS := $(RUN_SELECTORS_TARGETS) $(RUN_ARGUMENTS_TARGETS) $(RUN_MUTABILITY_TARGETS) $(RUN_STORAGE_TARGETS) $(RUN_BLOCKS_TARGETS) $(RUN_FLOW_TARGETS) +RUN_TARGETS := $(RUN_SELECTORS_TARGETS) $(RUN_EVENTS_TARGETS) $(RUN_ARGUMENTS_TARGETS) $(RUN_MUTABILITY_TARGETS) $(RUN_STORAGE_TARGETS) $(RUN_BLOCKS_TARGETS) $(RUN_FLOW_TARGETS) benchmark-selectors: $(addsuffix .build, $(PROVIDERS_SELECTORS)) run-selectors +benchmark-events: $(addsuffix .build, $(PROVIDERS_EVENTS)) run-events benchmark-arguments: $(addsuffix .build, $(PROVIDERS_ARGUMENTS)) run-arguments benchmark-mutability: $(addsuffix .build, $(PROVIDERS_MUTABILITY)) run-mutability benchmark-storage: $(addsuffix .build, $(PROVIDERS_STORAGE)) run-storage @@ -44,6 +49,7 @@ benchmark-flow: $(addsuffix .build, $(PROVIDERS_FLOW)) run-blocks run-flow build: $(BUILD_TARGETS) run-selectors: $(RUN_SELECTORS_TARGETS) +run-events: $(RUN_EVENTS_TARGETS) run-arguments: $(RUN_ARGUMENTS_TARGETS) run-mutability: $(RUN_MUTABILITY_TARGETS) run-storage: $(RUN_STORAGE_TARGETS) @@ -74,13 +80,13 @@ $(UNPACK_TARGETS): .SECONDEXPANSION: $(RUN_TARGETS): datasets/$$(notdir $$@) $(info [*] Running $@...) - $(DOCKER) run --init --network=none --cpus=$(DOCKER_CPUS) --rm \ + $(DOCKER) run --init --network=none --cpus=$(DOCKER_CPUS) --rm \ -v $(DATASET)/$(notdir $@):/dataset \ -v $(RES):/mnt \ $(DOCKER_PREFIX)-$(basename $(subst /,,$(dir $@))) \ $(subst .,,$(suffix $(subst /,,$(dir $@)))) \ /dataset \ /mnt/$(subst /,_,$@).json \ - /mnt/$(PROVIDER_BASE).selectors_$(notdir $@).json + /mnt/$(PROVIDER_BASE).$(subst .,,$(suffix $(subst /,,$(dir $@))))_$(notdir $@).json -.PHONY: benchmark-selectors benchmark-arguments benchmark-mutability build run-selectors run-arguments run-mutability $(BUILD_TARGETS) $(RUN_TARGETS) +.PHONY: benchmark-selectors benchmark-events benchmark-arguments benchmark-mutability build run-selectors run-events run-arguments run-mutability $(BUILD_TARGETS) $(RUN_TARGETS) diff --git a/benchmark/README.md b/benchmark/README.md index 8c6e9c4..f79e133 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -62,6 +62,9 @@ python3 compare.py --mode=arguments --normalize-args fixed-size-array tuples str # Output markdown tables python3 compare.py --mode=selectors --markdown + +# Events mode with GT denoise (runtime evidence check) +python3 compare.py --mode=events ``` ## Control Flow Graph Analysis diff --git a/benchmark/compare.py b/benchmark/compare.py index 4e8d976..96766bd 100644 --- a/benchmark/compare.py +++ b/benchmark/compare.py @@ -5,6 +5,7 @@ import re import subprocess from collections import defaultdict +from typing import Optional def get_mode_defaults() -> dict: @@ -36,6 +37,10 @@ def get_mode_defaults() -> dict: 'datasets': makefile_vars.get('DATASETS', []), 'providers': makefile_vars.get('PROVIDERS_SELECTORS', []), }, + 'events': { + 'datasets': makefile_vars.get('DATASETS', []), + 'providers': makefile_vars.get('PROVIDERS_EVENTS', []), + }, 'arguments': { 'datasets': makefile_vars.get('DATASETS', []), 'providers': makefile_vars.get('PROVIDERS_ARGUMENTS', []), @@ -75,10 +80,80 @@ def load_data(btype: str, dname: str, providers: list[str], results_dir: str) -> times.append({'total': total_time, 'p50': ptimes[50], 'p99': ptimes[99]}) return data, times -def process_selectors(dname: str, providers: list[str], results_dir: str): - pdata, ptimes = load_data('selectors', dname, providers, results_dir) +def normalize_hex_token(s: str) -> str: + s = s.strip().lower() + if s.startswith('0x'): + s = s[2:] + return s + + +def load_runtime_code_hex(dataset_file: pathlib.Path) -> str: + try: + with dataset_file.open('r') as fh: + data = json.load(fh) + except (OSError, json.JSONDecodeError): + return '' + code = data.get('runtimeBytecode') or data.get('code') + if not isinstance(code, str): + return '' + return normalize_hex_token(code) + + +def has_event_hash_evidence(code_hex: str, h32: str) -> bool: + return h32 in code_hex + + +def build_events_uncertain_hashes( + dname: str, + dataset_dir: pathlib.Path, + ground_truth_provider: dict, +) -> tuple[dict[str, set[str]], dict]: + code_cache = {} + uncertain_by_file = {} + uncertain_contracts = 0 + uncertain_signatures = 0 + + for fname, (_, ground_truth) in ground_truth_provider.items(): + dataset_file = dataset_dir / dname / fname + if fname not in code_cache: + code_cache[fname] = load_runtime_code_hex(dataset_file) + code_hex = code_cache[fname] + uncertain = set() + for h in ground_truth: + h_norm = normalize_hex_token(h) + if len(h_norm) != 64 or not has_event_hash_evidence(code_hex, h_norm): + uncertain.add(h) + uncertain_by_file[fname] = uncertain + uncertain_signatures += len(uncertain) + if len(uncertain) > 0: + uncertain_contracts += 1 + + meta = { + 'mode': 'substring', + 'uncertain_contracts': uncertain_contracts, + 'uncertain_signatures': uncertain_signatures, + } + return uncertain_by_file, meta + + +def process_selectors( + dname: str, + providers: list[str], + results_dir: str, + btype: str = 'selectors', + datasets_dir: pathlib.Path = pathlib.Path(__file__).parent / 'datasets', +): + pdata, ptimes = load_data(btype, dname, providers, results_dir) results = [] ground_truth_provider = pdata[0] + events_denoise_meta = None + uncertain_by_file = {} + if btype == 'events': + uncertain_by_file, events_denoise_meta = build_events_uncertain_hashes( + dname, + pathlib.Path(datasets_dir), + ground_truth_provider, + ) for fname, (_, ground_truth) in ground_truth_provider.items(): ground_truth_set = set(ground_truth) provider_comparisons = [] @@ -87,7 +162,12 @@ def process_selectors(dname: str, providers: list[str], results_dir: str): provider_set = set(provider_data[fname][1]) false_positives = list(provider_set - ground_truth_set) false_negatives = list(ground_truth_set - provider_set) - provider_comparisons.append([false_positives, false_negatives]) + if btype == 'events': + uncertain_set = uncertain_by_file.get(fname, set()) + false_negatives_denoised = list((ground_truth_set - uncertain_set) - provider_set) + provider_comparisons.append([false_positives, false_negatives, false_negatives_denoised]) + else: + provider_comparisons.append([false_positives, false_negatives]) results.append({ 'addr': fname[2:-5], # '0xFF.json' => 'FF' @@ -95,7 +175,10 @@ def process_selectors(dname: str, providers: list[str], results_dir: str): 'data': provider_comparisons, }) - return { 'dataset': dname, 'results': results, 'timings': ptimes[1:] } + ret = {'dataset': dname, 'results': results, 'timings': ptimes[1:]} + if events_denoise_meta is not None: + ret['events_denoise'] = events_denoise_meta + return ret def format_time_val(val_us: int) -> str: @@ -124,22 +207,38 @@ def markdown_selectors(providers: list[str], all_results: list): print(f' {name}') print(' ') for dataset_idx, dataset_result in enumerate(all_results): + has_denoised_fn = 'events_denoise' in dataset_result dataset_name = dataset_result['dataset'] cnt_contracts = len(dataset_result['results']) cnt_funcs = sum(len(x['ground_truth']) for x in dataset_result['results']) + rowspan = 7 if has_denoised_fn else 5 print(' ') - print(f' {dataset_name}
{cnt_contracts}
addresses

{cnt_funcs}
functions
') + if has_denoised_fn: + denoise = dataset_result['events_denoise'] + print( + f' {dataset_name}
{cnt_contracts}
addresses

{cnt_funcs}
signatures

uncertain={denoise["uncertain_signatures"]}
' + ) + else: + print(f' {dataset_name}
{cnt_contracts}
addresses

{cnt_funcs}
functions
') print(' FP addrs') for idx in range(0, len(providers) - 1): # skip ground_truth provider fp_contracts = sum(len(x['data'][idx][0]) > 0 for x in dataset_result['results']) print(f' {fp_contracts}') print(' ') print(' ') - print(' FN addrs') + fn_addr_label = 'FN(raw)' if has_denoised_fn else 'FN' + print(f' {fn_addr_label} addrs') for idx in range(0, len(providers) - 1): # skip ground_truth provider fn_contracts = sum(len(x['data'][idx][1]) > 0 for x in dataset_result['results']) print(f' {fn_contracts}') print(' ') + if has_denoised_fn: + print(' ') + print(' FN(denoised) addrs') + for idx in range(0, len(providers) - 1): # skip ground_truth provider + fn_contracts = sum(len(x['data'][idx][2]) > 0 for x in dataset_result['results']) + print(f' {fn_contracts}') + print(' ') print(' ') print(' FP funcs') for idx in range(0, len(providers) - 1): # skip ground_truth provider @@ -147,11 +246,19 @@ def markdown_selectors(providers: list[str], all_results: list): print(f' {fp_signatures}') print(' ') print(' ') - print(' FN funcs') + fn_funcs_label = 'FN(raw)' if has_denoised_fn else 'FN' + print(f' {fn_funcs_label} funcs') for idx in range(0, len(providers) - 1): # skip ground_truth provider fn_signatures = sum(len(x['data'][idx][1]) for x in dataset_result['results']) print(f' {fn_signatures}') print(' ') + if has_denoised_fn: + print(' ') + print(' FN(denoised) funcs') + for idx in range(0, len(providers) - 1): # skip ground_truth provider + fn_signatures = sum(len(x['data'][idx][2]) for x in dataset_result['results']) + print(f' {fn_signatures}') + print(' ') print(' ') print(' Time') for idx in range(0, len(providers) - 1): # skip ground_truth provider @@ -161,7 +268,11 @@ def markdown_selectors(providers: list[str], all_results: list): print(f' ') print('') -def markdown_arguments_or_mutability(providers: list[str], all_results: list, second_results: list|None): +def markdown_arguments_or_mutability( + providers: list[str], + all_results: list, + second_results: Optional[list], +): print('') print(' ') print(' ') @@ -197,8 +308,16 @@ def markdown_arguments_or_mutability(providers: list[str], all_results: list, se def show_selectors(providers: list[str], all_results: list, show_errors: bool): for dataset_result in all_results: + has_denoised_fn = 'events_denoise' in dataset_result cnt_contracts = len(dataset_result['results']) cnt_funcs = sum(len(x['ground_truth']) for x in dataset_result['results']) + if has_denoised_fn: + denoise = dataset_result['events_denoise'] + print( + f'dataset {dataset_result["dataset"]}: events denoise={denoise["mode"]}, ' + f'uncertain_signatures={denoise["uncertain_signatures"]}, ' + f'uncertain_contracts={denoise["uncertain_contracts"]}' + ) for provider_idx, name in enumerate(providers[1:]): fp_signatures = sum(len(x['data'][provider_idx][0]) for x in dataset_result['results']) fn_signatures = sum(len(x['data'][provider_idx][1]) for x in dataset_result['results']) @@ -207,7 +326,16 @@ def show_selectors(providers: list[str], all_results: list, show_errors: bool): print(f'dataset {dataset_result["dataset"]} ({cnt_contracts} contracts, {cnt_funcs} signatures), {name}:') print(f' time: {format_time(dataset_result["timings"][provider_idx])}') print(f' False Positive: {fp_signatures} signatures, {fp_contracts} contracts') - print(f' False Negative: {fn_signatures} signatures, {fn_contracts} contracts') + if has_denoised_fn: + print(f' False Negative(raw): {fn_signatures} signatures, {fn_contracts} contracts') + else: + print(f' False Negative: {fn_signatures} signatures, {fn_contracts} contracts') + if has_denoised_fn: + fn_signatures_denoised = sum(len(x['data'][provider_idx][2]) for x in dataset_result['results']) + fn_contracts_denoised = sum(len(x['data'][provider_idx][2]) > 0 for x in dataset_result['results']) + print( + f' False Negative(denoised): {fn_signatures_denoised} signatures, {fn_contracts_denoised} contracts' + ) if show_errors is not True: continue print(' errors:') @@ -215,14 +343,17 @@ def show_selectors(providers: list[str], all_results: list, show_errors: bool): want = sorted(x['ground_truth']) fp = sorted(x['data'][provider_idx][0]) fn = sorted(x['data'][provider_idx][1]) + fn_denoised = sorted(x['data'][provider_idx][2]) if has_denoised_fn else [] if len(fp) > 0 or len(fn) > 0: print(' ', x['addr']) print(f' want: {want}') print(f' FP : {fp}') - print(f' FN : {fn}') + print(f' FN(raw) : {fn}') + if has_denoised_fn: + print(f' FN(denoised) : {fn_denoised}') print('') -def normalize_args(args: str, rules: set[str]|None) -> str: +def normalize_args(args: str, rules: Optional[set[str]]) -> str: if rules is None: return args @@ -500,7 +631,8 @@ def show_arguments_or_mutability(providers: list[str], all_results: list, show_e if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--results-dir', type=str, default=pathlib.Path(__file__).parent / 'results', help='results directory') - parser.add_argument('--mode', choices=['selectors', 'arguments', 'mutability', 'storage', 'flow'], default='selectors', help='mode') + parser.add_argument('--datasets-dir', type=pathlib.Path, default=pathlib.Path(__file__).parent / 'datasets', help='datasets directory (for event denoise)') + parser.add_argument('--mode', choices=['selectors', 'events', 'arguments', 'mutability', 'storage', 'flow'], default='selectors', help='mode') parser.add_argument('--providers', nargs='+', default=None) parser.add_argument('--datasets', nargs='+', default=None) parser.add_argument('--markdown', nargs='?', default=False, const=True, help='show markdown output') @@ -527,6 +659,22 @@ def show_arguments_or_mutability(providers: list[str], all_results: list, show_e else: show_selectors(cfg.providers, results, cfg.show_errors) + if cfg.mode == 'events': + results = [ + process_selectors( + d, + cfg.providers, + cfg.results_dir, + 'events', + cfg.datasets_dir, + ) for d in cfg.datasets + ] + + if cfg.markdown: + markdown_selectors(cfg.providers, results) + else: + show_selectors(cfg.providers, results, cfg.show_errors) + if cfg.mode == 'arguments': results = [process_arguments(d, cfg.providers, cfg.results_dir, cfg.normalize_args) for d in cfg.datasets] if cfg.markdown: diff --git a/benchmark/providers/etherscan/main.py b/benchmark/providers/etherscan/main.py index 1a1c130..1e11402 100644 --- a/benchmark/providers/etherscan/main.py +++ b/benchmark/providers/etherscan/main.py @@ -10,6 +10,10 @@ def sign(inp: bytes) -> str: return keccak.new(digest_bits=256, data=inp).digest()[:4].hex() + +def sign32(inp: bytes) -> str: + return keccak.new(digest_bits=256, data=inp).digest().hex() + def join_inputs(inputs) -> str: if len(inputs) == 0: return '' @@ -119,6 +123,17 @@ def process_storage(sl): return ret def process(data, mode): + if mode == 'events': + ret = {} + for x in data['abi']: + if x['type'] != 'event': + continue + args = join_inputs(x['inputs']) + n = f'{x["name"]}({args})' + ret[sign32(n.encode('ascii'))] = True + + return list(ret.keys()) + if mode == 'storage': return process_storage(data['storageLayout']) ret = {} diff --git a/benchmark/providers/evmole-rs/src/main.rs b/benchmark/providers/evmole-rs/src/main.rs index d96dc28..6904d8c 100644 --- a/benchmark/providers/evmole-rs/src/main.rs +++ b/benchmark/providers/evmole-rs/src/main.rs @@ -20,6 +20,7 @@ enum Mode { Selectors, Arguments, Mutability, + Events, Storage, Blocks, Flow, @@ -113,6 +114,21 @@ fn main() -> Result<(), Box> { ), ); } + Mode::Events => { + let (info, dur) = timeit(evmole::ContractInfoArgs::new(&code).with_events()); + ret_selectors.insert( + fname, + ( + dur, + info + .events + .unwrap_or_default() + .into_iter() + .map(|e| hex::encode(e)) + .collect(), + ), + ); + } Mode::Arguments => { let fsel = if !only_selector.is_empty() { &only_selector @@ -248,7 +264,7 @@ fn main() -> Result<(), Box> { let file = fs::File::create(cfg.output_file)?; let mut bw = BufWriter::new(file); - if cfg.mode == Mode::Selectors { + if cfg.mode == Mode::Selectors || cfg.mode == Mode::Events { let _ = serde_json::to_writer(&mut bw, &ret_selectors); } else if cfg.mode == Mode::Blocks || cfg.mode == Mode::Flow { let _ = serde_json::to_writer(&mut bw, &ret_flow); diff --git a/evmole.pyi b/evmole.pyi index 7f755a7..8e5643d 100644 --- a/evmole.pyi +++ b/evmole.pyi @@ -103,6 +103,8 @@ class Contract: Attributes: functions (Optional[List[Function]]): List of detected contract functions. None if no functions were extracted + events (Optional[List[str]]): List of event selectors found in the contract bytecode as hex strings. + None if events were not extracted storage (Optional[List[StorageRecord]]): List of contract storage records. None if storage layout was not extracted disassembled (Optional[List[Tuple[int, str]]]): List of bytecode instructions, where each element is [offset, instruction]. @@ -114,6 +116,7 @@ class Contract: """ functions: Optional[List[Function]] + events: Optional[List[str]] storage: Optional[List[StorageRecord]] disassembled: Optional[List[Tuple[int, str]]] basic_blocks: Optional[List[Tuple[int, int]]] @@ -125,6 +128,7 @@ def contract_info( selectors: bool = False, arguments: bool = False, state_mutability: bool = False, + events: bool = False, storage: bool = False, disassemble: bool = False, basic_blocks: bool = False, @@ -140,6 +144,8 @@ def contract_info( arguments (bool, optional): When True, extracts function arguments. Defaults to False. state_mutability (bool, optional): When True, extracts function state mutability. Defaults to False. + events (bool, optional): When True, extracts event selectors found in the contract bytecode. + Defaults to False. storage (bool, optional): When True, extracts the contract's storage layout. Defaults to False. disassemble (bool, optional): When True, includes disassembled bytecode. @@ -154,3 +160,4 @@ def contract_info( weren't requested to be extracted will be None. """ ... + diff --git a/src/bin/cfg_reach_debug.rs b/src/bin/cfg_reach_debug.rs new file mode 100644 index 0000000..6d59fcd --- /dev/null +++ b/src/bin/cfg_reach_debug.rs @@ -0,0 +1,215 @@ +use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; +use std::{env, fs}; + +use evmole::control_flow_graph::{Block, BlockType}; +use evmole::{ContractInfoArgs, contract_info}; + +fn block_for_pc(blocks: &BTreeMap, pc: usize) -> Option { + let (start, block) = blocks.range(..=pc).next_back()?; + if pc <= block.end { Some(*start) } else { None } +} + +fn btype_name(b: &BlockType) -> &'static str { + match b { + BlockType::Terminate { .. } => "Terminate", + BlockType::Jump { .. } => "Jump", + BlockType::Jumpi { .. } => "Jumpi", + BlockType::DynamicJump { .. } => "DynamicJump", + BlockType::DynamicJumpi { .. } => "DynamicJumpi", + } +} + +fn main() { + let mut code_file: Option = None; + let mut targets: Vec = Vec::new(); + let mut args = env::args().skip(1); + while let Some(arg) = args.next() { + match arg.as_str() { + "--code-file" => code_file = args.next(), + "--target-pc" => { + if let Some(v) = args.next() { + let h = v.trim_start_matches("0x"); + targets.push(usize::from_str_radix(h, 16).unwrap()); + } + } + _ => {} + } + } + + let path = code_file.expect("--code-file required"); + let text = fs::read_to_string(path).expect("read code file"); + let hex = text.trim().trim_start_matches("0x"); + let code = alloy_primitives::hex::decode(hex).expect("decode hex"); + + let info = contract_info( + ContractInfoArgs::new(&code) + .with_selectors() + .with_control_flow_graph(), + ); + let functions = info.functions.unwrap_or_default(); + let cfg = info.control_flow_graph.expect("cfg"); + + println!("functions={}", functions.len()); + println!("blocks={}", cfg.blocks.len()); + + let mut succ: HashMap> = HashMap::new(); + let mut pred: HashMap> = HashMap::new(); + for (start, block) in &cfg.blocks { + let mut add = |to: usize| { + if cfg.blocks.contains_key(&to) { + succ.entry(*start).or_default().push(to); + pred.entry(to).or_default().push(*start); + } + }; + match &block.btype { + BlockType::Terminate { .. } => {} + BlockType::Jump { to } => add(*to), + BlockType::Jumpi { true_to, false_to } => { + add(*true_to); + add(*false_to); + } + BlockType::DynamicJump { to } => { + for dj in to { + if let Some(dst) = dj.to { + add(dst) + } + } + } + BlockType::DynamicJumpi { true_to, false_to } => { + add(*false_to); + for dj in true_to { + if let Some(dst) = dj.to { + add(dst) + } + } + } + } + } + + let context_blocks: Vec<(String, usize, usize)> = functions + .iter() + .filter_map(|f| { + block_for_pc(&cfg.blocks, f.bytecode_offset).map(|b| { + ( + alloy_primitives::hex::encode(f.selector), + f.bytecode_offset, + b, + ) + }) + }) + .collect(); + + println!("context_blocks={}", context_blocks.len()); + + for tpc in targets { + let tblock = block_for_pc(&cfg.blocks, tpc); + println!("\nTARGET pc=0x{tpc:x} block={:?}", tblock); + let Some(tb) = tblock else { + continue; + }; + + // reachable contexts + let mut reached_by: Vec<(String, usize, usize)> = Vec::new(); + for (sel, off, cblock) in &context_blocks { + let mut q = VecDeque::new(); + let mut vis = HashSet::new(); + q.push_back(*cblock); + vis.insert(*cblock); + let mut ok = false; + while let Some(n) = q.pop_front() { + if n == tb { + ok = true; + break; + } + if let Some(nexts) = succ.get(&n) { + for &nx in nexts { + if vis.insert(nx) { + q.push_back(nx); + } + } + } + } + if ok { + reached_by.push((sel.clone(), *off, *cblock)); + } + } + println!("reachable_contexts={}", reached_by.len()); + + // predecessor chain (depth 2) + let mut lvl1 = pred.get(&tb).cloned().unwrap_or_default(); + lvl1.sort_unstable(); + lvl1.dedup(); + println!("pred_l1_count={}", lvl1.len()); + for p1 in lvl1.iter().take(12) { + let b1 = cfg.blocks.get(p1).unwrap(); + println!( + " p1=0x{p1:x} type={} end=0x{:x}", + btype_name(&b1.btype), + b1.end + ); + let mut lvl2 = pred.get(p1).cloned().unwrap_or_default(); + lvl2.sort_unstable(); + lvl2.dedup(); + for p2 in lvl2.iter().take(5) { + let b2 = cfg.blocks.get(p2).unwrap(); + println!( + " p2=0x{p2:x} type={} end=0x{:x}", + btype_name(&b2.btype), + b2.end + ); + } + if lvl2.len() > 5 { + println!(" ... {} more p2", lvl2.len() - 5); + } + } + if lvl1.len() > 12 { + println!(" ... {} more p1", lvl1.len() - 12); + } + + // shortest path from first context + if let Some((sel, off, src)) = reached_by.first() { + let mut q = VecDeque::new(); + let mut vis = HashSet::new(); + let mut prev: HashMap = HashMap::new(); + q.push_back(*src); + vis.insert(*src); + while let Some(n) = q.pop_front() { + if n == tb { + break; + } + if let Some(nexts) = succ.get(&n) { + for &nx in nexts { + if vis.insert(nx) { + prev.insert(nx, n); + q.push_back(nx); + } + } + } + } + if vis.contains(&tb) { + let mut path = vec![tb]; + let mut cur = tb; + while let Some(p) = prev.get(&cur) { + path.push(*p); + cur = *p; + } + path.reverse(); + println!( + "shortest_path_from selector=0x{sel} off=0x{off:x} len={}", + path.len() + ); + for b in path.iter().take(30) { + let bb = cfg.blocks.get(b).unwrap(); + println!( + " block=0x{b:x} type={} end=0x{:x}", + btype_name(&bb.btype), + bb.end + ); + } + if path.len() > 30 { + println!(" ... {} more blocks", path.len() - 30); + } + } + } + } +} diff --git a/src/bin/events_debug.rs b/src/bin/events_debug.rs new file mode 100644 index 0000000..a8197f7 --- /dev/null +++ b/src/bin/events_debug.rs @@ -0,0 +1,170 @@ +use std::{fs, path::PathBuf, time::Instant}; + +use evmole::{EventSelector, contract_events}; + +#[derive(Debug, Default)] +struct Args { + code_hex: Option, + code_file: Option, + raw_file: Option, + iters: usize, + warmup: usize, + show_events: bool, +} + +fn usage() -> &'static str { + "Usage: + cargo run --release --bin events_debug -- [OPTIONS] + +Options: + --code-hex Bytecode hex string (with or without 0x prefix) + --code-file Text file containing bytecode hex + --raw-file Raw bytecode file + --iters Timed iterations (default: 1) + --warmup Warmup iterations (default: 0) + --show-events Print extracted event selectors + -h, --help Show this help + +Exactly one of --code-hex / --code-file / --raw-file must be provided." +} + +fn parse_args() -> Result { + let mut args = Args { + iters: 1, + warmup: 0, + show_events: false, + ..Default::default() + }; + + let mut it = std::env::args().skip(1); + while let Some(arg) = it.next() { + match arg.as_str() { + "-h" | "--help" => { + println!("{}", usage()); + std::process::exit(0); + } + "--code-hex" => { + args.code_hex = Some(it.next().ok_or("--code-hex requires a value")?); + } + "--code-file" => { + args.code_file = Some(PathBuf::from( + it.next().ok_or("--code-file requires a value")?, + )); + } + "--raw-file" => { + args.raw_file = Some(PathBuf::from( + it.next().ok_or("--raw-file requires a value")?, + )); + } + "--iters" => { + let v = it.next().ok_or("--iters requires a value")?; + args.iters = v.parse().map_err(|_| format!("invalid --iters: {v}"))?; + } + "--warmup" => { + let v = it.next().ok_or("--warmup requires a value")?; + args.warmup = v.parse().map_err(|_| format!("invalid --warmup: {v}"))?; + } + "--show-events" => args.show_events = true, + _ => return Err(format!("unknown argument: {arg}")), + } + } + + let inputs = [ + args.code_hex.is_some(), + args.code_file.is_some(), + args.raw_file.is_some(), + ] + .into_iter() + .filter(|v| *v) + .count(); + if inputs != 1 { + return Err("provide exactly one of --code-hex / --code-file / --raw-file".to_string()); + } + if args.iters == 0 { + return Err("--iters must be >= 1".to_string()); + } + Ok(args) +} + +fn decode_hex(input: &str) -> Result, String> { + let s = input.trim(); + let s = s.strip_prefix("0x").unwrap_or(s); + alloy_primitives::hex::decode(s).map_err(|e| format!("hex decode failed: {e}")) +} + +fn load_code(args: &Args) -> Result, String> { + if let Some(hex) = &args.code_hex { + return decode_hex(hex); + } + if let Some(path) = &args.code_file { + let text = fs::read_to_string(path) + .map_err(|e| format!("failed to read code file '{}': {e}", path.display()))?; + return decode_hex(&text); + } + if let Some(path) = &args.raw_file { + return fs::read(path) + .map_err(|e| format!("failed to read raw file '{}': {e}", path.display())); + } + Err("no input provided".to_string()) +} + +fn fmt_selector(s: &EventSelector) -> String { + alloy_primitives::hex::encode(s) +} + +fn main() { + let args = match parse_args() { + Ok(v) => v, + Err(e) => { + eprintln!("error: {e}"); + eprintln!("{}", usage()); + std::process::exit(2); + } + }; + + let code = match load_code(&args) { + Ok(v) => v, + Err(e) => { + eprintln!("error: {e}"); + std::process::exit(2); + } + }; + if code.is_empty() { + eprintln!("error: empty bytecode"); + std::process::exit(2); + } + + for _ in 0..args.warmup { + let _ = contract_events(&code); + } + + let mut last_events = Vec::new(); + let mut elapsed_ms = Vec::with_capacity(args.iters); + + for _ in 0..args.iters { + let t0 = Instant::now(); + let events = contract_events(&code); + let dt = t0.elapsed().as_secs_f64() * 1000.0; + elapsed_ms.push(dt); + last_events = events; + } + + let total_ms: f64 = elapsed_ms.iter().sum(); + let avg_ms = total_ms / elapsed_ms.len() as f64; + let min_ms = elapsed_ms.iter().copied().fold(f64::INFINITY, f64::min); + let max_ms = elapsed_ms.iter().copied().fold(0.0, f64::max); + + println!("code_len: {}", code.len()); + println!( + "time_ms: avg={avg_ms:.3} min={min_ms:.3} max={max_ms:.3} (iters={})", + args.iters + ); + println!("events: {}", last_events.len()); + + if args.show_events { + last_events.sort_unstable(); + for evt in &last_events { + println!("event: {}", fmt_selector(evt)); + } + } +} diff --git a/src/contract_info.rs b/src/contract_info.rs index 365d082..7f2acd3 100644 --- a/src/contract_info.rs +++ b/src/contract_info.rs @@ -3,6 +3,7 @@ use crate::{ arguments::function_arguments, control_flow_graph::basic_blocks, control_flow_graph::{ControlFlowGraph, control_flow_graph}, + events::{EventSelector, contract_events}, evm::code_iterator::disassemble, selectors::function_selectors, state_mutability::function_state_mutability, @@ -49,6 +50,16 @@ pub struct Contract { /// List of contract functions with their metadata pub functions: Option>, + /// Event selectors found in the contract bytecode + #[cfg_attr( + feature = "serde", + serde( + skip_serializing_if = "Option::is_none", + serialize_with = "crate::serialize::events" + ) + )] + pub events: Option>, + /// Contract storage layout pub storage: Option>, @@ -74,6 +85,7 @@ pub struct ContractInfoArgs<'a> { need_selectors: bool, need_arguments: bool, need_state_mutability: bool, + need_events: bool, need_storage: bool, need_disassemble: bool, need_basic_blocks: bool, @@ -113,6 +125,12 @@ impl<'a> ContractInfoArgs<'a> { self } + /// Enables the extraction of event selectors from the contract bytecode + pub fn with_events(mut self) -> Self { + self.need_events = true; + self + } + /// Enables the extraction of the contract's storage layout pub fn with_storage(mut self) -> Self { self.need_selectors = true; @@ -231,8 +249,15 @@ pub fn contract_info(args: ContractInfoArgs) -> Contract { (None, None) }; + let events = if args.need_events { + Some(contract_events(args.code)) + } else { + None + }; + Contract { functions, + events, storage, disassembled, basic_blocks, diff --git a/src/control_flow_graph/mod.rs b/src/control_flow_graph/mod.rs index cffe274..d04f8a4 100644 --- a/src/control_flow_graph/mod.rs +++ b/src/control_flow_graph/mod.rs @@ -16,7 +16,7 @@ use resolver::resolve_dynamic_jumps; mod initial; mod reachable; mod resolver; -mod state; +pub(crate) mod state; /// Constant used to mark invalid jump destinations (jumps not to JUMPDEST). /// Any jump destination value equal to or greater than this constant should be considered invalid. diff --git a/src/events/classify.rs b/src/events/classify.rs new file mode 100644 index 0000000..2df62c5 --- /dev/null +++ b/src/events/classify.rs @@ -0,0 +1,217 @@ +use std::collections::BTreeMap; + +use crate::collections::{HashMap, HashSet}; +use crate::control_flow_graph::{ + Block, BlockType, INVALID_JUMP_START, basic_blocks, control_flow_graph, + state::{StackSym, State}, +}; +use crate::evm::{code_iterator::iterate_code, op}; +use crate::selectors::function_selectors; + +#[derive(Clone, Copy, Debug)] +pub(super) struct LogSite { + pub pc: usize, + pub block_start: usize, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(super) enum LogSiteClass { + /// Sub-class a: topic0 produced by PUSH32 in the same block. + Push32 { topic_pc: usize }, + /// Sub-class b: topic0 produced by PUSH5..PUSH31 in the same block. + PushN { topic_pc: usize }, + /// Sub-class c: topic0 produced by MLOAD (preceded by CODECOPY pattern). + MloadCodecopy { mload_pc: usize }, + /// Sub-class e/f: topic0 comes from a predecessor block (Before(n)). + CrossBlock { init_sym_n: usize }, +} + +#[derive(Clone, Debug)] +pub(super) struct ClassifiedLogSite { + pub site: LogSite, + pub class: LogSiteClass, +} + +pub(super) struct CfgIndex { + pub blocks: BTreeMap, + pub preds_by_block: HashMap>, + pub contexts_reaching_block: HashMap>, +} + +pub(super) fn classify_log_sites(code: &[u8]) -> (CfgIndex, Vec) { + let index = build_cfg_index(code); + if index.blocks.is_empty() { + return (index, Vec::new()); + } + + let mut out = Vec::new(); + for site in collect_log_sites(code, &index.blocks) { + if let Some(class) = classify_one(code, &index.blocks, site) { + out.push(ClassifiedLogSite { site, class }); + } + } + out.sort_unstable_by(|a, b| a.site.pc.cmp(&b.site.pc)); + (index, out) +} + +fn classify_one( + code: &[u8], + blocks: &BTreeMap, + site: LogSite, +) -> Option { + let sym = topic0_symbol_at_log(code, blocks, site)?; + match sym { + StackSym::Other(pc) => { + let &opcode = code.get(pc)?; + match opcode { + op::PUSH32 => Some(LogSiteClass::Push32 { topic_pc: pc }), + op::PUSH5..=op::PUSH31 => Some(LogSiteClass::PushN { topic_pc: pc }), + op::MLOAD => Some(LogSiteClass::MloadCodecopy { mload_pc: pc }), + _ => None, + } + } + StackSym::Before(n) => Some(LogSiteClass::CrossBlock { init_sym_n: n }), + StackSym::Pushed(_) | StackSym::Jumpdest(_) => None, + } +} + +// --------------------------------------------------------------------------- +// CFG construction +// --------------------------------------------------------------------------- + +// TODO: control_flow_graph() prunes blocks unreachable from PC=0 via resolved edges. +// Internal functions called through unresolved dynamic jumps (e.g. Solidity internal +// _transfer/_mint) are lost. This causes ~33 FN on largest1k where PUSH32+LOG exist +// in pruned blocks. Fix requires improving dynamic jump resolution in +// control_flow_graph/resolver.rs so these blocks enter the reachable set. +fn build_cfg_index(code: &[u8]) -> CfgIndex { + let cfg = control_flow_graph(code, basic_blocks(code)); + + let mut succ_by_block: HashMap> = HashMap::default(); + let mut preds_by_block: HashMap> = HashMap::default(); + + let mut add_edge = |from: usize, to: usize| { + if to >= INVALID_JUMP_START || !cfg.blocks.contains_key(&to) { + return; + } + succ_by_block.entry(from).or_default().insert(to); + preds_by_block.entry(to).or_default().insert(from); + }; + + for (start, block) in &cfg.blocks { + match &block.btype { + BlockType::Terminate { .. } => {} + BlockType::Jump { to } => add_edge(*start, *to), + BlockType::Jumpi { true_to, false_to } => { + add_edge(*start, *true_to); + add_edge(*start, *false_to); + } + BlockType::DynamicJump { to } => { + for dj in to { + if let Some(dst) = dj.to { + add_edge(*start, dst); + } + } + } + BlockType::DynamicJumpi { true_to, false_to } => { + add_edge(*start, *false_to); + for dj in true_to { + if let Some(dst) = dj.to { + add_edge(*start, dst); + } + } + } + } + } + + let contexts = collect_contexts(code); + let mut contexts_reaching_block: HashMap> = HashMap::default(); + for context in contexts { + let Some(entry) = find_block_start(&cfg.blocks, context) else { + continue; + }; + let mut stack = vec![entry]; + let mut seen: HashSet = HashSet::default(); + while let Some(block) = stack.pop() { + if !seen.insert(block) { + continue; + } + contexts_reaching_block + .entry(block) + .or_default() + .insert(context); + if let Some(nexts) = succ_by_block.get(&block) { + stack.extend(nexts.iter().copied()); + } + } + } + + CfgIndex { + blocks: cfg.blocks, + preds_by_block, + contexts_reaching_block, + } +} + +fn collect_contexts(code: &[u8]) -> Vec { + let (selectors, _) = function_selectors(code, 0); + let mut set: HashSet = HashSet::default(); + set.insert(0); + set.extend(selectors.into_values()); + let mut out: Vec = set.into_iter().collect(); + out.sort_unstable(); + out +} + +// --------------------------------------------------------------------------- +// LOG site collection & symbolic helpers +// --------------------------------------------------------------------------- + +fn collect_log_sites(code: &[u8], blocks: &BTreeMap) -> Vec { + let mut out = Vec::new(); + for (start, block) in blocks { + for (pc, cop) in iterate_code(code, *start, Some(block.end)) { + if (op::LOG1..=op::LOG4).contains(&cop.op) { + out.push(LogSite { + pc, + block_start: *start, + }); + } + } + } + out +} + +fn topic0_symbol_at_log( + code: &[u8], + blocks: &BTreeMap, + log_site: LogSite, +) -> Option { + let block = blocks.get(&log_site.block_start)?; + let mut state = State::new(); + if let Some(prev_pc) = find_prev_instruction_pc(code, block.start, log_site.pc) { + let _ = state.exec(code, block.start, Some(prev_pc)); + } + // LOG1..LOG4: stack is [offset, size, topic0, ...]; topic0 is at position 2 + Some(state.get_stack(2)) +} + +pub(super) fn find_prev_instruction_pc( + code: &[u8], + start_pc: usize, + target_pc: usize, +) -> Option { + let mut prev = None; + for (pc, _) in iterate_code(code, start_pc, Some(target_pc)) { + if pc == target_pc { + return prev; + } + prev = Some(pc); + } + None +} + +pub(super) fn find_block_start(blocks: &BTreeMap, pc: usize) -> Option { + let (start, block) = blocks.range(..=pc).next_back()?; + if pc <= block.end { Some(*start) } else { None } +} diff --git a/src/events/mod.rs b/src/events/mod.rs new file mode 100644 index 0000000..028dc16 --- /dev/null +++ b/src/events/mod.rs @@ -0,0 +1,676 @@ +mod classify; +mod resolve; + +/// Event selector is a 32-byte keccak256 hash of the event signature +pub type EventSelector = [u8; 32]; + +/// Coarse-grained category for `LOGx` topic0 extraction complexity. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Serialize))] +pub enum EventLogClass { + /// Topic0 resolves to a single PUSH32 inside the same basic block. + SameBlockSinglePush32, + /// Topic0 resolves to PUSH32 in the same block, but multiple PUSH32 exist before LOG. + SameBlockMultiPush32, + /// Topic0 comes from predecessor blocks (symbol is Before(n) at LOG site). + CrossBlockBefore, + /// Any other source (non-PUSH32 producer or unresolved pattern). + Other, +} + +/// Per-`LOGx` classification record. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Serialize))] +pub struct EventLogClassRecord { + pub log_pc: usize, + pub block_start: usize, + pub class: EventLogClass, +} + +// --------------------------------------------------------------------------- +// Shared helpers +// --------------------------------------------------------------------------- + +/// Checks if a 32-byte value looks like a keccak256 hash (event selector). +/// +/// Heuristics (empirically tuned to filter non-event constants while preserving real selectors): +/// - All-zero rejected (null value) +/// - First 6 bytes all zero → likely an address or small integer, not a hash +/// - Last 6 bytes all zero → likely a bit-mask or padded constant +/// - Known non-event constants (role hashes, EIP-712 type hashes) → blocklist +/// - 4+ consecutive 0x00 or 0xFF bytes → structured constant, not a hash +fn is_plausible_event_hash(val: &[u8; 32]) -> bool { + if val == &[0u8; 32] { + return false; + } + if val[..6] == [0u8; 6] { + return false; + } + if val[26..] == [0u8; 6] { + return false; + } + if is_known_non_event_constant(val) { + return false; + } + let mut zero_run = 0u8; + let mut ff_run = 0u8; + for &b in val { + if b == 0 { + zero_run += 1; + if zero_run >= 4 { + return false; + } + } else { + zero_run = 0; + } + if b == 0xff { + ff_run += 1; + if ff_run >= 4 { + return false; + } + } else { + ff_run = 0; + } + } + true +} + +macro_rules! hex_bytes32 { + ($s:literal) => {{ + const BYTES: [u8; 32] = { + const fn hex_val(c: u8) -> u8 { + match c { + b'0'..=b'9' => c - b'0', + b'a'..=b'f' => c - b'a' + 10, + b'A'..=b'F' => c - b'A' + 10, + _ => panic!("invalid hex char"), + } + } + let s = $s.as_bytes(); + let mut out = [0u8; 32]; + let mut i = 0; + while i < 32 { + out[i] = (hex_val(s[i * 2]) << 4) | hex_val(s[i * 2 + 1]); + i += 1; + } + out + }; + BYTES + }}; +} + +/// Well-known non-event keccak256 constants that commonly appear as PUSH32 +/// but are NOT event selectors. These include: +/// - OpenZeppelin AccessControl role hashes (PAUSER_ROLE, MINTER_ROLE, etc.) +/// - EIP-712 type hashes (domain separator, Permit, Delegation) +/// - EIP-712 version/name hashes (keccak256("1"), keccak256("")) +/// +/// Curated from production false-positive analysis across 1730+ contracts. +/// Each entry eliminates FP in 20-65 contracts with zero TP loss. +#[rustfmt::skip] +const KNOWN_NON_EVENT_HASHES: &[[u8; 32]] = &[ + // keccak256("PAUSER_ROLE") + hex_bytes32!("65d7a28e3265b37a6474929f336521b332c1681b933f6cb9f3376673440d862a"), + // keccak256("MINTER_ROLE") + hex_bytes32!("9f2df0fed2c77648de5860a4cc508cd0818c85b8b8a1ab4ceeef8d981c8956a6"), + // keccak256("EIP712Domain(string name,string version,uint256 chainId,address verifyingContract)") + hex_bytes32!("8b73c3c69bb8fe3d512ecc4cf759cc79239f7b179b0ffacaa9a75d522b39400f"), + // keccak256("ADMIN_ROLE") — OpenZeppelin AccessControl (not DEFAULT_ADMIN_ROLE which is 0x00) + hex_bytes32!("a49807205ce4d355092ef5a8a18f56e8913cf4a201fbe287825b095693c21775"), + // keccak256("1") — EIP-712 version hash + hex_bytes32!("c89efdaa54c0f20c7adf612882df0950f5a951637e0307cdcb4c672f298b8bc6"), + // keccak256("") — empty hash, used for EXTCODEHASH sentinel + hex_bytes32!("c5d2460186f7233c927e7db2dcc703c0e500b653ca82273b7bfad8045d85a470"), + // keccak256("Permit(address owner,address spender,uint256 value,uint256 nonce,uint256 deadline)") + hex_bytes32!("6e71edae12b1b97f4d1f60370fef10105fa2faae0126114a169c64845d6126c9"), + // keccak256("UPGRADER_ROLE") + hex_bytes32!("189ab7a9244df0848122154315af71fe140f3db0fe014031783b0946b8c9d2e3"), + // keccak256("OPERATOR_ROLE") + hex_bytes32!("97667070c54ef182b0f5858b034beac1b6f3089aa2d3188bb1e8929f4fa9b929"), + // keccak256("SNAPSHOT_ROLE") + hex_bytes32!("5fdbd35e8da83ee755d5e62a539e5ed7f47126abede0b8b10f9ea43dc6eed07f"), + // keccak256("BURNER_ROLE") + hex_bytes32!("3c11d16cbaffd01df69ce1c404f6340ee057498f5f00246190ea54220576a848"), + // keccak256("EXECUTOR_ROLE") + hex_bytes32!("d8aa0f3194971a2a116679f7c2090f6939c8d4e01a2a8d7e41d55e5351469e63"), + // keccak256("CANCELLER_ROLE") + hex_bytes32!("fd643c72710c63c0180259aba6b2d05451e3591a24e58b62239378085726f783"), + // keccak256("PROPOSER_ROLE") + hex_bytes32!("b09aa5aeb3702cfd50b6b62bc4532604938f21248a27a1d5ca736082b6819cc1"), + // keccak256("TIMELOCK_ADMIN_ROLE") + hex_bytes32!("5f58e3a2316349923ce3780f8d587db2d72378aed66a8261c916544fa6846ca5"), + // keccak256("PREDICATE_ROLE") + hex_bytes32!("12ff340d0cd9c652c747ca35727e68c547d0f0bfa7758c2e59b9aadc721a202b"), + // keccak256("DEPOSITOR_ROLE") + hex_bytes32!("8f4f2da22e8ac8f11e15f9fc141cddbb5deea8800186560abb6e68c5496619a9"), + // keccak256("URI_SETTER_ROLE") + hex_bytes32!("7804d923f43a17d325d77e781528e0793b2edd7d8aa4a317c18bf4cd7da5db7e"), + // keccak256("MANAGER_ROLE") + hex_bytes32!("241ecf16d79d0f8dbfb92cbc07fe17840425976cf0667f022fe9877caa831b08"), + // keccak256("GOVERNANCE_ROLE") + hex_bytes32!("71840dc4906352362b0cdaf79870196c8e42acafade72d5d5a6d59291253ceb1"), + // keccak256("Delegation(address delegatee,uint256 nonce,uint256 expiry)") + hex_bytes32!("e48329057bfd03d55e49b547132e39cffd9c1820ad7b9d4c5307691425d15adf"), + // keccak256("MetaTransaction(uint256 nonce,address from,bytes functionSignature)") + hex_bytes32!("2d0335ab174d301747ad37e568a4556fead940e3d2551a80ae05629fc44e80b0"), + // keccak256("EIP712Domain(string name,string version,uint256 chainId,address verifyingContract,bytes32 salt)") + hex_bytes32!("d87cd6ef79d4e2b95e15ce8abf732db51ec771f1ca2edccf22a46c729ac56472"), + // keccak256("EIP712Domain(string name,uint256 chainId,address verifyingContract)") + hex_bytes32!("b1188b85de397c4c89df42e52e9bb3e936e8e7a3983bbb543b71ba9ea5234396"), + // keccak256("KEEPER_ROLE") + hex_bytes32!("fc8737ab85eb45125971625a9ebdb75cc78e01d5c1fa80c4c6e5203f47bc4fab"), + // keccak256("GUARDIAN_ROLE") + hex_bytes32!("55435dd261a4b9b3364963f7738a7a662ad9c84396d64be3365284bb7f0a5041"), + // keccak256("RELAYER_ROLE") + hex_bytes32!("e2b7fb3b832174769106daebcfd6d1970523240dda11281102db9363b83b0dc4"), +]; + +fn is_known_non_event_constant(val: &[u8; 32]) -> bool { + KNOWN_NON_EVENT_HASHES.iter().any(|known| known == val) +} + +// --------------------------------------------------------------------------- +// Vyper detection & bypass +// --------------------------------------------------------------------------- + +/// Detects Vyper-compiled bytecode from bytecode prefix patterns. +/// +/// Vyper uses internal function call patterns that produce dynamic JUMPs the CFG +/// resolver cannot follow, causing LOG-containing blocks to appear unreachable. +/// Three prefix patterns cover all known Vyper compiler versions (0.2–0.4+): +fn is_likely_vyper(code: &[u8]) -> bool { + // PUSH1 4; CALLDATASIZE; LT — Vyper 0.2–0.3 + code.starts_with(&[0x60, 0x04, 0x36]) + // PUSH1 3; CALLDATASIZE; GT — Vyper 0.4+ + || code.starts_with(&[0x60, 0x03, 0x36, 0x11]) + // CALLVALUE; ISZERO; PUSH2 0x00xx — Vyper non-payable entry + || code.starts_with(&[0x34, 0x15, 0x61, 0x00]) +} + +/// Scans entire bytecode for LOG1–LOG4 sites regardless of CFG reachability. +/// +/// For each LOG, finds the enclosing pseudo-block (nearest prior JUMPDEST or code +/// start), runs lightweight symbolic execution to identify topic0's source, and +/// extracts the value if it's a same-block PUSH32 or PUSHn. +fn vyper_scan_all_log_sites(code: &[u8]) -> Vec { + use crate::control_flow_graph::state::{StackSym, State}; + use crate::evm::{code_iterator::iterate_code, op}; + + let mut out: crate::collections::HashSet = + crate::collections::HashSet::default(); + + // Pass 1: collect all LOG1–LOG4 sites with their pseudo-block starts. + // Split pseudo-blocks at LOG instructions: after a LOG consumes items from the + // stack, the next LOG's topic0 is built from scratch, so we must restart symbolic + // execution from after the previous LOG to avoid stale stack values. + let mut block_start = 0usize; + let mut log_sites: Vec<(usize, usize)> = Vec::new(); + for (pc, cop) in iterate_code(code, 0, None) { + if cop.op == op::JUMPDEST { + block_start = pc; + } + if (op::LOG0..=op::LOG4).contains(&cop.op) { + if (op::LOG1..=op::LOG4).contains(&cop.op) { + log_sites.push((pc, block_start)); + } + // After any LOG, the next instruction starts a new pseudo-block + block_start = pc + 1; + } + } + + // Pass 2: symbolically execute each pseudo-block to resolve topic0. + // + // State::exec terminates at JUMP/JUMPI/REVERT/RETURN/STOP (it's designed for + // CFG basic blocks). In Vyper pseudo-blocks, these can appear mid-block (e.g., + // a require guard's JUMPI before the event emit). To handle this, find the last + // such "barrier" before the LOG and start execution from the instruction after it. + for (log_pc, bs) in log_sites { + // Find the last barrier instruction in [bs, log_pc) to use as exec start. + let exec_start = { + let mut last_barrier_end = bs; + for (pc, cop) in iterate_code(code, bs, Some(log_pc)) { + if pc == log_pc { + break; + } + if matches!( + cop.op, + op::JUMP + | op::JUMPI + | op::STOP + | op::RETURN + | op::REVERT + | op::INVALID + | op::SELFDESTRUCT + ) { + last_barrier_end = pc + cop.opi.size; + } + } + last_barrier_end + }; + + let prev_pc = classify::find_prev_instruction_pc(code, exec_start, log_pc); + let mut state = State::new(); + if let Some(prev) = prev_pc { + let _ = state.exec(code, exec_start, Some(prev)); + } + + // topic0 is always at stack position 2 for LOG1–LOG4 + let mut sym = state.get_stack(2); + + // If topic0 is Before(n), the value came from before the barrier. + // Try resolving through a parent state (block_start → barrier). + if matches!(sym, StackSym::Before(_)) && exec_start > bs { + let mut parent = State::new(); + if let Some(prev) = classify::find_prev_instruction_pc(code, bs, exec_start) { + let _ = parent.exec(code, bs, Some(prev)); + } + let resolved = state.resolve_with_parent(&parent); + sym = resolved.get_stack(2); + } + + if let StackSym::Other(pc) = sym { + let Some(&opcode) = code.get(pc) else { + continue; + }; + match opcode { + op::PUSH32 => { + if pc + 33 <= code.len() { + let mut topic = [0u8; 32]; + topic.copy_from_slice(&code[pc + 1..pc + 33]); + if is_plausible_event_hash(&topic) { + out.insert(topic); + } + } + } + op::PUSH5..=op::PUSH31 => { + let n = (opcode - op::PUSH1 + 1) as usize; + let start = pc + 1; + if let Some(end) = start.checked_add(n) + && end <= code.len() + { + let mut topic = [0u8; 32]; + topic[32 - n..].copy_from_slice(&code[start..end]); + if is_plausible_event_hash(&topic) { + out.insert(topic); + } + } + } + _ => {} + } + } + } + + out.into_iter().collect() +} + +// --------------------------------------------------------------------------- +// Entry points +// --------------------------------------------------------------------------- + +fn contract_events_internal(code: &[u8]) -> Vec { + if code.is_empty() { + return Vec::new(); + } + let Ok((index, classified)) = std::panic::catch_unwind(|| classify::classify_log_sites(code)) + else { + return Vec::new(); + }; + let mut events = resolve::resolve_classified_log_sites(code, &index, &classified); + + if is_likely_vyper(code) { + let supplement = vyper_scan_all_log_sites(code); + if !supplement.is_empty() { + let mut set: crate::collections::HashSet = events.drain(..).collect(); + set.extend(supplement); + events = set.into_iter().collect(); + events.sort_unstable(); + } + } + + events +} + +/// Extracts all event selectors from contract bytecode. +pub fn contract_events(code: &[u8]) -> Vec { + contract_events_internal(code) +} + +/// Classifies each `LOGx` site by topic0 source complexity. +/// +/// This is a lightweight diagnostic helper intended for analysis/demo usage. +pub fn contract_event_log_classes(code: &[u8]) -> Vec { + if code.is_empty() { + return Vec::new(); + } + let Ok((_, classified)) = std::panic::catch_unwind(|| classify::classify_log_sites(code)) + else { + return Vec::new(); + }; + classified + .into_iter() + .map(|v| EventLogClassRecord { + log_pc: v.site.pc, + block_start: v.site.block_start, + class: map_log_site_class(code, &v), + }) + .collect() +} + +fn map_log_site_class(code: &[u8], site: &classify::ClassifiedLogSite) -> EventLogClass { + match site.class { + classify::LogSiteClass::Push32 { .. } => { + use crate::evm::{code_iterator::iterate_code, op}; + let count = iterate_code(code, site.site.block_start, Some(site.site.pc)) + .filter(|(_, cop)| cop.op == op::PUSH32) + .count(); + if count <= 1 { + EventLogClass::SameBlockSinglePush32 + } else { + EventLogClass::SameBlockMultiPush32 + } + } + classify::LogSiteClass::PushN { .. } | classify::LogSiteClass::MloadCodecopy { .. } => { + EventLogClass::Other + } + classify::LogSiteClass::CrossBlock { .. } => EventLogClass::CrossBlockBefore, + } +} + +#[cfg(test)] +mod tests { + use std::collections::BTreeSet; + + use super::*; + use crate::evm::op; + + fn append_log1(code: &mut Vec, selector: [u8; 32]) { + code.push(op::PUSH32); + code.extend_from_slice(&selector); + // stack: [topic0] + code.extend_from_slice(&[op::PUSH1, 0x00, op::PUSH1, 0x00, op::LOG1]); + } + + fn append_single_selector_dispatch(code: &mut Vec, selector: [u8; 4]) -> usize { + code.extend_from_slice(&[ + op::PUSH1, + 0x00, + op::CALLDATALOAD, + op::PUSH1, + 0xE0, + op::SHR, + op::PUSH4, + ]); + code.extend_from_slice(&selector); + code.push(op::EQ); + code.extend_from_slice(&[op::PUSH1, 0x00]); + let entry_patch = code.len() - 1; + code.push(op::JUMPI); + code.push(op::STOP); + entry_patch + } + + fn make_plausible_hash() -> [u8; 32] { + // A value that passes is_plausible_event_hash: no long zero/ff runs, non-zero prefix/suffix. + [0xabu8; 32] + } + + // --- Public API tests --- + + #[test] + fn test_simple_log1() { + let selector = [0xab; 32]; + let mut code = Vec::new(); + append_log1(&mut code, selector); + code.push(op::STOP); + + let events = contract_events(&code); + assert_eq!(events, vec![selector]); + } + + #[test] + fn test_require_guarded_event() { + let function_selector = [0xaa, 0xbb, 0xcc, 0xdd]; + let event_selector = [0x42; 32]; + + let mut code = Vec::new(); + let entry_patch = append_single_selector_dispatch(&mut code, function_selector); + + let function_entry = code.len(); + code[entry_patch] = u8::try_from(function_entry).unwrap(); + code.push(op::JUMPDEST); + + // Emulate a require guard: + // if (!cond) revert(); else emit LOG1(topic0) + code.extend_from_slice(&[op::PUSH1, 0x00]); // cond = 0 + code.extend_from_slice(&[op::PUSH1, 0x00]); // destination (patched below) + let emit_patch = code.len() - 1; + code.extend_from_slice(&[op::JUMPI, op::PUSH1, 0x00, op::PUSH1, 0x00, op::REVERT]); + let emit_pc = code.len(); + code[emit_patch] = u8::try_from(emit_pc).unwrap(); + + code.push(op::JUMPDEST); + append_log1(&mut code, event_selector); + code.push(op::STOP); + + let events = contract_events(&code); + assert_eq!(events, vec![event_selector]); + } + + #[test] + fn test_forks_when_both_branches_are_alive() { + let function_selector = [0xaa, 0xbb, 0xcc, 0xdd]; + let event_true = [0x11; 32]; + let event_false = [0x22; 32]; + + let mut code = Vec::new(); + let entry_patch = append_single_selector_dispatch(&mut code, function_selector); + + let function_entry = code.len(); + code[entry_patch] = u8::try_from(function_entry).unwrap(); + code.push(op::JUMPDEST); + + // Always-false condition. VM takes fallthrough branch, but both branches emit, + // so branch classifier should fork and collect both events. + code.extend_from_slice(&[op::PUSH1, 0x00]); // cond = 0 + code.extend_from_slice(&[op::PUSH1, 0x00]); // true destination (patched below) + let true_patch = code.len() - 1; + code.push(op::JUMPI); + + code.push(op::JUMPDEST); + append_log1(&mut code, event_false); + code.push(op::STOP); + + let true_pc = code.len(); + code[true_patch] = u8::try_from(true_pc).unwrap(); + + code.push(op::JUMPDEST); + append_log1(&mut code, event_true); + code.push(op::STOP); + + let events = contract_events(&code); + let found: BTreeSet<_> = events.into_iter().collect(); + let expected: BTreeSet<_> = [event_true, event_false].into_iter().collect(); + assert_eq!(found, expected); + } + + #[test] + fn test_no_events() { + let code = alloy_primitives::hex::decode("6080604052348015600e575f80fd5b50").unwrap(); + let events = contract_events(&code); + assert!(events.is_empty()); + } + + #[test] + fn test_push32_no_log() { + let mut code = Vec::new(); + code.push(op::PUSH32); + code.extend_from_slice(&[0xab; 32]); + code.push(op::POP); + code.push(op::STOP); + + let events = contract_events(&code); + assert!(events.is_empty()); + } + + // --- CC module tests (migrated from cc/mod.rs) --- + + /// Sub-class a: single PUSH32 + LOG1 in one block. + #[test] + fn cc_push32_extracts_event() { + let selector = make_plausible_hash(); + let mut code = Vec::new(); + append_log1(&mut code, selector); + code.push(op::STOP); + + let events = contract_events(&code); + assert_eq!(events, vec![selector]); + } + + /// Sub-class e/f: topic0 pushed in a predecessor block, consumed via JUMP. + #[test] + fn cc_cross_block_extracts_event() { + let selector = [0x11u8; 32]; + let mut code = Vec::new(); + // Block 0: push selector, jump to JUMPDEST + code.push(op::PUSH32); + code.extend_from_slice(&selector); + // PUSH1 + code.extend_from_slice(&[ + op::PUSH1, + 0x24, // target = 0x24 + op::JUMP, + ]); + // Block 1 at 0x24: JUMPDEST, then emit LOG1 + code.push(op::JUMPDEST); + code.extend_from_slice(&[op::PUSH1, 0x00, op::PUSH1, 0x00, op::LOG1]); + code.push(op::STOP); + + let events = contract_events(&code); + assert_eq!(events, vec![selector]); + } + + /// Sub-class d: Other(pc) pointing at a non-PUSH instruction → skip. + #[test] + fn cc_other_small_push1_returns_empty() { + let mut code = Vec::new(); + // PUSH1 0x01 is a small value, not a plausible event hash + code.extend_from_slice(&[ + op::PUSH1, + 0x01, // topic0 — small, not PUSH32/PUSHn(5+) + op::PUSH1, + 0x00, + op::PUSH1, + 0x00, + op::LOG1, + op::STOP, + ]); + + let events = contract_events(&code); + assert!(events.is_empty()); + } + + /// Sub-class b: PUSH31 with a plausible hash → extract. + #[test] + fn cc_push31_extracts_event() { + // Build a 32-byte value with leading zero (since PUSH31 only pushes 31 bytes). + // The first byte will be 0x00 after right-aligning. + // For is_plausible_event_hash: first 6 bytes must not all be zero. + // PUSH31 → [0x00, b1..b31] where b1..b6 are non-zero. + let mut expected = [0u8; 32]; + for i in 1..32 { + expected[i] = 0xab; + } + // expected[0] = 0x00, expected[1..] = 0xab + // is_plausible_event_hash checks val[..6] != [0;6] — first 6 bytes are [0,ab,ab,ab,ab,ab] → OK + + let mut code = Vec::new(); + code.push(op::PUSH31); + code.extend_from_slice(&expected[1..]); // 31 bytes + code.extend_from_slice(&[op::PUSH1, 0x00, op::PUSH1, 0x00, op::LOG1]); + code.push(op::STOP); + + let events = contract_events(&code); + assert_eq!(events, vec![expected]); + } + + /// Sub-class c: CODECOPY + MLOAD pattern. + #[test] + fn cc_mload_codecopy_extracts_event() { + // Layout: + // [header: PUSH + PUSH + PUSH + CODECOPY + PUSH + MLOAD + PUSH + PUSH + LOG1 + STOP] + // Then at some offset, place the 32-byte event hash in the bytecode. + // + // CODECOPY copies code[src..src+32] into memory[dst..dst+32], + // then MLOAD reads memory[dst] to get the topic. + let selector = make_plausible_hash(); + + let mut code = Vec::new(); + // We'll put the selector at code offset = 0x40 (after the instruction sequence). + let selector_offset: u8 = 0x40; + + // PUSH1 0x20 (length = 32) + code.extend_from_slice(&[op::PUSH1, 0x20]); + // PUSH1 (source offset in code) + code.extend_from_slice(&[op::PUSH1, selector_offset]); + // PUSH1 0x00 (dest offset in memory) + code.extend_from_slice(&[op::PUSH1, 0x00]); + // CODECOPY + code.push(op::CODECOPY); + // PUSH1 0x00 (memory offset to load) + code.extend_from_slice(&[op::PUSH1, 0x00]); + // MLOAD — loads 32 bytes from memory[0] + code.push(op::MLOAD); + // Now topic0 is on stack. Push offset+size for LOG1. + code.extend_from_slice(&[op::PUSH1, 0x00, op::PUSH1, 0x00]); + code.push(op::LOG1); + code.push(op::STOP); + + // Pad to selector_offset + while code.len() < selector_offset as usize { + code.push(0x00); + } + // Place the selector at the expected offset + code.extend_from_slice(&selector); + + let events = contract_events(&code); + assert_eq!(events, vec![selector]); + } + + // --- Vyper bypass tests --- + + #[test] + fn vyper_detection() { + // Vyper 0.2–0.3 prefix: PUSH1 4; CALLDATASIZE; LT + assert!(super::is_likely_vyper(&[0x60, 0x04, 0x36, 0x10, 0x00])); + // Vyper 0.4+ prefix: PUSH1 3; CALLDATASIZE; GT + assert!(super::is_likely_vyper(&[0x60, 0x03, 0x36, 0x11, 0x00])); + // Vyper non-payable: CALLVALUE; ISZERO; PUSH2 0x00xx + assert!(super::is_likely_vyper(&[0x34, 0x15, 0x61, 0x00, 0x0e])); + // Solidity: not detected + assert!(!super::is_likely_vyper(&[0x60, 0x80, 0x60, 0x40, 0x52])); + // Empty + assert!(!super::is_likely_vyper(&[])); + } + + /// Vyper-like contract with a LOG1 in an unreachable block (dynamic JUMP return). + /// The main CFG pipeline misses it, but the Vyper supplement recovers it. + #[test] + fn vyper_unreachable_log_recovered() { + let selector = make_plausible_hash(); + let mut code = Vec::new(); + + // Vyper prefix: PUSH1 4; CALLDATASIZE; LT → triggers is_likely_vyper() + code.extend_from_slice(&[0x60, 0x04, 0x36]); + code.push(op::STOP); + + // Unreachable block: JUMPDEST + PUSH32 + offset + size + LOG1 + STOP + code.push(op::JUMPDEST); + code.push(op::PUSH32); + code.extend_from_slice(&selector); + code.extend_from_slice(&[op::PUSH1, 0x00, op::PUSH1, 0x00, op::LOG1]); + code.push(op::STOP); + + let events = contract_events(&code); + assert_eq!(events, vec![selector]); + } +} diff --git a/src/events/resolve.rs b/src/events/resolve.rs new file mode 100644 index 0000000..4f0503c --- /dev/null +++ b/src/events/resolve.rs @@ -0,0 +1,441 @@ +use std::collections::VecDeque; + +use crate::collections::{HashMap, HashSet}; +use crate::control_flow_graph::state::{StackSym, State}; +use crate::evm::{code_iterator::iterate_code, op}; + +use super::classify::{ + CfgIndex, ClassifiedLogSite, LogSiteClass, find_block_start, find_prev_instruction_pc, +}; +use super::{EventSelector, is_plausible_event_hash}; + +const MAX_STATES_PER_LOG: usize = 500_000; +const MAX_PRED_STEPS_PER_LOG: usize = 500_000; +const BLOCK_STATE_CACHE_MAX_ENTRIES: usize = 2_048; +const CONTINUATION_CACHE_MAX_ENTRIES: usize = 4_096; + +// --------------------------------------------------------------------------- +// Block state cache (LRU-ish, shared across all LOG sites) +// --------------------------------------------------------------------------- + +struct BlockStateCache { + map: HashMap, + insertion_order: VecDeque, + max_entries: usize, +} + +impl BlockStateCache { + fn new(max_entries: usize) -> Self { + Self { + map: HashMap::default(), + insertion_order: VecDeque::new(), + max_entries, + } + } + + fn insert(&mut self, block_start: usize, state: State) { + if self.max_entries == 0 || self.map.contains_key(&block_start) { + return; + } + self.map.insert(block_start, state); + self.insertion_order.push_back(block_start); + while self.map.len() > self.max_entries { + let Some(old) = self.insertion_order.pop_front() else { + break; + }; + if self.map.remove(&old).is_some() { + break; + } + } + } + + fn get_exit_symbol( + &mut self, + code: &[u8], + index: &CfgIndex, + block_start: usize, + slot: usize, + ) -> Option { + if !self.map.contains_key(&block_start) { + let block = index.blocks.get(&block_start)?; + let mut state = State::new(); + let _ = state.exec(code, block.start, Some(block.end)); + self.insert(block_start, state); + } + self.map + .get(&block_start) + .map(|state| state.get_stack(slot)) + } +} + +// --------------------------------------------------------------------------- +// Continuation cache (shared across all LOG sites) +// --------------------------------------------------------------------------- + +struct ContinuationCache { + // Key: (block_start, slot) → Value: [(pred_block, exit_sym)] + map: HashMap<(usize, usize), Vec<(usize, StackSym)>>, + insertion_order: VecDeque<(usize, usize)>, + max_entries: usize, +} + +impl ContinuationCache { + fn new(max_entries: usize) -> Self { + Self { + map: HashMap::default(), + insertion_order: VecDeque::new(), + max_entries, + } + } + + fn get_or_compute( + &mut self, + code: &[u8], + index: &CfgIndex, + state_cache: &mut BlockStateCache, + block_start: usize, + slot: usize, + ) -> &[(usize, StackSym)] { + let key = (block_start, slot); + if !self.map.contains_key(&key) { + let result = Self::compute(code, index, state_cache, block_start, slot); + self.insert(key, result); + } + self.map.get(&key).map(|v| v.as_slice()).unwrap_or(&[]) + } + + fn compute( + code: &[u8], + index: &CfgIndex, + state_cache: &mut BlockStateCache, + block_start: usize, + slot: usize, + ) -> Vec<(usize, StackSym)> { + let Some(preds) = index.preds_by_block.get(&block_start) else { + return Vec::new(); + }; + preds + .iter() + .filter_map(|&pred| { + state_cache + .get_exit_symbol(code, index, pred, slot) + .map(|sym| (pred, sym)) + }) + .collect() + } + + fn insert(&mut self, key: (usize, usize), result: Vec<(usize, StackSym)>) { + if self.max_entries == 0 || self.map.contains_key(&key) { + return; + } + self.map.insert(key, result); + self.insertion_order.push_back(key); + while self.map.len() > self.max_entries { + let Some(old) = self.insertion_order.pop_front() else { + break; + }; + if self.map.remove(&old).is_some() { + break; + } + } + } +} + +// --------------------------------------------------------------------------- +// Backtrack types (BFS dedup) +// --------------------------------------------------------------------------- + +#[derive(Clone, Hash, PartialEq, Eq)] +struct BacktrackKey { + block_start: usize, + context: usize, + sym: StackSym, +} + +#[derive(Clone)] +struct BacktrackState { + block_start: usize, + context: usize, + sym: StackSym, +} + +// --------------------------------------------------------------------------- +// Dispatch entry +// --------------------------------------------------------------------------- + +pub(super) fn resolve_classified_log_sites( + code: &[u8], + index: &CfgIndex, + sites: &[ClassifiedLogSite], +) -> Vec { + let mut out: HashSet = HashSet::default(); + let mut state_cache = BlockStateCache::new(BLOCK_STATE_CACHE_MAX_ENTRIES); + let mut cont_cache = ContinuationCache::new(CONTINUATION_CACHE_MAX_ENTRIES); + + for site in sites { + match site.class { + LogSiteClass::Push32 { topic_pc } => { + resolve_push32(code, topic_pc, &mut out); + } + LogSiteClass::PushN { topic_pc } => { + resolve_pushn(code, topic_pc, &mut out); + } + LogSiteClass::MloadCodecopy { mload_pc } => { + resolve_mload_codecopy(code, mload_pc, site.site.block_start, &mut out); + } + LogSiteClass::CrossBlock { init_sym_n } => { + resolve_cross_block( + code, + index, + site, + init_sym_n, + &mut state_cache, + &mut cont_cache, + &mut out, + ); + } + } + } + + let mut events: Vec = out.into_iter().collect(); + events.sort_unstable(); + events +} + +// --------------------------------------------------------------------------- +// Sub-class a: PUSH32 +// --------------------------------------------------------------------------- + +fn resolve_push32(code: &[u8], pc: usize, out: &mut HashSet) { + if let Some(topic) = push32_value(code, pc) { + out.insert(topic); + } +} + +fn push32_value(code: &[u8], pc: usize) -> Option<[u8; 32]> { + if code.get(pc).copied()? != op::PUSH32 { + return None; + } + let end = pc.checked_add(33)?; + if end > code.len() { + return None; + } + let mut topic = [0u8; 32]; + topic.copy_from_slice(&code[pc + 1..end]); + Some(topic) +} + +// --------------------------------------------------------------------------- +// Sub-class b: PUSH5..PUSH31 (right-aligned into 32 bytes) +// --------------------------------------------------------------------------- + +fn resolve_pushn(code: &[u8], pc: usize, out: &mut HashSet) { + if let Some(topic) = pushn_value(code, pc) { + out.insert(topic); + } +} + +fn pushn_value(code: &[u8], pc: usize) -> Option<[u8; 32]> { + let opcode = *code.get(pc)?; + if !(op::PUSH1..op::PUSH32).contains(&opcode) { + return None; + } + let n = (opcode - op::PUSH1 + 1) as usize; + let start = pc + 1; + let end = start.checked_add(n)?; + if end > code.len() { + return None; + } + let mut topic = [0u8; 32]; + topic[32 - n..].copy_from_slice(&code[start..end]); + Some(topic) +} + +// --------------------------------------------------------------------------- +// Sub-class c: MLOAD preceded by CODECOPY +// --------------------------------------------------------------------------- + +fn resolve_mload_codecopy( + code: &[u8], + mload_pc: usize, + block_start: usize, + out: &mut HashSet, +) { + if let Some(topic) = mload_codecopy_value(code, mload_pc, block_start) { + out.insert(topic); + } +} + +fn mload_codecopy_value(code: &[u8], mload_pc: usize, block_start: usize) -> Option<[u8; 32]> { + // Use symbolic execution to precisely identify CODECOPY's `offset` argument. + // CODECOPY pops (destOffset, offset, size) from stack. + let instrs: Vec<(usize, u8)> = iterate_code(code, block_start, Some(mload_pc)) + .map(|(pc, cop)| (pc, cop.op)) + .collect(); + + // Find the last CODECOPY before the MLOAD. + let (codecopy_pc, _) = *instrs.iter().rev().find(|&&(_, op)| op == op::CODECOPY)?; + + // Run symbolic execution up to (but not including) CODECOPY to read its arguments. + let prev_pc = find_prev_instruction_pc(code, block_start, codecopy_pc)?; + let mut state = State::new(); + let _ = state.exec(code, block_start, Some(prev_pc)); + + // Helper: extract a concrete usize from a stack symbol. + let sym_to_usize = |sym: &StackSym| -> Option { + match sym { + // PUSH1..PUSH4 → Pushed([u8; 4]) with value stored big-endian + StackSym::Pushed(bytes) => Some(u32::from_be_bytes(*bytes) as usize), + // PUSH5..PUSH32 → Other(pc), read concrete value from bytecode + StackSym::Other(pc) => { + let opcode = *code.get(*pc)?; + if !(op::PUSH1..=op::PUSH32).contains(&opcode) { + return None; + } + let n = (opcode - op::PUSH1 + 1) as usize; + // Only PUSH1..PUSH8 can fit in a usize; larger pushes + // (PUSH9..PUSH32) cannot represent a valid code offset. + if n > 8 { + return None; + } + let arg_start = pc + 1; + let arg_end = arg_start.checked_add(n)?; + if arg_end > code.len() { + return None; + } + let mut buf = [0u8; 8]; + let copy_start = 8 - n; + buf[copy_start..].copy_from_slice(&code[arg_start..arg_end]); + Some(u64::from_be_bytes(buf) as usize) + } + _ => None, + } + }; + + // Stack at CODECOPY: [destOffset(0), offset(1), size(2), ...] + // Reject if size argument isn't exactly 0x20 (32). + // Larger copies are bulk data loads (role hash tables, etc.) that produce FP. + let size_sym = state.get_stack(2); + if sym_to_usize(&size_sym)? != 0x20 { + return None; + } + + // Extract the code offset from the `offset` argument (stack position 1). + let offset = sym_to_usize(&state.get_stack(1))?; + + // Read 32 bytes at the code offset. + if offset.checked_add(32).is_some_and(|end| end <= code.len()) { + let mut topic = [0u8; 32]; + topic.copy_from_slice(&code[offset..offset + 32]); + if is_plausible_event_hash(&topic) { + return Some(topic); + } + } + None +} + +// --------------------------------------------------------------------------- +// Sub-class e/f: CrossBlock (BFS backtrack through predecessor blocks) +// --------------------------------------------------------------------------- + +fn resolve_cross_block( + code: &[u8], + index: &CfgIndex, + site: &ClassifiedLogSite, + init_sym_n: usize, + state_cache: &mut BlockStateCache, + cont_cache: &mut ContinuationCache, + out: &mut HashSet, +) { + let Some(contexts) = index.contexts_reaching_block.get(&site.site.block_start) else { + return; + }; + + let init_sym = StackSym::Before(init_sym_n); + let mut queue: VecDeque = VecDeque::new(); + queue.extend(contexts.iter().copied().map(|context| BacktrackState { + block_start: site.site.block_start, + context, + sym: init_sym.clone(), + })); + + let mut visited: HashSet = HashSet::default(); + let mut processed_states = 0usize; + let mut pred_steps = 0usize; + + while let Some(state) = queue.pop_front() { + if processed_states >= MAX_STATES_PER_LOG || pred_steps >= MAX_PRED_STEPS_PER_LOG { + break; + } + processed_states += 1; + + let key = BacktrackKey { + block_start: state.block_start, + context: state.context, + sym: state.sym.clone(), + }; + if !visited.insert(key) { + continue; + } + + match state.sym { + StackSym::Other(pc) => { + resolve_topic_at_pc(code, pc, index, out); + } + StackSym::Before(n) => { + let continuations = + cont_cache.get_or_compute(code, index, state_cache, state.block_start, n); + for &(pred, ref exit_sym) in continuations { + if pred_steps >= MAX_PRED_STEPS_PER_LOG { + break; + } + pred_steps += 1; + + let context_reaches_pred = index + .contexts_reaching_block + .get(&pred) + .is_some_and(|set| set.contains(&state.context)); + if !context_reaches_pred { + continue; + } + + queue.push_back(BacktrackState { + block_start: pred, + context: state.context, + sym: exit_sym.clone(), + }); + } + } + StackSym::Pushed(_) | StackSym::Jumpdest(_) => {} + } + } +} + +/// Unified topic extraction: dispatches to push32/pushn/mload_codecopy based on opcode at `pc`. +fn resolve_topic_at_pc(code: &[u8], pc: usize, index: &CfgIndex, out: &mut HashSet) { + let Some(&opcode) = code.get(pc) else { + return; + }; + match opcode { + op::PUSH32 => { + if let Some(topic) = push32_value(code, pc) + && is_plausible_event_hash(&topic) + { + out.insert(topic); + } + } + op::PUSH5..=op::PUSH31 => { + if let Some(topic) = pushn_value(code, pc) + && is_plausible_event_hash(&topic) + { + out.insert(topic); + } + } + op::MLOAD => { + if let Some(block_start) = find_block_start(&index.blocks, pc) { + resolve_mload_codecopy(code, pc, block_start, out); + } + } + _ => {} + } +} diff --git a/src/evm/vm.rs b/src/evm/vm.rs index 411228e..7958f0e 100644 --- a/src/evm/vm.rs +++ b/src/evm/vm.rs @@ -81,6 +81,15 @@ where T: std::fmt::Debug + Clone + Eq, U: CallData, { + fn merge_labels(lhs: &Option, rhs: &Option) -> Option { + match (lhs, rhs) { + (Some(l), Some(r)) if l == r => Some(l.clone()), + (Some(l), None) => Some(l.clone()), + (None, Some(r)) => Some(r.clone()), + _ => None, + } + } + pub fn new(code: &'a [u8], calldata: &'a U) -> Self { Self { code, @@ -131,8 +140,11 @@ where let s1: U256 = (&raws1).into(); let (gas_used, res) = f(&raws0, s0, &raws1, s1); - - self.stack.push_uint(res); + let label = Self::merge_labels(&raws0.label, &raws1.label); + self.stack.push(Element { + data: res.to_be_bytes(), + label, + }); let mut ret = StepResult::new(op, gas_used); ret.args[0] = raws0; ret.args[1] = raws1; @@ -297,10 +309,13 @@ where op::ISZERO => { let raws0 = self.stack.pop()?; - self.stack.push_data(if raws0.data == VAL_0_B { - VAL_1_B - } else { - VAL_0_B + self.stack.push(Element { + data: if raws0.data == VAL_0_B { + VAL_1_B + } else { + VAL_0_B + }, + label: raws0.label.clone(), }); let mut ret = StepResult::new(op, 3); ret.args[0] = raws0; @@ -316,7 +331,10 @@ where op::NOT => { let raws0 = self.stack.pop()?; let v: U256 = (&raws0).into(); - self.stack.push_uint(!v); + self.stack.push(Element { + data: (!v).to_be_bytes(), + label: raws0.label.clone(), + }); let mut ret = StepResult::new(op, 3); ret.args[0] = raws0; Ok(ret) @@ -828,4 +846,40 @@ mod tests { assert_eq!(r, expected); } } + + #[test] + fn test_bop_label_propagation_single_source() { + let code = [op::EQ]; + let mut vm = Vm::new(&code, &DummyCallData {}); + vm.stack.push(Element { + data: U256::from(1).to_be_bytes(), + label: Some(7u8), + }); + vm.stack.push(Element { + data: U256::from(1).to_be_bytes(), + label: None, + }); + + assert!(vm.step().is_ok()); + let out = vm.stack.pop().unwrap(); + assert_eq!(out.label, Some(7u8)); + } + + #[test] + fn test_bop_label_propagation_conflict_clears_label() { + let code = [op::EQ]; + let mut vm = Vm::new(&code, &DummyCallData {}); + vm.stack.push(Element { + data: U256::from(1).to_be_bytes(), + label: Some(1u8), + }); + vm.stack.push(Element { + data: U256::from(1).to_be_bytes(), + label: Some(2u8), + }); + + assert!(vm.step().is_ok()); + let out = vm.stack.pop().unwrap(); + assert_eq!(out.label, None); + } } diff --git a/src/interface_js.rs b/src/interface_js.rs index 7ada504..2f30e8f 100644 --- a/src/interface_js.rs +++ b/src/interface_js.rs @@ -18,6 +18,7 @@ const DOC_CONTRACT: &'static str = r#" /** * Contains the analysis results of a contract * @property functions - Array of functions found in the contract. Not present if no functions were extracted. + * @property events - Array of event selectors found in the contract bytecode as hex strings. Not present if events were not extracted. * @property storage - Array of storage records found in the contract. Not present if storage layout was not extracted. * @property disassembled - Array of bytecode instructions, where each element is a tuple of [offset: number, instruction: string] * @property basicBlocks - Array of basic blocks found in the contract. Not present if basic blocks were not analyzed. @@ -27,6 +28,7 @@ const DOC_CONTRACT: &'static str = r#" */ export type Contract = { functions?: ContractFunction[], + events?: string[], storage?: StorageRecord[], disassembled?: [number, string][], basicBlocks?: [number, number][], @@ -36,6 +38,7 @@ export type Contract = { /// @typedef {Object} Contract /// @description Contains the analysis results of a contract /// @property {ContractFunction[]} [functions] - Array of functions found in the contract. Not present if no functions were extracted +/// @property {string[]} [events] - Array of event selectors found in the contract bytecode as hex strings. Not present if events were not extracted /// @property {StorageRecord[]} [storage] - Array of storage records found in the contract. Not present if storage layout was not extracted /// @property {Array>} [disassembled] - Array of bytecode instructions, where each element is [offset, instruction] /// @property {Array>} [basicBlocks] - Array of basic blocks found in the contract. Not present if basic blocks were not analyzed. @@ -238,6 +241,9 @@ struct ContractInfoArgs { #[serde(default, rename = "stateMutability")] state_mutability: bool, + #[serde(default)] + events: bool, + #[serde(default)] storage: bool, @@ -261,6 +267,7 @@ const DOC_CONTRACT_INFO: &'static str = r#" * @param args.selectors - When true, includes function selectors in the output * @param args.arguments - When true, includes function arguments information * @param args.stateMutability - When true, includes state mutability information for functions + * @param args.events - When true, includes event selectors found in the contract bytecode * @param args.storage - When true, includes contract storage layout information * @param args.disassemble - When true, includes disassembled bytecode * @param args.basicBlocks - When true, includes basic block analysis @@ -271,6 +278,7 @@ export function contractInfo(code: string, args: { selectors?: boolean, arguments?: boolean, stateMutability?: boolean, + events?: boolean, storage?: boolean, disassemble?: boolean, basicBlocks?: boolean, @@ -284,6 +292,7 @@ export function contractInfo(code: string, args: { /// @param {boolean} [args.selectors] - When true, includes function selectors in the output /// @param {boolean} [args.arguments] - When true, includes function arguments information /// @param {boolean} [args.stateMutability] - When true, includes state mutability information for functions +/// @param {boolean} [args.events] - When true, includes event selectors found in the contract bytecode /// @param {boolean} [args.storage] - When true, includes contract storage layout information /// @param {boolean} [args.disassemble] - When true, includes disassembled bytecode /// @param {boolean} [args.basicBlocks] - When true, includes basic block analysis @@ -305,6 +314,9 @@ pub fn contract_info(code: &str, args: JsValue) -> Result { if args.state_mutability { cargs = cargs.with_state_mutability(); } + if args.events { + cargs = cargs.with_events(); + } if args.storage { cargs = cargs.with_storage(); } diff --git a/src/interface_py.rs b/src/interface_py.rs index a6ca923..c8fd29d 100644 --- a/src/interface_py.rs +++ b/src/interface_py.rs @@ -205,6 +205,7 @@ mod evmole { #[pyclass(name = "Contract", get_all)] struct PyContract { functions: Option>, + events: Option>, storage: Option>, disassembled: Option>, basic_blocks: Option>, @@ -215,7 +216,7 @@ mod evmole { impl PyContract { fn __repr__(&self) -> String { format!( - "Contract(functions={}, storage={}, disassembled={}, basic_blocks={}, control_flow_graph={})", + "Contract(functions={}, events={}, storage={}, disassembled={}, basic_blocks={}, control_flow_graph={})", self.functions.as_ref().map_or_else( || "None".to_string(), |v| format!( @@ -226,6 +227,9 @@ mod evmole { .join(", ") ) ), + self.events + .as_ref() + .map_or_else(|| "None".to_string(), |v| format!("{v:?}")), self.storage.as_ref().map_or_else( || "None".to_string(), |v| format!( @@ -252,13 +256,14 @@ mod evmole { // {{{ contract_info #[pyfunction] - #[pyo3(signature = (code, *, selectors=false, arguments=false, state_mutability=false, storage=false, disassemble=false, basic_blocks=false, control_flow_graph=false))] + #[pyo3(signature = (code, *, selectors=false, arguments=false, state_mutability=false, events=false, storage=false, disassemble=false, basic_blocks=false, control_flow_graph=false))] #[allow(clippy::too_many_arguments)] fn contract_info( code: &Bound<'_, PyAny>, selectors: bool, arguments: bool, state_mutability: bool, + events: bool, storage: bool, disassemble: bool, basic_blocks: bool, @@ -276,6 +281,9 @@ mod evmole { if state_mutability { args = args.with_state_mutability(); } + if events { + args = args.with_events(); + } if storage { args = args.with_storage(); } @@ -359,8 +367,13 @@ mod evmole { .collect(), }); + let events = info + .events + .map(|evts| evts.into_iter().map(hex::encode).collect()); + Ok(PyContract { functions, + events, storage, disassembled: info.disassembled, basic_blocks: info.basic_blocks, diff --git a/src/interface_wasm.rs b/src/interface_wasm.rs index 28c0877..f49373a 100644 --- a/src/interface_wasm.rs +++ b/src/interface_wasm.rs @@ -38,6 +38,7 @@ const OPT_STORAGE: u32 = 8; const OPT_DISASSEMBLE: u32 = 16; const OPT_BASIC_BLOCKS: u32 = 32; const OPT_CONTROL_FLOW_GRAPH: u32 = 64; +const OPT_EVENTS: u32 = 128; /// Analyze EVM bytecode and return contract information as JSON. /// @@ -72,6 +73,9 @@ pub extern "C" fn contract_info(code_ptr: *const u8, code_len: usize, opts: u32) if opts & OPT_STATE_MUTABILITY != 0 { args = args.with_state_mutability(); } + if opts & OPT_EVENTS != 0 { + args = args.with_events(); + } if opts & OPT_STORAGE != 0 { args = args.with_storage(); } @@ -120,6 +124,8 @@ struct ContractResult { #[serde(skip_serializing_if = "Option::is_none")] functions: Option>, #[serde(skip_serializing_if = "Option::is_none")] + events: Option>, + #[serde(skip_serializing_if = "Option::is_none")] storage: Option>, #[serde(skip_serializing_if = "Option::is_none")] disassembled: Option>, @@ -259,8 +265,13 @@ impl ContractResult { .collect(), }); + let events = info + .events + .map(|evts| evts.into_iter().map(hex::encode).collect()); + ContractResult { functions, + events, storage, disassembled: info.disassembled, basic_blocks: info.basic_blocks, diff --git a/src/lib.rs b/src/lib.rs index 59abb63..17291ad 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,12 +7,16 @@ pub use contract_info::contract_info; pub use contract_info::{Contract, ContractInfoArgs, Function}; +pub use events::{ + EventLogClass, EventLogClassRecord, EventSelector, contract_event_log_classes, contract_events, +}; pub use storage::StorageRecord; mod arguments; mod collections; mod contract_info; pub mod control_flow_graph; +mod events; mod evm; mod selectors; mod state_mutability; diff --git a/src/serialize.rs b/src/serialize.rs index 1b329df..dffaec7 100644 --- a/src/serialize.rs +++ b/src/serialize.rs @@ -3,7 +3,9 @@ use std::collections::BTreeMap; use alloy_primitives::hex; use serde::{Serializer, ser::SerializeSeq}; -use crate::{DynSolType, Selector, Slot, StateMutability, control_flow_graph::Block}; +use crate::{ + DynSolType, Selector, Slot, StateMutability, control_flow_graph::Block, events::EventSelector, +}; pub fn selector(val: &Selector, serializer: S) -> Result { serializer.serialize_str(&hex::encode(val)) @@ -48,6 +50,22 @@ pub fn vec_selector(val: &Vec, serializer: S) -> Result s.end() } +pub fn events( + val: &Option>, + serializer: S, +) -> Result { + match val { + Some(evts) => { + let mut s = serializer.serialize_seq(Some(evts.len()))?; + for evt in evts { + s.serialize_element(&hex::encode(evt))?; + } + s.end() + } + None => serializer.serialize_none(), + } +} + pub fn blocks( val: &BTreeMap, serializer: S,
Dataset