From 04b5ae5ca35baf42d539f8d826361259b9a652be Mon Sep 17 00:00:00 2001 From: Hongwei Date: Sun, 22 Feb 2026 18:45:51 -0500 Subject: [PATCH 1/2] fix: handle adjacent @@ variable tokens in split_words() When variables appear adjacent without spaces in decompiled code (e.g., func(a,b,c)), the @@ placeholder tokens merge into one word. re.search() only matched the first pattern, silently losing the rest and causing a holder/mask count mismatch that discards all predictions. Replace re.search() with re.finditer() to extract all @@ patterns. --- varbert/model.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/varbert/model.py b/varbert/model.py index e04c59c..0234e64 100644 --- a/varbert/model.py +++ b/varbert/model.py @@ -107,7 +107,7 @@ def varec_init(self): str(self.model_base_dir), avar_vocab_size = self.vocab_size, from_tf=False, - config=config + config=config ) model.to(device) @@ -124,13 +124,16 @@ def split_words(text: str): words = text.replace("\n", " ").split(" ") r = [] for w in words: - m = re.search(r"@@[^\s@]+@@[^\s@]+@@", w) - if m is not None: - if m.start() > 0: - r.append(w[: m.start()]) - r.append(w[m.start(): m.end()]) - if m.end() < len(w): - r.append(w[m.end():]) + matches = list(re.finditer(r"@@[^\s@]+@@[^\s@]+@@", w)) + if matches: + pos = 0 + for m in matches: + if m.start() > pos: + r.append(w[pos: m.start()]) + r.append(w[m.start(): m.end()]) + pos = m.end() + if pos < len(w): + r.append(w[pos:]) else: r.append(w) r = [w for w in r if len(w) > 0] @@ -206,7 +209,7 @@ def preprocess_word_mask(self, ftext, tokenizer): tpwords.append(vocab[t]) towords.append(vocab[t]) pos += 1 - + assert len(tpwords) == len(towords) assert None not in tpwords assert None not in towords @@ -411,4 +414,3 @@ def forward( "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer), "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer), } - From 09d3e8cebd5fc21598cbb32f7085795cea8c1c36 Mon Sep 17 00:00:00 2001 From: mahaloz Date: Sun, 22 Feb 2026 17:06:33 -0700 Subject: [PATCH 2/2] fix broken tests with pin --- setup.cfg | 2 +- varbert/__init__.py | 2 +- varbert/model.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.cfg b/setup.cfg index 38eb4b5..13bd5f6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -15,7 +15,7 @@ long_description_content_type = text/markdown [options] install_requires = torch - transformers + transformers>=5.2.0 tqdm dailalib libbs>=1.18.1 diff --git a/varbert/__init__.py b/varbert/__init__.py index e0d4da6..8fd0b41 100644 --- a/varbert/__init__.py +++ b/varbert/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.3.0" +__version__ = "2.3.1" import importlib.resources import tarfile diff --git a/varbert/model.py b/varbert/model.py index 0234e64..7545aba 100644 --- a/varbert/model.py +++ b/varbert/model.py @@ -116,7 +116,7 @@ def varec_init(self): @staticmethod def create_inputs_for_model(code_txt, tokenizer): input_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(code_txt)) - input_ids = tokenizer.build_inputs_with_special_tokens(input_ids) + input_ids = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id] return torch.tensor(input_ids, dtype=torch.long) @staticmethod @@ -283,7 +283,7 @@ def process(self, code: str): # _code = "\n".join(_code_lines) input_ids = self.preprocess_word_mask(_code, tokenizer)[0] - input_ids_with_special_tokens = tokenizer.build_inputs_with_special_tokens(input_ids) + input_ids_with_special_tokens = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id] if len(input_ids_with_special_tokens) < 800: # padding padded_input_ids = input_ids_with_special_tokens[:-1] + [1] * 800 + [2]