-
Notifications
You must be signed in to change notification settings - Fork 17
Open
Description
I am having a go at running inference and evaluation for this model, and running into a TypeError in GPTLMHeadModel:
In [1]: import torch
...: from transformers import AutoTokenizer
...: from based.models.gpt import GPTLMHeadModel
...:
...: tokenizer = AutoTokenizer.from_pretrained("gpt2")
...: model = GPTLMHeadModel.from_pretrained_hf("hazyresearch/based-360m").to("cuda", dtype=torch.float
...: 16)
tokenizer_config.json: 100%|███████████████████████████████████████████| 26.0/26.0 [00:00<00:00, 260kB/s]
config.json: 100%|██████████████████████████████████████████████████████| 665/665 [00:00<00:00, 8.64MB/s]
vocab.json: 100%|███████████████████████████████████████████████████| 1.04M/1.04M [00:00<00:00, 12.1MB/s]
merges.txt: 100%|█████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 8.99MB/s]
tokenizer.json: 100%|███████████████████████████████████████████████| 1.36M/1.36M [00:00<00:00, 17.8MB/s]
config.json: 100%|██████████████████████████████████████████████████| 2.86k/2.86k [00:00<00:00, 36.7MB/s]
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[1], line 6
3 from based.models.gpt import GPTLMHeadModel
5 tokenizer = AutoTokenizer.from_pretrained("gpt2")
----> 6 model = GPTLMHeadModel.from_pretrained_hf("hazyresearch/based-360m").to("cuda", dtype=torch.float16)
File /based/models/gpt.py:468, in GPTPreTrainedModel.from_pretrained_hf(cls, pretrained_model_name, device, **kwargs)
466 config_data = load_config_hf(pretrained_model_name)
467 config = GPT2Config(**config_data)
--> 468 model = cls(config, device=device, **kwargs)
469 state_dict = load_state_dict_hf(pretrained_model_name, device=device)
471 # remove the 'model.' prefix from the keys
File /based/models/gpt.py:741, in GPTLMHeadModel.__init__(self, config, process_group, device, dtype)
739 super().__init__(config)
740 self.process_group = process_group
--> 741 self.transformer = GPTModel(config, process_group=process_group, **factory_kwargs)
742 self.tie_word_embeddings = getattr(config, "tie_word_embeddings", True)
743 lm_head_bias = getattr(config, "lm_head_bias", False)
File /based/models/gpt.py:585, in GPTModel.__init__(self, config, process_group, device, dtype)
569 self.embeddings = ParallelGPT2Embeddings(
570 config.hidden_size,
571 vocab_size,
(...)
575 **factory_kwargs,
576 )
578 # We change the order of dropout, residual and layer norm:
579 # Instead of LN -> Attn / MLP -> Dropout -> Add, we do:
580 # Dropout -> Add -> LN -> Attn / MLP, returning both the residual branch (output of Add) and
581 # the main branch (output of MLP). The model definition is unchanged, but the mapping of the
582 # nn.Dropout probabilities are changed.
583 # This is for performance reason: we can fuse dropout + add + layer_norm.
584 self.layers = nn.ModuleList(
--> 585 [
586 create_block(config, layer_idx=i, process_group=process_group, **factory_kwargs)
587 for i in range(config.num_hidden_layers)
588 ]
589 )
590 self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
591 if self.fused_dropout_add_ln:
File /based/models/gpt.py:586, in <listcomp>(.0)
569 self.embeddings = ParallelGPT2Embeddings(
570 config.hidden_size,
571 vocab_size,
(...)
575 **factory_kwargs,
576 )
578 # We change the order of dropout, residual and layer norm:
579 # Instead of LN -> Attn / MLP -> Dropout -> Add, we do:
580 # Dropout -> Add -> LN -> Attn / MLP, returning both the residual branch (output of Add) and
581 # the main branch (output of MLP). The model definition is unchanged, but the mapping of the
582 # nn.Dropout probabilities are changed.
583 # This is for performance reason: we can fuse dropout + add + layer_norm.
584 self.layers = nn.ModuleList(
585 [
--> 586 create_block(config, layer_idx=i, process_group=process_group, **factory_kwargs)
587 for i in range(config.num_hidden_layers)
588 ]
589 )
590 self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
591 if self.fused_dropout_add_ln:
File /based/models/gpt.py:371, in create_block(config, layer_idx, process_group, device, dtype, **kwargs)
369 mlp_cls = create_mlp_cls(config, layer_idx, process_group=process_group, **factory_kwargs)
370 use_rms_norm = getattr(config, "rms_norm", False)
--> 371 norm_cls = partial(
372 nn.LayerNorm if not use_rms_norm else RMSNorm,
373 eps=config.layer_norm_epsilon,
374 **factory_kwargs,
375 )
376 # TD [2022-07-30]: Force residual in fp32, seems to make fp16 training more stable
377 residual_in_fp32 = getattr(config, "residual_in_fp32", False)
TypeError: the first argument must be callable
For reproducibility, I have been running this in a docker container:
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
RUN apt-get update && apt-get install -y \
apt-utils \
python3.10 \
python3-pip \
git \
&& rm -rf /var/lib/apt/lists/*
RUN pip install --upgrade pip
RUN pip install \
torch==2.1.2 \
torchvision==0.16.2 \
torchaudio==2.1.2 \
--index-url https://download.pytorch.org/whl/cu118 # due to observed causal-conv1d dependency
RUN pip install \
jupyter==1.0.0 \
hydra-core==1.3.2
RUN pip install jupyter
COPY . .
RUN pip install .
Any idea what could be going wrong here?
Metadata
Metadata
Assignees
Labels
No labels