Skip to content

Type Error in GPTLMHeadModel #3

@axelmagn

Description

@axelmagn

I am having a go at running inference and evaluation for this model, and running into a TypeError in GPTLMHeadModel:

In [1]: import torch
   ...: from transformers import AutoTokenizer
   ...: from based.models.gpt import GPTLMHeadModel
   ...: 
   ...: tokenizer = AutoTokenizer.from_pretrained("gpt2")
   ...: model = GPTLMHeadModel.from_pretrained_hf("hazyresearch/based-360m").to("cuda", dtype=torch.float
   ...: 16)
tokenizer_config.json: 100%|███████████████████████████████████████████| 26.0/26.0 [00:00<00:00, 260kB/s]
config.json: 100%|██████████████████████████████████████████████████████| 665/665 [00:00<00:00, 8.64MB/s]
vocab.json: 100%|███████████████████████████████████████████████████| 1.04M/1.04M [00:00<00:00, 12.1MB/s]
merges.txt: 100%|█████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 8.99MB/s]
tokenizer.json: 100%|███████████████████████████████████████████████| 1.36M/1.36M [00:00<00:00, 17.8MB/s]
config.json: 100%|██████████████████████████████████████████████████| 2.86k/2.86k [00:00<00:00, 36.7MB/s]
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[1], line 6
      3 from based.models.gpt import GPTLMHeadModel
      5 tokenizer = AutoTokenizer.from_pretrained("gpt2")
----> 6 model = GPTLMHeadModel.from_pretrained_hf("hazyresearch/based-360m").to("cuda", dtype=torch.float16)

File /based/models/gpt.py:468, in GPTPreTrainedModel.from_pretrained_hf(cls, pretrained_model_name, device, **kwargs)
    466 config_data = load_config_hf(pretrained_model_name)
    467 config = GPT2Config(**config_data)
--> 468 model = cls(config, device=device, **kwargs)
    469 state_dict = load_state_dict_hf(pretrained_model_name, device=device)
    471 # remove the 'model.' prefix from the keys

File /based/models/gpt.py:741, in GPTLMHeadModel.__init__(self, config, process_group, device, dtype)
    739 super().__init__(config)
    740 self.process_group = process_group
--> 741 self.transformer = GPTModel(config, process_group=process_group, **factory_kwargs)
    742 self.tie_word_embeddings = getattr(config, "tie_word_embeddings", True)
    743 lm_head_bias = getattr(config, "lm_head_bias", False)

File /based/models/gpt.py:585, in GPTModel.__init__(self, config, process_group, device, dtype)
    569     self.embeddings = ParallelGPT2Embeddings(
    570         config.hidden_size,
    571         vocab_size,
   (...)
    575         **factory_kwargs,
    576     )
    578 # We change the order of dropout, residual and layer norm:
    579 # Instead of LN -> Attn / MLP -> Dropout -> Add, we do:
    580 # Dropout -> Add -> LN -> Attn / MLP, returning both the residual branch (output of Add) and
    581 # the main branch (output of MLP). The model definition is unchanged, but the mapping of the
    582 # nn.Dropout probabilities are changed.
    583 # This is for performance reason: we can fuse dropout + add + layer_norm.
    584 self.layers = nn.ModuleList(
--> 585     [
    586         create_block(config, layer_idx=i, process_group=process_group, **factory_kwargs)
    587         for i in range(config.num_hidden_layers)
    588     ]
    589 )
    590 self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
    591 if self.fused_dropout_add_ln:

File /based/models/gpt.py:586, in <listcomp>(.0)
    569     self.embeddings = ParallelGPT2Embeddings(
    570         config.hidden_size,
    571         vocab_size,
   (...)
    575         **factory_kwargs,
    576     )
    578 # We change the order of dropout, residual and layer norm:
    579 # Instead of LN -> Attn / MLP -> Dropout -> Add, we do:
    580 # Dropout -> Add -> LN -> Attn / MLP, returning both the residual branch (output of Add) and
    581 # the main branch (output of MLP). The model definition is unchanged, but the mapping of the
    582 # nn.Dropout probabilities are changed.
    583 # This is for performance reason: we can fuse dropout + add + layer_norm.
    584 self.layers = nn.ModuleList(
    585     [
--> 586         create_block(config, layer_idx=i, process_group=process_group, **factory_kwargs)
    587         for i in range(config.num_hidden_layers)
    588     ]
    589 )
    590 self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
    591 if self.fused_dropout_add_ln:

File /based/models/gpt.py:371, in create_block(config, layer_idx, process_group, device, dtype, **kwargs)
    369 mlp_cls = create_mlp_cls(config, layer_idx, process_group=process_group, **factory_kwargs)
    370 use_rms_norm = getattr(config, "rms_norm", False)
--> 371 norm_cls = partial(
    372     nn.LayerNorm if not use_rms_norm else RMSNorm,
    373     eps=config.layer_norm_epsilon,
    374     **factory_kwargs,
    375 )
    376 # TD [2022-07-30]: Force residual in fp32, seems to make fp16 training more stable
    377 residual_in_fp32 = getattr(config, "residual_in_fp32", False)

TypeError: the first argument must be callable

For reproducibility, I have been running this in a docker container:

FROM nvidia/cuda:11.8.0-devel-ubuntu22.04

RUN apt-get update && apt-get install -y \
    apt-utils \
    python3.10 \
    python3-pip \
    git \
    && rm -rf /var/lib/apt/lists/*

RUN pip install --upgrade pip
RUN pip install \
    torch==2.1.2 \
    torchvision==0.16.2 \
    torchaudio==2.1.2 \
    --index-url https://download.pytorch.org/whl/cu118 # due to observed causal-conv1d dependency

RUN pip install \
    jupyter==1.0.0 \
    hydra-core==1.3.2

RUN pip install jupyter
COPY . .
RUN pip install .

Any idea what could be going wrong here?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions