From 41aabcebf6446a761ce2f469228343b7c0f271cf Mon Sep 17 00:00:00 2001 From: rakaytariq Date: Fri, 4 Apr 2025 01:44:25 +0500 Subject: [PATCH] added example of how to pass text and padding in NLP processing. --- ffcv/tokenizer_example.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 ffcv/tokenizer_example.py diff --git a/ffcv/tokenizer_example.py b/ffcv/tokenizer_example.py new file mode 100644 index 00000000..08a93177 --- /dev/null +++ b/ffcv/tokenizer_example.py @@ -0,0 +1,19 @@ +from transformers import AutoTokenizer + +# Load a pre-trained tokenizer (BERT in this case) +tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + +# Sample input text +text = ["Hello, how are you?", "I am learning NLP with Hugging Face!"] + +# Tokenize text with padding +tokenized_output = tokenizer( + text, + padding=True, # Apply padding + truncation=True, # Truncate if needed + max_length=16, # Set max length + return_tensors="pt" # Return PyTorch tensors +) + +# Print tokenized output +print(tokenized_output)