diff --git a/.gitignore b/.gitignore index f6086cc..6d03ca1 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,7 @@ examples/archive */experimental.py docs build - +.env # gemini-cli settings .gemini/ # GitHub App credentials diff --git a/consent/__init__.py b/consent/__init__.py index 5134d3b..8d609e9 100644 --- a/consent/__init__.py +++ b/consent/__init__.py @@ -1 +1 @@ -from .consent import * +from .consent import ConSent, OpenAIEncoder, Config, HYPERPARAMETERS, wandb diff --git a/consent/consent.py b/consent/consent.py index ce2072d..bdbaa57 100644 --- a/consent/consent.py +++ b/consent/consent.py @@ -20,6 +20,7 @@ from wandb.keras import WandbCallback import consent.utils as utils +from consent.openai_encoder import OpenAIEncoder physical_devices = tf.config.list_physical_devices('GPU') @@ -37,7 +38,8 @@ 'sentence-transformers/LaBSE', 'https://tfhub.dev/google/LEALLA/LEALLA-small/1', 'https://tfhub.dev/google/LEALLA/LEALLA-base/1', - 'https://tfhub.dev/google/LEALLA/LEALLA-large/1' + 'https://tfhub.dev/google/LEALLA/LEALLA-large/1', + 'openai' ] HYPERPARAMETERS = [ @@ -213,7 +215,8 @@ def make_model( consent_hl_units: int ): # Ensure language_featurizer is compatible with current implementation - assert language_featurizer in SUPPORTED_LANGUAGE_FEATURIZERS, \ + is_openai = language_featurizer.startswith('openai/') + assert is_openai or language_featurizer in SUPPORTED_LANGUAGE_FEATURIZERS, \ "`language_featurizer` not supported " + \ f"(available: {SUPPORTED_LANGUAGE_FEATURIZERS})." @@ -254,6 +257,10 @@ def make_model( layer.trainable=False for w in layer.weights: w._trainable=False encoder = SBert(tokenizer, model)(text_input) + elif is_openai: + assert os.environ.get("OPENAI_API_KEY"), "OPENAI_API_KEY not set" + model_name = language_featurizer.split('openai/')[-1] + encoder = OpenAIEncoder(model_name, name="sent_encoder")(text_input) else: encoder = HubWrapper(language_featurizer, name="sent_encoder")(text_input) @@ -289,6 +296,7 @@ def make_model( outputs=[sent_output, consent_output]) + def prepare_inputs(self, dialog_data: pd.DataFrame): """ Extracts texts and contexts (inputs for ConSent.model) from diff --git a/consent/openai_encoder.py b/consent/openai_encoder.py new file mode 100644 index 0000000..94f3cc2 --- /dev/null +++ b/consent/openai_encoder.py @@ -0,0 +1,42 @@ +import os +import openai +import tensorflow as tf +import numpy as np + +class OpenAIEncoder(tf.keras.layers.Layer): + def __init__(self, model_name, **kwargs): + super(OpenAIEncoder, self).__init__(**kwargs) + self.model_name = model_name + self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + # Assuming text-embedding-3-small, which has a dimension of 1536 + self.embedding_dim = 1536 + + def _get_embeddings(self, inputs): + embeddings = [] + for text_tensor in inputs: + text = text_tensor.numpy().decode('utf-8')[:140] + response = self.client.embeddings.create( + input=text, + model=self.model_name + ) + embeddings.append(response.data[0].embedding) + return np.array(embeddings, dtype=np.float32) + + def call(self, inputs): + embeddings = tf.py_function( + self._get_embeddings, + [inputs], + tf.float32 + ) + embeddings.set_shape((None, self.embedding_dim)) + return embeddings + + def compute_output_shape(self, input_shape): + return (input_shape[0], self.embedding_dim) + + def get_config(self): + config = super().get_config() + config.update({ + "model_name": self.model_name + }) + return config diff --git a/requirements.txt b/requirements.txt index fdab7fb..e3bb4fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,5 @@ tensorflow-text==2.19.0 tensorflow-hub wandb==0.12.11 protobuf<=3.20.3 +openai +tiktoken diff --git a/tests/test_consent.py b/tests/test_consent.py index b5b99b2..34bd424 100644 --- a/tests/test_consent.py +++ b/tests/test_consent.py @@ -2,7 +2,9 @@ import time import json import unittest +from unittest.mock import patch, MagicMock import pandas as pd +import numpy as np from consent import Config, ConSent import consent.utils as utils @@ -11,7 +13,7 @@ class TestConSent(unittest.TestCase): def setUp(self): - self.data_df = pd.read_csv(\ + self.data_df = pd.read_csv( "tests/test_data/Chats-EN-ConSent_dummy_data.csv") self.data_df = self.data_df.drop(columns=['Unnamed: 0']) self.data_df = self.data_df.rename(columns={ @@ -71,7 +73,44 @@ def test_training_and_inference(self): print("\n\nGenerating 'sent' and 'consent' predictions using consent.predict_proba()...\n ", pred_message) - + @patch('consent.openai_encoder.openai.OpenAI') + def test_train_with_openai_featurizer(self, mock_openai_class): + # Mock the OpenAI client and its response + mock_client = MagicMock() + mock_embedding = MagicMock() + mock_embedding.embedding = np.random.rand(1536).tolist() # text-embedding-3-small dimension + mock_response = MagicMock() + mock_response.data = [mock_embedding] + mock_client.embeddings.create.return_value = mock_response + mock_openai_class.return_value = mock_client + + # Set a dummy API key + os.environ['OPENAI_API_KEY'] = 'test_key' + + # Define config + config = Config(**{ + "dataset_name": "Chats-EN-ConSent_dummy_data", + "code_name": "L1", + "codes": ["OFF", "COO", "DOM"], + "default_code": "OFF", + "language_featurizer": "openai/text-embedding-3-small", + "sent_hl_units": 10, + "sent_dropout": 0.5, + "consent_hl_units": 5, + "lags": 2, + "max_epochs": 1, + "callback_patience": 1, + "learning_rate": 1e-3, + "batch_size": 32}) + + # Initialize and train + consent_model = ConSent(config) + consent_model.train(self.data_df.head(10)) + + # Check if the mock was called + self.assertTrue(mock_client.embeddings.create.called) + if __name__ == '__main__': unittest.main() +