Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ examples/archive
*/experimental.py
docs
build

.env
# gemini-cli settings
.gemini/
# GitHub App credentials
Expand Down
2 changes: 1 addition & 1 deletion consent/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .consent import *
from .consent import ConSent, OpenAIEncoder, Config, HYPERPARAMETERS, wandb
12 changes: 10 additions & 2 deletions consent/consent.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from wandb.keras import WandbCallback

import consent.utils as utils
from consent.openai_encoder import OpenAIEncoder


physical_devices = tf.config.list_physical_devices('GPU')
Expand All @@ -37,7 +38,8 @@
'sentence-transformers/LaBSE',
'https://tfhub.dev/google/LEALLA/LEALLA-small/1',
'https://tfhub.dev/google/LEALLA/LEALLA-base/1',
'https://tfhub.dev/google/LEALLA/LEALLA-large/1'
'https://tfhub.dev/google/LEALLA/LEALLA-large/1',
'openai'
]

HYPERPARAMETERS = [
Expand Down Expand Up @@ -213,7 +215,8 @@ def make_model(
consent_hl_units: int
):
# Ensure language_featurizer is compatible with current implementation
assert language_featurizer in SUPPORTED_LANGUAGE_FEATURIZERS, \
is_openai = language_featurizer.startswith('openai/')
assert is_openai or language_featurizer in SUPPORTED_LANGUAGE_FEATURIZERS, \
"`language_featurizer` not supported " + \
f"(available: {SUPPORTED_LANGUAGE_FEATURIZERS})."

Expand Down Expand Up @@ -254,6 +257,10 @@ def make_model(
layer.trainable=False
for w in layer.weights: w._trainable=False
encoder = SBert(tokenizer, model)(text_input)
elif is_openai:
assert os.environ.get("OPENAI_API_KEY"), "OPENAI_API_KEY not set"
model_name = language_featurizer.split('openai/')[-1]
encoder = OpenAIEncoder(model_name, name="sent_encoder")(text_input)
else:
encoder = HubWrapper(language_featurizer, name="sent_encoder")(text_input)

Expand Down Expand Up @@ -289,6 +296,7 @@ def make_model(
outputs=[sent_output, consent_output])



def prepare_inputs(self, dialog_data: pd.DataFrame):
"""
Extracts texts and contexts (inputs for ConSent.model) from
Expand Down
42 changes: 42 additions & 0 deletions consent/openai_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os
import openai
import tensorflow as tf
import numpy as np

class OpenAIEncoder(tf.keras.layers.Layer):
def __init__(self, model_name, **kwargs):
super(OpenAIEncoder, self).__init__(**kwargs)
self.model_name = model_name
self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
# Assuming text-embedding-3-small, which has a dimension of 1536
self.embedding_dim = 1536

def _get_embeddings(self, inputs):
embeddings = []
for text_tensor in inputs:
text = text_tensor.numpy().decode('utf-8')[:140]
response = self.client.embeddings.create(
input=text,
model=self.model_name
)
embeddings.append(response.data[0].embedding)
return np.array(embeddings, dtype=np.float32)

def call(self, inputs):
embeddings = tf.py_function(
self._get_embeddings,
[inputs],
tf.float32
)
embeddings.set_shape((None, self.embedding_dim))
return embeddings

def compute_output_shape(self, input_shape):
return (input_shape[0], self.embedding_dim)

def get_config(self):
config = super().get_config()
config.update({
"model_name": self.model_name
})
return config
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@ tensorflow-text==2.19.0
tensorflow-hub
wandb==0.12.11
protobuf<=3.20.3
openai
tiktoken
43 changes: 41 additions & 2 deletions tests/test_consent.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
import time
import json
import unittest
from unittest.mock import patch, MagicMock
import pandas as pd
import numpy as np

from consent import Config, ConSent
import consent.utils as utils
Expand All @@ -11,7 +13,7 @@
class TestConSent(unittest.TestCase):

def setUp(self):
self.data_df = pd.read_csv(\
self.data_df = pd.read_csv(
"tests/test_data/Chats-EN-ConSent_dummy_data.csv")
self.data_df = self.data_df.drop(columns=['Unnamed: 0'])
self.data_df = self.data_df.rename(columns={
Expand Down Expand Up @@ -71,7 +73,44 @@ def test_training_and_inference(self):

print("\n\nGenerating 'sent' and 'consent' predictions using consent.predict_proba()...\n ", pred_message)


@patch('consent.openai_encoder.openai.OpenAI')
def test_train_with_openai_featurizer(self, mock_openai_class):
# Mock the OpenAI client and its response
mock_client = MagicMock()
mock_embedding = MagicMock()
mock_embedding.embedding = np.random.rand(1536).tolist() # text-embedding-3-small dimension
mock_response = MagicMock()
mock_response.data = [mock_embedding]
mock_client.embeddings.create.return_value = mock_response
mock_openai_class.return_value = mock_client

# Set a dummy API key
os.environ['OPENAI_API_KEY'] = 'test_key'

# Define config
config = Config(**{
"dataset_name": "Chats-EN-ConSent_dummy_data",
"code_name": "L1",
"codes": ["OFF", "COO", "DOM"],
"default_code": "OFF",
"language_featurizer": "openai/text-embedding-3-small",
"sent_hl_units": 10,
"sent_dropout": 0.5,
"consent_hl_units": 5,
"lags": 2,
"max_epochs": 1,
"callback_patience": 1,
"learning_rate": 1e-3,
"batch_size": 32})

# Initialize and train
consent_model = ConSent(config)
consent_model.train(self.data_df.head(10))

# Check if the mock was called
self.assertTrue(mock_client.embeddings.create.called)


if __name__ == '__main__':
unittest.main()