adaj · adaj · Aug 31, 2025 · Aug 31, 2025 · Aug 31, 2025 · Aug 31, 2025
diff --git a/.gitignore b/.gitignore
@@ -12,7 +12,7 @@ examples/archive
 */experimental.py
 docs
 build
-
+.env
 # gemini-cli settings
 .gemini/
 # GitHub App credentials

diff --git a/consent/__init__.py b/consent/__init__.py
@@ -1 +1 @@
-from .consent import *
+from .consent import ConSent, OpenAIEncoder, Config, HYPERPARAMETERS, wandb
diff --git a/consent/consent.py b/consent/consent.py
@@ -20,6 +20,7 @@
 from wandb.keras import WandbCallback
 
 import consent.utils as utils
+from consent.openai_encoder import OpenAIEncoder
 
 
 physical_devices = tf.config.list_physical_devices('GPU')
@@ -37,7 +38,8 @@
     'sentence-transformers/LaBSE',
     'https://tfhub.dev/google/LEALLA/LEALLA-small/1',
     'https://tfhub.dev/google/LEALLA/LEALLA-base/1',
-    'https://tfhub.dev/google/LEALLA/LEALLA-large/1'
+    'https://tfhub.dev/google/LEALLA/LEALLA-large/1',
+    'openai'
 ]
 
 HYPERPARAMETERS  = [
@@ -213,7 +215,8 @@ def make_model(
         consent_hl_units: int
     ):
         # Ensure language_featurizer is compatible with current implementation
-        assert language_featurizer in SUPPORTED_LANGUAGE_FEATURIZERS, \
+        is_openai = language_featurizer.startswith('openai/')
+        assert is_openai or language_featurizer in SUPPORTED_LANGUAGE_FEATURIZERS, \
                 "`language_featurizer` not supported " + \
                 f"(available: {SUPPORTED_LANGUAGE_FEATURIZERS})."
 
@@ -254,6 +257,10 @@ def make_model(
                 layer.trainable=False
                 for w in layer.weights: w._trainable=False
             encoder = SBert(tokenizer, model)(text_input)
+        elif is_openai:
+            assert os.environ.get("OPENAI_API_KEY"), "OPENAI_API_KEY not set"
+            model_name = language_featurizer.split('openai/')[-1]
+            encoder = OpenAIEncoder(model_name, name="sent_encoder")(text_input)
         else:
             encoder = HubWrapper(language_featurizer, name="sent_encoder")(text_input)
 
@@ -289,6 +296,7 @@ def make_model(
                               outputs=[sent_output, consent_output])
 
 
+
     def prepare_inputs(self, dialog_data: pd.DataFrame):
         """
         Extracts texts and contexts (inputs for ConSent.model) from

diff --git a/consent/openai_encoder.py b/consent/openai_encoder.py
@@ -0,0 +1,42 @@
+import os
+import openai
+import tensorflow as tf
+import numpy as np
+
+class OpenAIEncoder(tf.keras.layers.Layer):
+    def __init__(self, model_name, **kwargs):
+        super(OpenAIEncoder, self).__init__(**kwargs)
+        self.model_name = model_name
+        self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+        # Assuming text-embedding-3-small, which has a dimension of 1536
+        self.embedding_dim = 1536
+
+    def _get_embeddings(self, inputs):
+        embeddings = []
+        for text_tensor in inputs:
+            text = text_tensor.numpy().decode('utf-8')[:140]
+            response = self.client.embeddings.create(
+                input=text,
+                model=self.model_name
+            )
+            embeddings.append(response.data[0].embedding)
+        return np.array(embeddings, dtype=np.float32)
+
+    def call(self, inputs):
+        embeddings = tf.py_function(
+            self._get_embeddings,
+            [inputs],
+            tf.float32
+        )
+        embeddings.set_shape((None, self.embedding_dim))
+        return embeddings
+
+    def compute_output_shape(self, input_shape):
+        return (input_shape[0], self.embedding_dim)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            "model_name": self.model_name
+        })
+        return config
diff --git a/requirements.txt b/requirements.txt
@@ -12,3 +12,5 @@ tensorflow-text==2.19.0
 tensorflow-hub
 wandb==0.12.11
 protobuf<=3.20.3
+openai
+tiktoken
diff --git a/tests/test_consent.py b/tests/test_consent.py
@@ -2,7 +2,9 @@
 import time
 import json
 import unittest
+from unittest.mock import patch, MagicMock
 import pandas as pd
+import numpy as np
 
 from consent import Config, ConSent
 import consent.utils as utils
@@ -11,7 +13,7 @@
 class TestConSent(unittest.TestCase):
 
     def setUp(self):
-        self.data_df = pd.read_csv(\
+        self.data_df = pd.read_csv(
             "tests/test_data/Chats-EN-ConSent_dummy_data.csv")
         self.data_df = self.data_df.drop(columns=['Unnamed: 0'])
         self.data_df = self.data_df.rename(columns={
@@ -71,7 +73,44 @@ def test_training_and_inference(self):
 
         print("\n\nGenerating 'sent' and 'consent' predictions using consent.predict_proba()...\n ", pred_message)
 
-
+    @patch('consent.openai_encoder.openai.OpenAI')
+    def test_train_with_openai_featurizer(self, mock_openai_class):
+        # Mock the OpenAI client and its response
+        mock_client = MagicMock()
+        mock_embedding = MagicMock()
+        mock_embedding.embedding = np.random.rand(1536).tolist()  # text-embedding-3-small dimension
+        mock_response = MagicMock()
+        mock_response.data = [mock_embedding]
+        mock_client.embeddings.create.return_value = mock_response
+        mock_openai_class.return_value = mock_client
+
+        # Set a dummy API key
+        os.environ['OPENAI_API_KEY'] = 'test_key'
+
+        # Define config
+        config = Config(**{
+            "dataset_name": "Chats-EN-ConSent_dummy_data",
+            "code_name": "L1",
+            "codes": ["OFF", "COO", "DOM"],
+            "default_code": "OFF",
+            "language_featurizer": "openai/text-embedding-3-small",
+            "sent_hl_units": 10,
+            "sent_dropout": 0.5,
+            "consent_hl_units": 5,
+            "lags": 2,
+            "max_epochs": 1,
+            "callback_patience": 1,
+            "learning_rate": 1e-3,
+            "batch_size": 32})
+
+        # Initialize and train
+        consent_model = ConSent(config)
+        consent_model.train(self.data_df.head(10))
+
+        # Check if the mock was called
+        self.assertTrue(mock_client.embeddings.create.called)
+
 
 if __name__ == '__main__':
     unittest.main()
+
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from .consent import *
		from .consent import ConSent, OpenAIEncoder, Config, HYPERPARAMETERS, wandb