Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apps/organizations/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,7 @@ class TermsAndConditions(HasAutoTranslatedFields, OrganizationRelated, models.Mo

auto_translated_fields: list[str] = ["html:content"]
auto_translate_all_languages: bool = True
auto_translate_instantly: bool = True

organization = models.OneToOneField(
"organizations.Organization",
Expand Down
5 changes: 4 additions & 1 deletion services/translator/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ class HasAutoTranslatedFields(metaclass=TranslatedModelMeta):

auto_translated_fields: list[str] = []
auto_translate_all_languages: bool = False
auto_translate_instantly: bool = False

_auto_translated_fields: list[str] = []
_html_auto_translated_fields: list[str] = []
Expand Down Expand Up @@ -103,12 +104,14 @@ def update_translated_fields(self, force_update: bool = True):
or getattr(self, field)
!= self._original_auto_translated_fields_values[field]
):
AutoTranslatedField.objects.update_or_create(
auto_translated_field, _ = AutoTranslatedField.objects.update_or_create(
content_type=content_type,
object_id=str(self.pk),
field_name=field,
defaults={"up_to_date": False, "field_type": field_type},
)
if self.auto_translate_instantly:
auto_translated_field.update_translation()

def _delete_auto_translated_fields(self):
AutoTranslatedField.objects.filter(
Expand Down
137 changes: 137 additions & 0 deletions services/translator/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
import re

from bs4 import BeautifulSoup
from django.conf import settings
from django.contrib.contenttypes.models import ContentType
from django.db import models

from apps.commons.mixins import OrganizationRelated

from .interface import AzureTranslatorService

AZURE_MAX_LENGTH = 50000


class AutoTranslatedField(models.Model):
"""
Expand Down Expand Up @@ -37,3 +47,130 @@ class Meta:
def instance(self) -> models.Model:
"""Return the related instance."""
return self.content_type.get_object_for_this_type(pk=self.object_id)

@staticmethod
def split_content(
content: str, max_length: int, text_type: str = "plain"
) -> list[str] | list[BeautifulSoup]:
"""
Split content into chunks of max_length, trying to split at html tags.

Maximum length for Azure Translator is 50 000 characters per request accross all
languages.

For example, sending a translate request of 3 000 characters to translate to three
different languages results in a request size of 3 000 x 3 = 9 000 characters

This function splits the content into chunks either at the end of a html tag or
at the last space before max_length.
"""
if text_type == "html":
soup = BeautifulSoup(content, "html.parser")
return soup.find_all(recursive=False)

if len(content) <= max_length:
return [content]
chunks = []
start = 0
while start < len(content):
end = start + max_length
if end >= len(content):
chunks.append(content[start:])
break
split_at = content.rfind(" ", start, end)
if split_at == -1 or split_at <= start:
split_at = end
chunks.append(content[start:split_at])
start = split_at
return chunks

def update_translation(self):
instance = self.instance
field_name = self.field_name
content = getattr(instance, field_name, "")
if not isinstance(instance, OrganizationRelated):
raise ValueError(
f"{instance._meta.model.__name__} does not support translations. "
"`OrganizationRelated` mixin is required for automatic translations."
)
if getattr(instance, "auto_translate_all_languages", False):
languages = (
settings.REQUIRED_LANGUAGES
if any(
o.auto_translate_content
for o in instance.get_related_organizations()
)
else {}
)
else:
organizations = [
o
for o in instance.get_related_organizations()
if o.auto_translate_content
]
# iter over languages in set (remove duplicate language)
languages: set[str] = {
lang for org in organizations for lang in org.languages
}
if languages:
base_max_length = AZURE_MAX_LENGTH * 0.8 # Safety margin
max_length = int(base_max_length // len(languages))
if content:
chunks = self.split_content(
content, max_length, text_type=self.field_type
)
translations = {}
detected_languages = []
for chunk in chunks:
if (
self.field_type == "html"
and (not str(chunk).strip() or not chunk.get_text(strip=True))
) or (re.findall(r'data:image\/[a-zA-Z]+;base64,[^"\']+', content)):
chunk_translations = [
{"to": lang, "text": str(chunk)} for lang in languages
]
else:
chunk = str(chunk)
if len(chunk) <= max_length:
chunk_translations, detected_language = (
AzureTranslatorService.translate_text_content(
str(chunk), languages, self.field_type
)
)
detected_languages.append(detected_language)
elif len(chunk) < base_max_length:
for lang in languages:
lang_chunk_translation, detected_language = (
AzureTranslatorService.translate_text_content(
str(chunk), [lang], self.field_type
)
)
chunk_translations.append(
{
"to": lang,
"text": lang_chunk_translation[0]["text"],
}
)
detected_languages.append(detected_language)
else:
chunk_translations = [
{"to": lang, "text": str(chunk)} for lang in languages
]
translations = {
f"{field_name}_{translation['to']}": (
translations.get(f"{field_name}_{translation['to']}", "")
+ translation["text"]
)
for translation in chunk_translations
}
# Use the most common detected language among chunks
if detected_languages:
detected_language = max(
set(detected_languages), key=detected_languages.count
)
translations[f"{field_name}_detected_language"] = detected_language
else:
translations = {f"{field_name}_{lang}": content for lang in languages}
instance._meta.model.objects.filter(pk=instance.pk).update(**translations)
self.up_to_date = True
self.save()
5 changes: 2 additions & 3 deletions services/translator/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from projects.celery import app

from .models import AutoTranslatedField
from .utils import update_auto_translated_field

logger = logging.getLogger(__name__)

Expand All @@ -16,7 +15,7 @@
def automatic_translations():
for field in AutoTranslatedField.objects.filter(up_to_date=False):
try:
update_auto_translated_field(field)
field.update_translation()
except Exception as e: # noqa: PIE786
logger.error(f"Error updating auto-translated field {field.id}: {e}")

Expand Down Expand Up @@ -48,7 +47,7 @@ def translate_object(

for field in queryset:
try:
update_auto_translated_field(field)
field.update_translation()
except Exception as e: # noqa: PIE786
logger.error(
f"Error updating model-translated {model} field {field.id}: {e}"
Expand Down
33 changes: 33 additions & 0 deletions services/translator/testcases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from types import SimpleNamespace

from apps.commons.test import JwtAPITestCase


class MockTranslateTestCase(JwtAPITestCase):
@classmethod
def translator_side_effect(
cls, body: list[str], to_language: list[str], text_type: str = "plain"
) -> list[dict]:
"""
This side effect is meant to be used with unittest mock. It will mock every call
made to the Azure translator API.

Arguments
---------
- content (str): The text content to be translated.
- languages (list of str): The target languages for translation.

Returns
-------
- A list of SimpleNamespace objects that simulates the Azure translator API response.
"""

return [
SimpleNamespace(
detected_language=SimpleNamespace(language="en", score=1.0),
translations=[
SimpleNamespace(text=f"{lang} : {body[0]}", to=lang)
for lang in to_language
],
)
]
49 changes: 10 additions & 39 deletions services/translator/tests/tasks/test_update_translations_task.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from types import SimpleNamespace
from unittest.mock import call, patch

from django.conf import settings
Expand All @@ -9,7 +8,6 @@
from apps.accounts.factories import PeopleGroupFactory, UserFactory
from apps.accounts.models import ProjectUser
from apps.announcements.factories import AnnouncementFactory
from apps.commons.test import JwtAPITestCase
from apps.feedbacks.factories import CommentFactory, ReviewFactory
from apps.files.factories import (
AttachmentFileFactory,
Expand Down Expand Up @@ -45,45 +43,19 @@
)
from services.translator.models import AutoTranslatedField
from services.translator.tasks import automatic_translations
from services.translator.utils import update_auto_translated_field
from services.translator.tests.tasks.test_update_translations_task import (
MockTranslateTestCase,
)

faker = Faker()


class MockTranslateTestCase(JwtAPITestCase):
@classmethod
def translator_side_effect(
cls, body: list[str], to_language: list[str], text_type: str = "plain"
) -> list[dict]:
"""
This side effect is meant to be used with unittest mock. It will mock every call
made to the Azure translator API.

Arguments
---------
- content (str): The text content to be translated.
- languages (list of str): The target languages for translation.

Returns
-------
- A list of SimpleNamespace objects that simulates the Azure translator API response.
"""

return [
SimpleNamespace(
detected_language=SimpleNamespace(language="en", score=1.0),
translations=[
SimpleNamespace(text=f"{lang} : {body[0]}", to=lang)
for lang in to_language
],
)
]


class UpdateTranslationsTestCase(MockTranslateTestCase):
@classmethod
def setUpTestData(cls) -> None:
@patch("azure.ai.translation.text.TextTranslationClient.translate")
def setUpTestData(cls, mock_translate) -> None:
super().setUpTestData()
mock_translate.side_effect = cls.translator_side_effect
cls.organization_data = {
field: (
f"<p>{faker.word()}</p>"
Expand Down Expand Up @@ -214,9 +186,8 @@ def setUpTestData(cls) -> None:
cls.organization_2,
cls.organization_3,
]:
TermsAndConditions.objects.update_or_create(
organization=organization, defaults=data
)
TermsAndConditions.objects.get_or_create(organization=organization)
TermsAndConditions.objects.filter(organization=organization).update(**data)
cls.instances.append(
{
"model": TermsAndConditions,
Expand Down Expand Up @@ -457,7 +428,7 @@ def test_safe_translation_with_base64_image(self, mock_translate):
field_name="description",
)

update_auto_translated_field(field)
field.update_translation()
mock_translate.assert_has_calls([])

@patch("azure.ai.translation.text.TextTranslationClient.translate")
Expand All @@ -482,7 +453,7 @@ def test_split_content_html(self, mock_translate):
object_id=project.pk,
field_name="description",
)
update_auto_translated_field(field)
field.update_translation()
mock_translate.assert_has_calls(
[
call(
Expand Down
Loading
Loading