From 627943b6a1746ed141067616c579f1f77f2bff23 Mon Sep 17 00:00:00 2001 From: Jordan Clive Date: Fri, 28 Apr 2023 12:10:27 +0100 Subject: [PATCH] Update metadata_utils.py --- bsmetadata/metadata_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bsmetadata/metadata_utils.py b/bsmetadata/metadata_utils.py index f3f942e7..04f12166 100644 --- a/bsmetadata/metadata_utils.py +++ b/bsmetadata/metadata_utils.py @@ -209,6 +209,9 @@ def random_sample_metadata_v2( A new collection of examples, with some metadata dropped. """ only_metadata_types = [key for key in metadata_type_sample_weights.keys() if f"metadata_{key}" in examples] + # Remove Html 3/4 of the time. + if random.random() < 0.75 and 'html' in only_metadata_types: + only_metadata_types.remove('html') for i in range(len(examples["text"])): example = {k: v[i] for k, v in examples.items()} metadata_types = [key for key in only_metadata_types if example[f"metadata_{key}"]]