From 82a06fc8a64dcd04f8c466a54585871ab1fde74a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D0=B7=D1=83=D1=80=20=D0=93=D1=80=D0=B5=D1=82?=
 =?UTF-8?q?=D0=B0=20=D0=95=D0=B2=D0=B3=D0=B5=D0=BD=D1=8C=D0=B5=D0=B2=D0=BD?=
 =?UTF-8?q?=D0=B0?= <gemazur_1@edu.hse.ru>
Date: Thu, 27 Mar 2025 03:24:35 +0300
Subject: [PATCH] pereobuch2

---
 .ipynb_checkpoints/ULTRAMegaOB-checkpoint.py | 14 +++++---------
 ULTRAMegaOB.py                               | 14 +++++---------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/.ipynb_checkpoints/ULTRAMegaOB-checkpoint.py b/.ipynb_checkpoints/ULTRAMegaOB-checkpoint.py
index 103a15b..66ae391 100644
--- a/.ipynb_checkpoints/ULTRAMegaOB-checkpoint.py
+++ b/.ipynb_checkpoints/ULTRAMegaOB-checkpoint.py
@@ -180,13 +180,12 @@ def compute_metrics(p):
 
 def augment_text(text, num_augments):
     """Р“РµРЅРµСЂР°С†РёСЏ Р°СѓРіРјРµРЅС‚РёСЂРѕРІР°РЅРЅС‹С… РїСЂРёРјРµСЂРѕРІ СЃ РїСЂРѕРІРµСЂРєР°РјРё"""
-    
-    if len(text) > 1000:  # РЎР»РёС€РєРѕРј РґР»РёРЅРЅС‹Рµ С‚РµРєСЃС‚С‹ РїР»РѕС…Рѕ Р°СѓРіРјРµРЅС‚РёСЂСѓСЋС‚СЃСЏ
+    if len(text) > 1000:
         logger.debug(f"РўРµРєСЃС‚ СЃР»РёС€РєРѕРј РґР»РёРЅРЅС‹Р№ РґР»СЏ Р°СѓРіРјРµРЅС‚Р°С†РёРё: {len(text)} СЃРёРјРІРѕР»РѕРІ")
         return [text]
     
     if not isinstance(text, str) or len(text.strip()) < 10:
-        return []
+        return [text]
         
     text = text.replace('\n', ' ').strip()
     
@@ -208,12 +207,9 @@ def augment_text(text, num_augments):
         # РћР±СЂР°С‚РЅС‹Р№ РїРµСЂРµРІРѕРґ
         if len(augmented) < num_augments:
             try:
-                # РћРїСЂРµРґРµР»СЏРµРј СЏР·С‹Рє С‚РµРєСЃС‚Р°
                 if any(cyr_char in text for cyr_char in 'Р°Р±РІРіРґРµС‘Р¶Р·РёР№РєР»РјРЅРѕРїСЂСЃС‚СѓС„С…С†С‡С€С‰СЉС‹СЊСЌСЋСЏ'):
-                    # Р”Р»СЏ СЂСѓСЃСЃРєРёС… С‚РµРєСЃС‚РѕРІ
                     tr_augs = translation_aug_ru.augment(text, n=num_augments-len(augmented))
                 else:
-                    # Р”Р»СЏ Р°РЅРіР»РёР№СЃРєРёС…/РґСЂСѓРіРёС… С‚РµРєСЃС‚РѕРІ
                     tr_augs = translation_aug.augment(text, n=num_augments-len(augmented))
                     
                 if tr_augs:
@@ -227,7 +223,7 @@ def augment_text(text, num_augments):
             logger.debug(f"РќРµ СѓРґР°Р»РѕСЃСЊ Р°СѓРіРјРµРЅС‚РёСЂРѕРІР°С‚СЊ С‚РµРєСЃС‚: {text[:50]}...")
             return [text]
             
-        augmented = list(set(augmented))  # РЈРґР°Р»РµРЅРёРµ РґСѓР±Р»РёРєР°С‚РѕРІ
+        augmented = list(set(augmented))
         return list(augmented)[:num_augments] if augmented else [text]
     except Exception as e:
         logger.error(f"РљСЂРёС‚РёС‡РµСЃРєР°СЏ РѕС€РёР±РєР° Р°СѓРіРјРµРЅС‚Р°С†РёРё: {str(e)}")
@@ -259,8 +255,8 @@ def balance_attack_types(unsafe_data):
             
             augmented = subset.sample(n=num_augments, replace=True)
             augmented['prompt'] = augmented['prompt'].apply(
-                lambda x: (augs := augment_text(x, 1)) and augs[0] if augs else x
-            )
+            lambda x: augment_text(x, 1)[0]  # РџСЂРѕСЃС‚Рѕ Р±РµСЂРµРј РїРµСЂРІС‹Р№ СЌР»РµРјРµРЅС‚ РІРѕР·РІСЂР°С‰Р°РµРјРѕРіРѕ СЃРїРёСЃРєР°
+                )
             
             # Р›РѕРіРёСЂРѕРІР°РЅРёРµ Р°СѓРіРјРµРЅС‚РёСЂРѕРІР°РЅРЅС‹С… РїСЂРёРјРµСЂРѕРІ
             logger.info(f"\nРђСѓРіРјРµРЅС‚Р°С†РёСЏ РґР»СЏ {attack_type}:")
diff --git a/ULTRAMegaOB.py b/ULTRAMegaOB.py
index 103a15b..66ae391 100644
--- a/ULTRAMegaOB.py
+++ b/ULTRAMegaOB.py
@@ -180,13 +180,12 @@ def compute_metrics(p):
 
 def augment_text(text, num_augments):
     """Р“РµРЅРµСЂР°С†РёСЏ Р°СѓРіРјРµРЅС‚РёСЂРѕРІР°РЅРЅС‹С… РїСЂРёРјРµСЂРѕРІ СЃ РїСЂРѕРІРµСЂРєР°РјРё"""
-    
-    if len(text) > 1000:  # РЎР»РёС€РєРѕРј РґР»РёРЅРЅС‹Рµ С‚РµРєСЃС‚С‹ РїР»РѕС…Рѕ Р°СѓРіРјРµРЅС‚РёСЂСѓСЋС‚СЃСЏ
+    if len(text) > 1000:
         logger.debug(f"РўРµРєСЃС‚ СЃР»РёС€РєРѕРј РґР»РёРЅРЅС‹Р№ РґР»СЏ Р°СѓРіРјРµРЅС‚Р°С†РёРё: {len(text)} СЃРёРјРІРѕР»РѕРІ")
         return [text]
     
     if not isinstance(text, str) or len(text.strip()) < 10:
-        return []
+        return [text]
         
     text = text.replace('\n', ' ').strip()
     
@@ -208,12 +207,9 @@ def augment_text(text, num_augments):
         # РћР±СЂР°С‚РЅС‹Р№ РїРµСЂРµРІРѕРґ
         if len(augmented) < num_augments:
             try:
-                # РћРїСЂРµРґРµР»СЏРµРј СЏР·С‹Рє С‚РµРєСЃС‚Р°
                 if any(cyr_char in text for cyr_char in 'Р°Р±РІРіРґРµС‘Р¶Р·РёР№РєР»РјРЅРѕРїСЂСЃС‚СѓС„С…С†С‡С€С‰СЉС‹СЊСЌСЋСЏ'):
-                    # Р”Р»СЏ СЂСѓСЃСЃРєРёС… С‚РµРєСЃС‚РѕРІ
                     tr_augs = translation_aug_ru.augment(text, n=num_augments-len(augmented))
                 else:
-                    # Р”Р»СЏ Р°РЅРіР»РёР№СЃРєРёС…/РґСЂСѓРіРёС… С‚РµРєСЃС‚РѕРІ
                     tr_augs = translation_aug.augment(text, n=num_augments-len(augmented))
                     
                 if tr_augs:
@@ -227,7 +223,7 @@ def augment_text(text, num_augments):
             logger.debug(f"РќРµ СѓРґР°Р»РѕСЃСЊ Р°СѓРіРјРµРЅС‚РёСЂРѕРІР°С‚СЊ С‚РµРєСЃС‚: {text[:50]}...")
             return [text]
             
-        augmented = list(set(augmented))  # РЈРґР°Р»РµРЅРёРµ РґСѓР±Р»РёРєР°С‚РѕРІ
+        augmented = list(set(augmented))
         return list(augmented)[:num_augments] if augmented else [text]
     except Exception as e:
         logger.error(f"РљСЂРёС‚РёС‡РµСЃРєР°СЏ РѕС€РёР±РєР° Р°СѓРіРјРµРЅС‚Р°С†РёРё: {str(e)}")
@@ -259,8 +255,8 @@ def balance_attack_types(unsafe_data):
             
             augmented = subset.sample(n=num_augments, replace=True)
             augmented['prompt'] = augmented['prompt'].apply(
-                lambda x: (augs := augment_text(x, 1)) and augs[0] if augs else x
-            )
+            lambda x: augment_text(x, 1)[0]  # РџСЂРѕСЃС‚Рѕ Р±РµСЂРµРј РїРµСЂРІС‹Р№ СЌР»РµРјРµРЅС‚ РІРѕР·РІСЂР°С‰Р°РµРјРѕРіРѕ СЃРїРёСЃРєР°
+                )
             
             # Р›РѕРіРёСЂРѕРІР°РЅРёРµ Р°СѓРіРјРµРЅС‚РёСЂРѕРІР°РЅРЅС‹С… РїСЂРёРјРµСЂРѕРІ
             logger.info(f"\nРђСѓРіРјРµРЅС‚Р°С†РёСЏ РґР»СЏ {attack_type}:")
-- 
GitLab