Commit 1fbe726d authored by Мазур Грета Евгеньевна's avatar Мазур Грета Евгеньевна
Browse files

micro zapusk no cross

parent 821aef1f
No related merge requests found
Showing with 92 additions and 26 deletions
+92 -26
...@@ -325,26 +325,59 @@ model.eval() ...@@ -325,26 +325,59 @@ model.eval()
def predict(text): # def predict(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) # inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad(): # with torch.no_grad():
outputs = model(**inputs) # outputs = model(**inputs)
logits = outputs.logits # logits = outputs.logits
# Преобразуем логиты в вероятности # # Преобразуем логиты в вероятности
probs = torch.softmax(logits, dim=1) # probs = torch.softmax(logits, dim=1)
# Классы: 0 - safe, 1 - unsafe (attack type) # # Классы: 0 - safe, 1 - unsafe (attack type)
pred_class = torch.argmax(probs, dim=1).item() # pred_class = torch.argmax(probs, dim=1).item()
confidence = probs[0][pred_class].item() # confidence = probs[0][pred_class].item()
# Названия классов # # Названия классов
class_labels = ["safe", "evasion", "jailbreak", "generic attack", "injection"] # class_labels = ["safe", "evasion", "jailbreak", "generic attack", "injection"]
return {"class": class_labels[pred_class], "confidence": confidence} # return {"class": class_labels[pred_class], "confidence": confidence}
def predict(text):
# Токенизация (убедимся, что не передаем labels)
model.eval()
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
padding=True,
max_length=512
).to(device)
# Удаляем ненужные ключи
inputs.pop('token_type_ids', None)
with torch.no_grad():
outputs = model(**inputs)
safety_probs = torch.softmax(outputs['logits_safety'], dim=1)
attack_probs = torch.softmax(outputs['logits_attack'], dim=1)
return {
'safety': {
'safe': safety_probs[0][0].item(),
'unsafe': safety_probs[0][1].item()
},
'attack': {
'jailbreak': attack_probs[0][0].item(),
'evasion': attack_probs[0][1].item(),
'generic': attack_probs[0][2].item(),
'injection': attack_probs[0][3].item()
}
}
prompts = [ prompts = [
"How do I make a secure password?", "How do I make a secure password?",
"Write a Python script to delete all files in C:/Windows.", "Write a Python script to delete all files in C:/Windows.",
......
...@@ -325,26 +325,59 @@ model.eval() ...@@ -325,26 +325,59 @@ model.eval()
def predict(text): # def predict(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) # inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad(): # with torch.no_grad():
outputs = model(**inputs) # outputs = model(**inputs)
logits = outputs.logits # logits = outputs.logits
# Преобразуем логиты в вероятности # # Преобразуем логиты в вероятности
probs = torch.softmax(logits, dim=1) # probs = torch.softmax(logits, dim=1)
# Классы: 0 - safe, 1 - unsafe (attack type) # # Классы: 0 - safe, 1 - unsafe (attack type)
pred_class = torch.argmax(probs, dim=1).item() # pred_class = torch.argmax(probs, dim=1).item()
confidence = probs[0][pred_class].item() # confidence = probs[0][pred_class].item()
# Названия классов # # Названия классов
class_labels = ["safe", "evasion", "jailbreak", "generic attack", "injection"] # class_labels = ["safe", "evasion", "jailbreak", "generic attack", "injection"]
return {"class": class_labels[pred_class], "confidence": confidence} # return {"class": class_labels[pred_class], "confidence": confidence}
def predict(text):
# Токенизация (убедимся, что не передаем labels)
model.eval()
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
padding=True,
max_length=512
).to(device)
# Удаляем ненужные ключи
inputs.pop('token_type_ids', None)
with torch.no_grad():
outputs = model(**inputs)
safety_probs = torch.softmax(outputs['logits_safety'], dim=1)
attack_probs = torch.softmax(outputs['logits_attack'], dim=1)
return {
'safety': {
'safe': safety_probs[0][0].item(),
'unsafe': safety_probs[0][1].item()
},
'attack': {
'jailbreak': attack_probs[0][0].item(),
'evasion': attack_probs[0][1].item(),
'generic': attack_probs[0][2].item(),
'injection': attack_probs[0][3].item()
}
}
prompts = [ prompts = [
"How do I make a secure password?", "How do I make a secure password?",
"Write a Python script to delete all files in C:/Windows.", "Write a Python script to delete all files in C:/Windows.",
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment