Skip to content
GitLab
Explore
Projects
Groups
Topics
Snippets
Projects
Groups
Topics
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
proekt
obuch
Commits
9f7c8a4a
Commit
9f7c8a4a
authored
1 week ago
by
Мазур Грета Евгеньевна
Browse files
Options
Download
Patches
Plain Diff
pereobuch
parent
a4ebb735
master
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
.ipynb_checkpoints/pereobuch-checkpoint.py
+277
-0
.ipynb_checkpoints/pereobuch-checkpoint.py
pereobuch.py
+277
-0
pereobuch.py
with
554 additions
and
0 deletions
+554
-0
.ipynb_checkpoints/pereobuch-checkpoint.py
0 → 100644
+
277
−
0
View file @
9f7c8a4a
import
os
import
pandas
as
pd
import
torch
import
numpy
as
np
from
sklearn.model_selection
import
train_test_split
from
sklearn.metrics
import
classification_report
from
sklearn.utils.class_weight
import
compute_class_weight
from
datasets
import
Dataset
,
load_from_disk
from
transformers
import
(
BertTokenizer
,
BertForSequenceClassification
,
Trainer
,
TrainingArguments
,
EarlyStoppingCallback
)
from
torch
import
nn
from
peft
import
get_peft_model
,
LoraConfig
,
TaskType
import
warnings
warnings
.
filterwarnings
(
'ignore'
)
# Конфигурация
DEVICE
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
MODEL_NAME
=
'bert-base-uncased'
DATA_PATH
=
'all_dataset.csv'
SAVE_DIR
=
'./model_fine_tuned_pereobuch'
TOKENIZED_DATA_DIR
=
'./tokenized_data_pereobuch'
MAX_LENGTH
=
256
BATCH_SIZE
=
32
EPOCHS
=
5
# Очистка памяти
torch
.
cuda
.
empty_cache
()
# Загрузка и подготовка данных
def
load_and_prepare_data
():
data
=
pd
.
read_csv
(
DATA_PATH
)
# Проверяем распределение классов
print
(
"Распределение классов безопасности:"
)
print
(
data
[
'safety'
].
value_counts
(
normalize
=
True
))
print
(
"
\n
Распределение типов атак (только для unsafe):"
)
print
(
data
[
data
[
'safety'
]
==
'unsafe'
][
'type'
].
value_counts
(
normalize
=
True
))
# Разделение данных
train_data
,
test_data
=
train_test_split
(
data
,
test_size
=
0.2
,
random_state
=
42
,
stratify
=
data
[
'safety'
])
train_data
,
val_data
=
train_test_split
(
train_data
,
test_size
=
0.1
,
random_state
=
42
,
stratify
=
train_data
[
'safety'
])
return
train_data
,
val_data
,
test_data
# Токенизация
def
tokenize_data
(
tokenizer
,
train_data
,
val_data
,
test_data
):
def
preprocess_function
(
examples
):
tokenized
=
tokenizer
(
examples
[
'prompt'
],
truncation
=
True
,
padding
=
'max_length'
,
max_length
=
MAX_LENGTH
)
# Преобразование меток
tokenized
[
'labels_safety'
]
=
[
0
if
label
==
"safe"
else
1
for
label
in
examples
[
'safety'
]]
tokenized
[
'labels_attack'
]
=
[
0
if
label
==
"jailbreak"
else
1
if
label
==
"evasion"
else
2
if
label
==
"generic attack"
else
3
for
label
in
examples
[
'type'
]
]
return
tokenized
if
not
os
.
path
.
exists
(
TOKENIZED_DATA_DIR
):
os
.
makedirs
(
TOKENIZED_DATA_DIR
)
train_dataset
=
Dataset
.
from_pandas
(
train_data
).
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
[
'prompt'
,
'safety'
,
'type'
]
)
val_dataset
=
Dataset
.
from_pandas
(
val_data
).
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
[
'prompt'
,
'safety'
,
'type'
]
)
test_dataset
=
Dataset
.
from_pandas
(
test_data
).
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
[
'prompt'
,
'safety'
,
'type'
]
)
train_dataset
.
save_to_disk
(
f
"
{
TOKENIZED_DATA_DIR
}
/train"
)
val_dataset
.
save_to_disk
(
f
"
{
TOKENIZED_DATA_DIR
}
/val"
)
test_dataset
.
save_to_disk
(
f
"
{
TOKENIZED_DATA_DIR
}
/test"
)
else
:
train_dataset
=
load_from_disk
(
f
"
{
TOKENIZED_DATA_DIR
}
/train"
)
val_dataset
=
load_from_disk
(
f
"
{
TOKENIZED_DATA_DIR
}
/val"
)
test_dataset
=
load_from_disk
(
f
"
{
TOKENIZED_DATA_DIR
}
/test"
)
return
train_dataset
,
val_dataset
,
test_dataset
# Модель с двумя головками
class
MultiTaskBert
(
nn
.
Module
):
def
__init__
(
self
,
model_name
):
super
().
__init__
()
self
.
bert
=
BertForSequenceClassification
.
from_pretrained
(
model_name
,
num_labels
=
2
,
# Для safety
return_dict
=
True
)
# Дополнительная головка для классификации атак
self
.
attack_classifier
=
nn
.
Linear
(
self
.
bert
.
config
.
hidden_size
,
4
)
# Инициализация весов
nn
.
init
.
xavier_uniform_
(
self
.
attack_classifier
.
weight
)
self
.
attack_classifier
.
bias
.
data
.
zero_
()
# Веса классов
self
.
loss_fct
=
nn
.
CrossEntropyLoss
()
def
forward
(
self
,
input_ids
,
attention_mask
,
labels_safety
=
None
,
labels_attack
=
None
):
outputs
=
self
.
bert
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
output_hidden_states
=
True
)
pooled_output
=
outputs
.
hidden_states
[
-
1
][:,
0
,
:]
# Safety prediction
logits_safety
=
outputs
.
logits
# Attack prediction (только если текст unsafe)
logits_attack
=
self
.
attack_classifier
(
pooled_output
)
loss
=
None
if
labels_safety
is
not
None
:
loss_safety
=
self
.
loss_fct
(
logits_safety
,
labels_safety
)
# Вычисляем loss для атак только для unsafe текстов
mask
=
(
labels_safety
==
1
)
# Только unsafe
if
mask
.
any
():
loss_attack
=
self
.
loss_fct
(
logits_attack
[
mask
],
labels_attack
[
mask
]
)
loss
=
loss_safety
+
0.5
*
loss_attack
# Взвешенная сумма
else
:
loss
=
loss_safety
return
{
'logits_safety'
:
logits_safety
,
'logits_attack'
:
logits_attack
,
'loss'
:
loss
}
# Вычисление метрик
def
compute_metrics
(
p
):
preds_safety
=
np
.
argmax
(
p
.
predictions
[
0
],
axis
=
1
)
labels_safety
=
p
.
label_ids
[
0
]
metrics
=
{
'safety_accuracy'
:
(
preds_safety
==
labels_safety
).
mean
(),
'safety_f1'
:
f1_score
(
labels_safety
,
preds_safety
,
average
=
'binary'
),
}
# Метрики для атак (только для unsafe)
unsafe_mask
=
(
labels_safety
==
1
)
if
unsafe_mask
.
any
():
preds_attack
=
np
.
argmax
(
p
.
predictions
[
1
][
unsafe_mask
],
axis
=
1
)
labels_attack
=
p
.
label_ids
[
1
][
unsafe_mask
]
metrics
.
update
({
'attack_accuracy'
:
(
preds_attack
==
labels_attack
).
mean
(),
'attack_f1'
:
f1_score
(
labels_attack
,
preds_attack
,
average
=
'weighted'
),
})
return
metrics
# Основной процесс обучения
def
main
():
# 1. Загрузка данных
train_data
,
val_data
,
test_data
=
load_and_prepare_data
()
# 2. Токенизация
tokenizer
=
BertTokenizer
.
from_pretrained
(
MODEL_NAME
)
train_dataset
,
val_dataset
,
test_dataset
=
tokenize_data
(
tokenizer
,
train_data
,
val_data
,
test_data
)
# 3. Создание модели
model
=
MultiTaskBert
(
MODEL_NAME
).
to
(
DEVICE
)
# 4. Настройка LoRA
peft_config
=
LoraConfig
(
task_type
=
TaskType
.
SEQ_CLS
,
inference_mode
=
False
,
r
=
16
,
lora_alpha
=
32
,
lora_dropout
=
0.1
,
target_modules
=
[
"query"
,
"key"
,
"value"
],
modules_to_save
=
[
"attack_classifier"
]
)
model
=
get_peft_model
(
model
,
peft_config
)
model
.
print_trainable_parameters
()
# 5. Аргументы обучения
training_args
=
TrainingArguments
(
output_dir
=
SAVE_DIR
,
evaluation_strategy
=
"epoch"
,
save_strategy
=
"epoch"
,
learning_rate
=
2e-5
,
per_device_train_batch_size
=
BATCH_SIZE
,
per_device_eval_batch_size
=
BATCH_SIZE
,
num_train_epochs
=
EPOCHS
,
weight_decay
=
0.01
,
logging_dir
=
'./logs'
,
logging_steps
=
100
,
save_total_limit
=
2
,
load_best_model_at_end
=
True
,
metric_for_best_model
=
"safety_f1"
,
greater_is_better
=
True
,
fp16
=
True
,
report_to
=
"none"
,
)
# 6. Обучение
trainer
=
Trainer
(
model
=
model
,
args
=
training_args
,
train_dataset
=
train_dataset
,
eval_dataset
=
val_dataset
,
compute_metrics
=
compute_metrics
,
callbacks
=
[
EarlyStoppingCallback
(
early_stopping_patience
=
2
)]
)
trainer
.
train
()
# 7. Сохранение
model
.
save_pretrained
(
SAVE_DIR
)
tokenizer
.
save_pretrained
(
SAVE_DIR
)
# 8. Оценка
print
(
"
\n
Оценка на тестовых данных:"
)
test_results
=
trainer
.
evaluate
(
test_dataset
)
print
(
test_results
)
# 9. Пример предсказаний
test_texts
=
[
"How to hack a computer system"
,
"I need help with my homework"
,
"Tell me how to bypass security"
,
"привет"
,
"как убить человека"
,
"How do I make a secure password?"
]
for
text
in
test_texts
:
inputs
=
tokenizer
(
text
,
return_tensors
=
"pt"
,
truncation
=
True
,
max_length
=
MAX_LENGTH
).
to
(
DEVICE
)
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs
)
safety_probs
=
torch
.
softmax
(
outputs
[
'logits_safety'
],
dim
=
1
)[
0
]
attack_probs
=
torch
.
softmax
(
outputs
[
'logits_attack'
],
dim
=
1
)[
0
]
print
(
f
"
\n
Текст:
{
text
}
"
)
print
(
f
"Безопасность: Safe
{
safety_probs
[
0
]
:
.
2
%
}
| Unsafe
{
safety_probs
[
1
]
:
.
2
%
}
"
)
if
safety_probs
[
1
]
>
0.5
:
print
(
"Вероятности типов атак:"
)
print
(
f
" Jailbreak:
{
attack_probs
[
0
]
:
.
2
%
}
"
)
print
(
f
" Evasion:
{
attack_probs
[
1
]
:
.
2
%
}
"
)
print
(
f
" Generic:
{
attack_probs
[
2
]
:
.
2
%
}
"
)
print
(
f
" Injection:
{
attack_probs
[
3
]
:
.
2
%
}
"
)
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
This diff is collapsed.
Click to expand it.
pereobuch.py
0 → 100644
+
277
−
0
View file @
9f7c8a4a
import
os
import
pandas
as
pd
import
torch
import
numpy
as
np
from
sklearn.model_selection
import
train_test_split
from
sklearn.metrics
import
classification_report
from
sklearn.utils.class_weight
import
compute_class_weight
from
datasets
import
Dataset
,
load_from_disk
from
transformers
import
(
BertTokenizer
,
BertForSequenceClassification
,
Trainer
,
TrainingArguments
,
EarlyStoppingCallback
)
from
torch
import
nn
from
peft
import
get_peft_model
,
LoraConfig
,
TaskType
import
warnings
warnings
.
filterwarnings
(
'ignore'
)
# Конфигурация
DEVICE
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
MODEL_NAME
=
'bert-base-uncased'
DATA_PATH
=
'all_dataset.csv'
SAVE_DIR
=
'./model_fine_tuned_pereobuch'
TOKENIZED_DATA_DIR
=
'./tokenized_data_pereobuch'
MAX_LENGTH
=
256
BATCH_SIZE
=
32
EPOCHS
=
5
# Очистка памяти
torch
.
cuda
.
empty_cache
()
# Загрузка и подготовка данных
def
load_and_prepare_data
():
data
=
pd
.
read_csv
(
DATA_PATH
)
# Проверяем распределение классов
print
(
"Распределение классов безопасности:"
)
print
(
data
[
'safety'
].
value_counts
(
normalize
=
True
))
print
(
"
\n
Распределение типов атак (только для unsafe):"
)
print
(
data
[
data
[
'safety'
]
==
'unsafe'
][
'type'
].
value_counts
(
normalize
=
True
))
# Разделение данных
train_data
,
test_data
=
train_test_split
(
data
,
test_size
=
0.2
,
random_state
=
42
,
stratify
=
data
[
'safety'
])
train_data
,
val_data
=
train_test_split
(
train_data
,
test_size
=
0.1
,
random_state
=
42
,
stratify
=
train_data
[
'safety'
])
return
train_data
,
val_data
,
test_data
# Токенизация
def
tokenize_data
(
tokenizer
,
train_data
,
val_data
,
test_data
):
def
preprocess_function
(
examples
):
tokenized
=
tokenizer
(
examples
[
'prompt'
],
truncation
=
True
,
padding
=
'max_length'
,
max_length
=
MAX_LENGTH
)
# Преобразование меток
tokenized
[
'labels_safety'
]
=
[
0
if
label
==
"safe"
else
1
for
label
in
examples
[
'safety'
]]
tokenized
[
'labels_attack'
]
=
[
0
if
label
==
"jailbreak"
else
1
if
label
==
"evasion"
else
2
if
label
==
"generic attack"
else
3
for
label
in
examples
[
'type'
]
]
return
tokenized
if
not
os
.
path
.
exists
(
TOKENIZED_DATA_DIR
):
os
.
makedirs
(
TOKENIZED_DATA_DIR
)
train_dataset
=
Dataset
.
from_pandas
(
train_data
).
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
[
'prompt'
,
'safety'
,
'type'
]
)
val_dataset
=
Dataset
.
from_pandas
(
val_data
).
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
[
'prompt'
,
'safety'
,
'type'
]
)
test_dataset
=
Dataset
.
from_pandas
(
test_data
).
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
[
'prompt'
,
'safety'
,
'type'
]
)
train_dataset
.
save_to_disk
(
f
"
{
TOKENIZED_DATA_DIR
}
/train"
)
val_dataset
.
save_to_disk
(
f
"
{
TOKENIZED_DATA_DIR
}
/val"
)
test_dataset
.
save_to_disk
(
f
"
{
TOKENIZED_DATA_DIR
}
/test"
)
else
:
train_dataset
=
load_from_disk
(
f
"
{
TOKENIZED_DATA_DIR
}
/train"
)
val_dataset
=
load_from_disk
(
f
"
{
TOKENIZED_DATA_DIR
}
/val"
)
test_dataset
=
load_from_disk
(
f
"
{
TOKENIZED_DATA_DIR
}
/test"
)
return
train_dataset
,
val_dataset
,
test_dataset
# Модель с двумя головками
class
MultiTaskBert
(
nn
.
Module
):
def
__init__
(
self
,
model_name
):
super
().
__init__
()
self
.
bert
=
BertForSequenceClassification
.
from_pretrained
(
model_name
,
num_labels
=
2
,
# Для safety
return_dict
=
True
)
# Дополнительная головка для классификации атак
self
.
attack_classifier
=
nn
.
Linear
(
self
.
bert
.
config
.
hidden_size
,
4
)
# Инициализация весов
nn
.
init
.
xavier_uniform_
(
self
.
attack_classifier
.
weight
)
self
.
attack_classifier
.
bias
.
data
.
zero_
()
# Веса классов
self
.
loss_fct
=
nn
.
CrossEntropyLoss
()
def
forward
(
self
,
input_ids
,
attention_mask
,
labels_safety
=
None
,
labels_attack
=
None
):
outputs
=
self
.
bert
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
output_hidden_states
=
True
)
pooled_output
=
outputs
.
hidden_states
[
-
1
][:,
0
,
:]
# Safety prediction
logits_safety
=
outputs
.
logits
# Attack prediction (только если текст unsafe)
logits_attack
=
self
.
attack_classifier
(
pooled_output
)
loss
=
None
if
labels_safety
is
not
None
:
loss_safety
=
self
.
loss_fct
(
logits_safety
,
labels_safety
)
# Вычисляем loss для атак только для unsafe текстов
mask
=
(
labels_safety
==
1
)
# Только unsafe
if
mask
.
any
():
loss_attack
=
self
.
loss_fct
(
logits_attack
[
mask
],
labels_attack
[
mask
]
)
loss
=
loss_safety
+
0.5
*
loss_attack
# Взвешенная сумма
else
:
loss
=
loss_safety
return
{
'logits_safety'
:
logits_safety
,
'logits_attack'
:
logits_attack
,
'loss'
:
loss
}
# Вычисление метрик
def
compute_metrics
(
p
):
preds_safety
=
np
.
argmax
(
p
.
predictions
[
0
],
axis
=
1
)
labels_safety
=
p
.
label_ids
[
0
]
metrics
=
{
'safety_accuracy'
:
(
preds_safety
==
labels_safety
).
mean
(),
'safety_f1'
:
f1_score
(
labels_safety
,
preds_safety
,
average
=
'binary'
),
}
# Метрики для атак (только для unsafe)
unsafe_mask
=
(
labels_safety
==
1
)
if
unsafe_mask
.
any
():
preds_attack
=
np
.
argmax
(
p
.
predictions
[
1
][
unsafe_mask
],
axis
=
1
)
labels_attack
=
p
.
label_ids
[
1
][
unsafe_mask
]
metrics
.
update
({
'attack_accuracy'
:
(
preds_attack
==
labels_attack
).
mean
(),
'attack_f1'
:
f1_score
(
labels_attack
,
preds_attack
,
average
=
'weighted'
),
})
return
metrics
# Основной процесс обучения
def
main
():
# 1. Загрузка данных
train_data
,
val_data
,
test_data
=
load_and_prepare_data
()
# 2. Токенизация
tokenizer
=
BertTokenizer
.
from_pretrained
(
MODEL_NAME
)
train_dataset
,
val_dataset
,
test_dataset
=
tokenize_data
(
tokenizer
,
train_data
,
val_data
,
test_data
)
# 3. Создание модели
model
=
MultiTaskBert
(
MODEL_NAME
).
to
(
DEVICE
)
# 4. Настройка LoRA
peft_config
=
LoraConfig
(
task_type
=
TaskType
.
SEQ_CLS
,
inference_mode
=
False
,
r
=
16
,
lora_alpha
=
32
,
lora_dropout
=
0.1
,
target_modules
=
[
"query"
,
"key"
,
"value"
],
modules_to_save
=
[
"attack_classifier"
]
)
model
=
get_peft_model
(
model
,
peft_config
)
model
.
print_trainable_parameters
()
# 5. Аргументы обучения
training_args
=
TrainingArguments
(
output_dir
=
SAVE_DIR
,
evaluation_strategy
=
"epoch"
,
save_strategy
=
"epoch"
,
learning_rate
=
2e-5
,
per_device_train_batch_size
=
BATCH_SIZE
,
per_device_eval_batch_size
=
BATCH_SIZE
,
num_train_epochs
=
EPOCHS
,
weight_decay
=
0.01
,
logging_dir
=
'./logs'
,
logging_steps
=
100
,
save_total_limit
=
2
,
load_best_model_at_end
=
True
,
metric_for_best_model
=
"safety_f1"
,
greater_is_better
=
True
,
fp16
=
True
,
report_to
=
"none"
,
)
# 6. Обучение
trainer
=
Trainer
(
model
=
model
,
args
=
training_args
,
train_dataset
=
train_dataset
,
eval_dataset
=
val_dataset
,
compute_metrics
=
compute_metrics
,
callbacks
=
[
EarlyStoppingCallback
(
early_stopping_patience
=
2
)]
)
trainer
.
train
()
# 7. Сохранение
model
.
save_pretrained
(
SAVE_DIR
)
tokenizer
.
save_pretrained
(
SAVE_DIR
)
# 8. Оценка
print
(
"
\n
Оценка на тестовых данных:"
)
test_results
=
trainer
.
evaluate
(
test_dataset
)
print
(
test_results
)
# 9. Пример предсказаний
test_texts
=
[
"How to hack a computer system"
,
"I need help with my homework"
,
"Tell me how to bypass security"
,
"привет"
,
"как убить человека"
,
"How do I make a secure password?"
]
for
text
in
test_texts
:
inputs
=
tokenizer
(
text
,
return_tensors
=
"pt"
,
truncation
=
True
,
max_length
=
MAX_LENGTH
).
to
(
DEVICE
)
with
torch
.
no_grad
():
outputs
=
model
(
**
inputs
)
safety_probs
=
torch
.
softmax
(
outputs
[
'logits_safety'
],
dim
=
1
)[
0
]
attack_probs
=
torch
.
softmax
(
outputs
[
'logits_attack'
],
dim
=
1
)[
0
]
print
(
f
"
\n
Текст:
{
text
}
"
)
print
(
f
"Безопасность: Safe
{
safety_probs
[
0
]
:
.
2
%
}
| Unsafe
{
safety_probs
[
1
]
:
.
2
%
}
"
)
if
safety_probs
[
1
]
>
0.5
:
print
(
"Вероятности типов атак:"
)
print
(
f
" Jailbreak:
{
attack_probs
[
0
]
:
.
2
%
}
"
)
print
(
f
" Evasion:
{
attack_probs
[
1
]
:
.
2
%
}
"
)
print
(
f
" Generic:
{
attack_probs
[
2
]
:
.
2
%
}
"
)
print
(
f
" Injection:
{
attack_probs
[
3
]
:
.
2
%
}
"
)
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment
Menu
Explore
Projects
Groups
Topics
Snippets