Skip to content
GitLab
Explore
Projects
Groups
Topics
Snippets
Projects
Groups
Topics
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
proekt
obuch
Commits
de562ff4
Commit
de562ff4
authored
4 weeks ago
by
Мазур Грета Евгеньевна
Browse files
Options
Download
Patches
Plain Diff
first zapusk
parent
5ad7cc5e
master
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
grustno.py
+113
-0
grustno.py
with
113 additions
and
0 deletions
+113
-0
grustno.py
+
113
−
0
View file @
de562ff4
...
...
@@ -4,3 +4,116 @@ print("ГОЙДАААААААА")
import
pandas
as
pd
df
=
pd
.
read_csv
(
'dataset__1_.csv'
)
print
(
df
.
head
())
import
pandas
as
pd
from
sklearn.model_selection
import
train_test_split
from
sklearn.utils
import
resample
from
datasets
import
Dataset
from
transformers
import
AutoModelForSequenceClassification
,
AutoTokenizer
,
TrainingArguments
,
Trainer
from
peft
import
get_peft_model
,
LoraConfig
,
TaskType
from
sklearn.metrics
import
f1_score
,
precision_score
,
recall_score
import
numpy
as
np
# Загрузка данных
df
=
pd
.
read_csv
(
'dataset__1_.csv'
)
# Балансировка классов
def
balance_classes
(
df
,
target_column
):
classes
=
df
[
target_column
].
unique
()
max_size
=
max
(
df
[
target_column
].
value_counts
())
balanced_df
=
pd
.
DataFrame
()
for
cls
in
classes
:
cls_df
=
df
[
df
[
target_column
]
==
cls
]
if
len
(
cls_df
)
<
max_size
:
cls_df
=
resample
(
cls_df
,
replace
=
True
,
n_samples
=
max_size
,
random_state
=
42
)
balanced_df
=
pd
.
concat
([
balanced_df
,
cls_df
])
return
balanced_df
.
sample
(
frac
=
1
,
random_state
=
42
).
reset_index
(
drop
=
True
)
df_balanced
=
balance_classes
(
df
,
'type'
)
# Разделение на train/test/validation
train_df
,
test_df
=
train_test_split
(
df_balanced
,
test_size
=
0.2
,
random_state
=
42
)
train_df
,
val_df
=
train_test_split
(
train_df
,
test_size
=
0.1
,
random_state
=
42
)
# Преобразование в Dataset
train_dataset
=
Dataset
.
from_pandas
(
train_df
)
val_dataset
=
Dataset
.
from_pandas
(
val_df
)
test_dataset
=
Dataset
.
from_pandas
(
test_df
)
# Загрузка модели и токенизатора
model_name
=
"mistralai/Mistral-7B-v0.1"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
model
=
AutoModelForSequenceClassification
.
from_pretrained
(
model_name
,
num_labels
=
4
)
# Настройка LoRa
peft_config
=
LoraConfig
(
task_type
=
TaskType
.
SEQ_CLS
,
inference_mode
=
False
,
r
=
8
,
lora_alpha
=
32
,
lora_dropout
=
0.1
,
target_modules
=
[
"q_proj"
,
"v_proj"
]
)
model
=
get_peft_model
(
model
,
peft_config
)
# Токенизация данных
def
tokenize_function
(
examples
):
return
tokenizer
(
examples
[
'prompt'
],
padding
=
"max_length"
,
truncation
=
True
)
train_dataset
=
train_dataset
.
map
(
tokenize_function
,
batched
=
True
)
val_dataset
=
val_dataset
.
map
(
tokenize_function
,
batched
=
True
)
test_dataset
=
test_dataset
.
map
(
tokenize_function
,
batched
=
True
)
# Настройка тренировочных аргументов
training_args
=
TrainingArguments
(
output_dir
=
"./results"
,
evaluation_strategy
=
"epoch"
,
learning_rate
=
2e-5
,
per_device_train_batch_size
=
4
,
per_device_eval_batch_size
=
4
,
num_train_epochs
=
3
,
weight_decay
=
0.01
,
save_strategy
=
"epoch"
,
load_best_model_at_end
=
True
,
)
# Функция для вычисления метрик
def
compute_metrics
(
p
):
predictions
,
labels
=
p
predictions
=
np
.
argmax
(
predictions
,
axis
=
1
)
return
{
'f1'
:
f1_score
(
labels
,
predictions
,
average
=
'macro'
),
'precision'
:
precision_score
(
labels
,
predictions
,
average
=
'macro'
),
'recall'
:
recall_score
(
labels
,
predictions
,
average
=
'macro'
)
}
# Тренировка модели
trainer
=
Trainer
(
model
=
model
,
args
=
training_args
,
train_dataset
=
train_dataset
,
eval_dataset
=
val_dataset
,
tokenizer
=
tokenizer
,
compute_metrics
=
compute_metrics
,
)
trainer
.
train
()
# Оценка модели на тестовых данных
results
=
trainer
.
evaluate
(
test_dataset
)
print
(
results
)
# Zero-shot классификация
def
zero_shot_classification
(
text
):
inputs
=
tokenizer
(
text
,
return_tensors
=
"pt"
,
padding
=
True
,
truncation
=
True
)
outputs
=
model
(
**
inputs
)
probs
=
outputs
.
logits
.
softmax
(
dim
=-
1
)
predicted_class
=
probs
.
argmax
().
item
()
return
predicted_class
# Пример zero-shot классификации
example_text
=
"This is a malicious prompt"
predicted_class
=
zero_shot_classification
(
example_text
)
print
(
f
"Predicted class:
{
predicted_class
}
"
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment
Menu
Explore
Projects
Groups
Topics
Snippets