Commit de562ff4 authored by Мазур Грета Евгеньевна's avatar Мазур Грета Евгеньевна
Browse files

first zapusk

parent 5ad7cc5e
No related merge requests found
Showing with 113 additions and 0 deletions
+113 -0
......@@ -4,3 +4,116 @@ print("ГОЙДАААААААА")
import pandas as pd
df = pd.read_csv('dataset__1_.csv')
print(df.head())
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np
# Загрузка данных
df = pd.read_csv('dataset__1_.csv')
# Балансировка классов
def balance_classes(df, target_column):
classes = df[target_column].unique()
max_size = max(df[target_column].value_counts())
balanced_df = pd.DataFrame()
for cls in classes:
cls_df = df[df[target_column] == cls]
if len(cls_df) < max_size:
cls_df = resample(cls_df, replace=True, n_samples=max_size, random_state=42)
balanced_df = pd.concat([balanced_df, cls_df])
return balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
df_balanced = balance_classes(df, 'type')
# Разделение на train/test/validation
train_df, test_df = train_test_split(df_balanced, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)
# Преобразование в Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)
# Загрузка модели и токенизатора
model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
# Настройка LoRa
peft_config = LoraConfig(
task_type=TaskType.SEQ_CLS,
inference_mode=False,
r=8,
lora_alpha=32,
lora_dropout=0.1,
target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, peft_config)
# Токенизация данных
def tokenize_function(examples):
return tokenizer(examples['prompt'], padding="max_length", truncation=True)
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
# Настройка тренировочных аргументов
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=3,
weight_decay=0.01,
save_strategy="epoch",
load_best_model_at_end=True,
)
# Функция для вычисления метрик
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=1)
return {
'f1': f1_score(labels, predictions, average='macro'),
'precision': precision_score(labels, predictions, average='macro'),
'recall': recall_score(labels, predictions, average='macro')
}
# Тренировка модели
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.train()
# Оценка модели на тестовых данных
results = trainer.evaluate(test_dataset)
print(results)
# Zero-shot классификация
def zero_shot_classification(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
outputs = model(**inputs)
probs = outputs.logits.softmax(dim=-1)
predicted_class = probs.argmax().item()
return predicted_class
# Пример zero-shot классификации
example_text = "This is a malicious prompt"
predicted_class = zero_shot_classification(example_text)
print(f"Predicted class: {predicted_class}")
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment