grustno.py 9.78 KiB
# print('iwurghfpse')
# print('eiugheariuhg')
# print("ГОЙДАААААААА")
# import pandas as pd
# df = pd.read_csv('dataset__1_.csv')
# print(df.head())
# import torch

# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(device)
# # model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4).to(device)
# if torch.cuda.is_available():
#     print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9} GB")


# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.utils import resample
# from datasets import Dataset
# from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
# from peft import get_peft_model, LoraConfig, TaskType
# from sklearn.metrics import f1_score, precision_score, recall_score
# import numpy as np

# # Загрузка данных
# df = pd.read_csv('dataset__1_.csv')

# # Балансировка классов
# def balance_classes(df, target_column):
#     classes = df[target_column].unique()
#     max_size = max(df[target_column].value_counts())
#     balanced_df = pd.DataFrame()
#     for cls in classes:
#         cls_df = df[df[target_column] == cls]
#         if len(cls_df) < max_size:
#             cls_df = resample(cls_df, replace=True, n_samples=max_size, random_state=42)
#         balanced_df = pd.concat([balanced_df, cls_df])
#     return balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# df_balanced = balance_classes(df, 'type')

# # Разделение на train/test/validation
# train_df, test_df = train_test_split(df_balanced, test_size=0.2, random_state=42)
# train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# # Преобразование в Dataset
# train_dataset = Dataset.from_pandas(train_df)
# val_dataset = Dataset.from_pandas(val_df)
# test_dataset = Dataset.from_pandas(test_df)

# # Загрузка модели и токенизатора
# model_name = "mistralai/Mistral-7B-v0.1"
# model_name = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4).to(device)
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

# # Настройка LoRa
# peft_config = LoraConfig(
#     task_type=TaskType.SEQ_CLS,
#     inference_mode=False,
#     r=8,
#     lora_alpha=32,
#     lora_dropout=0.1,
#     target_modules=["q_proj", "v_proj"]
# )

# model = get_peft_model(model, peft_config)

# # Токенизация данных
# def tokenize_function(examples):
#     return tokenizer(examples['prompt'], padding="max_length", truncation=True)

# train_dataset = train_dataset.map(tokenize_function, batched=True)
# val_dataset = val_dataset.map(tokenize_function, batched=True)
# test_dataset = test_dataset.map(tokenize_function, batched=True)

# # Настройка тренировочных аргументов
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     save_strategy="epoch",
#     load_best_model_at_end=True,
# )

# # Функция для вычисления метрик
# def compute_metrics(p):
#     predictions, labels = p
#     predictions = np.argmax(predictions, axis=1)
#     return {
#         'f1': f1_score(labels, predictions, average='macro'),
#         'precision': precision_score(labels, predictions, average='macro'),
#         'recall': recall_score(labels, predictions, average='macro')
#     }

# # Тренировка модели
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

# trainer.train()

# # Оценка модели на тестовых данных
# results = trainer.evaluate(test_dataset)
# print(results)

# # Zero-shot классификация
# def zero_shot_classification(text):
#     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
#     outputs = model(**inputs)
#     probs = outputs.logits.softmax(dim=-1)
#     predicted_class = probs.argmax().item()
#     return predicted_class

# # Пример zero-shot классификации
# example_text = "This is a malicious prompt"
# predicted_class = zero_shot_classification(example_text)
# print(f"Predicted class: {predicted_class}")





import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np
import torch

# Загрузка данных
df = pd.read_csv('dataset__1_.csv')

# Балансировка классов
def balance_classes(df, target_column):
    classes = df[target_column].unique()
    max_size = max(df[target_column].value_counts())
    balanced_df = pd.DataFrame()
    for cls in classes:
        cls_df = df[df[target_column] == cls]
        if len(cls_df) < max_size:
            cls_df = resample(cls_df, replace=True, n_samples=max_size, random_state=42)
        balanced_df = pd.concat([balanced_df, cls_df])
    return balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

df_balanced = balance_classes(df, 'type')

# Разделение на train/test/validation
train_df, test_df = train_test_split(df_balanced, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Преобразование в Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Настройка квантования (8-bit)
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Включаем 8-битное квантование
    llm_int8_threshold=6.0  # Порог для квантования
)

# Загрузка модели и токенизатора
model_name = "mistralai/Mistral-7B-v0.1"  # Модель Mistral-7B
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4,  # Количество классов
    quantization_config=quantization_config,  # Применяем квантование
    device_map="auto"  # Автоматическое распределение на GPU/CPU
)

# Настройка LoRA
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Тип задачи (классификация)
    inference_mode=False,
    r=8,  # Ранг LoRA
    lora_alpha=32,  # Альфа-параметр LoRA
    lora_dropout=0.1,  # Dropout для LoRA
    target_modules=["q_proj", "v_proj"]  # Целевые модули для адаптации
)

# Применяем LoRA к модели
model = get_peft_model(model, peft_config)

# Токенизация данных
def tokenize_function(examples):
    return tokenizer(examples['prompt'], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Настройка тренировочных аргументов
training_args = TrainingArguments(
    output_dir="./results",  # Директория для сохранения результатов
    evaluation_strategy="epoch",  # Оценка после каждой эпохи
    learning_rate=2e-5,  # Скорость обучения
    per_device_train_batch_size=2,  # Размер батча для тренировки
    per_device_eval_batch_size=2,  # Размер батча для оценки
    num_train_epochs=3,  # Количество эпох
    weight_decay=0.01,  # Вес для L2-регуляризации
    save_strategy="epoch",  # Сохранение после каждой эпохи
    load_best_model_at_end=True,  # Загрузка лучшей модели в конце
    logging_dir="./logs",  # Директория для логов
    logging_steps=10,  # Частота логирования
    fp16=True  # Использование mixed precision (если доступно)
)

# Функция для вычисления метрик
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {
        'f1': f1_score(labels, predictions, average='macro'),
        'precision': precision_score(labels, predictions, average='macro'),
        'recall': recall_score(labels, predictions, average='macro')
    }

# Тренировка модели
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

# Оценка модели на тестовых данных
results = trainer.evaluate(test_dataset)
print(results)

# Zero-shot классификация
def zero_shot_classification(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=-1)
    predicted_class = probs.argmax().item()
    return predicted_class

# Пример zero-shot классификации
example_text = "This is a malicious prompt"
predicted_class = zero_shot_classification(example_text)
print(f"Predicted class: {predicted_class}")


#AAAAAAAAAAAAAAAAAAAA