# print('iwurghfpse') # print('eiugheariuhg') # print("ГОЙДАААААААА") # import pandas as pd # df = pd.read_csv('dataset__1_.csv') # print(df.head()) # import torch # device = "cuda" if torch.cuda.is_available() else "cpu" # print(device) # # model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4).to(device) # if torch.cuda.is_available(): # print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9} GB") # import pandas as pd # from sklearn.model_selection import train_test_split # from sklearn.utils import resample # from datasets import Dataset # from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer # from peft import get_peft_model, LoraConfig, TaskType # from sklearn.metrics import f1_score, precision_score, recall_score # import numpy as np # # Загрузка данных # df = pd.read_csv('dataset__1_.csv') # # Балансировка классов # def balance_classes(df, target_column): # classes = df[target_column].unique() # max_size = max(df[target_column].value_counts()) # balanced_df = pd.DataFrame() # for cls in classes: # cls_df = df[df[target_column] == cls] # if len(cls_df) < max_size: # cls_df = resample(cls_df, replace=True, n_samples=max_size, random_state=42) # balanced_df = pd.concat([balanced_df, cls_df]) # return balanced_df.sample(frac=1, random_state=42).reset_index(drop=True) # df_balanced = balance_classes(df, 'type') # # Разделение на train/test/validation # train_df, test_df = train_test_split(df_balanced, test_size=0.2, random_state=42) # train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42) # # Преобразование в Dataset # train_dataset = Dataset.from_pandas(train_df) # val_dataset = Dataset.from_pandas(val_df) # test_dataset = Dataset.from_pandas(test_df) # # Загрузка модели и токенизатора # model_name = "mistralai/Mistral-7B-v0.1" # model_name = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4).to(device) # tokenizer = AutoTokenizer.from_pretrained(model_name) # model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4) # # Настройка LoRa # peft_config = LoraConfig( # task_type=TaskType.SEQ_CLS, # inference_mode=False, # r=8, # lora_alpha=32, # lora_dropout=0.1, # target_modules=["q_proj", "v_proj"] # ) # model = get_peft_model(model, peft_config) # # Токенизация данных # def tokenize_function(examples): # return tokenizer(examples['prompt'], padding="max_length", truncation=True) # train_dataset = train_dataset.map(tokenize_function, batched=True) # val_dataset = val_dataset.map(tokenize_function, batched=True) # test_dataset = test_dataset.map(tokenize_function, batched=True) # # Настройка тренировочных аргументов # training_args = TrainingArguments( # output_dir="./results", # evaluation_strategy="epoch", # learning_rate=2e-5, # per_device_train_batch_size=4, # per_device_eval_batch_size=4, # num_train_epochs=3, # weight_decay=0.01, # save_strategy="epoch", # load_best_model_at_end=True, # ) # # Функция для вычисления метрик # def compute_metrics(p): # predictions, labels = p # predictions = np.argmax(predictions, axis=1) # return { # 'f1': f1_score(labels, predictions, average='macro'), # 'precision': precision_score(labels, predictions, average='macro'), # 'recall': recall_score(labels, predictions, average='macro') # } # # Тренировка модели # trainer = Trainer( # model=model, # args=training_args, # train_dataset=train_dataset, # eval_dataset=val_dataset, # tokenizer=tokenizer, # compute_metrics=compute_metrics, # ) # trainer.train() # # Оценка модели на тестовых данных # results = trainer.evaluate(test_dataset) # print(results) # # Zero-shot классификация # def zero_shot_classification(text): # inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) # outputs = model(**inputs) # probs = outputs.logits.softmax(dim=-1) # predicted_class = probs.argmax().item() # return predicted_class # # Пример zero-shot классификации # example_text = "This is a malicious prompt" # predicted_class = zero_shot_classification(example_text) # print(f"Predicted class: {predicted_class}") import pandas as pd from sklearn.model_selection import train_test_split from sklearn.utils import resample from datasets import Dataset from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig ) from peft import get_peft_model, LoraConfig, TaskType from sklearn.metrics import f1_score, precision_score, recall_score import numpy as np import torch # Загрузка данных df = pd.read_csv('dataset__1_.csv') # Балансировка классов def balance_classes(df, target_column): classes = df[target_column].unique() max_size = max(df[target_column].value_counts()) balanced_df = pd.DataFrame() for cls in classes: cls_df = df[df[target_column] == cls] if len(cls_df) < max_size: cls_df = resample(cls_df, replace=True, n_samples=max_size, random_state=42) balanced_df = pd.concat([balanced_df, cls_df]) return balanced_df.sample(frac=1, random_state=42).reset_index(drop=True) df_balanced = balance_classes(df, 'type') # Разделение на train/test/validation train_df, test_df = train_test_split(df_balanced, test_size=0.2, random_state=42) train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42) # Преобразование в Dataset train_dataset = Dataset.from_pandas(train_df) val_dataset = Dataset.from_pandas(val_df) test_dataset = Dataset.from_pandas(test_df) # Настройка квантования (8-bit) quantization_config = BitsAndBytesConfig( load_in_8bit=True, # Включаем 8-битное квантование llm_int8_threshold=6.0 # Порог для квантования ) # Загрузка модели и токенизатора model_name = "mistralai/Mistral-7B-v0.1" # Модель Mistral-7B tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=4, # Количество классов quantization_config=quantization_config, # Применяем квантование device_map="auto" # Автоматическое распределение на GPU/CPU ) # Настройка LoRA peft_config = LoraConfig( task_type=TaskType.SEQ_CLS, # Тип задачи (классификация) inference_mode=False, r=8, # Ранг LoRA lora_alpha=32, # Альфа-параметр LoRA lora_dropout=0.1, # Dropout для LoRA target_modules=["q_proj", "v_proj"] # Целевые модули для адаптации ) # Применяем LoRA к модели model = get_peft_model(model, peft_config) # Токенизация данных def tokenize_function(examples): return tokenizer(examples['prompt'], padding="max_length", truncation=True, max_length=512) train_dataset = train_dataset.map(tokenize_function, batched=True) val_dataset = val_dataset.map(tokenize_function, batched=True) test_dataset = test_dataset.map(tokenize_function, batched=True) # Настройка тренировочных аргументов training_args = TrainingArguments( output_dir="./results", # Директория для сохранения результатов evaluation_strategy="epoch", # Оценка после каждой эпохи learning_rate=2e-5, # Скорость обучения per_device_train_batch_size=2, # Размер батча для тренировки per_device_eval_batch_size=2, # Размер батча для оценки num_train_epochs=3, # Количество эпох weight_decay=0.01, # Вес для L2-регуляризации save_strategy="epoch", # Сохранение после каждой эпохи load_best_model_at_end=True, # Загрузка лучшей модели в конце logging_dir="./logs", # Директория для логов logging_steps=10, # Частота логирования fp16=True # Использование mixed precision (если доступно) ) # Функция для вычисления метрик def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=1) return { 'f1': f1_score(labels, predictions, average='macro'), 'precision': precision_score(labels, predictions, average='macro'), 'recall': recall_score(labels, predictions, average='macro') } # Тренировка модели trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics, ) trainer.train() # Оценка модели на тестовых данных results = trainer.evaluate(test_dataset) print(results) # Zero-shot классификация def zero_shot_classification(text): inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) outputs = model(**inputs) probs = outputs.logits.softmax(dim=-1) predicted_class = probs.argmax().item() return predicted_class # Пример zero-shot классификации example_text = "This is a malicious prompt" predicted_class = zero_shot_classification(example_text) print(f"Predicted class: {predicted_class}") #AAAAAAAAAAAAAAAAAAAA