179 lines
7.7 KiB
Python
179 lines
7.7 KiB
Python
import random
|
||
import re
|
||
|
||
from sentence_transformers import InputExample
|
||
|
||
from Generator.Enums.RandomType import RandomType
|
||
from Generator.Models.ConstLiteral import ConstLiteral
|
||
from Generator.Models.Term import Term
|
||
from Generator.Models.VariableLiteral import VariableLiteral
|
||
from Generator.UniversalRandomizer import UniversalRandomizer
|
||
|
||
|
||
class LogGenerator:
|
||
def __init__(self):
|
||
# Обертки для переменных: id=..., [ip], 'user'
|
||
self.wrappers = [("", ""), ("", ""), ("id=", ""), ("user:", ""), ("[", "]"), ("'", "'")]
|
||
|
||
# Словарь для констант (имитация логов)
|
||
self.log_keywords = [
|
||
# Уровни логирования
|
||
"INFO", "ERROR", "WARN", "DEBUG", "TRACE", "CRITICAL", "FATAL", "NOTICE",
|
||
|
||
# Действия (Verbs)
|
||
"started", "stopped", "failed", "completed", "aborted", "retrying",
|
||
"connecting", "disconnected", "listening", "resolving", "binding",
|
||
"parsing", "rendering", "authenticating", "authorizing", "validated",
|
||
"rejected", "accepted", "dropped", "created", "deleted", "updated",
|
||
"fetching", "sending", "receiving", "waiting", "killing", "spawning",
|
||
|
||
# Сущности (Nouns)
|
||
"System", "Kernel", "Thread", "Process", "Worker", "Daemon", "Job",
|
||
"Connection", "Session", "User", "Client", "Server", "Proxy", "Gateway",
|
||
"Database", "Table", "Index", "Query", "Transaction", "Commit", "Rollback",
|
||
"Cache", "Buffer", "Heap", "Stack", "Memory", "Disk", "Volume",
|
||
"Network", "Port", "Socket", "Interface", "Protocol", "Packet",
|
||
"Request", "Response", "Header", "Body", "Payload", "Token", "Key",
|
||
"File", "Directory", "Path", "Config", "Module", "Plugin", "Component",
|
||
"Exception", "Error", "Timeout", "Latency", "HealthCheck", "Status",
|
||
|
||
# HTTP и Web
|
||
"GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD",
|
||
"HTTP/1.1", "HTTP/2", "API", "Endpoint", "Route", "URI", "URL",
|
||
"JSON", "XML", "YAML", "HTML", "CSS", "JS",
|
||
|
||
# Предлоги и связки
|
||
"at", "in", "on", "to", "from", "by", "with", "for", "via", "through",
|
||
|
||
# Прилагательные и состояния
|
||
"successful", "failed", "denied", "allowed", "active", "inactive",
|
||
"pending", "queued", "blocked", "locked", "corrupted", "invalid",
|
||
"missing", "found", "available", "unavailable", "busy", "idle",
|
||
"secure", "insecure", "public", "private", "local", "remote"
|
||
]
|
||
|
||
def generate(self, min_literals=15, max_literals=25) -> Term:
|
||
count = random.randint(min_literals, max_literals)
|
||
literals = []
|
||
|
||
for i in range(count):
|
||
# 60% Константа, 40% Переменная
|
||
if random.random() < 0.6:
|
||
# Либо слово из словаря, либо случайное слово
|
||
txt = random.choice(self.log_keywords) if random.random() < 0.8 else UniversalRandomizer.fake.text.word()
|
||
literals.append(ConstLiteral(text=txt))
|
||
else:
|
||
r_type = random.choice(list(RandomType))
|
||
pref, post = random.choice(self.wrappers)
|
||
literals.append(VariableLiteral(name=f"v{i}", type=r_type, prefix=pref, postfix=post))
|
||
|
||
return Term(literals=literals, separator=random.choice([" ", ";", "|"]))
|
||
|
||
def generate_training_data(self, count=100):
|
||
train_examples = []
|
||
|
||
for _ in range(count):
|
||
anchor_term = self.generate()
|
||
|
||
anchor_text = anchor_term.render().text
|
||
|
||
# 2. Генерируем Positive (Позитивный пример)
|
||
positive_text = anchor_term.render().text
|
||
|
||
# 3. Генерируем Hard Negative
|
||
literals_copy = anchor_term.literals[:]
|
||
random.shuffle(literals_copy)
|
||
|
||
negative_hard_text = anchor_term.separator.join([lit.render().text for lit in literals_copy])
|
||
|
||
# 4. Генерируем Easy Negative (Совсем другой шаблон)
|
||
random_other_term = self.generate()
|
||
negative_easy_text = random_other_term.render().text
|
||
|
||
# 3. Генерируем Very Hard Negative
|
||
|
||
bad_sep = random.choice([" ", ";", "|", " "])
|
||
negative_very_hard_text = bad_sep.join([lit.render().text for lit in literals_copy])
|
||
|
||
# 5. Упаковываем для Sentence Transformers
|
||
|
||
# Перемешивание, но с сохранением разделителя
|
||
train_examples.append(InputExample(texts=[
|
||
self.mask_log_structure(anchor_text),
|
||
self.mask_log_structure(positive_text),
|
||
self.mask_log_structure(negative_hard_text)
|
||
]))
|
||
|
||
# Другой лог
|
||
train_examples.append(InputExample(texts=[
|
||
self.mask_log_structure(anchor_text),
|
||
self.mask_log_structure(positive_text),
|
||
self.mask_log_structure(negative_easy_text)
|
||
]))
|
||
|
||
# Перемешивание + случайный разделитель
|
||
train_examples.append(InputExample(texts=[
|
||
self.mask_log_structure(anchor_text),
|
||
self.mask_log_structure(positive_text),
|
||
self.mask_log_structure(negative_very_hard_text)
|
||
]))
|
||
|
||
return train_examples
|
||
|
||
def mask_log_structure(self, text: str) -> str:
|
||
# 1. GUID / UUID (строгий паттерн)
|
||
# Пример: 123e4567-e89b-12d3-a456-426614174000
|
||
text = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', '<GUID>', text)
|
||
|
||
# 2. IP-адреса (IPv4)
|
||
# Пример: 192.168.0.1
|
||
# Важно делать ДО флоатов, иначе 192.168 определится как Float
|
||
text = re.sub(r'\d{1,3}(?:\.\d{1,3}){3}', '<IP>', text)
|
||
|
||
# 3. Числа с плавающей точкой (Floats)
|
||
# Пример: 0.05, 123.45, -3.14
|
||
# (?<!\.) - проверка, что перед числом нет точки (чтобы не ломать IP, если вдруг проскочил)
|
||
text = re.sub(r'-?\d+\.\d+', '<NUM>', text)
|
||
|
||
# 4. Целые числа (Integers)
|
||
# Пример: 404, 500, -1
|
||
text = re.sub(r'-?\d+', '<NUM>', text)
|
||
|
||
# 5. (Опционально) Hex-строки (адреса памяти, хеши)
|
||
# Пример: 0x7fff5fbff
|
||
text = re.sub(r'0x[0-9a-fA-F]+', '<HEX>', text)
|
||
|
||
return text
|
||
|
||
|
||
|
||
if __name__ == "__main__":
|
||
gen = LogGenerator()
|
||
gen.generate_training_data(count=1)
|
||
|
||
print("Пример генерации датасета:\n")
|
||
|
||
# Генерируем 5 примеров
|
||
for i in range(10):
|
||
# 1. Получаем объект Term
|
||
term = gen.generate()
|
||
|
||
# 3. Используем данные (например, сохраняем в JSON для обучения)
|
||
print(f"--- Sample {i + 1} ---")
|
||
result = term.render()
|
||
print(f"{term.structure().text}")
|
||
|
||
for j in range(5):
|
||
# 2. Рендерим его в строку и метаданные
|
||
result = term.render()
|
||
|
||
print(f"Positive {j}: {result.text}")
|
||
|
||
for j in range(5):
|
||
# 2. Рендерим его в строку и метаданные
|
||
random.shuffle(term.literals)
|
||
term.separator = random.choice([" ", ";", "|"])
|
||
result = term.render()
|
||
|
||
print(f"Negative {j}: {result.text}")
|