Files
LogPatternExtractor/Generator/LogGenerator.py
2026-05-02 18:33:38 +03:00

179 lines
7.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import random
import re
from sentence_transformers import InputExample
from Generator.Enums.RandomType import RandomType
from Generator.Models.ConstLiteral import ConstLiteral
from Generator.Models.Term import Term
from Generator.Models.VariableLiteral import VariableLiteral
from Generator.UniversalRandomizer import UniversalRandomizer
class LogGenerator:
def __init__(self):
# Обертки для переменных: id=..., [ip], 'user'
self.wrappers = [("", ""), ("", ""), ("id=", ""), ("user:", ""), ("[", "]"), ("'", "'")]
# Словарь для констант (имитация логов)
self.log_keywords = [
# Уровни логирования
"INFO", "ERROR", "WARN", "DEBUG", "TRACE", "CRITICAL", "FATAL", "NOTICE",
# Действия (Verbs)
"started", "stopped", "failed", "completed", "aborted", "retrying",
"connecting", "disconnected", "listening", "resolving", "binding",
"parsing", "rendering", "authenticating", "authorizing", "validated",
"rejected", "accepted", "dropped", "created", "deleted", "updated",
"fetching", "sending", "receiving", "waiting", "killing", "spawning",
# Сущности (Nouns)
"System", "Kernel", "Thread", "Process", "Worker", "Daemon", "Job",
"Connection", "Session", "User", "Client", "Server", "Proxy", "Gateway",
"Database", "Table", "Index", "Query", "Transaction", "Commit", "Rollback",
"Cache", "Buffer", "Heap", "Stack", "Memory", "Disk", "Volume",
"Network", "Port", "Socket", "Interface", "Protocol", "Packet",
"Request", "Response", "Header", "Body", "Payload", "Token", "Key",
"File", "Directory", "Path", "Config", "Module", "Plugin", "Component",
"Exception", "Error", "Timeout", "Latency", "HealthCheck", "Status",
# HTTP и Web
"GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD",
"HTTP/1.1", "HTTP/2", "API", "Endpoint", "Route", "URI", "URL",
"JSON", "XML", "YAML", "HTML", "CSS", "JS",
# Предлоги и связки
"at", "in", "on", "to", "from", "by", "with", "for", "via", "through",
# Прилагательные и состояния
"successful", "failed", "denied", "allowed", "active", "inactive",
"pending", "queued", "blocked", "locked", "corrupted", "invalid",
"missing", "found", "available", "unavailable", "busy", "idle",
"secure", "insecure", "public", "private", "local", "remote"
]
def generate(self, min_literals=15, max_literals=25) -> Term:
count = random.randint(min_literals, max_literals)
literals = []
for i in range(count):
# 60% Константа, 40% Переменная
if random.random() < 0.6:
# Либо слово из словаря, либо случайное слово
txt = random.choice(self.log_keywords) if random.random() < 0.8 else UniversalRandomizer.fake.text.word()
literals.append(ConstLiteral(text=txt))
else:
r_type = random.choice(list(RandomType))
pref, post = random.choice(self.wrappers)
literals.append(VariableLiteral(name=f"v{i}", type=r_type, prefix=pref, postfix=post))
return Term(literals=literals, separator=random.choice([" ", ";", "|"]))
def generate_training_data(self, count=100):
train_examples = []
for _ in range(count):
anchor_term = self.generate()
anchor_text = anchor_term.render().text
# 2. Генерируем Positive (Позитивный пример)
positive_text = anchor_term.render().text
# 3. Генерируем Hard Negative
literals_copy = anchor_term.literals[:]
random.shuffle(literals_copy)
negative_hard_text = anchor_term.separator.join([lit.render().text for lit in literals_copy])
# 4. Генерируем Easy Negative (Совсем другой шаблон)
random_other_term = self.generate()
negative_easy_text = random_other_term.render().text
# 3. Генерируем Very Hard Negative
bad_sep = random.choice([" ", ";", "|", " "])
negative_very_hard_text = bad_sep.join([lit.render().text for lit in literals_copy])
# 5. Упаковываем для Sentence Transformers
# Перемешивание, но с сохранением разделителя
train_examples.append(InputExample(texts=[
self.mask_log_structure(anchor_text),
self.mask_log_structure(positive_text),
self.mask_log_structure(negative_hard_text)
]))
# Другой лог
train_examples.append(InputExample(texts=[
self.mask_log_structure(anchor_text),
self.mask_log_structure(positive_text),
self.mask_log_structure(negative_easy_text)
]))
# Перемешивание + случайный разделитель
train_examples.append(InputExample(texts=[
self.mask_log_structure(anchor_text),
self.mask_log_structure(positive_text),
self.mask_log_structure(negative_very_hard_text)
]))
return train_examples
def mask_log_structure(self, text: str) -> str:
# 1. GUID / UUID (строгий паттерн)
# Пример: 123e4567-e89b-12d3-a456-426614174000
text = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', '<GUID>', text)
# 2. IP-адреса (IPv4)
# Пример: 192.168.0.1
# Важно делать ДО флоатов, иначе 192.168 определится как Float
text = re.sub(r'\d{1,3}(?:\.\d{1,3}){3}', '<IP>', text)
# 3. Числа с плавающей точкой (Floats)
# Пример: 0.05, 123.45, -3.14
# (?<!\.) - проверка, что перед числом нет точки (чтобы не ломать IP, если вдруг проскочил)
text = re.sub(r'-?\d+\.\d+', '<NUM>', text)
# 4. Целые числа (Integers)
# Пример: 404, 500, -1
text = re.sub(r'-?\d+', '<NUM>', text)
# 5. (Опционально) Hex-строки (адреса памяти, хеши)
# Пример: 0x7fff5fbff
text = re.sub(r'0x[0-9a-fA-F]+', '<HEX>', text)
return text
if __name__ == "__main__":
gen = LogGenerator()
gen.generate_training_data(count=1)
print("Пример генерации датасета:\n")
# Генерируем 5 примеров
for i in range(10):
# 1. Получаем объект Term
term = gen.generate()
# 3. Используем данные (например, сохраняем в JSON для обучения)
print(f"--- Sample {i + 1} ---")
result = term.render()
print(f"{term.structure().text}")
for j in range(5):
# 2. Рендерим его в строку и метаданные
result = term.render()
print(f"Positive {j}: {result.text}")
for j in range(5):
# 2. Рендерим его в строку и метаданные
random.shuffle(term.literals)
term.separator = random.choice([" ", ";", "|"])
result = term.render()
print(f"Negative {j}: {result.text}")