import random import re from sentence_transformers import InputExample from Generator.Enums.RandomType import RandomType from Generator.Models.ConstLiteral import ConstLiteral from Generator.Models.Term import Term from Generator.Models.VariableLiteral import VariableLiteral from Generator.UniversalRandomizer import UniversalRandomizer class LogGenerator: def __init__(self): # Обертки для переменных: id=..., [ip], 'user' self.wrappers = [("", ""), ("", ""), ("id=", ""), ("user:", ""), ("[", "]"), ("'", "'")] # Словарь для констант (имитация логов) self.log_keywords = [ # Уровни логирования "INFO", "ERROR", "WARN", "DEBUG", "TRACE", "CRITICAL", "FATAL", "NOTICE", # Действия (Verbs) "started", "stopped", "failed", "completed", "aborted", "retrying", "connecting", "disconnected", "listening", "resolving", "binding", "parsing", "rendering", "authenticating", "authorizing", "validated", "rejected", "accepted", "dropped", "created", "deleted", "updated", "fetching", "sending", "receiving", "waiting", "killing", "spawning", # Сущности (Nouns) "System", "Kernel", "Thread", "Process", "Worker", "Daemon", "Job", "Connection", "Session", "User", "Client", "Server", "Proxy", "Gateway", "Database", "Table", "Index", "Query", "Transaction", "Commit", "Rollback", "Cache", "Buffer", "Heap", "Stack", "Memory", "Disk", "Volume", "Network", "Port", "Socket", "Interface", "Protocol", "Packet", "Request", "Response", "Header", "Body", "Payload", "Token", "Key", "File", "Directory", "Path", "Config", "Module", "Plugin", "Component", "Exception", "Error", "Timeout", "Latency", "HealthCheck", "Status", # HTTP и Web "GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD", "HTTP/1.1", "HTTP/2", "API", "Endpoint", "Route", "URI", "URL", "JSON", "XML", "YAML", "HTML", "CSS", "JS", # Предлоги и связки "at", "in", "on", "to", "from", "by", "with", "for", "via", "through", # Прилагательные и состояния "successful", "failed", "denied", "allowed", "active", "inactive", "pending", "queued", "blocked", "locked", "corrupted", "invalid", "missing", "found", "available", "unavailable", "busy", "idle", "secure", "insecure", "public", "private", "local", "remote" ] def generate(self, min_literals=15, max_literals=25) -> Term: count = random.randint(min_literals, max_literals) literals = [] for i in range(count): # 60% Константа, 40% Переменная if random.random() < 0.6: # Либо слово из словаря, либо случайное слово txt = random.choice(self.log_keywords) if random.random() < 0.8 else UniversalRandomizer.fake.text.word() literals.append(ConstLiteral(text=txt)) else: r_type = random.choice(list(RandomType)) pref, post = random.choice(self.wrappers) literals.append(VariableLiteral(name=f"v{i}", type=r_type, prefix=pref, postfix=post)) return Term(literals=literals, separator=random.choice([" ", ";", "|"])) def generate_training_data(self, count=100): train_examples = [] for _ in range(count): anchor_term = self.generate() anchor_text = anchor_term.render().text # 2. Генерируем Positive (Позитивный пример) positive_text = anchor_term.render().text # 3. Генерируем Hard Negative literals_copy = anchor_term.literals[:] random.shuffle(literals_copy) negative_hard_text = anchor_term.separator.join([lit.render().text for lit in literals_copy]) # 4. Генерируем Easy Negative (Совсем другой шаблон) random_other_term = self.generate() negative_easy_text = random_other_term.render().text # 3. Генерируем Very Hard Negative bad_sep = random.choice([" ", ";", "|", " "]) negative_very_hard_text = bad_sep.join([lit.render().text for lit in literals_copy]) # 5. Упаковываем для Sentence Transformers # Перемешивание, но с сохранением разделителя train_examples.append(InputExample(texts=[ self.mask_log_structure(anchor_text), self.mask_log_structure(positive_text), self.mask_log_structure(negative_hard_text) ])) # Другой лог train_examples.append(InputExample(texts=[ self.mask_log_structure(anchor_text), self.mask_log_structure(positive_text), self.mask_log_structure(negative_easy_text) ])) # Перемешивание + случайный разделитель train_examples.append(InputExample(texts=[ self.mask_log_structure(anchor_text), self.mask_log_structure(positive_text), self.mask_log_structure(negative_very_hard_text) ])) return train_examples def mask_log_structure(self, text: str) -> str: # 1. GUID / UUID (строгий паттерн) # Пример: 123e4567-e89b-12d3-a456-426614174000 text = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', '', text) # 2. IP-адреса (IPv4) # Пример: 192.168.0.1 # Важно делать ДО флоатов, иначе 192.168 определится как Float text = re.sub(r'\d{1,3}(?:\.\d{1,3}){3}', '', text) # 3. Числа с плавающей точкой (Floats) # Пример: 0.05, 123.45, -3.14 # (?', text) # 4. Целые числа (Integers) # Пример: 404, 500, -1 text = re.sub(r'-?\d+', '', text) # 5. (Опционально) Hex-строки (адреса памяти, хеши) # Пример: 0x7fff5fbff text = re.sub(r'0x[0-9a-fA-F]+', '', text) return text if __name__ == "__main__": gen = LogGenerator() gen.generate_training_data(count=1) print("Пример генерации датасета:\n") # Генерируем 5 примеров for i in range(10): # 1. Получаем объект Term term = gen.generate() # 3. Используем данные (например, сохраняем в JSON для обучения) print(f"--- Sample {i + 1} ---") result = term.render() print(f"{term.structure().text}") for j in range(5): # 2. Рендерим его в строку и метаданные result = term.render() print(f"Positive {j}: {result.text}") for j in range(5): # 2. Рендерим его в строку и метаданные random.shuffle(term.literals) term.separator = random.choice([" ", ";", "|"]) result = term.render() print(f"Negative {j}: {result.text}")