Первый запуск

This commit is contained in:
KuzarinM
2026-05-02 18:33:38 +03:00
commit cb55eaef01
51 changed files with 2127373 additions and 0 deletions

View File

@@ -0,0 +1,13 @@
from enum import Enum, auto
class RandomType(Enum):
IP = auto()
DATE = auto()
EMAIL = auto()
STATUS_CODE = auto()
PATH = auto()
USERNAME = auto()
INT = auto()
VERSION = auto()
ID = auto()

178
Generator/LogGenerator.py Normal file
View File

@@ -0,0 +1,178 @@
import random
import re
from sentence_transformers import InputExample
from Generator.Enums.RandomType import RandomType
from Generator.Models.ConstLiteral import ConstLiteral
from Generator.Models.Term import Term
from Generator.Models.VariableLiteral import VariableLiteral
from Generator.UniversalRandomizer import UniversalRandomizer
class LogGenerator:
def __init__(self):
# Обертки для переменных: id=..., [ip], 'user'
self.wrappers = [("", ""), ("", ""), ("id=", ""), ("user:", ""), ("[", "]"), ("'", "'")]
# Словарь для констант (имитация логов)
self.log_keywords = [
# Уровни логирования
"INFO", "ERROR", "WARN", "DEBUG", "TRACE", "CRITICAL", "FATAL", "NOTICE",
# Действия (Verbs)
"started", "stopped", "failed", "completed", "aborted", "retrying",
"connecting", "disconnected", "listening", "resolving", "binding",
"parsing", "rendering", "authenticating", "authorizing", "validated",
"rejected", "accepted", "dropped", "created", "deleted", "updated",
"fetching", "sending", "receiving", "waiting", "killing", "spawning",
# Сущности (Nouns)
"System", "Kernel", "Thread", "Process", "Worker", "Daemon", "Job",
"Connection", "Session", "User", "Client", "Server", "Proxy", "Gateway",
"Database", "Table", "Index", "Query", "Transaction", "Commit", "Rollback",
"Cache", "Buffer", "Heap", "Stack", "Memory", "Disk", "Volume",
"Network", "Port", "Socket", "Interface", "Protocol", "Packet",
"Request", "Response", "Header", "Body", "Payload", "Token", "Key",
"File", "Directory", "Path", "Config", "Module", "Plugin", "Component",
"Exception", "Error", "Timeout", "Latency", "HealthCheck", "Status",
# HTTP и Web
"GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD",
"HTTP/1.1", "HTTP/2", "API", "Endpoint", "Route", "URI", "URL",
"JSON", "XML", "YAML", "HTML", "CSS", "JS",
# Предлоги и связки
"at", "in", "on", "to", "from", "by", "with", "for", "via", "through",
# Прилагательные и состояния
"successful", "failed", "denied", "allowed", "active", "inactive",
"pending", "queued", "blocked", "locked", "corrupted", "invalid",
"missing", "found", "available", "unavailable", "busy", "idle",
"secure", "insecure", "public", "private", "local", "remote"
]
def generate(self, min_literals=15, max_literals=25) -> Term:
count = random.randint(min_literals, max_literals)
literals = []
for i in range(count):
# 60% Константа, 40% Переменная
if random.random() < 0.6:
# Либо слово из словаря, либо случайное слово
txt = random.choice(self.log_keywords) if random.random() < 0.8 else UniversalRandomizer.fake.text.word()
literals.append(ConstLiteral(text=txt))
else:
r_type = random.choice(list(RandomType))
pref, post = random.choice(self.wrappers)
literals.append(VariableLiteral(name=f"v{i}", type=r_type, prefix=pref, postfix=post))
return Term(literals=literals, separator=random.choice([" ", ";", "|"]))
def generate_training_data(self, count=100):
train_examples = []
for _ in range(count):
anchor_term = self.generate()
anchor_text = anchor_term.render().text
# 2. Генерируем Positive (Позитивный пример)
positive_text = anchor_term.render().text
# 3. Генерируем Hard Negative
literals_copy = anchor_term.literals[:]
random.shuffle(literals_copy)
negative_hard_text = anchor_term.separator.join([lit.render().text for lit in literals_copy])
# 4. Генерируем Easy Negative (Совсем другой шаблон)
random_other_term = self.generate()
negative_easy_text = random_other_term.render().text
# 3. Генерируем Very Hard Negative
bad_sep = random.choice([" ", ";", "|", " "])
negative_very_hard_text = bad_sep.join([lit.render().text for lit in literals_copy])
# 5. Упаковываем для Sentence Transformers
# Перемешивание, но с сохранением разделителя
train_examples.append(InputExample(texts=[
self.mask_log_structure(anchor_text),
self.mask_log_structure(positive_text),
self.mask_log_structure(negative_hard_text)
]))
# Другой лог
train_examples.append(InputExample(texts=[
self.mask_log_structure(anchor_text),
self.mask_log_structure(positive_text),
self.mask_log_structure(negative_easy_text)
]))
# Перемешивание + случайный разделитель
train_examples.append(InputExample(texts=[
self.mask_log_structure(anchor_text),
self.mask_log_structure(positive_text),
self.mask_log_structure(negative_very_hard_text)
]))
return train_examples
def mask_log_structure(self, text: str) -> str:
# 1. GUID / UUID (строгий паттерн)
# Пример: 123e4567-e89b-12d3-a456-426614174000
text = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', '<GUID>', text)
# 2. IP-адреса (IPv4)
# Пример: 192.168.0.1
# Важно делать ДО флоатов, иначе 192.168 определится как Float
text = re.sub(r'\d{1,3}(?:\.\d{1,3}){3}', '<IP>', text)
# 3. Числа с плавающей точкой (Floats)
# Пример: 0.05, 123.45, -3.14
# (?<!\.) - проверка, что перед числом нет точки (чтобы не ломать IP, если вдруг проскочил)
text = re.sub(r'-?\d+\.\d+', '<NUM>', text)
# 4. Целые числа (Integers)
# Пример: 404, 500, -1
text = re.sub(r'-?\d+', '<NUM>', text)
# 5. (Опционально) Hex-строки (адреса памяти, хеши)
# Пример: 0x7fff5fbff
text = re.sub(r'0x[0-9a-fA-F]+', '<HEX>', text)
return text
if __name__ == "__main__":
gen = LogGenerator()
gen.generate_training_data(count=1)
print("Пример генерации датасета:\n")
# Генерируем 5 примеров
for i in range(10):
# 1. Получаем объект Term
term = gen.generate()
# 3. Используем данные (например, сохраняем в JSON для обучения)
print(f"--- Sample {i + 1} ---")
result = term.render()
print(f"{term.structure().text}")
for j in range(5):
# 2. Рендерим его в строку и метаданные
result = term.render()
print(f"Positive {j}: {result.text}")
for j in range(5):
# 2. Рендерим его в строку и метаданные
random.shuffle(term.literals)
term.separator = random.choice([" ", ";", "|"])
result = term.render()
print(f"Negative {j}: {result.text}")

View File

@@ -0,0 +1,15 @@
from dataclasses import dataclass
from Generator.Models.Literal import Literal
from Generator.Models.RenderResult import RenderResult
@dataclass
class ConstLiteral(Literal):
text: str
def render(self, chanse: float = 1) -> RenderResult:
return RenderResult(self.text, [])
def structure(self) -> RenderResult:
return self.render()

View File

@@ -0,0 +1,12 @@
from dataclasses import dataclass
from Generator.Models.RenderResult import RenderResult
@dataclass
class Literal:
def render(self, chanse: float = 1) -> RenderResult:
return RenderResult("", [])
def structure(self) -> RenderResult:
return RenderResult("", [])

View File

@@ -0,0 +1,8 @@
from dataclasses import dataclass
from typing import List, Tuple
@dataclass
class RenderResult:
text: str
spans: List[Tuple[int, int, str]]

53
Generator/Models/Term.py Normal file
View File

@@ -0,0 +1,53 @@
from dataclasses import dataclass
from typing import List
from Generator.Models.ConstLiteral import ConstLiteral
from Generator.Models.Literal import Literal
from Generator.Models.RenderResult import RenderResult
from Generator.Models.VariableLiteral import VariableLiteral
@dataclass
class Term:
literals: List[Literal]
separator: str = " "
def render(self, chanse: float = 1) -> RenderResult:
final_text = ""
final_spans = []
for i, literal in enumerate(self.literals):
res = literal.render(chanse)
current_offset = len(final_text)
final_text += res.text
# Сдвигаем координаты с учетом позиции слова в строке
for (start, end, label) in res.spans:
final_spans.append((current_offset + start, current_offset + end, label))
# Добавляем разделитель, если это не последнее слово
if i < len(self.literals) - 1:
final_text += self.separator
return RenderResult(final_text, final_spans)
def structure(self) -> RenderResult:
final_text = ""
final_spans = []
for i, literal in enumerate(self.literals):
res = literal.structure()
current_offset = len(final_text)
final_text += res.text
# Сдвигаем координаты с учетом позиции слова в строке
for (start, end, label) in res.spans:
final_spans.append((current_offset + start, current_offset + end, label))
# Добавляем разделитель, если это не последнее слово
if i < len(self.literals) - 1:
final_text += self.separator
return RenderResult(final_text, final_spans)

View File

@@ -0,0 +1,45 @@
import random
from dataclasses import dataclass
from Generator.Enums.RandomType import RandomType
from Generator.Models.Literal import Literal
from Generator.Models.RenderResult import RenderResult
from Generator.UniversalRandomizer import UniversalRandomizer
@dataclass
class VariableLiteral(Literal):
name: str
type: RandomType
prefix: str = ""
postfix: str = ""
last_value: str | None = None
def render(self, chanse: float = 1) -> RenderResult:
if self.last_value is None or random.random() <= chanse:
# Генерируем значение
val = str(UniversalRandomizer().get_random(self.type))
self.last_value = val
else:
val = self.last_value
# Формируем строку: префикс + значение + постфикс
full_text = f"{self.prefix}{val}{self.postfix}"
# Вычисляем координаты ЧИСТОГО значения (без префикса)
start = len(self.prefix)
end = start + len(val)
return RenderResult(full_text, [(start, end, self.type.name)])
def structure(self) -> RenderResult:
val = f"<{self.type.name}>"
# Формируем строку: префикс + значение + постфикс
full_text = f"{self.prefix}{val}{self.postfix}"
# Вычисляем координаты ЧИСТОГО значения (без префикса)
start = len(self.prefix)
end = start + len(val)
return RenderResult(full_text, [(start, end, self.type.name)])

View File

@@ -0,0 +1,31 @@
import random
from typing import Any
from Generator.Enums.RandomType import RandomType
from mimesis import Generic
from mimesis.locales import Locale
class UniversalRandomizer:
fake = Generic(locale=Locale.EN)
def get_random(self, r_type: RandomType) -> Any:
if r_type == RandomType.IP:
return self.fake.internet.ip_v4()
if r_type == RandomType.DATE:
return self.fake.datetime.date().isoformat()
if r_type == RandomType.EMAIL:
return self.fake.person.email()
if r_type == RandomType.STATUS_CODE:
return self.fake.internet.http_status_code()
if r_type == RandomType.PATH:
return f"/var/log/{self.fake.file.file_name()}"
if r_type == RandomType.USERNAME:
return self.fake.person.username()
if r_type == RandomType.INT:
return random.randint(1, 9999)
if r_type == RandomType.VERSION:
return self.fake.development.version()
if r_type == RandomType.ID:
return self.fake.cryptographic.uuid().split('-')[0]
return "UNKNOWN"