Первый запуск
This commit is contained in:
13
Generator/Enums/RandomType.py
Normal file
13
Generator/Enums/RandomType.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from enum import Enum, auto
|
||||
|
||||
|
||||
class RandomType(Enum):
|
||||
IP = auto()
|
||||
DATE = auto()
|
||||
EMAIL = auto()
|
||||
STATUS_CODE = auto()
|
||||
PATH = auto()
|
||||
USERNAME = auto()
|
||||
INT = auto()
|
||||
VERSION = auto()
|
||||
ID = auto()
|
||||
178
Generator/LogGenerator.py
Normal file
178
Generator/LogGenerator.py
Normal file
@@ -0,0 +1,178 @@
|
||||
import random
|
||||
import re
|
||||
|
||||
from sentence_transformers import InputExample
|
||||
|
||||
from Generator.Enums.RandomType import RandomType
|
||||
from Generator.Models.ConstLiteral import ConstLiteral
|
||||
from Generator.Models.Term import Term
|
||||
from Generator.Models.VariableLiteral import VariableLiteral
|
||||
from Generator.UniversalRandomizer import UniversalRandomizer
|
||||
|
||||
|
||||
class LogGenerator:
|
||||
def __init__(self):
|
||||
# Обертки для переменных: id=..., [ip], 'user'
|
||||
self.wrappers = [("", ""), ("", ""), ("id=", ""), ("user:", ""), ("[", "]"), ("'", "'")]
|
||||
|
||||
# Словарь для констант (имитация логов)
|
||||
self.log_keywords = [
|
||||
# Уровни логирования
|
||||
"INFO", "ERROR", "WARN", "DEBUG", "TRACE", "CRITICAL", "FATAL", "NOTICE",
|
||||
|
||||
# Действия (Verbs)
|
||||
"started", "stopped", "failed", "completed", "aborted", "retrying",
|
||||
"connecting", "disconnected", "listening", "resolving", "binding",
|
||||
"parsing", "rendering", "authenticating", "authorizing", "validated",
|
||||
"rejected", "accepted", "dropped", "created", "deleted", "updated",
|
||||
"fetching", "sending", "receiving", "waiting", "killing", "spawning",
|
||||
|
||||
# Сущности (Nouns)
|
||||
"System", "Kernel", "Thread", "Process", "Worker", "Daemon", "Job",
|
||||
"Connection", "Session", "User", "Client", "Server", "Proxy", "Gateway",
|
||||
"Database", "Table", "Index", "Query", "Transaction", "Commit", "Rollback",
|
||||
"Cache", "Buffer", "Heap", "Stack", "Memory", "Disk", "Volume",
|
||||
"Network", "Port", "Socket", "Interface", "Protocol", "Packet",
|
||||
"Request", "Response", "Header", "Body", "Payload", "Token", "Key",
|
||||
"File", "Directory", "Path", "Config", "Module", "Plugin", "Component",
|
||||
"Exception", "Error", "Timeout", "Latency", "HealthCheck", "Status",
|
||||
|
||||
# HTTP и Web
|
||||
"GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD",
|
||||
"HTTP/1.1", "HTTP/2", "API", "Endpoint", "Route", "URI", "URL",
|
||||
"JSON", "XML", "YAML", "HTML", "CSS", "JS",
|
||||
|
||||
# Предлоги и связки
|
||||
"at", "in", "on", "to", "from", "by", "with", "for", "via", "through",
|
||||
|
||||
# Прилагательные и состояния
|
||||
"successful", "failed", "denied", "allowed", "active", "inactive",
|
||||
"pending", "queued", "blocked", "locked", "corrupted", "invalid",
|
||||
"missing", "found", "available", "unavailable", "busy", "idle",
|
||||
"secure", "insecure", "public", "private", "local", "remote"
|
||||
]
|
||||
|
||||
def generate(self, min_literals=15, max_literals=25) -> Term:
|
||||
count = random.randint(min_literals, max_literals)
|
||||
literals = []
|
||||
|
||||
for i in range(count):
|
||||
# 60% Константа, 40% Переменная
|
||||
if random.random() < 0.6:
|
||||
# Либо слово из словаря, либо случайное слово
|
||||
txt = random.choice(self.log_keywords) if random.random() < 0.8 else UniversalRandomizer.fake.text.word()
|
||||
literals.append(ConstLiteral(text=txt))
|
||||
else:
|
||||
r_type = random.choice(list(RandomType))
|
||||
pref, post = random.choice(self.wrappers)
|
||||
literals.append(VariableLiteral(name=f"v{i}", type=r_type, prefix=pref, postfix=post))
|
||||
|
||||
return Term(literals=literals, separator=random.choice([" ", ";", "|"]))
|
||||
|
||||
def generate_training_data(self, count=100):
|
||||
train_examples = []
|
||||
|
||||
for _ in range(count):
|
||||
anchor_term = self.generate()
|
||||
|
||||
anchor_text = anchor_term.render().text
|
||||
|
||||
# 2. Генерируем Positive (Позитивный пример)
|
||||
positive_text = anchor_term.render().text
|
||||
|
||||
# 3. Генерируем Hard Negative
|
||||
literals_copy = anchor_term.literals[:]
|
||||
random.shuffle(literals_copy)
|
||||
|
||||
negative_hard_text = anchor_term.separator.join([lit.render().text for lit in literals_copy])
|
||||
|
||||
# 4. Генерируем Easy Negative (Совсем другой шаблон)
|
||||
random_other_term = self.generate()
|
||||
negative_easy_text = random_other_term.render().text
|
||||
|
||||
# 3. Генерируем Very Hard Negative
|
||||
|
||||
bad_sep = random.choice([" ", ";", "|", " "])
|
||||
negative_very_hard_text = bad_sep.join([lit.render().text for lit in literals_copy])
|
||||
|
||||
# 5. Упаковываем для Sentence Transformers
|
||||
|
||||
# Перемешивание, но с сохранением разделителя
|
||||
train_examples.append(InputExample(texts=[
|
||||
self.mask_log_structure(anchor_text),
|
||||
self.mask_log_structure(positive_text),
|
||||
self.mask_log_structure(negative_hard_text)
|
||||
]))
|
||||
|
||||
# Другой лог
|
||||
train_examples.append(InputExample(texts=[
|
||||
self.mask_log_structure(anchor_text),
|
||||
self.mask_log_structure(positive_text),
|
||||
self.mask_log_structure(negative_easy_text)
|
||||
]))
|
||||
|
||||
# Перемешивание + случайный разделитель
|
||||
train_examples.append(InputExample(texts=[
|
||||
self.mask_log_structure(anchor_text),
|
||||
self.mask_log_structure(positive_text),
|
||||
self.mask_log_structure(negative_very_hard_text)
|
||||
]))
|
||||
|
||||
return train_examples
|
||||
|
||||
def mask_log_structure(self, text: str) -> str:
|
||||
# 1. GUID / UUID (строгий паттерн)
|
||||
# Пример: 123e4567-e89b-12d3-a456-426614174000
|
||||
text = re.sub(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', '<GUID>', text)
|
||||
|
||||
# 2. IP-адреса (IPv4)
|
||||
# Пример: 192.168.0.1
|
||||
# Важно делать ДО флоатов, иначе 192.168 определится как Float
|
||||
text = re.sub(r'\d{1,3}(?:\.\d{1,3}){3}', '<IP>', text)
|
||||
|
||||
# 3. Числа с плавающей точкой (Floats)
|
||||
# Пример: 0.05, 123.45, -3.14
|
||||
# (?<!\.) - проверка, что перед числом нет точки (чтобы не ломать IP, если вдруг проскочил)
|
||||
text = re.sub(r'-?\d+\.\d+', '<NUM>', text)
|
||||
|
||||
# 4. Целые числа (Integers)
|
||||
# Пример: 404, 500, -1
|
||||
text = re.sub(r'-?\d+', '<NUM>', text)
|
||||
|
||||
# 5. (Опционально) Hex-строки (адреса памяти, хеши)
|
||||
# Пример: 0x7fff5fbff
|
||||
text = re.sub(r'0x[0-9a-fA-F]+', '<HEX>', text)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
gen = LogGenerator()
|
||||
gen.generate_training_data(count=1)
|
||||
|
||||
print("Пример генерации датасета:\n")
|
||||
|
||||
# Генерируем 5 примеров
|
||||
for i in range(10):
|
||||
# 1. Получаем объект Term
|
||||
term = gen.generate()
|
||||
|
||||
# 3. Используем данные (например, сохраняем в JSON для обучения)
|
||||
print(f"--- Sample {i + 1} ---")
|
||||
result = term.render()
|
||||
print(f"{term.structure().text}")
|
||||
|
||||
for j in range(5):
|
||||
# 2. Рендерим его в строку и метаданные
|
||||
result = term.render()
|
||||
|
||||
print(f"Positive {j}: {result.text}")
|
||||
|
||||
for j in range(5):
|
||||
# 2. Рендерим его в строку и метаданные
|
||||
random.shuffle(term.literals)
|
||||
term.separator = random.choice([" ", ";", "|"])
|
||||
result = term.render()
|
||||
|
||||
print(f"Negative {j}: {result.text}")
|
||||
15
Generator/Models/ConstLiteral.py
Normal file
15
Generator/Models/ConstLiteral.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
from Generator.Models.Literal import Literal
|
||||
from Generator.Models.RenderResult import RenderResult
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConstLiteral(Literal):
|
||||
text: str
|
||||
|
||||
def render(self, chanse: float = 1) -> RenderResult:
|
||||
return RenderResult(self.text, [])
|
||||
|
||||
def structure(self) -> RenderResult:
|
||||
return self.render()
|
||||
12
Generator/Models/Literal.py
Normal file
12
Generator/Models/Literal.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
from Generator.Models.RenderResult import RenderResult
|
||||
|
||||
|
||||
@dataclass
|
||||
class Literal:
|
||||
def render(self, chanse: float = 1) -> RenderResult:
|
||||
return RenderResult("", [])
|
||||
|
||||
def structure(self) -> RenderResult:
|
||||
return RenderResult("", [])
|
||||
8
Generator/Models/RenderResult.py
Normal file
8
Generator/Models/RenderResult.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Tuple
|
||||
|
||||
|
||||
@dataclass
|
||||
class RenderResult:
|
||||
text: str
|
||||
spans: List[Tuple[int, int, str]]
|
||||
53
Generator/Models/Term.py
Normal file
53
Generator/Models/Term.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
from Generator.Models.ConstLiteral import ConstLiteral
|
||||
from Generator.Models.Literal import Literal
|
||||
from Generator.Models.RenderResult import RenderResult
|
||||
from Generator.Models.VariableLiteral import VariableLiteral
|
||||
|
||||
|
||||
@dataclass
|
||||
class Term:
|
||||
literals: List[Literal]
|
||||
separator: str = " "
|
||||
|
||||
def render(self, chanse: float = 1) -> RenderResult:
|
||||
final_text = ""
|
||||
final_spans = []
|
||||
|
||||
for i, literal in enumerate(self.literals):
|
||||
res = literal.render(chanse)
|
||||
|
||||
current_offset = len(final_text)
|
||||
final_text += res.text
|
||||
|
||||
# Сдвигаем координаты с учетом позиции слова в строке
|
||||
for (start, end, label) in res.spans:
|
||||
final_spans.append((current_offset + start, current_offset + end, label))
|
||||
|
||||
# Добавляем разделитель, если это не последнее слово
|
||||
if i < len(self.literals) - 1:
|
||||
final_text += self.separator
|
||||
|
||||
return RenderResult(final_text, final_spans)
|
||||
|
||||
def structure(self) -> RenderResult:
|
||||
final_text = ""
|
||||
final_spans = []
|
||||
|
||||
for i, literal in enumerate(self.literals):
|
||||
res = literal.structure()
|
||||
|
||||
current_offset = len(final_text)
|
||||
final_text += res.text
|
||||
|
||||
# Сдвигаем координаты с учетом позиции слова в строке
|
||||
for (start, end, label) in res.spans:
|
||||
final_spans.append((current_offset + start, current_offset + end, label))
|
||||
|
||||
# Добавляем разделитель, если это не последнее слово
|
||||
if i < len(self.literals) - 1:
|
||||
final_text += self.separator
|
||||
|
||||
return RenderResult(final_text, final_spans)
|
||||
45
Generator/Models/VariableLiteral.py
Normal file
45
Generator/Models/VariableLiteral.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import random
|
||||
from dataclasses import dataclass
|
||||
|
||||
from Generator.Enums.RandomType import RandomType
|
||||
from Generator.Models.Literal import Literal
|
||||
from Generator.Models.RenderResult import RenderResult
|
||||
from Generator.UniversalRandomizer import UniversalRandomizer
|
||||
|
||||
|
||||
@dataclass
|
||||
class VariableLiteral(Literal):
|
||||
name: str
|
||||
type: RandomType
|
||||
prefix: str = ""
|
||||
postfix: str = ""
|
||||
last_value: str | None = None
|
||||
|
||||
def render(self, chanse: float = 1) -> RenderResult:
|
||||
if self.last_value is None or random.random() <= chanse:
|
||||
# Генерируем значение
|
||||
val = str(UniversalRandomizer().get_random(self.type))
|
||||
self.last_value = val
|
||||
else:
|
||||
val = self.last_value
|
||||
|
||||
# Формируем строку: префикс + значение + постфикс
|
||||
full_text = f"{self.prefix}{val}{self.postfix}"
|
||||
|
||||
# Вычисляем координаты ЧИСТОГО значения (без префикса)
|
||||
start = len(self.prefix)
|
||||
end = start + len(val)
|
||||
|
||||
return RenderResult(full_text, [(start, end, self.type.name)])
|
||||
|
||||
def structure(self) -> RenderResult:
|
||||
val = f"<{self.type.name}>"
|
||||
|
||||
# Формируем строку: префикс + значение + постфикс
|
||||
full_text = f"{self.prefix}{val}{self.postfix}"
|
||||
|
||||
# Вычисляем координаты ЧИСТОГО значения (без префикса)
|
||||
start = len(self.prefix)
|
||||
end = start + len(val)
|
||||
|
||||
return RenderResult(full_text, [(start, end, self.type.name)])
|
||||
31
Generator/UniversalRandomizer.py
Normal file
31
Generator/UniversalRandomizer.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import random
|
||||
from typing import Any
|
||||
|
||||
from Generator.Enums.RandomType import RandomType
|
||||
from mimesis import Generic
|
||||
from mimesis.locales import Locale
|
||||
|
||||
|
||||
class UniversalRandomizer:
|
||||
fake = Generic(locale=Locale.EN)
|
||||
|
||||
def get_random(self, r_type: RandomType) -> Any:
|
||||
if r_type == RandomType.IP:
|
||||
return self.fake.internet.ip_v4()
|
||||
if r_type == RandomType.DATE:
|
||||
return self.fake.datetime.date().isoformat()
|
||||
if r_type == RandomType.EMAIL:
|
||||
return self.fake.person.email()
|
||||
if r_type == RandomType.STATUS_CODE:
|
||||
return self.fake.internet.http_status_code()
|
||||
if r_type == RandomType.PATH:
|
||||
return f"/var/log/{self.fake.file.file_name()}"
|
||||
if r_type == RandomType.USERNAME:
|
||||
return self.fake.person.username()
|
||||
if r_type == RandomType.INT:
|
||||
return random.randint(1, 9999)
|
||||
if r_type == RandomType.VERSION:
|
||||
return self.fake.development.version()
|
||||
if r_type == RandomType.ID:
|
||||
return self.fake.cryptographic.uuid().split('-')[0]
|
||||
return "UNKNOWN"
|
||||
Reference in New Issue
Block a user