import difflib import os import re import time from typing import List, Dict, Any, Union, Optional import numpy as np from sentence_transformers import SentenceTransformer, util from Processor.Models.LogTemplate import LogTemplate from Processor.Models.LogVariable import LogVariable from Processor.TemplateDatabase import TemplateDatabase class StreamingLogCluster: # --- Константы класса для удобства настройки --- THRESHOLD_CREATE_NEW = 0.7 #0.70 SCORE_EXACT_MATCH = 0.85 SCORE_PARTIAL_MATCH = 0.6 MAX_VAR_LEN = 32 HARD_DELIMITERS = {'|', ';', ','} SOFT_DELIMITERS = {'=', ':', '-', '>', '<', '[', ']', '(', ')', '{', '}', '"', "'"} def __init__(self, model_path: str, db_path: str = "logs_knowledge.db"): self.model = SentenceTransformer(model_path) self.db = TemplateDatabase(db_path) # Компилируем регулярные выражения один раз self.mask_regex = { 'guid': re.compile(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-...'), 'ip': re.compile(r'\d{1,3}(?:\.\d{1,3}){3}'), 'ver': re.compile(r'\d{1,3}(?:\.\d{1,3}){2}'), 'num': re.compile(r'-?\d+(\.\d+)?'), 'base64': re.compile(r'(?\d{4}-\d{2}-\d{2}|\d{2}\.\d{2}\.\d{4}|\d{2}/\d{2}/\d{4})', r'(?P