From f956b85ed39191f94f3e45f03660c91c19fbe533 Mon Sep 17 00:00:00 2001 From: RookieCuzz <85008507+RookieCuzz@users.noreply.github.com> Date: Wed, 3 Sep 2025 16:37:43 +0800 Subject: [PATCH] =?UTF-8?q?=E6=95=8F=E6=84=9F=E8=AF=8D=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 18 +++++ cmd/server/main.go | 96 +++++++++++++++++++++++++ go.mod | 30 ++++++++ go.sum | 55 ++++++++++++++ internal/detect/service.go | 144 +++++++++++++++++++++++++++++++++++++ internal/lexicon/store.go | 141 ++++++++++++++++++++++++++++++++++++ 6 files changed, 484 insertions(+) create mode 100644 cmd/server/main.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 internal/detect/service.go create mode 100644 internal/lexicon/store.go diff --git a/README.md b/README.md index 6e29f6d..ffb1440 100644 --- a/README.md +++ b/README.md @@ -82,3 +82,21 @@ git clone https://github.com/Konsheng/Sensitive-lexicon.git Star History Chart + +## 运行敏感词检测服务(Fiber + fuzzy-patricia) + +```bash +# Windows PowerShell 示例 +$env:PORT="8080"; $env:LEXICON_DIR="Vocabulary"; $env:FUZZY_MAX_DISTANCE="1" + +# 构建并运行 +go mod tidy +go build -o bin\server.exe ./cmd/server +./bin/server.exe +``` + +- POST `/detect` + - 请求体: `{ "text": "待检测文本", "enable_fuzzy": true }` + - 响应: `{ "hits": [{"word":"...","type":"substring|fuzzy","distance":0}] }` +- POST `/reload` 重新加载 `Vocabulary` 目录 +- GET `/health` 存活探针 diff --git a/cmd/server/main.go b/cmd/server/main.go new file mode 100644 index 0000000..7f35123 --- /dev/null +++ b/cmd/server/main.go @@ -0,0 +1,96 @@ +package main + +import ( + "github.com/go-playground/validator/v10" + "github.com/gofiber/fiber/v2" + "log" + "os" + "sensitive-lexicon/internal/detect" + "sensitive-lexicon/internal/lexicon" + "strconv" + "time" +) + +type User struct { + Name string `json:"name" validate:"min=5,max=20"` + Age int `json:"age" validate:"gte=18"` + Enrollment time.Time `json:"enrollment" validate:"before_today"` + Graduation time.Time `json:"graduation" validate:"gtfield=Enrollment"` +} + +// BeforeToday 验证日期是否在今天之前 +func BeforeToday(fl validator.FieldLevel) bool { + fieldTime, ok := fl.Field().Interface().(time.Time) + if !ok { + return false + } + return fieldTime.Before(time.Now()) +} +func main() { + lexiconDir := getenv("LEXICON_DIR", "Vocabulary") + minNgram := getenvInt("FUZZY_MIN_NGRAM", 2) + maxNgram := getenvInt("FUZZY_MAX_NGRAM", 10) + maxDistance := getenvInt("FUZZY_MAX_DISTANCE", 1) + + store := lexicon.NewStore() + if err := store.LoadFromDir(lexiconDir); err != nil { + log.Fatalf("failed to load lexicon: %v", err) + } + + service := detect.NewService(store) + service.SetFuzzyConfig(detect.FuzzyConfig{MinNgramLen: minNgram, MaxNgramLen: maxNgram, MaxDistance: maxDistance}) + + app := fiber.New() + app.Get("/health", func(c *fiber.Ctx) error { + return c.JSON(fiber.Map{"status": "ok"}) + }) + + app.Post("/detect", func(c *fiber.Ctx) error { + var req detect.DetectRequest + if err := c.BodyParser(&req); err != nil { + return fiber.NewError(fiber.StatusBadRequest, err.Error()) + } + res := service.Detect(req) + return c.JSON(res) + }) + + app.Post("/contains", func(c *fiber.Ctx) error { + var req detect.ContainsRequest + if err := c.BodyParser(&req); err != nil { + return fiber.NewError(fiber.StatusBadRequest, err.Error()) + } + res := service.Contains(req) + return c.JSON(res) + }) + + app.Post("/reload", func(c *fiber.Ctx) error { + if err := store.LoadFromDir(lexiconDir); err != nil { + return fiber.NewError(fiber.StatusInternalServerError, err.Error()) + } + stats := store.Stats() + return c.JSON(stats) + }) + + port := getenv("PORT", "8080") + addr := ":" + port + log.Printf("listening on %s", addr) + if err := app.Listen(addr); err != nil { + log.Fatal(err) + } +} + +func getenv(k, def string) string { + if v := os.Getenv(k); v != "" { + return v + } + return def +} + +func getenvInt(k string, def int) int { + if v := os.Getenv(k); v != "" { + if i, err := strconv.Atoi(v); err == nil { + return i + } + } + return def +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..8fb7329 --- /dev/null +++ b/go.mod @@ -0,0 +1,30 @@ +module sensitive-lexicon + +go 1.22 + +require ( + github.com/go-playground/validator/v10 v10.27.0 + github.com/gofiber/fiber/v2 v2.52.5 + github.com/ozeidan/fuzzy-patricia v3.0.0+incompatible +) + +require ( + github.com/andybalholm/brotli v1.0.5 // indirect + github.com/gabriel-vasile/mimetype v1.4.8 // indirect + github.com/go-playground/locales v0.14.1 // indirect + github.com/go-playground/universal-translator v0.18.1 // indirect + github.com/google/uuid v1.5.0 // indirect + github.com/klauspost/compress v1.17.0 // indirect + github.com/leodido/go-urn v1.4.0 // indirect + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mattn/go-runewidth v0.0.15 // indirect + github.com/rivo/uniseg v0.2.0 // indirect + github.com/valyala/bytebufferpool v1.0.0 // indirect + github.com/valyala/fasthttp v1.51.0 // indirect + github.com/valyala/tcplisten v1.0.0 // indirect + golang.org/x/crypto v0.33.0 // indirect + golang.org/x/net v0.34.0 // indirect + golang.org/x/sys v0.30.0 // indirect + golang.org/x/text v0.22.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..7889090 --- /dev/null +++ b/go.sum @@ -0,0 +1,55 @@ +github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs= +github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/gabriel-vasile/mimetype v1.4.8 h1:FfZ3gj38NjllZIeJAmMhr+qKL8Wu+nOoI3GqacKw1NM= +github.com/gabriel-vasile/mimetype v1.4.8/go.mod h1:ByKUIKGjh1ODkGM1asKUbQZOLGrPjydw3hYPU2YU9t8= +github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s= +github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= +github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA= +github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY= +github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY= +github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY= +github.com/go-playground/validator/v10 v10.27.0 h1:w8+XrWVMhGkxOaaowyKH35gFydVHOvC0/uWoy2Fzwn4= +github.com/go-playground/validator/v10 v10.27.0/go.mod h1:I5QpIEbmr8On7W0TktmJAumgzX4CA1XNl4ZmDuVHKKo= +github.com/gofiber/fiber/v2 v2.52.5 h1:tWoP1MJQjGEe4GB5TUGOi7P2E0ZMMRx5ZTG4rT+yGMo= +github.com/gofiber/fiber/v2 v2.52.5/go.mod h1:KEOE+cXMhXG0zHc9d8+E38hoX+ZN7bhOtgeF2oT6jrQ= +github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU= +github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/klauspost/compress v1.17.0 h1:Rnbp4K9EjcDuVuHtd0dgA4qNuv9yKDYKK1ulpJwgrqM= +github.com/klauspost/compress v1.17.0/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= +github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U= +github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= +github.com/ozeidan/fuzzy-patricia v3.0.0+incompatible h1:Pl61eMyfJqgY/wytiI4vamqPYribq6d8VxeP1CNyg9M= +github.com/ozeidan/fuzzy-patricia v3.0.0+incompatible/go.mod h1:zgvuCcYS7wB7fVCGblsaFFmEe8+aAH13dTYm8FbrpsM= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= +github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= +github.com/valyala/fasthttp v1.51.0 h1:8b30A5JlZ6C7AS81RsWjYMQmrZG6feChmgAolCl1SqA= +github.com/valyala/fasthttp v1.51.0/go.mod h1:oI2XroL+lI7vdXyYoQk03bXBThfFl2cVdIA3Xl7cH8g= +github.com/valyala/tcplisten v1.0.0 h1:rBHj/Xf+E1tRGZyWIWwJDiRY0zc1Js+CV5DqwacVSA8= +github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc= +golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus= +golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M= +golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0= +golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= +golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= +golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/detect/service.go b/internal/detect/service.go new file mode 100644 index 0000000..464dcfc --- /dev/null +++ b/internal/detect/service.go @@ -0,0 +1,144 @@ +package detect + +import ( + "sort" + "strings" + "unicode/utf8" + + "sensitive-lexicon/internal/lexicon" +) + +type FuzzyConfig struct { + MinNgramLen int + MaxNgramLen int + MaxDistance int +} + +type DetectRequest struct { + Text string `json:"text"` + // If true, enable fuzzy detection on n-grams within the text + EnableFuzzy bool `json:"enable_fuzzy"` +} + +type Match struct { + Word string `json:"word"` + Type string `json:"type"` // substring | fuzzy + Distance int `json:"distance,omitempty"` +} + +type DetectResponse struct { + Hits []Match `json:"hits"` +} + +type ContainsRequest struct { + Text string `json:"text"` +} + +type ContainsResponse struct { + Contains bool `json:"contains"` + Word string `json:"word,omitempty"` +} + +type Service struct { + store *lexicon.Store + fuzzyCfg FuzzyConfig +} + +func NewService(store *lexicon.Store) *Service { + return &Service{store: store, fuzzyCfg: FuzzyConfig{MinNgramLen: 2, MaxNgramLen: 10, MaxDistance: 1}} +} + +func (s *Service) SetFuzzyConfig(cfg FuzzyConfig) { + s.fuzzyCfg = cfg +} + +func (s *Service) Detect(req DetectRequest) DetectResponse { + text := strings.TrimSpace(req.Text) + if text == "" { + return DetectResponse{} + } + + unique := make(map[string]Match) + + // Substring hits: for each codepoint window from input, find lexicon entries containing it + s.store.ForEachSubstringMatch(text, func(word string) bool { + unique[word] = Match{Word: word, Type: "substring"} + return true + }) + + if req.EnableFuzzy { + for _, token := range generateNgrams(text, s.fuzzyCfg.MinNgramLen, s.fuzzyCfg.MaxNgramLen) { + s.store.ForEachFuzzyMatch(token, s.fuzzyCfg.MaxDistance, func(word string, d int) bool { + if old, ok := unique[word]; ok { + if old.Type == "substring" && d == 0 { + return true + } + } + unique[word] = Match{Word: word, Type: ternary(d == 0, "substring", "fuzzy"), Distance: d} + return true + }) + } + } + + res := DetectResponse{Hits: make([]Match, 0, len(unique))} + for _, v := range unique { + res.Hits = append(res.Hits, v) + } + sort.Slice(res.Hits, func(i, j int) bool { + if res.Hits[i].Type == res.Hits[j].Type { + if res.Hits[i].Distance == res.Hits[j].Distance { + return res.Hits[i].Word < res.Hits[j].Word + } + return res.Hits[i].Distance < res.Hits[j].Distance + } + return res.Hits[i].Type < res.Hits[j].Type + }) + return res +} + +func (s *Service) Contains(req ContainsRequest) ContainsResponse { + ok, w := s.store.HasAnyInText(strings.TrimSpace(req.Text)) + return ContainsResponse{Contains: ok, Word: w} +} + +func ternary[T any](cond bool, a, b T) T { + if cond { + return a + } + return b +} + +func generateNgrams(text string, minLen, maxLen int) []string { + if minLen < 1 { + minLen = 1 + } + if maxLen < minLen { + maxLen = minLen + } + // Work on rune boundaries for CJK safety + runes := []rune(text) + n := len(runes) + var out []string + for i := 0; i < n; i++ { + for l := minLen; l <= maxLen && i+l <= n; l++ { + out = append(out, string(runes[i:i+l])) + } + } + return dedupStrings(out) +} + +func dedupStrings(in []string) []string { + seen := make(map[string]struct{}, len(in)) + out := make([]string, 0, len(in)) + for _, s := range in { + if _, ok := seen[s]; ok { + continue + } + seen[s] = struct{}{} + out = append(out, s) + } + return out +} + +// Guard for unused import warning if utf8 not referenced elsewhere +var _ = utf8.RuneCountInString diff --git a/internal/lexicon/store.go b/internal/lexicon/store.go new file mode 100644 index 0000000..8d6ec6c --- /dev/null +++ b/internal/lexicon/store.go @@ -0,0 +1,141 @@ +package lexicon + +import ( + "bufio" + "errors" + "os" + "path/filepath" + "strings" + "sync" + + "github.com/ozeidan/fuzzy-patricia/patricia" +) + +// Store holds the trie and statistics for the loaded lexicon. +type Store struct { + mu sync.RWMutex + trie *patricia.Trie + cnt int +} + +func NewStore() *Store { + return &Store{trie: patricia.NewTrie()} +} + +// LoadFromDir loads all .txt files from dir into the trie. +func (s *Store) LoadFromDir(dir string) error { + s.mu.Lock() + defer s.mu.Unlock() + + newTrie := patricia.NewTrie() + count := 0 + + walkErr := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.IsDir() { + return nil + } + if strings.ToLower(filepath.Ext(info.Name())) != ".txt" { + return nil + } + f, err := os.Open(path) + if err != nil { + return err + } + defer f.Close() + scanner := bufio.NewScanner(f) + // Increase buffer for long lines + buf := make([]byte, 0, 1024*64) + scanner.Buffer(buf, 1024*1024) + for scanner.Scan() { + w := strings.TrimSpace(scanner.Text()) + if w == "" || strings.HasPrefix(w, "#") { + continue + } + newTrie.Insert(patricia.Prefix(w), struct{}{}) + count++ + } + return scanner.Err() + }) + if walkErr != nil { + return walkErr + } + if count == 0 { + return errors.New("no entries loaded") + } + // Swap in + s.trie = newTrie + s.cnt = count + return nil +} + +func (s *Store) Stats() map[string]interface{} { + s.mu.RLock() + defer s.mu.RUnlock() + return map[string]interface{}{ + "count": s.cnt, + } +} + +// ForEachSubstringMatch visits any keys that contain the given substring. +// It uses the library's substring search. +func (s *Store) ForEachSubstringMatch(query string, visit func(word string) bool) { + s.mu.RLock() + tr := s.trie + s.mu.RUnlock() + if tr == nil || query == "" { + return + } + // second argument is caseSensitive; we use false by default + tr.VisitSubstring(patricia.Prefix(query), false, func(prefix patricia.Prefix, _ patricia.Item) error { + // The library does not expose a public stop error in all versions; ignore early stop + _ = visit(string(prefix)) + return nil + }) +} + +// ForEachFuzzyMatch visits keys with fuzzy distance within maxDistance to query. +func (s *Store) ForEachFuzzyMatch(query string, maxDistance int, visit func(word string, distance int) bool) { + s.mu.RLock() + tr := s.trie + s.mu.RUnlock() + if tr == nil || query == "" { + return + } + // signature in current lib: VisitFuzzy(prefix, caseSensitive bool, visitor) + tr.VisitFuzzy(patricia.Prefix(query), false, func(prefix patricia.Prefix, _ patricia.Item, dist int) error { + if dist <= maxDistance { + _ = visit(string(prefix), dist) + } + return nil + }) +} + +// HasAnyInText returns true if any lexicon word is a substring of the given text. +// It scans each rune offset and visits prefixes against the trie. +func (s *Store) HasAnyInText(text string) (bool, string) { + s.mu.RLock() + tr := s.trie + s.mu.RUnlock() + if tr == nil || text == "" { + return false, "" + } + runes := []rune(text) + n := len(runes) + for i := 0; i < n; i++ { + suffix := string(runes[i:]) + foundWord := "" + tr.VisitPrefixes(patricia.Prefix(suffix), false, func(prefix patricia.Prefix, _ patricia.Item) error { + if foundWord == "" { + foundWord = string(prefix) + } + return nil + }) + if foundWord != "" { + return true, foundWord + } + } + return false, "" +}