敏感词检测
This commit is contained in:
144
internal/detect/service.go
Normal file
144
internal/detect/service.go
Normal file
@@ -0,0 +1,144 @@
|
||||
package detect
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"sensitive-lexicon/internal/lexicon"
|
||||
)
|
||||
|
||||
type FuzzyConfig struct {
|
||||
MinNgramLen int
|
||||
MaxNgramLen int
|
||||
MaxDistance int
|
||||
}
|
||||
|
||||
type DetectRequest struct {
|
||||
Text string `json:"text"`
|
||||
// If true, enable fuzzy detection on n-grams within the text
|
||||
EnableFuzzy bool `json:"enable_fuzzy"`
|
||||
}
|
||||
|
||||
type Match struct {
|
||||
Word string `json:"word"`
|
||||
Type string `json:"type"` // substring | fuzzy
|
||||
Distance int `json:"distance,omitempty"`
|
||||
}
|
||||
|
||||
type DetectResponse struct {
|
||||
Hits []Match `json:"hits"`
|
||||
}
|
||||
|
||||
type ContainsRequest struct {
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
type ContainsResponse struct {
|
||||
Contains bool `json:"contains"`
|
||||
Word string `json:"word,omitempty"`
|
||||
}
|
||||
|
||||
type Service struct {
|
||||
store *lexicon.Store
|
||||
fuzzyCfg FuzzyConfig
|
||||
}
|
||||
|
||||
func NewService(store *lexicon.Store) *Service {
|
||||
return &Service{store: store, fuzzyCfg: FuzzyConfig{MinNgramLen: 2, MaxNgramLen: 10, MaxDistance: 1}}
|
||||
}
|
||||
|
||||
func (s *Service) SetFuzzyConfig(cfg FuzzyConfig) {
|
||||
s.fuzzyCfg = cfg
|
||||
}
|
||||
|
||||
func (s *Service) Detect(req DetectRequest) DetectResponse {
|
||||
text := strings.TrimSpace(req.Text)
|
||||
if text == "" {
|
||||
return DetectResponse{}
|
||||
}
|
||||
|
||||
unique := make(map[string]Match)
|
||||
|
||||
// Substring hits: for each codepoint window from input, find lexicon entries containing it
|
||||
s.store.ForEachSubstringMatch(text, func(word string) bool {
|
||||
unique[word] = Match{Word: word, Type: "substring"}
|
||||
return true
|
||||
})
|
||||
|
||||
if req.EnableFuzzy {
|
||||
for _, token := range generateNgrams(text, s.fuzzyCfg.MinNgramLen, s.fuzzyCfg.MaxNgramLen) {
|
||||
s.store.ForEachFuzzyMatch(token, s.fuzzyCfg.MaxDistance, func(word string, d int) bool {
|
||||
if old, ok := unique[word]; ok {
|
||||
if old.Type == "substring" && d == 0 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
unique[word] = Match{Word: word, Type: ternary(d == 0, "substring", "fuzzy"), Distance: d}
|
||||
return true
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
res := DetectResponse{Hits: make([]Match, 0, len(unique))}
|
||||
for _, v := range unique {
|
||||
res.Hits = append(res.Hits, v)
|
||||
}
|
||||
sort.Slice(res.Hits, func(i, j int) bool {
|
||||
if res.Hits[i].Type == res.Hits[j].Type {
|
||||
if res.Hits[i].Distance == res.Hits[j].Distance {
|
||||
return res.Hits[i].Word < res.Hits[j].Word
|
||||
}
|
||||
return res.Hits[i].Distance < res.Hits[j].Distance
|
||||
}
|
||||
return res.Hits[i].Type < res.Hits[j].Type
|
||||
})
|
||||
return res
|
||||
}
|
||||
|
||||
func (s *Service) Contains(req ContainsRequest) ContainsResponse {
|
||||
ok, w := s.store.HasAnyInText(strings.TrimSpace(req.Text))
|
||||
return ContainsResponse{Contains: ok, Word: w}
|
||||
}
|
||||
|
||||
func ternary[T any](cond bool, a, b T) T {
|
||||
if cond {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
func generateNgrams(text string, minLen, maxLen int) []string {
|
||||
if minLen < 1 {
|
||||
minLen = 1
|
||||
}
|
||||
if maxLen < minLen {
|
||||
maxLen = minLen
|
||||
}
|
||||
// Work on rune boundaries for CJK safety
|
||||
runes := []rune(text)
|
||||
n := len(runes)
|
||||
var out []string
|
||||
for i := 0; i < n; i++ {
|
||||
for l := minLen; l <= maxLen && i+l <= n; l++ {
|
||||
out = append(out, string(runes[i:i+l]))
|
||||
}
|
||||
}
|
||||
return dedupStrings(out)
|
||||
}
|
||||
|
||||
func dedupStrings(in []string) []string {
|
||||
seen := make(map[string]struct{}, len(in))
|
||||
out := make([]string, 0, len(in))
|
||||
for _, s := range in {
|
||||
if _, ok := seen[s]; ok {
|
||||
continue
|
||||
}
|
||||
seen[s] = struct{}{}
|
||||
out = append(out, s)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// Guard for unused import warning if utf8 not referenced elsewhere
|
||||
var _ = utf8.RuneCountInString
|
||||
141
internal/lexicon/store.go
Normal file
141
internal/lexicon/store.go
Normal file
@@ -0,0 +1,141 @@
|
||||
package lexicon
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/ozeidan/fuzzy-patricia/patricia"
|
||||
)
|
||||
|
||||
// Store holds the trie and statistics for the loaded lexicon.
|
||||
type Store struct {
|
||||
mu sync.RWMutex
|
||||
trie *patricia.Trie
|
||||
cnt int
|
||||
}
|
||||
|
||||
func NewStore() *Store {
|
||||
return &Store{trie: patricia.NewTrie()}
|
||||
}
|
||||
|
||||
// LoadFromDir loads all .txt files from dir into the trie.
|
||||
func (s *Store) LoadFromDir(dir string) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
newTrie := patricia.NewTrie()
|
||||
count := 0
|
||||
|
||||
walkErr := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
if strings.ToLower(filepath.Ext(info.Name())) != ".txt" {
|
||||
return nil
|
||||
}
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
scanner := bufio.NewScanner(f)
|
||||
// Increase buffer for long lines
|
||||
buf := make([]byte, 0, 1024*64)
|
||||
scanner.Buffer(buf, 1024*1024)
|
||||
for scanner.Scan() {
|
||||
w := strings.TrimSpace(scanner.Text())
|
||||
if w == "" || strings.HasPrefix(w, "#") {
|
||||
continue
|
||||
}
|
||||
newTrie.Insert(patricia.Prefix(w), struct{}{})
|
||||
count++
|
||||
}
|
||||
return scanner.Err()
|
||||
})
|
||||
if walkErr != nil {
|
||||
return walkErr
|
||||
}
|
||||
if count == 0 {
|
||||
return errors.New("no entries loaded")
|
||||
}
|
||||
// Swap in
|
||||
s.trie = newTrie
|
||||
s.cnt = count
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Store) Stats() map[string]interface{} {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
return map[string]interface{}{
|
||||
"count": s.cnt,
|
||||
}
|
||||
}
|
||||
|
||||
// ForEachSubstringMatch visits any keys that contain the given substring.
|
||||
// It uses the library's substring search.
|
||||
func (s *Store) ForEachSubstringMatch(query string, visit func(word string) bool) {
|
||||
s.mu.RLock()
|
||||
tr := s.trie
|
||||
s.mu.RUnlock()
|
||||
if tr == nil || query == "" {
|
||||
return
|
||||
}
|
||||
// second argument is caseSensitive; we use false by default
|
||||
tr.VisitSubstring(patricia.Prefix(query), false, func(prefix patricia.Prefix, _ patricia.Item) error {
|
||||
// The library does not expose a public stop error in all versions; ignore early stop
|
||||
_ = visit(string(prefix))
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
// ForEachFuzzyMatch visits keys with fuzzy distance within maxDistance to query.
|
||||
func (s *Store) ForEachFuzzyMatch(query string, maxDistance int, visit func(word string, distance int) bool) {
|
||||
s.mu.RLock()
|
||||
tr := s.trie
|
||||
s.mu.RUnlock()
|
||||
if tr == nil || query == "" {
|
||||
return
|
||||
}
|
||||
// signature in current lib: VisitFuzzy(prefix, caseSensitive bool, visitor)
|
||||
tr.VisitFuzzy(patricia.Prefix(query), false, func(prefix patricia.Prefix, _ patricia.Item, dist int) error {
|
||||
if dist <= maxDistance {
|
||||
_ = visit(string(prefix), dist)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
// HasAnyInText returns true if any lexicon word is a substring of the given text.
|
||||
// It scans each rune offset and visits prefixes against the trie.
|
||||
func (s *Store) HasAnyInText(text string) (bool, string) {
|
||||
s.mu.RLock()
|
||||
tr := s.trie
|
||||
s.mu.RUnlock()
|
||||
if tr == nil || text == "" {
|
||||
return false, ""
|
||||
}
|
||||
runes := []rune(text)
|
||||
n := len(runes)
|
||||
for i := 0; i < n; i++ {
|
||||
suffix := string(runes[i:])
|
||||
foundWord := ""
|
||||
tr.VisitPrefixes(patricia.Prefix(suffix), false, func(prefix patricia.Prefix, _ patricia.Item) error {
|
||||
if foundWord == "" {
|
||||
foundWord = string(prefix)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if foundWord != "" {
|
||||
return true, foundWord
|
||||
}
|
||||
}
|
||||
return false, ""
|
||||
}
|
||||
Reference in New Issue
Block a user