145 lines
3.2 KiB
Go
145 lines
3.2 KiB
Go
package detect
|
|
|
|
import (
|
|
"sort"
|
|
"strings"
|
|
"unicode/utf8"
|
|
|
|
"sensitive-lexicon/internal/lexicon"
|
|
)
|
|
|
|
type FuzzyConfig struct {
|
|
MinNgramLen int
|
|
MaxNgramLen int
|
|
MaxDistance int
|
|
}
|
|
|
|
type DetectRequest struct {
|
|
Text string `json:"text"`
|
|
// If true, enable fuzzy detection on n-grams within the text
|
|
EnableFuzzy bool `json:"enable_fuzzy"`
|
|
}
|
|
|
|
type Match struct {
|
|
Word string `json:"word"`
|
|
Type string `json:"type"` // substring | fuzzy
|
|
Distance int `json:"distance,omitempty"`
|
|
}
|
|
|
|
type DetectResponse struct {
|
|
Hits []Match `json:"hits"`
|
|
}
|
|
|
|
type ContainsRequest struct {
|
|
Text string `json:"text"`
|
|
}
|
|
|
|
type ContainsResponse struct {
|
|
Contains bool `json:"contains"`
|
|
Word string `json:"word,omitempty"`
|
|
}
|
|
|
|
type Service struct {
|
|
store *lexicon.Store
|
|
fuzzyCfg FuzzyConfig
|
|
}
|
|
|
|
func NewService(store *lexicon.Store) *Service {
|
|
return &Service{store: store, fuzzyCfg: FuzzyConfig{MinNgramLen: 2, MaxNgramLen: 10, MaxDistance: 1}}
|
|
}
|
|
|
|
func (s *Service) SetFuzzyConfig(cfg FuzzyConfig) {
|
|
s.fuzzyCfg = cfg
|
|
}
|
|
|
|
func (s *Service) Detect(req DetectRequest) DetectResponse {
|
|
text := strings.TrimSpace(req.Text)
|
|
if text == "" {
|
|
return DetectResponse{}
|
|
}
|
|
|
|
unique := make(map[string]Match)
|
|
|
|
// Substring hits: for each codepoint window from input, find lexicon entries containing it
|
|
s.store.ForEachSubstringMatch(text, func(word string) bool {
|
|
unique[word] = Match{Word: word, Type: "substring"}
|
|
return true
|
|
})
|
|
|
|
if req.EnableFuzzy {
|
|
for _, token := range generateNgrams(text, s.fuzzyCfg.MinNgramLen, s.fuzzyCfg.MaxNgramLen) {
|
|
s.store.ForEachFuzzyMatch(token, s.fuzzyCfg.MaxDistance, func(word string, d int) bool {
|
|
if old, ok := unique[word]; ok {
|
|
if old.Type == "substring" && d == 0 {
|
|
return true
|
|
}
|
|
}
|
|
unique[word] = Match{Word: word, Type: ternary(d == 0, "substring", "fuzzy"), Distance: d}
|
|
return true
|
|
})
|
|
}
|
|
}
|
|
|
|
res := DetectResponse{Hits: make([]Match, 0, len(unique))}
|
|
for _, v := range unique {
|
|
res.Hits = append(res.Hits, v)
|
|
}
|
|
sort.Slice(res.Hits, func(i, j int) bool {
|
|
if res.Hits[i].Type == res.Hits[j].Type {
|
|
if res.Hits[i].Distance == res.Hits[j].Distance {
|
|
return res.Hits[i].Word < res.Hits[j].Word
|
|
}
|
|
return res.Hits[i].Distance < res.Hits[j].Distance
|
|
}
|
|
return res.Hits[i].Type < res.Hits[j].Type
|
|
})
|
|
return res
|
|
}
|
|
|
|
func (s *Service) Contains(req ContainsRequest) ContainsResponse {
|
|
ok, w := s.store.HasAnyInText(strings.TrimSpace(req.Text))
|
|
return ContainsResponse{Contains: ok, Word: w}
|
|
}
|
|
|
|
func ternary[T any](cond bool, a, b T) T {
|
|
if cond {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
func generateNgrams(text string, minLen, maxLen int) []string {
|
|
if minLen < 1 {
|
|
minLen = 1
|
|
}
|
|
if maxLen < minLen {
|
|
maxLen = minLen
|
|
}
|
|
// Work on rune boundaries for CJK safety
|
|
runes := []rune(text)
|
|
n := len(runes)
|
|
var out []string
|
|
for i := 0; i < n; i++ {
|
|
for l := minLen; l <= maxLen && i+l <= n; l++ {
|
|
out = append(out, string(runes[i:i+l]))
|
|
}
|
|
}
|
|
return dedupStrings(out)
|
|
}
|
|
|
|
func dedupStrings(in []string) []string {
|
|
seen := make(map[string]struct{}, len(in))
|
|
out := make([]string, 0, len(in))
|
|
for _, s := range in {
|
|
if _, ok := seen[s]; ok {
|
|
continue
|
|
}
|
|
seen[s] = struct{}{}
|
|
out = append(out, s)
|
|
}
|
|
return out
|
|
}
|
|
|
|
// Guard for unused import warning if utf8 not referenced elsewhere
|
|
var _ = utf8.RuneCountInString
|