10 Commits
main ... dev

Author SHA1 Message Date
RookieCuzz
b05a333e07 Update Jenkinsfile 2025-10-08 15:43:35 +08:00
RookieCuzz
ba1d8a32ff add: Jenkinscicd 2025-10-08 11:42:09 +08:00
RookieCuzz
dbf0ffc752 add: docker 打包 2025-10-08 10:43:58 +08:00
RookieCuzz
05c600b439 fix: ci 2025-10-08 10:37:31 +08:00
RookieCuzz
332db8f522 update: branch name 2025-10-08 10:35:43 +08:00
RookieCuzz
a2c61e116b update: 力工 2025-10-08 10:32:34 +08:00
RookieCuzz
f0a167af8f add: gitignore 2025-10-08 10:31:51 +08:00
RookieCuzz
80cfc45cb0 add: ci action 2025-10-08 10:31:37 +08:00
RookieCuzz
81a8126d68 Merge remote-tracking branch 'origin/dev' into dev 2025-10-08 10:26:45 +08:00
RookieCuzz
f956b85ed3 敏感词检测 2025-09-03 16:37:43 +08:00
12 changed files with 824 additions and 0 deletions

27
.dockerignore Normal file
View File

@@ -0,0 +1,27 @@
.git
.github
.idea
.vscode
Dockerfile
.dockerignore
# Build outputs
bin/
build/
dist/
out/
# Archives & temp
*.zip
*.tar
*.tar.gz
*.rar
*.7z
*.log
*.tmp
*.swp
~*
# Optional: exclude non-runtime assets
Organized/
ThirdPartyCompatibleFormats/

132
.github/workflows/server.yml vendored Normal file
View File

@@ -0,0 +1,132 @@
name: Server CI/CD
on:
push:
branches: [ "dev", "master" ]
tags:
- '*'
pull_request:
branches: [ "dev", "master" ]
permissions:
contents: read
packages: write
jobs:
build:
name: Build server binaries
runs-on: ubuntu-latest
strategy:
matrix:
goos: [ linux, windows ]
goarch: [ amd64 ]
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Setup Go
uses: actions/setup-go@v4
with:
go-version: '1.22.x'
cache: true
- name: Download modules
run: go mod download
- name: Vet & Test
run: |
go vet ./...
go test ./... -v
- name: Build cmd/server
shell: bash
run: |
mkdir -p build
EXT=""
if [ "${{ matrix.goos }}" = "windows" ]; then EXT=".exe"; fi
OUT="server-${{ matrix.goos }}-${{ matrix.goarch }}${EXT}"
echo "Building $OUT"
GOOS=${{ matrix.goos }} GOARCH=${{ matrix.goarch }} \
go build -trimpath -ldflags="-s -w" -o "build/${OUT}" ./cmd/server
- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: server-${{ matrix.goos }}-${{ matrix.goarch }}
path: build/server-${{ matrix.goos }}-${{ matrix.goarch }}*
release:
name: Release binaries
needs: build
runs-on: ubuntu-latest
if: startsWith(github.ref, 'refs/tags/')
steps:
- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: dist
- name: List artifacts
run: ls -R dist
- name: Create GitHub Release
uses: softprops/action-gh-release@v1
with:
files: dist/**/*
draft: false
prerelease: false
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
docker:
name: Build and push Docker image
needs: build
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup QEMU
uses: docker/setup-qemu-action@v3
- name: Setup Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to GHCR
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Determine image name and tags
id: imagetags
shell: bash
run: |
IMAGE="ghcr.io/${{ github.repository_owner }}/sensitive-lexicon-server"
echo "IMAGE=$IMAGE" >> $GITHUB_ENV
if [ "${{ github.event_name }}" = "pull_request" ]; then
echo "PUSH=false" >> $GITHUB_ENV
echo "TAGS=${IMAGE}:pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
elif [ "${{ github.ref_type }}" = "tag" ]; then
echo "PUSH=true" >> $GITHUB_ENV
echo "TAGS=${IMAGE}:${{ github.ref_name }},${IMAGE}:latest" >> $GITHUB_ENV
else
BRANCH="${{ github.ref_name }}"
echo "PUSH=true" >> $GITHUB_ENV
if [ "$BRANCH" = "dev" ] || [ "$BRANCH" = "master" ]; then
echo "TAGS=${IMAGE}:latest,${IMAGE}:sha-${{ github.sha }}" >> $GITHUB_ENV
else
echo "TAGS=${IMAGE}:branch-${BRANCH},${IMAGE}:sha-${{ github.sha }}" >> $GITHUB_ENV
fi
fi
echo "Using tags: $TAGS"
- name: Build and push
uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
platforms: linux/amd64,linux/arm64
push: ${{ env.PUSH }}
tags: ${{ env.TAGS }}

45
.gitignore vendored Normal file
View File

@@ -0,0 +1,45 @@
# Go build artifacts
bin/
build/
dist/
out/
# Executables and shared libs
*.exe
*.exe~
*.dll
*.so
*.dylib
# Test and coverage outputs
*.test
coverage.out
*.coverprofile
# Logs
*.log
logs/
# Env files
.env
.env.*
# IDE settings
.vscode/
.idea/
*.iml
# OS files
.DS_Store
Thumbs.db
# Archives and temporary files
*.zip
*.tar
*.tar.gz
*.rar
*.7z
*.tmp
*.temp
*.swp
~*

37
Dockerfile Normal file
View File

@@ -0,0 +1,37 @@
# syntax=docker/dockerfile:1.4
FROM golang:1.22-alpine AS builder
WORKDIR /src
# Pre-fetch deps
COPY go.mod go.sum ./
RUN go mod download
# Copy source
COPY . .
# Normalize and tidy modules inside build context
RUN go mod tidy
# Build static binary for target platform
ARG TARGETOS
ARG TARGETARCH
ENV CGO_ENABLED=0
RUN GOOS=$TARGETOS GOARCH=$TARGETARCH \
go build -trimpath -ldflags="-s -w" -o /out/server ./cmd/server
FROM gcr.io/distroless/static:nonroot
WORKDIR /app
# App binary
COPY --from=builder /out/server /app/server
# Default lexicon files
COPY Vocabulary /app/Vocabulary
# Default envs
ENV PORT=8080
ENV LEXICON_DIR=Vocabulary
EXPOSE 8080
USER nonroot
ENTRYPOINT ["/app/server"]

98
Jenkinsfile vendored Normal file
View File

@@ -0,0 +1,98 @@
pipeline {
agent {label 'dockeragent'}
// 构建逻辑已迁移到 DockerfileJenkins 不再进行本地 go build
environment {
GO111MODULE = 'on' // 开启 Modules 模式
CGO_ENABLED = '0'
APP_NAME = 'sensitive-lexicon'
REGISTRY = 'crpi-vqe38j3xeblrq0n4.cn-hangzhou.personal.cr.aliyuncs.com/go-mctown'
}
stages {
stage('Checkout') {
steps {
checkout scm
}
}
// 使用 Dockerfile 完成编译与打包,仅保留镜像构建与推送
stage('Docker Build & Push') {
steps {
withCredentials([usernamePassword(
credentialsId: 'aliyun-docker-login',
usernameVariable: 'DOCKER_USERNAME',
passwordVariable: 'DOCKER_PASSWORD'
)]) {
sh """
echo "\$DOCKER_PASSWORD" | docker login --username \$DOCKER_USERNAME --password-stdin ${env.REGISTRY.split('/')[0]}
"""
}
script {
def imageTag = "${env.REGISTRY}/${env.APP_NAME}:${env.BUILD_NUMBER}"
def latestTag = "${env.REGISTRY}/${env.APP_NAME}:latest"
sh """
ls -l
docker build -t ${imageTag} --network=host .
docker tag ${imageTag} ${latestTag}
docker push ${imageTag}
docker push ${latestTag}
"""
}
}
}
stage('Deploy All Compose Projects') {
parallel {
stage('Deploy compose1') {
agent {label 'dockeragent'}
steps {
checkout scm
sh """
pwd
ls -l
"""
dir('deploy/compose') {
script {
withCredentials([usernamePassword(
credentialsId: 'aliyun-docker-login',
usernameVariable: 'DOCKER_USERNAME',
passwordVariable: 'DOCKER_PASSWORD'
)]) {
sh """
echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USERNAME" --password-stdin ${env.REGISTRY.split('/')[0]}
"""
}
sh """
pwd
ls -l
docker compose -f docker-compose.yml down || true
docker compose -f docker-compose.yml pull
docker compose -f docker-compose.yml up -d --remove-orphans
"""
}
}
}
}
}
}
}
post {
always {
cleanWs()
}
success {
echo "✅ 构建成功!"
}
failure {
echo "🔥 构建失败,请检查日志。"
}
}
}

View File

@@ -83,3 +83,21 @@ git clone https://github.com/Konsheng/Sensitive-lexicon.git
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=konsheng/Sensitive-lexicon&type=Date" />
</picture>
</a>
## 运行敏感词检测服务Fiber + fuzzy-patricia
```bash
# Windows PowerShell 示例
$env:PORT="8080"; $env:LEXICON_DIR="Vocabulary"; $env:FUZZY_MAX_DISTANCE="1"
# 构建并运行
go mod tidy
go build -o bin\server.exe ./cmd/server
./bin/server.exe
```
- POST `/detect`
- 请求体: `{ "text": "待检测文本", "enable_fuzzy": true }`
- 响应: `{ "hits": [{"word":"...","type":"substring|fuzzy","distance":0}] }`
- POST `/reload` 重新加载 `Vocabulary` 目录
- GET `/health` 存活探针

View File

@@ -1,3 +1,4 @@
力工
穴海
协警
纳米比亚

96
cmd/server/main.go Normal file
View File

@@ -0,0 +1,96 @@
package main
import (
"github.com/go-playground/validator/v10"
"github.com/gofiber/fiber/v2"
"log"
"os"
"sensitive-lexicon/internal/detect"
"sensitive-lexicon/internal/lexicon"
"strconv"
"time"
)
type User struct {
Name string `json:"name" validate:"min=5,max=20"`
Age int `json:"age" validate:"gte=18"`
Enrollment time.Time `json:"enrollment" validate:"before_today"`
Graduation time.Time `json:"graduation" validate:"gtfield=Enrollment"`
}
// BeforeToday 验证日期是否在今天之前
func BeforeToday(fl validator.FieldLevel) bool {
fieldTime, ok := fl.Field().Interface().(time.Time)
if !ok {
return false
}
return fieldTime.Before(time.Now())
}
func main() {
lexiconDir := getenv("LEXICON_DIR", "Vocabulary")
minNgram := getenvInt("FUZZY_MIN_NGRAM", 2)
maxNgram := getenvInt("FUZZY_MAX_NGRAM", 10)
maxDistance := getenvInt("FUZZY_MAX_DISTANCE", 1)
store := lexicon.NewStore()
if err := store.LoadFromDir(lexiconDir); err != nil {
log.Fatalf("failed to load lexicon: %v", err)
}
service := detect.NewService(store)
service.SetFuzzyConfig(detect.FuzzyConfig{MinNgramLen: minNgram, MaxNgramLen: maxNgram, MaxDistance: maxDistance})
app := fiber.New()
app.Get("/health", func(c *fiber.Ctx) error {
return c.JSON(fiber.Map{"status": "ok"})
})
app.Post("/detect", func(c *fiber.Ctx) error {
var req detect.DetectRequest
if err := c.BodyParser(&req); err != nil {
return fiber.NewError(fiber.StatusBadRequest, err.Error())
}
res := service.Detect(req)
return c.JSON(res)
})
app.Post("/contains", func(c *fiber.Ctx) error {
var req detect.ContainsRequest
if err := c.BodyParser(&req); err != nil {
return fiber.NewError(fiber.StatusBadRequest, err.Error())
}
res := service.Contains(req)
return c.JSON(res)
})
app.Post("/reload", func(c *fiber.Ctx) error {
if err := store.LoadFromDir(lexiconDir); err != nil {
return fiber.NewError(fiber.StatusInternalServerError, err.Error())
}
stats := store.Stats()
return c.JSON(stats)
})
port := getenv("PORT", "8080")
addr := ":" + port
log.Printf("listening on %s", addr)
if err := app.Listen(addr); err != nil {
log.Fatal(err)
}
}
func getenv(k, def string) string {
if v := os.Getenv(k); v != "" {
return v
}
return def
}
func getenvInt(k string, def int) int {
if v := os.Getenv(k); v != "" {
if i, err := strconv.Atoi(v); err == nil {
return i
}
}
return def
}

30
go.mod Normal file
View File

@@ -0,0 +1,30 @@
module sensitive-lexicon
go 1.22
require (
github.com/go-playground/validator/v10 v10.27.0
github.com/gofiber/fiber/v2 v2.52.5
github.com/ozeidan/fuzzy-patricia v3.0.0+incompatible
)
require (
github.com/andybalholm/brotli v1.0.5 // indirect
github.com/gabriel-vasile/mimetype v1.4.8 // indirect
github.com/go-playground/locales v0.14.1 // indirect
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/google/uuid v1.5.0 // indirect
github.com/klauspost/compress v1.17.0 // indirect
github.com/leodido/go-urn v1.4.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-runewidth v0.0.15 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect
github.com/valyala/fasthttp v1.51.0 // indirect
github.com/valyala/tcplisten v1.0.0 // indirect
golang.org/x/crypto v0.33.0 // indirect
golang.org/x/net v0.34.0 // indirect
golang.org/x/sys v0.30.0 // indirect
golang.org/x/text v0.22.0 // indirect
)

55
go.sum Normal file
View File

@@ -0,0 +1,55 @@
github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs=
github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gabriel-vasile/mimetype v1.4.8 h1:FfZ3gj38NjllZIeJAmMhr+qKL8Wu+nOoI3GqacKw1NM=
github.com/gabriel-vasile/mimetype v1.4.8/go.mod h1:ByKUIKGjh1ODkGM1asKUbQZOLGrPjydw3hYPU2YU9t8=
github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY=
github.com/go-playground/validator/v10 v10.27.0 h1:w8+XrWVMhGkxOaaowyKH35gFydVHOvC0/uWoy2Fzwn4=
github.com/go-playground/validator/v10 v10.27.0/go.mod h1:I5QpIEbmr8On7W0TktmJAumgzX4CA1XNl4ZmDuVHKKo=
github.com/gofiber/fiber/v2 v2.52.5 h1:tWoP1MJQjGEe4GB5TUGOi7P2E0ZMMRx5ZTG4rT+yGMo=
github.com/gofiber/fiber/v2 v2.52.5/go.mod h1:KEOE+cXMhXG0zHc9d8+E38hoX+ZN7bhOtgeF2oT6jrQ=
github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU=
github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/klauspost/compress v1.17.0 h1:Rnbp4K9EjcDuVuHtd0dgA4qNuv9yKDYKK1ulpJwgrqM=
github.com/klauspost/compress v1.17.0/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U=
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/ozeidan/fuzzy-patricia v3.0.0+incompatible h1:Pl61eMyfJqgY/wytiI4vamqPYribq6d8VxeP1CNyg9M=
github.com/ozeidan/fuzzy-patricia v3.0.0+incompatible/go.mod h1:zgvuCcYS7wB7fVCGblsaFFmEe8+aAH13dTYm8FbrpsM=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/valyala/fasthttp v1.51.0 h1:8b30A5JlZ6C7AS81RsWjYMQmrZG6feChmgAolCl1SqA=
github.com/valyala/fasthttp v1.51.0/go.mod h1:oI2XroL+lI7vdXyYoQk03bXBThfFl2cVdIA3Xl7cH8g=
github.com/valyala/tcplisten v1.0.0 h1:rBHj/Xf+E1tRGZyWIWwJDiRY0zc1Js+CV5DqwacVSA8=
github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc=
golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus=
golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M=
golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0=
golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

144
internal/detect/service.go Normal file
View File

@@ -0,0 +1,144 @@
package detect
import (
"sort"
"strings"
"unicode/utf8"
"sensitive-lexicon/internal/lexicon"
)
type FuzzyConfig struct {
MinNgramLen int
MaxNgramLen int
MaxDistance int
}
type DetectRequest struct {
Text string `json:"text"`
// If true, enable fuzzy detection on n-grams within the text
EnableFuzzy bool `json:"enable_fuzzy"`
}
type Match struct {
Word string `json:"word"`
Type string `json:"type"` // substring | fuzzy
Distance int `json:"distance,omitempty"`
}
type DetectResponse struct {
Hits []Match `json:"hits"`
}
type ContainsRequest struct {
Text string `json:"text"`
}
type ContainsResponse struct {
Contains bool `json:"contains"`
Word string `json:"word,omitempty"`
}
type Service struct {
store *lexicon.Store
fuzzyCfg FuzzyConfig
}
func NewService(store *lexicon.Store) *Service {
return &Service{store: store, fuzzyCfg: FuzzyConfig{MinNgramLen: 2, MaxNgramLen: 10, MaxDistance: 1}}
}
func (s *Service) SetFuzzyConfig(cfg FuzzyConfig) {
s.fuzzyCfg = cfg
}
func (s *Service) Detect(req DetectRequest) DetectResponse {
text := strings.TrimSpace(req.Text)
if text == "" {
return DetectResponse{}
}
unique := make(map[string]Match)
// Substring hits: for each codepoint window from input, find lexicon entries containing it
s.store.ForEachSubstringMatch(text, func(word string) bool {
unique[word] = Match{Word: word, Type: "substring"}
return true
})
if req.EnableFuzzy {
for _, token := range generateNgrams(text, s.fuzzyCfg.MinNgramLen, s.fuzzyCfg.MaxNgramLen) {
s.store.ForEachFuzzyMatch(token, s.fuzzyCfg.MaxDistance, func(word string, d int) bool {
if old, ok := unique[word]; ok {
if old.Type == "substring" && d == 0 {
return true
}
}
unique[word] = Match{Word: word, Type: ternary(d == 0, "substring", "fuzzy"), Distance: d}
return true
})
}
}
res := DetectResponse{Hits: make([]Match, 0, len(unique))}
for _, v := range unique {
res.Hits = append(res.Hits, v)
}
sort.Slice(res.Hits, func(i, j int) bool {
if res.Hits[i].Type == res.Hits[j].Type {
if res.Hits[i].Distance == res.Hits[j].Distance {
return res.Hits[i].Word < res.Hits[j].Word
}
return res.Hits[i].Distance < res.Hits[j].Distance
}
return res.Hits[i].Type < res.Hits[j].Type
})
return res
}
func (s *Service) Contains(req ContainsRequest) ContainsResponse {
ok, w := s.store.HasAnyInText(strings.TrimSpace(req.Text))
return ContainsResponse{Contains: ok, Word: w}
}
func ternary[T any](cond bool, a, b T) T {
if cond {
return a
}
return b
}
func generateNgrams(text string, minLen, maxLen int) []string {
if minLen < 1 {
minLen = 1
}
if maxLen < minLen {
maxLen = minLen
}
// Work on rune boundaries for CJK safety
runes := []rune(text)
n := len(runes)
var out []string
for i := 0; i < n; i++ {
for l := minLen; l <= maxLen && i+l <= n; l++ {
out = append(out, string(runes[i:i+l]))
}
}
return dedupStrings(out)
}
func dedupStrings(in []string) []string {
seen := make(map[string]struct{}, len(in))
out := make([]string, 0, len(in))
for _, s := range in {
if _, ok := seen[s]; ok {
continue
}
seen[s] = struct{}{}
out = append(out, s)
}
return out
}
// Guard for unused import warning if utf8 not referenced elsewhere
var _ = utf8.RuneCountInString

141
internal/lexicon/store.go Normal file
View File

@@ -0,0 +1,141 @@
package lexicon
import (
"bufio"
"errors"
"os"
"path/filepath"
"strings"
"sync"
"github.com/ozeidan/fuzzy-patricia/patricia"
)
// Store holds the trie and statistics for the loaded lexicon.
type Store struct {
mu sync.RWMutex
trie *patricia.Trie
cnt int
}
func NewStore() *Store {
return &Store{trie: patricia.NewTrie()}
}
// LoadFromDir loads all .txt files from dir into the trie.
func (s *Store) LoadFromDir(dir string) error {
s.mu.Lock()
defer s.mu.Unlock()
newTrie := patricia.NewTrie()
count := 0
walkErr := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if info.IsDir() {
return nil
}
if strings.ToLower(filepath.Ext(info.Name())) != ".txt" {
return nil
}
f, err := os.Open(path)
if err != nil {
return err
}
defer f.Close()
scanner := bufio.NewScanner(f)
// Increase buffer for long lines
buf := make([]byte, 0, 1024*64)
scanner.Buffer(buf, 1024*1024)
for scanner.Scan() {
w := strings.TrimSpace(scanner.Text())
if w == "" || strings.HasPrefix(w, "#") {
continue
}
newTrie.Insert(patricia.Prefix(w), struct{}{})
count++
}
return scanner.Err()
})
if walkErr != nil {
return walkErr
}
if count == 0 {
return errors.New("no entries loaded")
}
// Swap in
s.trie = newTrie
s.cnt = count
return nil
}
func (s *Store) Stats() map[string]interface{} {
s.mu.RLock()
defer s.mu.RUnlock()
return map[string]interface{}{
"count": s.cnt,
}
}
// ForEachSubstringMatch visits any keys that contain the given substring.
// It uses the library's substring search.
func (s *Store) ForEachSubstringMatch(query string, visit func(word string) bool) {
s.mu.RLock()
tr := s.trie
s.mu.RUnlock()
if tr == nil || query == "" {
return
}
// second argument is caseSensitive; we use false by default
tr.VisitSubstring(patricia.Prefix(query), false, func(prefix patricia.Prefix, _ patricia.Item) error {
// The library does not expose a public stop error in all versions; ignore early stop
_ = visit(string(prefix))
return nil
})
}
// ForEachFuzzyMatch visits keys with fuzzy distance within maxDistance to query.
func (s *Store) ForEachFuzzyMatch(query string, maxDistance int, visit func(word string, distance int) bool) {
s.mu.RLock()
tr := s.trie
s.mu.RUnlock()
if tr == nil || query == "" {
return
}
// signature in current lib: VisitFuzzy(prefix, caseSensitive bool, visitor)
tr.VisitFuzzy(patricia.Prefix(query), false, func(prefix patricia.Prefix, _ patricia.Item, dist int) error {
if dist <= maxDistance {
_ = visit(string(prefix), dist)
}
return nil
})
}
// HasAnyInText returns true if any lexicon word is a substring of the given text.
// It scans each rune offset and visits prefixes against the trie.
func (s *Store) HasAnyInText(text string) (bool, string) {
s.mu.RLock()
tr := s.trie
s.mu.RUnlock()
if tr == nil || text == "" {
return false, ""
}
runes := []rune(text)
n := len(runes)
for i := 0; i < n; i++ {
suffix := string(runes[i:])
foundWord := ""
tr.VisitPrefixes(patricia.Prefix(suffix), false, func(prefix patricia.Prefix, _ patricia.Item) error {
if foundWord == "" {
foundWord = string(prefix)
}
return nil
})
if foundWord != "" {
return true, foundWord
}
}
return false, ""
}