10 Commits
main ... dev

Author SHA1 Message Date
RookieCuzz
b05a333e07 Update Jenkinsfile 2025-10-08 15:43:35 +08:00
RookieCuzz
ba1d8a32ff add: Jenkinscicd 2025-10-08 11:42:09 +08:00
RookieCuzz
dbf0ffc752 add: docker 打包 2025-10-08 10:43:58 +08:00
RookieCuzz
05c600b439 fix: ci 2025-10-08 10:37:31 +08:00
RookieCuzz
332db8f522 update: branch name 2025-10-08 10:35:43 +08:00
RookieCuzz
a2c61e116b update: 力工 2025-10-08 10:32:34 +08:00
RookieCuzz
f0a167af8f add: gitignore 2025-10-08 10:31:51 +08:00
RookieCuzz
80cfc45cb0 add: ci action 2025-10-08 10:31:37 +08:00
RookieCuzz
81a8126d68 Merge remote-tracking branch 'origin/dev' into dev 2025-10-08 10:26:45 +08:00
RookieCuzz
f956b85ed3 敏感词检测 2025-09-03 16:37:43 +08:00
14 changed files with 807 additions and 58 deletions

27
.dockerignore Normal file
View File

@@ -0,0 +1,27 @@
.git
.github
.idea
.vscode
Dockerfile
.dockerignore
# Build outputs
bin/
build/
dist/
out/
# Archives & temp
*.zip
*.tar
*.tar.gz
*.rar
*.7z
*.log
*.tmp
*.swp
~*
# Optional: exclude non-runtime assets
Organized/
ThirdPartyCompatibleFormats/

132
.github/workflows/server.yml vendored Normal file
View File

@@ -0,0 +1,132 @@
name: Server CI/CD
on:
push:
branches: [ "dev", "master" ]
tags:
- '*'
pull_request:
branches: [ "dev", "master" ]
permissions:
contents: read
packages: write
jobs:
build:
name: Build server binaries
runs-on: ubuntu-latest
strategy:
matrix:
goos: [ linux, windows ]
goarch: [ amd64 ]
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Setup Go
uses: actions/setup-go@v4
with:
go-version: '1.22.x'
cache: true
- name: Download modules
run: go mod download
- name: Vet & Test
run: |
go vet ./...
go test ./... -v
- name: Build cmd/server
shell: bash
run: |
mkdir -p build
EXT=""
if [ "${{ matrix.goos }}" = "windows" ]; then EXT=".exe"; fi
OUT="server-${{ matrix.goos }}-${{ matrix.goarch }}${EXT}"
echo "Building $OUT"
GOOS=${{ matrix.goos }} GOARCH=${{ matrix.goarch }} \
go build -trimpath -ldflags="-s -w" -o "build/${OUT}" ./cmd/server
- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: server-${{ matrix.goos }}-${{ matrix.goarch }}
path: build/server-${{ matrix.goos }}-${{ matrix.goarch }}*
release:
name: Release binaries
needs: build
runs-on: ubuntu-latest
if: startsWith(github.ref, 'refs/tags/')
steps:
- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: dist
- name: List artifacts
run: ls -R dist
- name: Create GitHub Release
uses: softprops/action-gh-release@v1
with:
files: dist/**/*
draft: false
prerelease: false
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
docker:
name: Build and push Docker image
needs: build
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup QEMU
uses: docker/setup-qemu-action@v3
- name: Setup Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to GHCR
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Determine image name and tags
id: imagetags
shell: bash
run: |
IMAGE="ghcr.io/${{ github.repository_owner }}/sensitive-lexicon-server"
echo "IMAGE=$IMAGE" >> $GITHUB_ENV
if [ "${{ github.event_name }}" = "pull_request" ]; then
echo "PUSH=false" >> $GITHUB_ENV
echo "TAGS=${IMAGE}:pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
elif [ "${{ github.ref_type }}" = "tag" ]; then
echo "PUSH=true" >> $GITHUB_ENV
echo "TAGS=${IMAGE}:${{ github.ref_name }},${IMAGE}:latest" >> $GITHUB_ENV
else
BRANCH="${{ github.ref_name }}"
echo "PUSH=true" >> $GITHUB_ENV
if [ "$BRANCH" = "dev" ] || [ "$BRANCH" = "master" ]; then
echo "TAGS=${IMAGE}:latest,${IMAGE}:sha-${{ github.sha }}" >> $GITHUB_ENV
else
echo "TAGS=${IMAGE}:branch-${BRANCH},${IMAGE}:sha-${{ github.sha }}" >> $GITHUB_ENV
fi
fi
echo "Using tags: $TAGS"
- name: Build and push
uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
platforms: linux/amd64,linux/arm64
push: ${{ env.PUSH }}
tags: ${{ env.TAGS }}

33
.gitignore vendored
View File

@@ -15,7 +15,6 @@ out/
*.test
coverage.out
*.coverprofile
cover/
# Logs
*.log
@@ -24,26 +23,6 @@ logs/
# Env files
.env
.env.*
.envrc
# Caches & temp
.cache/
tmp/
# Profiling & traces
*.prof
*.pprof
trace.out
# PID and backup files
*.pid
*.bak
*.orig
# Editor swap files
*.swp
*.swo
~*
# IDE settings
.vscode/
@@ -54,15 +33,13 @@ trace.out
.DS_Store
Thumbs.db
# Archives
# Archives and temporary files
*.zip
*.tar
*.tar.gz
*.rar
*.7z
# Keys & certs (avoid committing secrets)
*.pem
*.key
*.crt
*.cert
*.tmp
*.temp
*.swp
~*

37
Dockerfile Normal file
View File

@@ -0,0 +1,37 @@
# syntax=docker/dockerfile:1.4
FROM golang:1.22-alpine AS builder
WORKDIR /src
# Pre-fetch deps
COPY go.mod go.sum ./
RUN go mod download
# Copy source
COPY . .
# Normalize and tidy modules inside build context
RUN go mod tidy
# Build static binary for target platform
ARG TARGETOS
ARG TARGETARCH
ENV CGO_ENABLED=0
RUN GOOS=$TARGETOS GOARCH=$TARGETARCH \
go build -trimpath -ldflags="-s -w" -o /out/server ./cmd/server
FROM gcr.io/distroless/static:nonroot
WORKDIR /app
# App binary
COPY --from=builder /out/server /app/server
# Default lexicon files
COPY Vocabulary /app/Vocabulary
# Default envs
ENV PORT=8080
ENV LEXICON_DIR=Vocabulary
EXPOSE 8080
USER nonroot
ENTRYPOINT ["/app/server"]

98
Jenkinsfile vendored Normal file
View File

@@ -0,0 +1,98 @@
pipeline {
agent {label 'dockeragent'}
// 构建逻辑已迁移到 DockerfileJenkins 不再进行本地 go build
environment {
GO111MODULE = 'on' // 开启 Modules 模式
CGO_ENABLED = '0'
APP_NAME = 'sensitive-lexicon'
REGISTRY = 'crpi-vqe38j3xeblrq0n4.cn-hangzhou.personal.cr.aliyuncs.com/go-mctown'
}
stages {
stage('Checkout') {
steps {
checkout scm
}
}
// 使用 Dockerfile 完成编译与打包,仅保留镜像构建与推送
stage('Docker Build & Push') {
steps {
withCredentials([usernamePassword(
credentialsId: 'aliyun-docker-login',
usernameVariable: 'DOCKER_USERNAME',
passwordVariable: 'DOCKER_PASSWORD'
)]) {
sh """
echo "\$DOCKER_PASSWORD" | docker login --username \$DOCKER_USERNAME --password-stdin ${env.REGISTRY.split('/')[0]}
"""
}
script {
def imageTag = "${env.REGISTRY}/${env.APP_NAME}:${env.BUILD_NUMBER}"
def latestTag = "${env.REGISTRY}/${env.APP_NAME}:latest"
sh """
ls -l
docker build -t ${imageTag} --network=host .
docker tag ${imageTag} ${latestTag}
docker push ${imageTag}
docker push ${latestTag}
"""
}
}
}
stage('Deploy All Compose Projects') {
parallel {
stage('Deploy compose1') {
agent {label 'dockeragent'}
steps {
checkout scm
sh """
pwd
ls -l
"""
dir('deploy/compose') {
script {
withCredentials([usernamePassword(
credentialsId: 'aliyun-docker-login',
usernameVariable: 'DOCKER_USERNAME',
passwordVariable: 'DOCKER_PASSWORD'
)]) {
sh """
echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USERNAME" --password-stdin ${env.REGISTRY.split('/')[0]}
"""
}
sh """
pwd
ls -l
docker compose -f docker-compose.yml down || true
docker compose -f docker-compose.yml pull
docker compose -f docker-compose.yml up -d --remove-orphans
"""
}
}
}
}
}
}
}
post {
always {
cleanWs()
}
success {
echo "✅ 构建成功!"
}
failure {
echo "🔥 构建失败,请检查日志。"
}
}
}

View File

@@ -84,12 +84,20 @@ git clone https://github.com/Konsheng/Sensitive-lexicon.git
</picture>
</a>
## 敏感词检测服务(Go
## 运行敏感词检测服务(Fiber + fuzzy-patricia
- 提供基于 Go 的敏感词检测服务,支持模糊匹配与词库热加载。
- 服务代码路径:`./cmd/server`(包含 REST API`/detect``/contains``/reload``/health`)。
- 分支导航:
- `dev` 开发版服务与工程化更新更频繁https://github.com/Konsheng/Sensitive-lexicon/tree/dev
- Docker 运行示例:
- `docker run -p 8080:8080 ghcr.io/<你的用户名>/sensitive-lexicon-server:latest`
- 环境变量:`PORT`, `LEXICON_DIR`, `FUZZY_MIN_NGRAM`, `FUZZY_MAX_NGRAM`, `FUZZY_MAX_DISTANCE`
```bash
# Windows PowerShell 示例
$env:PORT="8080"; $env:LEXICON_DIR="Vocabulary"; $env:FUZZY_MAX_DISTANCE="1"
# 构建并运行
go mod tidy
go build -o bin\server.exe ./cmd/server
./bin/server.exe
```
- POST `/detect`
- 请求体: `{ "text": "待检测文本", "enable_fuzzy": true }`
- 响应: `{ "hits": [{"word":"...","type":"substring|fuzzy","distance":0}] }`
- POST `/reload` 重新加载 `Vocabulary` 目录
- GET `/health` 存活探针

View File

@@ -1,3 +1,4 @@
力工
穴海
协警
纳米比亚

View File

@@ -1,10 +1,8 @@
习近平
平近习
xjp
JinPing Xi
习太子
习明泽
彭丽媛
老习
温家宝
温加宝
@@ -269,6 +267,7 @@ g产
g匪
共匪
仇共
政府
症腐
政腐
政付
@@ -279,6 +278,7 @@ zhengfu
政zhi
挡中央
档中央
中央领导
中国zf
中央zf
国wu院
@@ -286,24 +286,41 @@ zhengfu
gong和
大陆官方
北京政权
江泽民
胡锦涛
温家宝
习近平
习仲勋
贺国强
贺子珍
周永康
李长春
李德生
王岐山
姚依林
回良玉
李源潮
李干成
戴秉国
黄镇
刘延东
刘瑞龙
俞正声
黄敬
薄熙
薄一波
周小川
周建南
温云松
徐明
江泽慧
江绵恒
江绵康
李小鹏
李鹏
李小琳
朱云来
朱容基
法轮功
李洪志
新疆骚乱
李强
郭文贵
张又侠
老王来了
六四
反送中

View File

@@ -1,14 +0,0 @@
力工梭哈
力工梭哈定律
力工梭哈理论
供养者思维
梭哈结婚
梭哈婚姻
梭哈成家
梭哈买房
梭哈家庭
攒钱结婚梭哈
理工男脉冲
性压抑大一统
性压抑大一统理论
性压抑相对论

96
cmd/server/main.go Normal file
View File

@@ -0,0 +1,96 @@
package main
import (
"github.com/go-playground/validator/v10"
"github.com/gofiber/fiber/v2"
"log"
"os"
"sensitive-lexicon/internal/detect"
"sensitive-lexicon/internal/lexicon"
"strconv"
"time"
)
type User struct {
Name string `json:"name" validate:"min=5,max=20"`
Age int `json:"age" validate:"gte=18"`
Enrollment time.Time `json:"enrollment" validate:"before_today"`
Graduation time.Time `json:"graduation" validate:"gtfield=Enrollment"`
}
// BeforeToday 验证日期是否在今天之前
func BeforeToday(fl validator.FieldLevel) bool {
fieldTime, ok := fl.Field().Interface().(time.Time)
if !ok {
return false
}
return fieldTime.Before(time.Now())
}
func main() {
lexiconDir := getenv("LEXICON_DIR", "Vocabulary")
minNgram := getenvInt("FUZZY_MIN_NGRAM", 2)
maxNgram := getenvInt("FUZZY_MAX_NGRAM", 10)
maxDistance := getenvInt("FUZZY_MAX_DISTANCE", 1)
store := lexicon.NewStore()
if err := store.LoadFromDir(lexiconDir); err != nil {
log.Fatalf("failed to load lexicon: %v", err)
}
service := detect.NewService(store)
service.SetFuzzyConfig(detect.FuzzyConfig{MinNgramLen: minNgram, MaxNgramLen: maxNgram, MaxDistance: maxDistance})
app := fiber.New()
app.Get("/health", func(c *fiber.Ctx) error {
return c.JSON(fiber.Map{"status": "ok"})
})
app.Post("/detect", func(c *fiber.Ctx) error {
var req detect.DetectRequest
if err := c.BodyParser(&req); err != nil {
return fiber.NewError(fiber.StatusBadRequest, err.Error())
}
res := service.Detect(req)
return c.JSON(res)
})
app.Post("/contains", func(c *fiber.Ctx) error {
var req detect.ContainsRequest
if err := c.BodyParser(&req); err != nil {
return fiber.NewError(fiber.StatusBadRequest, err.Error())
}
res := service.Contains(req)
return c.JSON(res)
})
app.Post("/reload", func(c *fiber.Ctx) error {
if err := store.LoadFromDir(lexiconDir); err != nil {
return fiber.NewError(fiber.StatusInternalServerError, err.Error())
}
stats := store.Stats()
return c.JSON(stats)
})
port := getenv("PORT", "8080")
addr := ":" + port
log.Printf("listening on %s", addr)
if err := app.Listen(addr); err != nil {
log.Fatal(err)
}
}
func getenv(k, def string) string {
if v := os.Getenv(k); v != "" {
return v
}
return def
}
func getenvInt(k string, def int) int {
if v := os.Getenv(k); v != "" {
if i, err := strconv.Atoi(v); err == nil {
return i
}
}
return def
}

30
go.mod Normal file
View File

@@ -0,0 +1,30 @@
module sensitive-lexicon
go 1.22
require (
github.com/go-playground/validator/v10 v10.27.0
github.com/gofiber/fiber/v2 v2.52.5
github.com/ozeidan/fuzzy-patricia v3.0.0+incompatible
)
require (
github.com/andybalholm/brotli v1.0.5 // indirect
github.com/gabriel-vasile/mimetype v1.4.8 // indirect
github.com/go-playground/locales v0.14.1 // indirect
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/google/uuid v1.5.0 // indirect
github.com/klauspost/compress v1.17.0 // indirect
github.com/leodido/go-urn v1.4.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-runewidth v0.0.15 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect
github.com/valyala/fasthttp v1.51.0 // indirect
github.com/valyala/tcplisten v1.0.0 // indirect
golang.org/x/crypto v0.33.0 // indirect
golang.org/x/net v0.34.0 // indirect
golang.org/x/sys v0.30.0 // indirect
golang.org/x/text v0.22.0 // indirect
)

55
go.sum Normal file
View File

@@ -0,0 +1,55 @@
github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs=
github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gabriel-vasile/mimetype v1.4.8 h1:FfZ3gj38NjllZIeJAmMhr+qKL8Wu+nOoI3GqacKw1NM=
github.com/gabriel-vasile/mimetype v1.4.8/go.mod h1:ByKUIKGjh1ODkGM1asKUbQZOLGrPjydw3hYPU2YU9t8=
github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY=
github.com/go-playground/validator/v10 v10.27.0 h1:w8+XrWVMhGkxOaaowyKH35gFydVHOvC0/uWoy2Fzwn4=
github.com/go-playground/validator/v10 v10.27.0/go.mod h1:I5QpIEbmr8On7W0TktmJAumgzX4CA1XNl4ZmDuVHKKo=
github.com/gofiber/fiber/v2 v2.52.5 h1:tWoP1MJQjGEe4GB5TUGOi7P2E0ZMMRx5ZTG4rT+yGMo=
github.com/gofiber/fiber/v2 v2.52.5/go.mod h1:KEOE+cXMhXG0zHc9d8+E38hoX+ZN7bhOtgeF2oT6jrQ=
github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU=
github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/klauspost/compress v1.17.0 h1:Rnbp4K9EjcDuVuHtd0dgA4qNuv9yKDYKK1ulpJwgrqM=
github.com/klauspost/compress v1.17.0/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U=
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/ozeidan/fuzzy-patricia v3.0.0+incompatible h1:Pl61eMyfJqgY/wytiI4vamqPYribq6d8VxeP1CNyg9M=
github.com/ozeidan/fuzzy-patricia v3.0.0+incompatible/go.mod h1:zgvuCcYS7wB7fVCGblsaFFmEe8+aAH13dTYm8FbrpsM=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/valyala/fasthttp v1.51.0 h1:8b30A5JlZ6C7AS81RsWjYMQmrZG6feChmgAolCl1SqA=
github.com/valyala/fasthttp v1.51.0/go.mod h1:oI2XroL+lI7vdXyYoQk03bXBThfFl2cVdIA3Xl7cH8g=
github.com/valyala/tcplisten v1.0.0 h1:rBHj/Xf+E1tRGZyWIWwJDiRY0zc1Js+CV5DqwacVSA8=
github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc=
golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus=
golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M=
golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0=
golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

144
internal/detect/service.go Normal file
View File

@@ -0,0 +1,144 @@
package detect
import (
"sort"
"strings"
"unicode/utf8"
"sensitive-lexicon/internal/lexicon"
)
type FuzzyConfig struct {
MinNgramLen int
MaxNgramLen int
MaxDistance int
}
type DetectRequest struct {
Text string `json:"text"`
// If true, enable fuzzy detection on n-grams within the text
EnableFuzzy bool `json:"enable_fuzzy"`
}
type Match struct {
Word string `json:"word"`
Type string `json:"type"` // substring | fuzzy
Distance int `json:"distance,omitempty"`
}
type DetectResponse struct {
Hits []Match `json:"hits"`
}
type ContainsRequest struct {
Text string `json:"text"`
}
type ContainsResponse struct {
Contains bool `json:"contains"`
Word string `json:"word,omitempty"`
}
type Service struct {
store *lexicon.Store
fuzzyCfg FuzzyConfig
}
func NewService(store *lexicon.Store) *Service {
return &Service{store: store, fuzzyCfg: FuzzyConfig{MinNgramLen: 2, MaxNgramLen: 10, MaxDistance: 1}}
}
func (s *Service) SetFuzzyConfig(cfg FuzzyConfig) {
s.fuzzyCfg = cfg
}
func (s *Service) Detect(req DetectRequest) DetectResponse {
text := strings.TrimSpace(req.Text)
if text == "" {
return DetectResponse{}
}
unique := make(map[string]Match)
// Substring hits: for each codepoint window from input, find lexicon entries containing it
s.store.ForEachSubstringMatch(text, func(word string) bool {
unique[word] = Match{Word: word, Type: "substring"}
return true
})
if req.EnableFuzzy {
for _, token := range generateNgrams(text, s.fuzzyCfg.MinNgramLen, s.fuzzyCfg.MaxNgramLen) {
s.store.ForEachFuzzyMatch(token, s.fuzzyCfg.MaxDistance, func(word string, d int) bool {
if old, ok := unique[word]; ok {
if old.Type == "substring" && d == 0 {
return true
}
}
unique[word] = Match{Word: word, Type: ternary(d == 0, "substring", "fuzzy"), Distance: d}
return true
})
}
}
res := DetectResponse{Hits: make([]Match, 0, len(unique))}
for _, v := range unique {
res.Hits = append(res.Hits, v)
}
sort.Slice(res.Hits, func(i, j int) bool {
if res.Hits[i].Type == res.Hits[j].Type {
if res.Hits[i].Distance == res.Hits[j].Distance {
return res.Hits[i].Word < res.Hits[j].Word
}
return res.Hits[i].Distance < res.Hits[j].Distance
}
return res.Hits[i].Type < res.Hits[j].Type
})
return res
}
func (s *Service) Contains(req ContainsRequest) ContainsResponse {
ok, w := s.store.HasAnyInText(strings.TrimSpace(req.Text))
return ContainsResponse{Contains: ok, Word: w}
}
func ternary[T any](cond bool, a, b T) T {
if cond {
return a
}
return b
}
func generateNgrams(text string, minLen, maxLen int) []string {
if minLen < 1 {
minLen = 1
}
if maxLen < minLen {
maxLen = minLen
}
// Work on rune boundaries for CJK safety
runes := []rune(text)
n := len(runes)
var out []string
for i := 0; i < n; i++ {
for l := minLen; l <= maxLen && i+l <= n; l++ {
out = append(out, string(runes[i:i+l]))
}
}
return dedupStrings(out)
}
func dedupStrings(in []string) []string {
seen := make(map[string]struct{}, len(in))
out := make([]string, 0, len(in))
for _, s := range in {
if _, ok := seen[s]; ok {
continue
}
seen[s] = struct{}{}
out = append(out, s)
}
return out
}
// Guard for unused import warning if utf8 not referenced elsewhere
var _ = utf8.RuneCountInString

141
internal/lexicon/store.go Normal file
View File

@@ -0,0 +1,141 @@
package lexicon
import (
"bufio"
"errors"
"os"
"path/filepath"
"strings"
"sync"
"github.com/ozeidan/fuzzy-patricia/patricia"
)
// Store holds the trie and statistics for the loaded lexicon.
type Store struct {
mu sync.RWMutex
trie *patricia.Trie
cnt int
}
func NewStore() *Store {
return &Store{trie: patricia.NewTrie()}
}
// LoadFromDir loads all .txt files from dir into the trie.
func (s *Store) LoadFromDir(dir string) error {
s.mu.Lock()
defer s.mu.Unlock()
newTrie := patricia.NewTrie()
count := 0
walkErr := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if info.IsDir() {
return nil
}
if strings.ToLower(filepath.Ext(info.Name())) != ".txt" {
return nil
}
f, err := os.Open(path)
if err != nil {
return err
}
defer f.Close()
scanner := bufio.NewScanner(f)
// Increase buffer for long lines
buf := make([]byte, 0, 1024*64)
scanner.Buffer(buf, 1024*1024)
for scanner.Scan() {
w := strings.TrimSpace(scanner.Text())
if w == "" || strings.HasPrefix(w, "#") {
continue
}
newTrie.Insert(patricia.Prefix(w), struct{}{})
count++
}
return scanner.Err()
})
if walkErr != nil {
return walkErr
}
if count == 0 {
return errors.New("no entries loaded")
}
// Swap in
s.trie = newTrie
s.cnt = count
return nil
}
func (s *Store) Stats() map[string]interface{} {
s.mu.RLock()
defer s.mu.RUnlock()
return map[string]interface{}{
"count": s.cnt,
}
}
// ForEachSubstringMatch visits any keys that contain the given substring.
// It uses the library's substring search.
func (s *Store) ForEachSubstringMatch(query string, visit func(word string) bool) {
s.mu.RLock()
tr := s.trie
s.mu.RUnlock()
if tr == nil || query == "" {
return
}
// second argument is caseSensitive; we use false by default
tr.VisitSubstring(patricia.Prefix(query), false, func(prefix patricia.Prefix, _ patricia.Item) error {
// The library does not expose a public stop error in all versions; ignore early stop
_ = visit(string(prefix))
return nil
})
}
// ForEachFuzzyMatch visits keys with fuzzy distance within maxDistance to query.
func (s *Store) ForEachFuzzyMatch(query string, maxDistance int, visit func(word string, distance int) bool) {
s.mu.RLock()
tr := s.trie
s.mu.RUnlock()
if tr == nil || query == "" {
return
}
// signature in current lib: VisitFuzzy(prefix, caseSensitive bool, visitor)
tr.VisitFuzzy(patricia.Prefix(query), false, func(prefix patricia.Prefix, _ patricia.Item, dist int) error {
if dist <= maxDistance {
_ = visit(string(prefix), dist)
}
return nil
})
}
// HasAnyInText returns true if any lexicon word is a substring of the given text.
// It scans each rune offset and visits prefixes against the trie.
func (s *Store) HasAnyInText(text string) (bool, string) {
s.mu.RLock()
tr := s.trie
s.mu.RUnlock()
if tr == nil || text == "" {
return false, ""
}
runes := []rune(text)
n := len(runes)
for i := 0; i < n; i++ {
suffix := string(runes[i:])
foundWord := ""
tr.VisitPrefixes(patricia.Prefix(suffix), false, func(prefix patricia.Prefix, _ patricia.Item) error {
if foundWord == "" {
foundWord = string(prefix)
}
return nil
})
if foundWord != "" {
return true, foundWord
}
}
return false, ""
}