A privacy-first, self-hosted, fully open source personal knowledge management software, written in typescript and golang. (PERSONAL FORK)
at lambda-fork/main 475 lines 12 kB view raw
1// SiYuan - Refactor your thinking 2// Copyright (c) 2020-present, b3log.org 3// 4// This program is free software: you can redistribute it and/or modify 5// it under the terms of the GNU Affero General Public License as published by 6// the Free Software Foundation, either version 3 of the License, or 7// (at your option) any later version. 8// 9// This program is distributed in the hope that it will be useful, 10// but WITHOUT ANY WARRANTY; without even the implied warranty of 11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12// GNU Affero General Public License for more details. 13// 14// You should have received a copy of the GNU Affero General Public License 15// along with this program. If not, see <https://www.gnu.org/licenses/>. 16 17package util 18 19import ( 20 "bytes" 21 "context" 22 "errors" 23 "fmt" 24 "os" 25 "os/exec" 26 "path/filepath" 27 "runtime/debug" 28 "strconv" 29 "strings" 30 "sync" 31 "sync/atomic" 32 "time" 33 34 "github.com/88250/go-humanize" 35 "github.com/88250/gulu" 36 "github.com/88250/lute/ast" 37 "github.com/88250/lute/html" 38 "github.com/siyuan-note/filelock" 39 "github.com/siyuan-note/logging" 40) 41 42var ( 43 TesseractBin = "tesseract" 44 TesseractEnabled bool 45 TesseractMaxSize = 2 * 1000 * uint64(1000) 46 TesseractLangs []string 47 48 assetsTexts = map[string]string{} 49 assetsTextsLock = sync.Mutex{} 50 assetsTextsChanged = atomic.Bool{} 51) 52 53func CleanNotExistAssetsTexts() { 54 assetsTextsLock.Lock() 55 defer assetsTextsLock.Unlock() 56 57 assetsPath := GetDataAssetsAbsPath() 58 var toRemoves []string 59 for asset, _ := range assetsTexts { 60 assetAbsPath := strings.TrimPrefix(asset, "assets") 61 assetAbsPath = filepath.Join(assetsPath, assetAbsPath) 62 if !filelock.IsExist(assetAbsPath) { 63 toRemoves = append(toRemoves, asset) 64 } 65 } 66 67 for _, asset := range toRemoves { 68 delete(assetsTexts, asset) 69 assetsTextsChanged.Store(true) 70 } 71 return 72} 73 74func LoadAssetsTexts() { 75 assetsPath := GetDataAssetsAbsPath() 76 assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json") 77 if !filelock.IsExist(assetsTextsPath) { 78 return 79 } 80 81 start := time.Now() 82 data, err := filelock.ReadFile(assetsTextsPath) 83 if err != nil { 84 logging.LogErrorf("read assets texts failed: %s", err) 85 return 86 } 87 88 assetsTextsLock.Lock() 89 if err = gulu.JSON.UnmarshalJSON(data, &assetsTexts); err != nil { 90 logging.LogErrorf("unmarshal assets texts failed: %s", err) 91 if err = filelock.Remove(assetsTextsPath); err != nil { 92 logging.LogErrorf("removed corrupted assets texts failed: %s", err) 93 } 94 return 95 } 96 assetsTextsLock.Unlock() 97 debug.FreeOSMemory() 98 99 if elapsed := time.Since(start).Seconds(); 2 < elapsed { 100 logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.BytesCustomCeil(uint64(len(data)), 2), assetsTextsPath, elapsed) 101 } 102 return 103} 104 105func SaveAssetsTexts() { 106 if !assetsTextsChanged.Load() { 107 return 108 } 109 110 start := time.Now() 111 112 assetsPath := GetDataAssetsAbsPath() 113 assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json") 114 115 assetsTextsLock.Lock() 116 // OCR 功能未开启且 ocr-texts.json 不存在时,如果 assetsTexts 为空则不创建文件 117 if !TesseractEnabled && !filelock.IsExist(assetsTextsPath) && 0 == len(assetsTexts) { 118 assetsTextsLock.Unlock() 119 assetsTextsChanged.Store(false) 120 return 121 } 122 data, err := gulu.JSON.MarshalIndentJSON(assetsTexts, "", " ") 123 if err != nil { 124 logging.LogErrorf("marshal assets texts failed: %s", err) 125 assetsTextsLock.Unlock() 126 return 127 } 128 assetsTextsLock.Unlock() 129 130 if err = filelock.WriteFile(assetsTextsPath, data); err != nil { 131 logging.LogErrorf("write assets texts failed: %s", err) 132 return 133 } 134 debug.FreeOSMemory() 135 136 if elapsed := time.Since(start).Seconds(); 2 < elapsed { 137 logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.BytesCustomCeil(uint64(len(data)), 2), assetsTextsPath, elapsed) 138 } 139 140 assetsTextsChanged.Store(false) 141} 142 143func SetAssetText(asset, text string) { 144 assetsTextsLock.Lock() 145 oldText, ok := assetsTexts[asset] 146 assetsTexts[asset] = text 147 assetsTextsLock.Unlock() 148 if !ok || oldText != text { 149 assetsTextsChanged.Store(true) 150 } 151} 152 153func ExistsAssetText(asset string) (ret bool) { 154 assetsTextsLock.Lock() 155 _, ret = assetsTexts[asset] 156 assetsTextsLock.Unlock() 157 return 158} 159 160func OcrAsset(asset string) (ret []map[string]interface{}, err error) { 161 if !TesseractEnabled { 162 err = fmt.Errorf(Langs[Lang][266]) 163 return 164 } 165 166 assetsPath := GetDataAssetsAbsPath() 167 assetAbsPath := strings.TrimPrefix(asset, "assets") 168 assetAbsPath = filepath.Join(assetsPath, assetAbsPath) 169 ret = Tesseract(assetAbsPath) 170 assetsTextsLock.Lock() 171 ocrText := GetOcrJsonText(ret) 172 assetsTexts[asset] = ocrText 173 assetsTextsLock.Unlock() 174 if "" != ocrText { 175 assetsTextsChanged.Store(true) 176 } 177 return 178} 179 180func GetAssetText(asset string) (ret string) { 181 assetsTextsLock.Lock() 182 ret = assetsTexts[asset] 183 assetsTextsLock.Unlock() 184 return 185} 186 187func RemoveAssetText(asset string) { 188 assetsTextsLock.Lock() 189 delete(assetsTexts, asset) 190 assetsTextsLock.Unlock() 191 assetsTextsChanged.Store(true) 192} 193 194var tesseractExts = []string{ 195 ".png", 196 ".jpg", 197 ".jpeg", 198 ".tif", 199 ".tiff", 200 ".bmp", 201 ".gif", 202 ".webp", 203 ".pbm", 204 ".pgm", 205 ".ppm", 206 ".pnm", 207} 208 209func IsTesseractExtractable(p string) bool { 210 lowerName := strings.ToLower(p) 211 for _, ext := range tesseractExts { 212 if strings.HasSuffix(lowerName, ext) { 213 return true 214 } 215 } 216 return false 217} 218 219// tesseractOCRLock 用于 Tesseract OCR 加锁串行执行提升稳定性 https://github.com/siyuan-note/siyuan/issues/7265 220var tesseractOCRLock = sync.Mutex{} 221 222func Tesseract(imgAbsPath string) (ret []map[string]interface{}) { 223 if ContainerStd != Container || !TesseractEnabled { 224 return 225 } 226 227 defer logging.Recover() 228 tesseractOCRLock.Lock() 229 defer tesseractOCRLock.Unlock() 230 231 if !IsTesseractExtractable(imgAbsPath) { 232 return 233 } 234 235 info, err := os.Stat(imgAbsPath) 236 if err != nil { 237 return 238 } 239 240 if TesseractMaxSize < uint64(info.Size()) { 241 return 242 } 243 244 defer logging.Recover() 245 246 timeout := 7000 247 timeoutEnv := os.Getenv("SIYUAN_TESSERACT_TIMEOUT") 248 if "" != timeoutEnv { 249 if timeoutParsed, parseErr := strconv.Atoi(timeoutEnv); nil == parseErr { 250 timeout = timeoutParsed 251 } else { 252 logging.LogWarnf("parse tesseract timeout [%s] failed: %s", timeoutEnv, parseErr) 253 } 254 } 255 ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Millisecond) 256 defer cancel() 257 258 cmd := exec.CommandContext(ctx, TesseractBin, "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(TesseractLangs, "+"), "tsv") 259 gulu.CmdAttr(cmd) 260 output, err := cmd.CombinedOutput() 261 if errors.Is(ctx.Err(), context.DeadlineExceeded) { 262 logging.LogWarnf("tesseract [path=%s, size=%d] timeout [%dms]", imgAbsPath, info.Size(), timeout) 263 return 264 } 265 266 if err != nil { 267 logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err) 268 return 269 } 270 271 tsv := string(output) 272 //logging.LogInfof("tesseract [path=%s] success [%s]", imgAbsPath, tsv) 273 274 // 按行分割 TSV 数据 275 tsv = strings.ReplaceAll(tsv, "\r", "") 276 lines := strings.Split(tsv, "\n") 277 278 // 解析 TSV 数据 跳过标题行,从第二行开始处理 279 for _, line := range lines[1:] { 280 if line == "" { 281 continue // 跳过空行 282 } 283 // 分割每列数据 284 fields := strings.Split(line, "\t") 285 // 将字段名和字段值映射到一个 map 中 286 dataMap := make(map[string]interface{}) 287 headers := strings.Split(lines[0], "\t") 288 for i, header := range headers { 289 if i < len(fields) { 290 dataMap[header] = fields[i] 291 } else { 292 dataMap[header] = "" 293 } 294 } 295 ret = append(ret, dataMap) 296 } 297 298 tsv = RemoveInvalid(tsv) 299 tsv = RemoveRedundantSpace(tsv) 300 msg := fmt.Sprintf("OCR [%s] [%s]", html.EscapeString(info.Name()), html.EscapeString(GetOcrJsonText(ret))) 301 PushStatusBar(msg) 302 return 303} 304 305// GetOcrJsonText 提取并连接所有 text 字段的函数 306func GetOcrJsonText(jsonData []map[string]interface{}) (ret string) { 307 for _, dataMap := range jsonData { 308 // 检查 text 字段是否存在 309 if text, ok := dataMap["text"]; ok { 310 // 确保 text 是字符串类型 311 if textStr, ok := text.(string); ok { 312 ret += " " + strings.ReplaceAll(textStr, "\r", "") 313 } 314 } 315 } 316 ret = RemoveInvalid(ret) 317 ret = RemoveRedundantSpace(ret) 318 return ret 319} 320 321var tesseractInited = atomic.Bool{} 322 323func WaitForTesseractInit() { 324 for { 325 if tesseractInited.Load() { 326 return 327 } 328 time.Sleep(time.Second) 329 } 330} 331 332func InitTesseract() { 333 ver := getTesseractVer() 334 if "" == ver { 335 tesseractInited.Store(true) 336 return 337 } 338 339 langs := getTesseractLangs() 340 if 1 > len(langs) { 341 logging.LogWarnf("no tesseract langs found, disabling tesseract-ocr") 342 TesseractEnabled = false 343 tesseractInited.Store(true) 344 return 345 } 346 347 maxSizeVal := os.Getenv("SIYUAN_TESSERACT_MAX_SIZE") 348 if "" != maxSizeVal { 349 if maxSize, parseErr := strconv.ParseUint(maxSizeVal, 10, 64); nil == parseErr { 350 TesseractMaxSize = maxSize 351 } 352 } 353 354 // Supports via environment var `SIYUAN_TESSERACT_ENABLED=false` to close OCR https://github.com/siyuan-note/siyuan/issues/9619 355 if enabled := os.Getenv("SIYUAN_TESSERACT_ENABLED"); "" != enabled { 356 if enabledBool, parseErr := strconv.ParseBool(enabled); nil == parseErr { 357 TesseractEnabled = enabledBool 358 if !enabledBool { 359 logging.LogInfof("tesseract-ocr disabled by env") 360 tesseractInited.Store(true) 361 return 362 } 363 } 364 } 365 366 TesseractLangs = filterTesseractLangs(langs) 367 logging.LogInfof("tesseract-ocr enabled [ver=%s, maxSize=%s, langs=%s]", ver, humanize.BytesCustomCeil(TesseractMaxSize, 2), strings.Join(TesseractLangs, "+")) 368 tesseractInited.Store(true) 369} 370 371func filterTesseractLangs(langs []string) (ret []string) { 372 ret = []string{} 373 374 envLangsVal := os.Getenv("SIYUAN_TESSERACT_LANGS") 375 if "" != envLangsVal { 376 envLangs := strings.Split(envLangsVal, "+") 377 for _, lang := range langs { 378 if gulu.Str.Contains(lang, envLangs) { 379 ret = append(ret, lang) 380 } 381 } 382 } else { 383 for _, lang := range langs { 384 if "eng" == lang || strings.HasPrefix(lang, "chi") || "fra" == lang || "spa" == lang || "deu" == lang || 385 "rus" == lang || "jpn" == lang || "osd" == lang { 386 ret = append(ret, lang) 387 } 388 } 389 } 390 return ret 391} 392 393func getTesseractVer() (ret string) { 394 if ContainerStd != Container { 395 return 396 } 397 398 cmd := exec.Command(TesseractBin, "--version") 399 gulu.CmdAttr(cmd) 400 data, err := cmd.CombinedOutput() 401 if err != nil { 402 errMsg := strings.ToLower(err.Error()) 403 if strings.Contains(errMsg, "executable file not found") || strings.Contains(errMsg, "no such file or directory") { 404 // macOS 端 Tesseract OCR 安装后不识别 https://github.com/siyuan-note/siyuan/issues/7107 405 TesseractBin = "/usr/local/bin/tesseract" 406 cmd = exec.Command(TesseractBin, "--version") 407 gulu.CmdAttr(cmd) 408 data, err = cmd.CombinedOutput() 409 if err != nil { 410 errMsg = strings.ToLower(err.Error()) 411 if strings.Contains(errMsg, "executable file not found") || strings.Contains(errMsg, "no such file or directory") { 412 TesseractBin = "/opt/homebrew/bin/tesseract" 413 cmd = exec.Command(TesseractBin, "--version") 414 gulu.CmdAttr(cmd) 415 data, err = cmd.CombinedOutput() 416 } 417 } 418 } 419 } 420 if err != nil { 421 return 422 } 423 424 if strings.HasPrefix(string(data), "tesseract ") { 425 parts := bytes.Split(data, []byte("\n")) 426 if 0 < len(parts) { 427 ret = strings.TrimPrefix(string(parts[0]), "tesseract ") 428 ret = strings.TrimSpace(ret) 429 TesseractEnabled = true 430 } 431 return 432 } 433 return 434} 435 436func getTesseractLangs() (ret []string) { 437 if !TesseractEnabled { 438 return nil 439 } 440 441 cmd := exec.Command(TesseractBin, "--list-langs") 442 gulu.CmdAttr(cmd) 443 data, err := cmd.CombinedOutput() 444 if err != nil { 445 return nil 446 } 447 448 parts := bytes.Split(data, []byte("\n")) 449 if 0 < len(parts) { 450 parts = parts[1:] 451 } 452 for _, part := range parts { 453 part = bytes.TrimSpace(part) 454 if 0 == len(part) { 455 continue 456 } 457 ret = append(ret, string(part)) 458 } 459 return 460} 461 462var ( 463 NodeOCRQueue []string 464 NodeOCRQueueLock = sync.Mutex{} 465) 466 467func PushNodeOCRQueue(n *ast.Node) { 468 if nil == n { 469 return 470 } 471 472 NodeOCRQueueLock.Lock() 473 defer NodeOCRQueueLock.Unlock() 474 NodeOCRQueue = append(NodeOCRQueue, n.ID) 475}