A privacy-first, self-hosted, fully open source personal knowledge management software, written in typescript and golang. (PERSONAL FORK)
1// SiYuan - Refactor your thinking
2// Copyright (c) 2020-present, b3log.org
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU Affero General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU Affero General Public License for more details.
13//
14// You should have received a copy of the GNU Affero General Public License
15// along with this program. If not, see <https://www.gnu.org/licenses/>.
16
17package util
18
19import (
20 "bytes"
21 "context"
22 "errors"
23 "fmt"
24 "os"
25 "os/exec"
26 "path/filepath"
27 "runtime/debug"
28 "strconv"
29 "strings"
30 "sync"
31 "sync/atomic"
32 "time"
33
34 "github.com/88250/go-humanize"
35 "github.com/88250/gulu"
36 "github.com/88250/lute/ast"
37 "github.com/88250/lute/html"
38 "github.com/siyuan-note/filelock"
39 "github.com/siyuan-note/logging"
40)
41
42var (
43 TesseractBin = "tesseract"
44 TesseractEnabled bool
45 TesseractMaxSize = 2 * 1000 * uint64(1000)
46 TesseractLangs []string
47
48 assetsTexts = map[string]string{}
49 assetsTextsLock = sync.Mutex{}
50 assetsTextsChanged = atomic.Bool{}
51)
52
53func CleanNotExistAssetsTexts() {
54 assetsTextsLock.Lock()
55 defer assetsTextsLock.Unlock()
56
57 assetsPath := GetDataAssetsAbsPath()
58 var toRemoves []string
59 for asset, _ := range assetsTexts {
60 assetAbsPath := strings.TrimPrefix(asset, "assets")
61 assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
62 if !filelock.IsExist(assetAbsPath) {
63 toRemoves = append(toRemoves, asset)
64 }
65 }
66
67 for _, asset := range toRemoves {
68 delete(assetsTexts, asset)
69 assetsTextsChanged.Store(true)
70 }
71 return
72}
73
74func LoadAssetsTexts() {
75 assetsPath := GetDataAssetsAbsPath()
76 assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
77 if !filelock.IsExist(assetsTextsPath) {
78 return
79 }
80
81 start := time.Now()
82 data, err := filelock.ReadFile(assetsTextsPath)
83 if err != nil {
84 logging.LogErrorf("read assets texts failed: %s", err)
85 return
86 }
87
88 assetsTextsLock.Lock()
89 if err = gulu.JSON.UnmarshalJSON(data, &assetsTexts); err != nil {
90 logging.LogErrorf("unmarshal assets texts failed: %s", err)
91 if err = filelock.Remove(assetsTextsPath); err != nil {
92 logging.LogErrorf("removed corrupted assets texts failed: %s", err)
93 }
94 return
95 }
96 assetsTextsLock.Unlock()
97 debug.FreeOSMemory()
98
99 if elapsed := time.Since(start).Seconds(); 2 < elapsed {
100 logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.BytesCustomCeil(uint64(len(data)), 2), assetsTextsPath, elapsed)
101 }
102 return
103}
104
105func SaveAssetsTexts() {
106 if !assetsTextsChanged.Load() {
107 return
108 }
109
110 start := time.Now()
111
112 assetsPath := GetDataAssetsAbsPath()
113 assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
114
115 assetsTextsLock.Lock()
116 // OCR 功能未开启且 ocr-texts.json 不存在时,如果 assetsTexts 为空则不创建文件
117 if !TesseractEnabled && !filelock.IsExist(assetsTextsPath) && 0 == len(assetsTexts) {
118 assetsTextsLock.Unlock()
119 assetsTextsChanged.Store(false)
120 return
121 }
122 data, err := gulu.JSON.MarshalIndentJSON(assetsTexts, "", " ")
123 if err != nil {
124 logging.LogErrorf("marshal assets texts failed: %s", err)
125 assetsTextsLock.Unlock()
126 return
127 }
128 assetsTextsLock.Unlock()
129
130 if err = filelock.WriteFile(assetsTextsPath, data); err != nil {
131 logging.LogErrorf("write assets texts failed: %s", err)
132 return
133 }
134 debug.FreeOSMemory()
135
136 if elapsed := time.Since(start).Seconds(); 2 < elapsed {
137 logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.BytesCustomCeil(uint64(len(data)), 2), assetsTextsPath, elapsed)
138 }
139
140 assetsTextsChanged.Store(false)
141}
142
143func SetAssetText(asset, text string) {
144 assetsTextsLock.Lock()
145 oldText, ok := assetsTexts[asset]
146 assetsTexts[asset] = text
147 assetsTextsLock.Unlock()
148 if !ok || oldText != text {
149 assetsTextsChanged.Store(true)
150 }
151}
152
153func ExistsAssetText(asset string) (ret bool) {
154 assetsTextsLock.Lock()
155 _, ret = assetsTexts[asset]
156 assetsTextsLock.Unlock()
157 return
158}
159
160func OcrAsset(asset string) (ret []map[string]interface{}, err error) {
161 if !TesseractEnabled {
162 err = fmt.Errorf(Langs[Lang][266])
163 return
164 }
165
166 assetsPath := GetDataAssetsAbsPath()
167 assetAbsPath := strings.TrimPrefix(asset, "assets")
168 assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
169 ret = Tesseract(assetAbsPath)
170 assetsTextsLock.Lock()
171 ocrText := GetOcrJsonText(ret)
172 assetsTexts[asset] = ocrText
173 assetsTextsLock.Unlock()
174 if "" != ocrText {
175 assetsTextsChanged.Store(true)
176 }
177 return
178}
179
180func GetAssetText(asset string) (ret string) {
181 assetsTextsLock.Lock()
182 ret = assetsTexts[asset]
183 assetsTextsLock.Unlock()
184 return
185}
186
187func RemoveAssetText(asset string) {
188 assetsTextsLock.Lock()
189 delete(assetsTexts, asset)
190 assetsTextsLock.Unlock()
191 assetsTextsChanged.Store(true)
192}
193
194var tesseractExts = []string{
195 ".png",
196 ".jpg",
197 ".jpeg",
198 ".tif",
199 ".tiff",
200 ".bmp",
201 ".gif",
202 ".webp",
203 ".pbm",
204 ".pgm",
205 ".ppm",
206 ".pnm",
207}
208
209func IsTesseractExtractable(p string) bool {
210 lowerName := strings.ToLower(p)
211 for _, ext := range tesseractExts {
212 if strings.HasSuffix(lowerName, ext) {
213 return true
214 }
215 }
216 return false
217}
218
219// tesseractOCRLock 用于 Tesseract OCR 加锁串行执行提升稳定性 https://github.com/siyuan-note/siyuan/issues/7265
220var tesseractOCRLock = sync.Mutex{}
221
222func Tesseract(imgAbsPath string) (ret []map[string]interface{}) {
223 if ContainerStd != Container || !TesseractEnabled {
224 return
225 }
226
227 defer logging.Recover()
228 tesseractOCRLock.Lock()
229 defer tesseractOCRLock.Unlock()
230
231 if !IsTesseractExtractable(imgAbsPath) {
232 return
233 }
234
235 info, err := os.Stat(imgAbsPath)
236 if err != nil {
237 return
238 }
239
240 if TesseractMaxSize < uint64(info.Size()) {
241 return
242 }
243
244 defer logging.Recover()
245
246 timeout := 7000
247 timeoutEnv := os.Getenv("SIYUAN_TESSERACT_TIMEOUT")
248 if "" != timeoutEnv {
249 if timeoutParsed, parseErr := strconv.Atoi(timeoutEnv); nil == parseErr {
250 timeout = timeoutParsed
251 } else {
252 logging.LogWarnf("parse tesseract timeout [%s] failed: %s", timeoutEnv, parseErr)
253 }
254 }
255 ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Millisecond)
256 defer cancel()
257
258 cmd := exec.CommandContext(ctx, TesseractBin, "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(TesseractLangs, "+"), "tsv")
259 gulu.CmdAttr(cmd)
260 output, err := cmd.CombinedOutput()
261 if errors.Is(ctx.Err(), context.DeadlineExceeded) {
262 logging.LogWarnf("tesseract [path=%s, size=%d] timeout [%dms]", imgAbsPath, info.Size(), timeout)
263 return
264 }
265
266 if err != nil {
267 logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err)
268 return
269 }
270
271 tsv := string(output)
272 //logging.LogInfof("tesseract [path=%s] success [%s]", imgAbsPath, tsv)
273
274 // 按行分割 TSV 数据
275 tsv = strings.ReplaceAll(tsv, "\r", "")
276 lines := strings.Split(tsv, "\n")
277
278 // 解析 TSV 数据 跳过标题行,从第二行开始处理
279 for _, line := range lines[1:] {
280 if line == "" {
281 continue // 跳过空行
282 }
283 // 分割每列数据
284 fields := strings.Split(line, "\t")
285 // 将字段名和字段值映射到一个 map 中
286 dataMap := make(map[string]interface{})
287 headers := strings.Split(lines[0], "\t")
288 for i, header := range headers {
289 if i < len(fields) {
290 dataMap[header] = fields[i]
291 } else {
292 dataMap[header] = ""
293 }
294 }
295 ret = append(ret, dataMap)
296 }
297
298 tsv = RemoveInvalid(tsv)
299 tsv = RemoveRedundantSpace(tsv)
300 msg := fmt.Sprintf("OCR [%s] [%s]", html.EscapeString(info.Name()), html.EscapeString(GetOcrJsonText(ret)))
301 PushStatusBar(msg)
302 return
303}
304
305// GetOcrJsonText 提取并连接所有 text 字段的函数
306func GetOcrJsonText(jsonData []map[string]interface{}) (ret string) {
307 for _, dataMap := range jsonData {
308 // 检查 text 字段是否存在
309 if text, ok := dataMap["text"]; ok {
310 // 确保 text 是字符串类型
311 if textStr, ok := text.(string); ok {
312 ret += " " + strings.ReplaceAll(textStr, "\r", "")
313 }
314 }
315 }
316 ret = RemoveInvalid(ret)
317 ret = RemoveRedundantSpace(ret)
318 return ret
319}
320
321var tesseractInited = atomic.Bool{}
322
323func WaitForTesseractInit() {
324 for {
325 if tesseractInited.Load() {
326 return
327 }
328 time.Sleep(time.Second)
329 }
330}
331
332func InitTesseract() {
333 ver := getTesseractVer()
334 if "" == ver {
335 tesseractInited.Store(true)
336 return
337 }
338
339 langs := getTesseractLangs()
340 if 1 > len(langs) {
341 logging.LogWarnf("no tesseract langs found, disabling tesseract-ocr")
342 TesseractEnabled = false
343 tesseractInited.Store(true)
344 return
345 }
346
347 maxSizeVal := os.Getenv("SIYUAN_TESSERACT_MAX_SIZE")
348 if "" != maxSizeVal {
349 if maxSize, parseErr := strconv.ParseUint(maxSizeVal, 10, 64); nil == parseErr {
350 TesseractMaxSize = maxSize
351 }
352 }
353
354 // Supports via environment var `SIYUAN_TESSERACT_ENABLED=false` to close OCR https://github.com/siyuan-note/siyuan/issues/9619
355 if enabled := os.Getenv("SIYUAN_TESSERACT_ENABLED"); "" != enabled {
356 if enabledBool, parseErr := strconv.ParseBool(enabled); nil == parseErr {
357 TesseractEnabled = enabledBool
358 if !enabledBool {
359 logging.LogInfof("tesseract-ocr disabled by env")
360 tesseractInited.Store(true)
361 return
362 }
363 }
364 }
365
366 TesseractLangs = filterTesseractLangs(langs)
367 logging.LogInfof("tesseract-ocr enabled [ver=%s, maxSize=%s, langs=%s]", ver, humanize.BytesCustomCeil(TesseractMaxSize, 2), strings.Join(TesseractLangs, "+"))
368 tesseractInited.Store(true)
369}
370
371func filterTesseractLangs(langs []string) (ret []string) {
372 ret = []string{}
373
374 envLangsVal := os.Getenv("SIYUAN_TESSERACT_LANGS")
375 if "" != envLangsVal {
376 envLangs := strings.Split(envLangsVal, "+")
377 for _, lang := range langs {
378 if gulu.Str.Contains(lang, envLangs) {
379 ret = append(ret, lang)
380 }
381 }
382 } else {
383 for _, lang := range langs {
384 if "eng" == lang || strings.HasPrefix(lang, "chi") || "fra" == lang || "spa" == lang || "deu" == lang ||
385 "rus" == lang || "jpn" == lang || "osd" == lang {
386 ret = append(ret, lang)
387 }
388 }
389 }
390 return ret
391}
392
393func getTesseractVer() (ret string) {
394 if ContainerStd != Container {
395 return
396 }
397
398 cmd := exec.Command(TesseractBin, "--version")
399 gulu.CmdAttr(cmd)
400 data, err := cmd.CombinedOutput()
401 if err != nil {
402 errMsg := strings.ToLower(err.Error())
403 if strings.Contains(errMsg, "executable file not found") || strings.Contains(errMsg, "no such file or directory") {
404 // macOS 端 Tesseract OCR 安装后不识别 https://github.com/siyuan-note/siyuan/issues/7107
405 TesseractBin = "/usr/local/bin/tesseract"
406 cmd = exec.Command(TesseractBin, "--version")
407 gulu.CmdAttr(cmd)
408 data, err = cmd.CombinedOutput()
409 if err != nil {
410 errMsg = strings.ToLower(err.Error())
411 if strings.Contains(errMsg, "executable file not found") || strings.Contains(errMsg, "no such file or directory") {
412 TesseractBin = "/opt/homebrew/bin/tesseract"
413 cmd = exec.Command(TesseractBin, "--version")
414 gulu.CmdAttr(cmd)
415 data, err = cmd.CombinedOutput()
416 }
417 }
418 }
419 }
420 if err != nil {
421 return
422 }
423
424 if strings.HasPrefix(string(data), "tesseract ") {
425 parts := bytes.Split(data, []byte("\n"))
426 if 0 < len(parts) {
427 ret = strings.TrimPrefix(string(parts[0]), "tesseract ")
428 ret = strings.TrimSpace(ret)
429 TesseractEnabled = true
430 }
431 return
432 }
433 return
434}
435
436func getTesseractLangs() (ret []string) {
437 if !TesseractEnabled {
438 return nil
439 }
440
441 cmd := exec.Command(TesseractBin, "--list-langs")
442 gulu.CmdAttr(cmd)
443 data, err := cmd.CombinedOutput()
444 if err != nil {
445 return nil
446 }
447
448 parts := bytes.Split(data, []byte("\n"))
449 if 0 < len(parts) {
450 parts = parts[1:]
451 }
452 for _, part := range parts {
453 part = bytes.TrimSpace(part)
454 if 0 == len(part) {
455 continue
456 }
457 ret = append(ret, string(part))
458 }
459 return
460}
461
462var (
463 NodeOCRQueue []string
464 NodeOCRQueueLock = sync.Mutex{}
465)
466
467func PushNodeOCRQueue(n *ast.Node) {
468 if nil == n {
469 return
470 }
471
472 NodeOCRQueueLock.Lock()
473 defer NodeOCRQueueLock.Unlock()
474 NodeOCRQueue = append(NodeOCRQueue, n.ID)
475}