A privacy-first, self-hosted, fully open source personal knowledge management software, written in typescript and golang. (PERSONAL FORK)
1package model
2
3import (
4 "path/filepath"
5 "strings"
6 "time"
7
8 "github.com/siyuan-note/logging"
9 "github.com/siyuan-note/siyuan/kernel/cache"
10 "github.com/siyuan-note/siyuan/kernel/sql"
11 "github.com/siyuan-note/siyuan/kernel/task"
12 "github.com/siyuan-note/siyuan/kernel/util"
13)
14
15func OCRAssetsJob() {
16 util.WaitForTesseractInit()
17
18 if !util.TesseractEnabled {
19 return
20 }
21
22 task.AppendTaskWithTimeout(task.OCRImage, 30*time.Second, autoOCRAssets)
23}
24
25func autoOCRAssets() {
26 if !util.TesseractEnabled {
27 return
28 }
29
30 defer logging.Recover()
31
32 assetsPath := util.GetDataAssetsAbsPath()
33 assets := getUnOCRAssetsAbsPaths()
34 if 0 < len(assets) {
35 for i, assetAbsPath := range assets {
36 text := util.GetOcrJsonText(util.Tesseract(assetAbsPath))
37 p := strings.TrimPrefix(assetAbsPath, assetsPath)
38 p = "assets" + filepath.ToSlash(p)
39 util.SetAssetText(p, text)
40 if 7 <= i { // 一次任务中最多处理 7 张图片,防止长时间占用系统资源
41 break
42 }
43 }
44 }
45
46 util.CleanNotExistAssetsTexts()
47
48 // 刷新 OCR 结果到数据库
49 util.NodeOCRQueueLock.Lock()
50 defer util.NodeOCRQueueLock.Unlock()
51 for _, id := range util.NodeOCRQueue {
52 sql.IndexNodeQueue(id)
53 }
54 util.NodeOCRQueue = nil
55}
56
57func getUnOCRAssetsAbsPaths() (ret []string) {
58 var assetsPaths []string
59 assets := cache.GetAssets()
60 for _, asset := range assets {
61 if !util.IsTesseractExtractable(asset.Path) {
62 continue
63 }
64 assetsPaths = append(assetsPaths, asset.Path)
65 }
66
67 assetsPath := util.GetDataAssetsAbsPath()
68 for _, assetPath := range assetsPaths {
69 if util.ExistsAssetText(assetPath) {
70 continue
71 }
72 absPath := filepath.Join(assetsPath, strings.TrimPrefix(assetPath, "assets"))
73 ret = append(ret, absPath)
74 }
75 return
76}
77
78func FlushAssetsTextsJob() {
79 util.SaveAssetsTexts()
80}