commit 8bdcaa713d17ef0b5af6781196cd9bfe091b4849

+49

pkgs/applications/misc/k2pdfopt/0001-Fix-CMakeLists.patch

··· 1 + From 2629af4ed00d7ca65359178203d80fb146901cdb Mon Sep 17 00:00:00 2001 2 + From: Daniel Fullmer <danielrf12@gmail.com> 3 + Date: Fri, 3 Jul 2020 21:00:45 -0700 4 + Subject: [PATCH 1/2] Fix CMakeLists 5 + 6 + --- 7 + CMakeLists.txt | 12 ++++++++---- 8 + 1 file changed, 8 insertions(+), 4 deletions(-) 9 + 10 + diff --git a/CMakeLists.txt b/CMakeLists.txt 11 + index e218279..4341de9 100644 12 + --- a/CMakeLists.txt 13 + +++ b/CMakeLists.txt 14 + @@ -57,6 +57,7 @@ endif(JPEG_FOUND) 15 + include(FindJasper) 16 + if(JASPER_FOUND) 17 + set(HAVE_JASPER_LIB 1) 18 + + set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${JASPER_LIBRARY}) 19 + endif(JASPER_FOUND) 20 + 21 + # paths from willuslib/wgs.c 22 + @@ -71,9 +72,12 @@ else() 23 + message(STATUS "Could NOT find ghostscript executable") 24 + endif(GHOSTSCRIPT_EXECUTABLE) 25 + 26 + -# willus.h 27 + -# HAVE_GSL_LIB 28 + - 29 + +pkg_check_modules(GSL gsl) 30 + +if(GSL_FOUND) 31 + + set(HAVE_GSL_LIB 1) 32 + + include_directories(SYSTEM ${GSL_INCLUDEDIR}) 33 + + set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${GSL_LDFLAGS}) 34 + +endif(GSL_FOUND) 35 + 36 + # libfreetype6 (>= 2.3.9), libjbig2dec0, libjpeg8 (>= 8c), libx11-6, libxext6, zlib1g (>= 1:1.2.0) 37 + # MUPDF_STATIC_LDFLAGS misses mupdf-js-none, and doubles libs ... 38 + @@ -85,7 +89,7 @@ if(MUPDF_FOUND) 39 + include_directories(SYSTEM ${MUPDF_INCLUDEDIR}) 40 + message(STATUS "mupdf libraries: ${MUPDF_LDFLAGS}") 41 + set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${MUPDF_LDFLAGS} 42 + - -lmupdf-js-none -lopenjpeg -ljbig2dec -ljpeg -lfreetype 43 + + 44 + ) 45 + endif(MUPDF_FOUND) 46 + 47 + -- 48 + 2.27.0 49 +

+115 -27

pkgs/applications/misc/k2pdfopt/default.nix

··· 1 - { stdenv, fetchzip, fetchurl, fetchpatch, cmake, pkgconfig 2 - , zlib, libpng 1 + { stdenv, runCommand, fetchzip, fetchurl, fetchpatch, fetchFromGitHub 2 + , cmake, pkgconfig, zlib, libpng 3 3 , enableGSL ? true, gsl 4 4 , enableGhostScript ? true, ghostscript 5 5 , enableMuPDF ? true, mupdf ··· 11 11 12 12 with stdenv.lib; 13 13 14 - stdenv.mkDerivation rec { 15 - pname = "k2pdfopt"; 16 - version = "2.51a"; 14 + # k2pdfopt is a pain to package. It requires modified versions of mupdf, 15 + # leptonica, and tesseract. Instead of shipping patches for these upstream 16 + # packages, k2pdfopt includes just the modified source files for these 17 + # packages. The individual files from the {mupdf,leptonica,tesseract}_mod/ 18 + # directories are intended to replace the corresponding source files in the 19 + # upstream packages, for a particular version of that upstream package. 20 + # 21 + # There are a few ways we could approach packaging these modified versions of 22 + # mupdf, leptonica, and mupdf: 23 + # 1) Override the upstream source with a new derivation that involves copying 24 + # the modified source files from k2pdfopt and replacing the corresponding 25 + # source files in the upstream packages. Since the files are intended for a 26 + # particular version of the upstream package, this would not allow us to easily 27 + # use updates to those packages in nixpkgs. 28 + # 2) Manually produce patches which can be applied against the upstream 29 + # project, and have the same effect as replacing those files. This is what I 30 + # believe k2pdfopt should do this for us anyway. The benefit of creating and 31 + # applying patches in this way is that minor updates (esp. security fixes) to 32 + # upstream packages might still allow these patches to apply successfully. 33 + # 3) Automatically produce these patches inside a nix derivation. This is the 34 + # approach taken here, using the "mkPatch" provided below. This has the 35 + # benefit of easier review and should hopefully be simpler to update in the 36 + # future. 37 + 38 + let 39 + # Create a patch against src based on changes applied in patchCommands 40 + mkPatch = { name, src, patchCommands }: runCommand "${name}-k2pdfopt.patch" { inherit src; } '' 41 + source $stdenv/setup 42 + unpackPhase 17 43 18 - src = (fetchzip { 19 - url = "http://www.willus.com/k2pdfopt/src/k2pdfopt_v2.51_src.zip"; 20 - sha256 = "133l7xkvi67s6sfk8cfh7rmavbsf7ib5fyksk1ci6b6sch3z2sw9"; 21 - }); 44 + orig=$sourceRoot 45 + new=$sourceRoot-modded 46 + cp -r $orig/. $new/ 22 47 23 - # Note: the v2.51a zip contains only files to be replaced in the v2.50 zip. 24 - v251a_src = (fetchzip { 25 - url = "http://www.willus.com/k2pdfopt/src/k2pdfopt_v2.51a_src.zip"; 26 - sha256 = "0vvwblii7kgdwfxw8dzk6jbmz4dv94d7rkv18i60y8wkayj6yhl6"; 27 - }); 48 + pushd $new >/dev/null 49 + ${patchCommands} 50 + popd >/dev/null 28 51 29 - postUnpack = '' 30 - cp -r ${v251a_src}/* $sourceRoot 52 + diff -Naur $orig $new > $out || true 31 53 ''; 32 54 33 - patches = [ ./k2pdfopt.patch ./k2pdfopt-mupdf-1.16.1.patch ]; 55 + pname = "k2pdfopt"; 56 + version = "2.53"; 57 + k2pdfopt_src = fetchzip { 58 + url = "http://www.willus.com/${pname}/src/${pname}_v${version}_src.zip"; 59 + sha256 = "1fna8bg3pascjfc3hmc6xn0xi2yh7f1qp0d344mw9hqanbnykyy8"; 60 + }; 61 + in stdenv.mkDerivation rec { 62 + inherit pname version; 63 + src = k2pdfopt_src; 64 + 65 + patches = [ 66 + ./0001-Fix-CMakeLists.patch 67 + ]; 68 + 69 + postPatch = '' 70 + substituteInPlace willuslib/bmpdjvu.c \ 71 + --replace "<djvu.h>" "<libdjvu/ddjvuapi.h>" 72 + ''; 34 73 35 74 nativeBuildInputs = [ cmake pkgconfig ]; 36 75 37 76 buildInputs = 38 77 let 39 - # The patches below were constructed by taking the files from k2pdfopt in 40 - # the {mupdf,leptonica,tesseract}_mod/ directories, replacing the 41 - # corresponding files in the respective source trees, resolving any errors 42 - # with more recent versions of these depencencies, and running diff. 43 - mupdf_modded = mupdf.overrideAttrs (attrs: { 44 - patches = attrs.patches ++ [ ./mupdf.patch ]; # Last verified with mupdf 1.16.1 78 + # We use specific versions of these sources below to match the versions 79 + # used in the k2pdfopt source. Note that this does _not_ need to match the 80 + # version used elsewhere in nixpkgs, since it is only used to create the 81 + # patch that can then be applied to the version in nixpkgs. 82 + mupdf_patch = mkPatch { 83 + name = "mupdf"; 84 + src = fetchurl { 85 + url = "https://mupdf.com/downloads/archive/mupdf-1.17.0-source.tar.gz"; 86 + sha256 = "13nl9nrcx2awz9l83mlv2psi1lmn3hdnfwxvwgwiwbxlkjl3zqq0"; 87 + }; 88 + patchCommands = '' 89 + cp ${k2pdfopt_src}/mupdf_mod/{filter-basic,font,stext-device,string}.c ./source/fitz/ 90 + cp ${k2pdfopt_src}/mupdf_mod/pdf-* ./source/pdf/ 91 + ''; 92 + }; 93 + mupdf_modded = mupdf.overrideAttrs ({ patches ? [], ... }: { 94 + patches = patches ++ [ mupdf_patch ]; 95 + # This function is missing in font.c, see font-win32.c 96 + postPatch = '' 97 + echo "void pdf_install_load_system_font_funcs(fz_context *ctx) {}" >> source/fitz/font.c 98 + ''; 45 99 }); 46 - leptonica_modded = leptonica.overrideAttrs (attrs: { 47 - patches = [ ./leptonica.patch ]; # Last verified with leptonica 1.78.0 100 + 101 + leptonica_patch = mkPatch { 102 + name = "leptonica"; 103 + src = fetchurl { 104 + url = "http://www.leptonica.org/source/leptonica-1.79.0.tar.gz"; 105 + sha256 = "1n004gv1dj3pq1fcnfdclvvx5nang80336aa67nvs3nnqp4ncn84"; 106 + }; 107 + patchCommands = "cp -r ${k2pdfopt_src}/leptonica_mod/. ./src/"; 108 + }; 109 + leptonica_modded = leptonica.overrideAttrs ({ patches ? [], ... }: { 110 + patches = patches ++ [ leptonica_patch ]; 48 111 }); 112 + 113 + tesseract_patch = mkPatch { 114 + name = "tesseract"; 115 + src = fetchFromGitHub { 116 + owner = "tesseract-ocr"; 117 + repo = "tesseract"; 118 + rev = "4.1.1"; 119 + sha256 = "1ca27zbjpx35nxh9fha410z3jskwyj06i5hqiqdc08s2d7kdivwn"; 120 + }; 121 + patchCommands = '' 122 + cp ${k2pdfopt_src}/tesseract_mod/{baseapi,tesscapi,tesseract}.* src/api/ 123 + cp ${k2pdfopt_src}/tesseract_mod/{tesscapi,tessedit,tesseract}.* src/ccmain/ 124 + cp ${k2pdfopt_src}/tesseract_mod/dotproduct{avx,fma,sse}.* src/arch/ 125 + cp ${k2pdfopt_src}/tesseract_mod/{intsimdmatrixsse,simddetect}.* src/arch/ 126 + cp ${k2pdfopt_src}/tesseract_mod/{errcode,genericvector,mainblk,params,serialis,tessdatamanager,tess_version,tprintf,unicharset}.* src/ccutil/ 127 + cp ${k2pdfopt_src}/tesseract_mod/{input,lstmrecognizer}.* src/lstm/ 128 + cp ${k2pdfopt_src}/tesseract_mod/openclwrapper.* src/opencl/ 129 + ''; 130 + }; 49 131 tesseract_modded = tesseract4.override { 50 - tesseractBase = tesseract4.tesseractBase.overrideAttrs (_: { 51 - patches = [ ./tesseract.patch ]; # Last verified with tesseract 1.4 132 + tesseractBase = tesseract4.tesseractBase.overrideAttrs ({ patches ? [], ... }: { 133 + patches = patches ++ [ tesseract_patch ]; 134 + # Additional compilation fixes 135 + postPatch = '' 136 + echo libtesseract_api_la_SOURCES += tesscapi.cpp >> src/api/Makefile.am 137 + substituteInPlace src/api/tesseract.h \ 138 + --replace "#include <leptonica.h>" "//#include <leptonica.h>" 139 + ''; 52 140 }); 53 141 }; 54 142 in

-151

pkgs/applications/misc/k2pdfopt/k2pdfopt-mupdf-1.16.1.patch

··· 1 - diff --git a/willuslib/wmupdf.c b/willuslib/wmupdf.c 2 - index 81627ef..f14a96c 100644 3 - --- a/willuslib/wmupdf.c 4 - +++ b/willuslib/wmupdf.c 5 - @@ -189,8 +189,6 @@ int wmupdf_remake_pdf(char *infile,char *outfile,WPDFPAGEINFO *pageinfo,int use_ 6 - pdf_write_opts.do_compress=1; 7 - pdf_write_opts.do_linear=0; 8 - pdf_write_opts.do_garbage=1; /* 2 and 3 don't work for this. */ 9 - - pdf_write_opts.continue_on_error=0; 10 - - pdf_write_opts.errors=NULL; 11 - write_failed=0; 12 - wpdfpageinfo_sort(pageinfo); 13 - xref=NULL; 14 - @@ -1687,8 +1685,8 @@ WPDFOUTLINE *wpdfoutline_read_from_pdf_file(char *filename) 15 - /* Sumatra version of MuPDF v1.4 -- use locally installed fonts */ 16 - pdf_install_load_system_font_funcs(ctx); 17 - fz_try(ctx) { doc=fz_open_document(ctx,filename); } 18 - - fz_catch(ctx) 19 - - { 20 - + fz_catch(ctx) 21 - + { 22 - fz_drop_context(ctx); 23 - return(NULL); 24 - } 25 - @@ -1890,5 +1888,5 @@ static pdf_obj *pdf_new_string_utf8(fz_context *ctx,char *string) 26 - willus_mem_free((double **)&utfbuf,funcname); 27 - return(pdfobj); 28 - } 29 - - 30 - + 31 - #endif /* HAVE_MUPDF_LIB */ 32 - diff --git a/willuslib/wmupdfinfo.c b/willuslib/wmupdfinfo.c 33 - index 5c7f38c..9b9e6fd 100644 34 - --- a/willuslib/wmupdfinfo.c 35 - +++ b/willuslib/wmupdfinfo.c 36 - @@ -237,23 +237,22 @@ static void showglobalinfo(fz_context *ctx, globals *glo,char *filename) 37 - pdf_obj *robj; 38 - 39 - robj=pdf_resolve_indirect(ctx,obj); 40 - - n=pdf_sprint_obj(ctx,NULL,0,robj,1); 41 - - buf=malloc(n+2); 42 - + buf=pdf_sprint_obj(ctx,NULL,0,&n,robj,1,0); 43 - if (buf==NULL) 44 - { 45 - fz_write_printf(ctx,out,"Info object (%d %d R):\n",pdf_to_num(ctx,obj),pdf_to_gen(ctx,obj)); 46 - - pdf_print_obj(ctx,out,robj,1); 47 - + pdf_print_obj(ctx,out,robj,1,0); 48 - } 49 - else 50 - { 51 - - pdf_sprint_obj(ctx,buf,n+2,robj,1); 52 - + pdf_sprint_obj(ctx,buf,n+2,&n,robj,1,0); 53 - display_pdf_field(ctx,out,buf,"Title","TITLE"); 54 - display_pdf_field(ctx,out,buf,"CreationDate","CREATED"); 55 - display_pdf_field(ctx,out,buf,"ModDate","LAST MODIFIED"); 56 - display_pdf_field(ctx,out,buf,"Producer","PDF PRODUCER"); 57 - display_pdf_field(ctx,out,buf,"Creator","CREATOR"); 58 - display_file_size(ctx,out,filename); 59 - - free(buf); 60 - + fz_free(ctx,buf); 61 - } 62 - } 63 - if (glo->dims==1) 64 - @@ -275,7 +274,7 @@ static void showglobalinfo(fz_context *ctx, globals *glo,char *filename) 65 - if (obj) 66 - { 67 - fz_write_printf(ctx,out, "\nEncryption object (%d %d R):\n", pdf_to_num(ctx,obj), pdf_to_gen(ctx,obj)); 68 - - pdf_print_obj(ctx,out, pdf_resolve_indirect(ctx,obj), 1); 69 - + pdf_print_obj(ctx,out, pdf_resolve_indirect(ctx,obj), 1, 0); 70 - } 71 - } 72 - 73 - @@ -396,7 +395,7 @@ gatherdimensions(fz_context *ctx, globals *glo, int page, pdf_obj *pageref, pdf_ 74 - if (j < glo->dims) 75 - return; 76 - 77 - - glo->dim = fz_resize_array(ctx, glo->dim, glo->dims+1, sizeof(struct info)); 78 - + glo->dim = fz_realloc_array(ctx, glo->dim, glo->dims+1, struct info); 79 - glo->dims++; 80 - 81 - glo->dim[glo->dims - 1].page = page; 82 - @@ -441,7 +440,7 @@ gatherfonts(fz_context *ctx, globals *glo, int page, pdf_obj *pageref, pdf_obj * 83 - if (k < glo->fonts) 84 - continue; 85 - 86 - - glo->font = fz_resize_array(ctx, glo->font, glo->fonts+1, sizeof(struct info)); 87 - + glo->font = fz_realloc_array(ctx, glo->font, glo->fonts+1, struct info); 88 - glo->fonts++; 89 - 90 - glo->font[glo->fonts - 1].page = page; 91 - @@ -510,7 +509,7 @@ gatherimages(fz_context *ctx, globals *glo, int page, pdf_obj *pageref, pdf_obj 92 - if (k < glo->images) 93 - continue; 94 - 95 - - glo->image = fz_resize_array(ctx, glo->image, glo->images+1, sizeof(struct info)); 96 - + glo->image = fz_realloc_array(ctx, glo->image, glo->images+1, struct info); 97 - glo->images++; 98 - 99 - glo->image[glo->images - 1].page = page; 100 - @@ -568,7 +567,7 @@ gatherforms(fz_context *ctx, globals *glo, int page, pdf_obj *pageref, pdf_obj * 101 - if (k < glo->forms) 102 - continue; 103 - 104 - - glo->form = fz_resize_array(ctx, glo->form, glo->forms+1, sizeof(struct info)); 105 - + glo->form = fz_realloc_array(ctx, glo->form, glo->forms+1, struct info); 106 - glo->forms++; 107 - 108 - glo->form[glo->forms - 1].page = page; 109 - @@ -613,7 +612,7 @@ gatherpsobjs(fz_context *ctx, globals *glo, int page, pdf_obj *pageref, pdf_obj 110 - if (k < glo->psobjs) 111 - continue; 112 - 113 - - glo->psobj = fz_resize_array(ctx, glo->psobj, glo->psobjs+1, sizeof(struct info)); 114 - + glo->psobj = fz_realloc_array(ctx, glo->psobj, glo->psobjs+1, struct info); 115 - glo->psobjs++; 116 - 117 - glo->psobj[glo->psobjs - 1].page = page; 118 - @@ -656,7 +655,7 @@ gathershadings(fz_context *ctx, globals *glo, int page, pdf_obj *pageref, pdf_ob 119 - if (k < glo->shadings) 120 - continue; 121 - 122 - - glo->shading = fz_resize_array(ctx, glo->shading, glo->shadings+1, sizeof(struct info)); 123 - + glo->shading = fz_realloc_array(ctx, glo->shading, glo->shadings+1, struct info); 124 - glo->shadings++; 125 - 126 - glo->shading[glo->shadings - 1].page = page; 127 - @@ -724,7 +723,7 @@ gatherpatterns(fz_context *ctx, globals *glo, int page, pdf_obj *pageref, pdf_ob 128 - if (k < glo->patterns) 129 - continue; 130 - 131 - - glo->pattern = fz_resize_array(ctx, glo->pattern, glo->patterns+1, sizeof(struct info)); 132 - + glo->pattern = fz_realloc_array(ctx, glo->pattern, glo->patterns+1, struct info); 133 - glo->patterns++; 134 - 135 - glo->pattern[glo->patterns - 1].page = page; 136 - @@ -1216,7 +1215,7 @@ void wmupdfinfo_get(char *filename,int *pagelist,char **buf) 137 - if (fout==NULL) 138 - return; 139 - */ 140 - - 141 - + 142 - ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED); 143 - if (!ctx) 144 - { 145 - @@ -1307,5 +1306,5 @@ static void date_convert(char *dst,char *src) 146 - else if (src[i]!='\0') 147 - sprintf(&dst[strlen(dst)]," %s",&src[i]); 148 - } 149 - - 150 - + 151 - #endif /* HAVE_MUPDF_LIB */

-99

pkgs/applications/misc/k2pdfopt/k2pdfopt.patch

··· 1 - diff --git a/CMakeLists.txt b/CMakeLists.txt 2 - index 4a2378b..502c477 100644 3 - --- a/CMakeLists.txt 4 - +++ b/CMakeLists.txt 5 - @@ -52,6 +52,7 @@ endif(JPEG_FOUND) 6 - include(FindJasper) 7 - if(JASPER_FOUND) 8 - set(HAVE_JASPER_LIB 1) 9 - + set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${JASPER_LIBRARY}) 10 - endif(JASPER_FOUND) 11 - 12 - # paths from willuslib/wgs.c 13 - @@ -66,8 +67,12 @@ else() 14 - message(STATUS "Could NOT find ghostscript executable") 15 - endif(GHOSTSCRIPT_EXECUTABLE) 16 - 17 - -# willus.h 18 - -# HAVE_GSL_LIB 19 - +pkg_check_modules(GSL gsl) 20 - +if(GSL_FOUND) 21 - + set(HAVE_GSL_LIB 1) 22 - + include_directories(SYSTEM ${GSL_INCLUDEDIR}) 23 - + set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${GSL_LDFLAGS}) 24 - +endif(GSL_FOUND) 25 - 26 - 27 - # libfreetype6 (>= 2.3.9), libjbig2dec0, libjpeg8 (>= 8c), libx11-6, libxext6, zlib1g (>= 1:1.2.0) 28 - @@ -80,7 +85,7 @@ if(MUPDF_FOUND) 29 - include_directories(SYSTEM ${MUPDF_INCLUDEDIR}) 30 - message(STATUS "mupdf libraries: ${MUPDF_LDFLAGS}") 31 - set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${MUPDF_LDFLAGS} 32 - - -lmupdf-js-none -lopenjpeg -ljbig2dec -ljpeg -lfreetype 33 - + 34 - ) 35 - endif(MUPDF_FOUND) 36 - 37 - @@ -91,9 +96,25 @@ if(DJVU_FOUND) 38 - set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${DJVU_LDFLAGS}) 39 - endif(DJVU_FOUND) 40 - 41 - -# HAVE_GOCR_LIB 42 - -# HAVE_LEPTONICA_LIB 43 - -# HAVE_TESSERACT_LIB 44 - +find_library(GOCR_LIB NAMES Pgm2asc) 45 - +if(GOCR_LIB) 46 - + set(HAVE_GOCR_LIB 1) 47 - + set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${GOCR_LIB}) 48 - +endif(GOCR_LIB) 49 - + 50 - +pkg_check_modules(LEPTONICA lept) 51 - +if(LEPTONICA_FOUND) 52 - + set(HAVE_LEPTONICA_LIB 1) 53 - + include_directories(SYSTEM ${LEPTONICA_INCLUDEDIR}) 54 - + set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${LEPTONICA_LDFLAGS}) 55 - +endif(LEPTONICA_FOUND) 56 - + 57 - +pkg_check_modules(TESSERACT tesseract) 58 - +if(TESSERACT_FOUND) 59 - + set(HAVE_TESSERACT_LIB 1) 60 - + include_directories(SYSTEM ${TESSERACT_INCLUDEDIR}) 61 - + set(K2PDFOPT_LIB ${K2PDFOPT_LIB} ${TESSERACT_LDFLAGS}) 62 - +endif(TESSERACT_FOUND) 63 - 64 - # ---- Describe project 65 - 66 - diff --git a/willuslib/CMakeLists.txt b/willuslib/CMakeLists.txt 67 - index 463bbc9..8043db5 100644 68 - --- a/willuslib/CMakeLists.txt 69 - +++ b/willuslib/CMakeLists.txt 70 - @@ -6,7 +6,7 @@ include_directories(..) 71 - set(WILLUSLIB_SRC 72 - ansi.c array.c bmp.c bmpdjvu.c bmpmupdf.c dtcompress.c filelist.c 73 - fontdata.c fontrender.c gslpolyfit.c linux.c math.c mem.c ocr.c 74 - - ocrjocr.c ocrtess.c pdfwrite.c point2d.c render.c strbuf.c string.c 75 - + ocrgocr.c ocrtess.c pdfwrite.c point2d.c render.c strbuf.c string.c 76 - token.c wfile.c wgs.c wgui.c willusversion.c win.c winbmp.c 77 - wincomdlg.c winmbox.c winshell.c wmupdf.c wmupdfinfo.c wpdf.c wsys.c 78 - wzfile.c wleptonica.c 79 - diff --git a/willuslib/ocrgocr.c b/willuslib/ocrgocr.c 80 - index 6027e9a..fbe10f0 100644 81 - --- a/willuslib/ocrgocr.c 82 - +++ b/willuslib/ocrgocr.c 83 - @@ -29,6 +29,8 @@ 84 - #ifdef HAVE_GOCR_LIB 85 - #include <gocr.h> 86 - 87 - +job_t *OCR_JOB; 88 - + 89 - /* 90 - ** bmp8 must be grayscale 91 - ** (x1,y1) and (x2,y2) from top left of bitmap 92 - @@ -63,6 +65,7 @@ void gocr_single_word_from_bmp8(char *text,int maxlen,WILLUSBITMAP *bmp8, 93 - h=y2-y1+1; 94 - dh=h+bw*2; 95 - job=&_job; 96 - + OCR_JOB=job; 97 - job_init(job); 98 - job_init_image(job); 99 - // willus_mem_alloc_warn((void **)&job->src.p.p,w*h,funcname,10);

-254

pkgs/applications/misc/k2pdfopt/leptonica.patch

··· 1 - From 8c11a20925686855023df90ed477957c7d7fe91e Mon Sep 17 00:00:00 2001 2 - From: Daniel Fullmer <danielrf12@gmail.com> 3 - Date: Fri, 13 Sep 2019 15:54:21 -0400 4 - Subject: [PATCH] Willus mod for k2pdfopt 5 - 6 - --- 7 - src/allheaders.h | 4 ++ 8 - src/dewarp2.c | 106 ++++++++++++++++++++++++++++++++++++++++++----- 9 - src/leptwin.c | 6 ++- 10 - 3 files changed, 104 insertions(+), 12 deletions(-) 11 - 12 - diff --git a/src/allheaders.h b/src/allheaders.h 13 - index e68eff1..b3cc729 100644 14 - --- a/src/allheaders.h 15 - +++ b/src/allheaders.h 16 - @@ -669,6 +669,10 @@ LEPT_DLL extern L_DEWARPA * dewarpaReadMem ( const l_uint8 *data, size_t size ); 17 - LEPT_DLL extern l_ok dewarpaWrite ( const char *filename, L_DEWARPA *dewa ); 18 - LEPT_DLL extern l_ok dewarpaWriteStream ( FILE *fp, L_DEWARPA *dewa ); 19 - LEPT_DLL extern l_ok dewarpaWriteMem ( l_uint8 **pdata, size_t *psize, L_DEWARPA *dewa ); 20 - +/* WILLUS MOD */ 21 - + LEPT_DLL extern l_int32 dewarpBuildPageModel_ex ( L_DEWARP *dew, const char *debugfile,l_int32 fit_order ); 22 - + LEPT_DLL extern l_int32 dewarpFindVertDisparity_ex ( L_DEWARP *dew, PTAA *ptaa, l_int32 rotflag,l_int32 fit_order ); 23 - + LEPT_DLL extern l_int32 dewarpBuildLineModel_ex ( L_DEWARP *dew, l_int32 opensize, const char *debugfile,l_int32 fit_order ); 24 - LEPT_DLL extern l_ok dewarpBuildPageModel ( L_DEWARP *dew, const char *debugfile ); 25 - LEPT_DLL extern l_ok dewarpFindVertDisparity ( L_DEWARP *dew, PTAA *ptaa, l_int32 rotflag ); 26 - LEPT_DLL extern l_ok dewarpFindHorizDisparity ( L_DEWARP *dew, PTAA *ptaa ); 27 - diff --git a/src/dewarp2.c b/src/dewarp2.c 28 - index 220eec1..2e29500 100644 29 - --- a/src/dewarp2.c 30 - +++ b/src/dewarp2.c 31 - @@ -144,9 +144,17 @@ static const l_float32 L_ALLOWED_W_FRACT = 0.05; /* no bigger */ 32 - * longest textlines. 33 - * </pre> 34 - */ 35 - +/* WILLUS MOD */ 36 - l_ok 37 - -dewarpBuildPageModel(L_DEWARP *dew, 38 - - const char *debugfile) 39 - +dewarpBuildPageModel(L_DEWARP *dew,const char *debugfile) 40 - +{ 41 - +return(dewarpBuildPageModel_ex(dew,debugfile,2)); 42 - +} 43 - + 44 - +l_ok 45 - +dewarpBuildPageModel_ex(L_DEWARP *dew, 46 - + const char *debugfile, 47 - + l_int32 fit_order) 48 - { 49 - l_int32 linecount, topline, botline, ret; 50 - PIX *pixs, *pix1, *pix2, *pix3; 51 - @@ -225,7 +233,7 @@ PTAA *ptaa1, *ptaa2; 52 - /* Get the sampled vertical disparity from the textline centers. 53 - * The disparity array will push pixels vertically so that each 54 - * textline is flat and centered at the y-position of the mid-point. */ 55 - - if (dewarpFindVertDisparity(dew, ptaa2, 0) != 0) { 56 - + if (dewarpFindVertDisparity_ex(dew, ptaa2, 0, fit_order) != 0) { 57 - L_WARNING("vertical disparity not built\n", procName); 58 - ptaaDestroy(&ptaa2); 59 - return 1; 60 - @@ -290,13 +298,24 @@ PTAA *ptaa1, *ptaa2; 61 - * a pdf. Non-pix debug output goes to /tmp. 62 - * </pre> 63 - */ 64 - +/* WILLUS MOD */ 65 - l_ok 66 - dewarpFindVertDisparity(L_DEWARP *dew, 67 - PTAA *ptaa, 68 - l_int32 rotflag) 69 - { 70 - +return(dewarpFindVertDisparity_ex(dew,ptaa,rotflag,2)); 71 - +} 72 - +/* WILLUS MOD -- add cubic and quartic fits and ..._ex functions */ 73 - +l_int32 74 - +dewarpFindVertDisparity_ex(L_DEWARP *dew, 75 - + PTAA *ptaa, 76 - + l_int32 rotflag, 77 - + l_int32 fit_order) 78 - +{ 79 - l_int32 i, j, nlines, npts, nx, ny, sampling; 80 - -l_float32 c0, c1, c2, x, y, midy, val, medval, meddev, minval, maxval; 81 - +/* WILLUS MOD */ 82 - +l_float32 c0, c1, c2, c3, c4, x, y, midy, val, medval, meddev, minval, maxval; 83 - l_float32 *famidys; 84 - NUMA *nax, *nafit, *nacurve0, *nacurve1, *nacurves; 85 - NUMA *namidy, *namidys, *namidysi; 86 - @@ -304,11 +323,22 @@ PIX *pix1, *pix2, *pixcirc, *pixdb; 87 - PTA *pta, *ptad, *ptacirc; 88 - PTAA *ptaa0, *ptaa1, *ptaa2, *ptaa3, *ptaa4, *ptaa5, *ptaat; 89 - FPIX *fpix; 90 - +/* WILLUS MOD */ 91 - +l_int32 fit_order1,fit_order2; 92 - 93 - PROCNAME("dewarpFindVertDisparity"); 94 - 95 - if (!dew) 96 - return ERROR_INT("dew not defined", procName, 1); 97 - +/* WILLUS MOD */ 98 - + if (fit_order < 10) 99 - + fit_order1 = fit_order2 = fit_order; 100 - + else 101 - + { 102 - + fit_order1=fit_order % 10; 103 - + fit_order2=fit_order / 10; 104 - + fit_order2=fit_order2 % 10; 105 - + } 106 - dew->vsuccess = 0; 107 - if (!ptaa) 108 - return ERROR_INT("ptaa not defined", procName, 1); 109 - @@ -331,12 +361,32 @@ FPIX *fpix; 110 - pixdb = (rotflag) ? pixRotateOrth(dew->pixs, 1) : pixClone(dew->pixs); 111 - for (i = 0; i < nlines; i++) { /* for each line */ 112 - pta = ptaaGetPta(ptaa, i, L_CLONE); 113 - - ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL); 114 - - numaAddNumber(nacurve0, c2); 115 - +/* WILLUS MOD */ 116 - +if (fit_order1>3) 117 - + { 118 - + ptaGetQuarticLSF(pta, &c4, &c3, &c2, &c1, &c0, NULL); 119 - + numaAddNumber(nacurve0, c4); 120 - + } 121 - +else if (fit_order1==3) 122 - + { 123 - + ptaGetCubicLSF(pta, &c3, &c2, &c1, &c0, NULL); 124 - + numaAddNumber(nacurve0, c3); 125 - + } 126 - +else 127 - + { 128 - + ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL); 129 - + numaAddNumber(nacurve0, c2); 130 - + } 131 - ptad = ptaCreate(nx); 132 - for (j = 0; j < nx; j++) { /* uniformly sampled in x */ 133 - x = j * sampling; 134 - - applyQuadraticFit(c2, c1, c0, x, &y); 135 - +/* WILLUS MOD */ 136 - +if (fit_order1>3) 137 - + applyQuarticFit(c4, c3, c2, c1, c0, x, &y); 138 - +else if (fit_order1==3) 139 - + applyCubicFit(c3, c2, c1, c0, x, &y); 140 - +else 141 - + applyQuadraticFit(c2, c1, c0, x, &y); 142 - ptaAddPt(ptad, x, y); 143 - } 144 - ptaaAddPta(ptaa0, ptad, L_INSERT); 145 - @@ -350,7 +400,13 @@ FPIX *fpix; 146 - for (i = 0; i < nlines; i++) { 147 - pta = ptaaGetPta(ptaa, i, L_CLONE); 148 - ptaGetArrays(pta, &nax, NULL); 149 - - ptaGetQuadraticLSF(pta, NULL, NULL, NULL, &nafit); 150 - +/* WILLUS MOD */ 151 - +if (fit_order1>3) 152 - +ptaGetQuarticLSF(pta, NULL, NULL, NULL, NULL, NULL, &nafit); 153 - +else if (fit_order1==3) 154 - +ptaGetCubicLSF(pta, NULL, NULL, NULL, NULL, &nafit); 155 - +else 156 - +ptaGetQuadraticLSF(pta, NULL, NULL, NULL, &nafit); 157 - ptad = ptaCreateFromNuma(nax, nafit); 158 - ptaaAddPta(ptaat, ptad, L_INSERT); 159 - ptaDestroy(&pta); 160 - @@ -494,11 +550,24 @@ FPIX *fpix; 161 - ptaa5 = ptaaCreate(nx); /* uniformly sampled across full height of image */ 162 - for (j = 0; j < nx; j++) { /* for each column */ 163 - pta = ptaaGetPta(ptaa4, j, L_CLONE); 164 - - ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL); 165 - +/* WILLUS MOD */ 166 - +/* Order higher than 2 can cause a little craziness here. */ 167 - +if (fit_order2>3) 168 - + ptaGetQuarticLSF(pta, &c4, &c3, &c2, &c1, &c0, NULL); 169 - +else if (fit_order2==3) 170 - + ptaGetCubicLSF(pta, &c3, &c2, &c1, &c0, NULL); 171 - +else 172 - + ptaGetQuadraticLSF(pta, &c2, &c1, &c0, NULL); 173 - ptad = ptaCreate(ny); 174 - for (i = 0; i < ny; i++) { /* uniformly sampled in y */ 175 - y = i * sampling; 176 - - applyQuadraticFit(c2, c1, c0, y, &val); 177 - +/* WILLUS MOD */ 178 - +if (fit_order2>3) 179 - + applyQuarticFit(c4, c3, c2, c1, c0, y, &val); 180 - +else if (fit_order2==3) 181 - + applyCubicFit(c3, c2, c1, c0, y, &val); 182 - +else 183 - + applyQuadraticFit(c2, c1, c0, y, &val); 184 - ptaAddPt(ptad, y, val); 185 - } 186 - ptaaAddPta(ptaa5, ptad, L_INSERT); 187 - @@ -1602,11 +1671,21 @@ FPIX *fpix; 188 - * See notes there. 189 - * </pre> 190 - */ 191 - +/* WILLUS MOD */ 192 - l_ok 193 - dewarpBuildLineModel(L_DEWARP *dew, 194 - l_int32 opensize, 195 - const char *debugfile) 196 - { 197 - +return(dewarpBuildLineModel_ex(dew,opensize,debugfile,2)); 198 - +} 199 - + 200 - +l_int32 201 - +dewarpBuildLineModel_ex(L_DEWARP *dew, 202 - + l_int32 opensize, 203 - + const char *debugfile, 204 - + l_int32 fit_order) 205 - +{ 206 - char buf[64]; 207 - l_int32 i, j, bx, by, ret, nlines; 208 - BOXA *boxa; 209 - @@ -1695,6 +1774,8 @@ PTAA *ptaa1, *ptaa2; 210 - 211 - /* Remove all lines that are not at least 0.75 times the length 212 - * of the longest line. */ 213 - +/* WILLUS MOD */ 214 - +/* 215 - ptaa2 = dewarpRemoveShortLines(pix, ptaa1, 0.75, DEBUG_SHORT_LINES); 216 - if (debugfile) { 217 - pix1 = pixConvertTo32(pix); 218 - @@ -1704,6 +1785,8 @@ PTAA *ptaa1, *ptaa2; 219 - pixDestroy(&pix1); 220 - pixDestroy(&pix2); 221 - } 222 - +*/ 223 - +ptaa2=ptaa1; 224 - ptaaDestroy(&ptaa1); 225 - nlines = ptaaGetCount(ptaa2); 226 - if (nlines < dew->minlines) { 227 - @@ -1717,7 +1800,8 @@ PTAA *ptaa1, *ptaa2; 228 - * centers. The disparity array will push pixels vertically 229 - * so that each line is flat and centered at the y-position 230 - * of the mid-point. */ 231 - - ret = dewarpFindVertDisparity(dew, ptaa2, 1 - i); 232 - +/* WILLUS MOD */ 233 - + ret = dewarpFindVertDisparity_ex(dew, ptaa2, 1 - i, fit_order); 234 - 235 - /* If i == 0, move the result to the horizontal disparity, 236 - * rotating it back by -90 degrees. */ 237 - diff --git a/src/leptwin.c b/src/leptwin.c 238 - index 72643a0..573d33e 100644 239 - --- a/src/leptwin.c 240 - +++ b/src/leptwin.c 241 - @@ -364,5 +364,9 @@ PIXCMAP *cmap; 242 - 243 - return hBitmap; 244 - } 245 - - 246 - +#else 247 - +/* willus mod: Avoid weird issue with OS/X library archiver when there are no symbols */ 248 - +int leptwin_my_empty_func(void); 249 - +int leptwin_my_empty_func(void) 250 - +{return(0);} 251 - #endif /* _WIN32 */ 252 - -- 253 - 2.22.0 254 -

-1060

pkgs/applications/misc/k2pdfopt/mupdf.patch

··· 1 - From d8927c969e3387ca2669a616c0ba53bce918a031 Mon Sep 17 00:00:00 2001 2 - From: Daniel Fullmer <danielrf12@gmail.com> 3 - Date: Fri, 13 Sep 2019 15:11:45 -0400 4 - Subject: [PATCH] Willus mod for k2pdfopt 5 - 6 - --- 7 - source/fitz/filter-basic.c | 3 + 8 - source/fitz/font-win32.c | 866 +++++++++++++++++++++++++++++++++++++ 9 - source/fitz/font.c | 3 + 10 - source/fitz/stext-device.c | 5 + 11 - source/fitz/string.c | 5 + 12 - source/pdf/pdf-annot.c | 14 +- 13 - source/pdf/pdf-link.c | 3 + 14 - source/pdf/pdf-parse.c | 5 + 15 - source/pdf/pdf-xref.c | 9 + 16 - 9 files changed, 912 insertions(+), 1 deletion(-) 17 - create mode 100644 source/fitz/font-win32.c 18 - 19 - diff --git a/source/fitz/filter-basic.c b/source/fitz/filter-basic.c 20 - index 0713a62e7..b8ef4d292 100644 21 - --- a/source/fitz/filter-basic.c 22 - +++ b/source/fitz/filter-basic.c 23 - @@ -259,7 +259,10 @@ look_for_endstream: 24 - if (!state->warned) 25 - { 26 - state->warned = 1; 27 - +/* willus mod -- no warning */ 28 - +/* 29 - fz_warn(ctx, "PDF stream Length incorrect"); 30 - +*/ 31 - } 32 - return *stm->rp++; 33 - } 34 - diff --git a/source/fitz/font-win32.c b/source/fitz/font-win32.c 35 - new file mode 100644 36 - index 000000000..45de8cfd3 37 - --- /dev/null 38 - +++ b/source/fitz/font-win32.c 39 - @@ -0,0 +1,866 @@ 40 - +/* 41 - +** Routines to access MS Windows system fonts. 42 - +** From sumatra PDF distro. 43 - +** Modified for MuPDF v1.9a by willus.com 44 - +*/ 45 - +#include "mupdf/pdf.h" 46 - + 47 - +/* 48 - + Which fonts are embedded is based on a few preprocessor definitions. 49 - + 50 - + The base 14 fonts are always embedded. 51 - + For CJK font substitution we embed DroidSansFallback. 52 - + 53 - + Set NOCJK to skip all CJK support (this also omits embedding the CJK CMaps) 54 - + Set NOCJKFONT to skip the embedded CJK font. 55 - + Set NOCJKFULL to embed a smaller CJK font without CJK Extension A support. 56 - +*/ 57 - + 58 - +#ifdef NOCJK 59 - +#define NOCJKFONT 60 - +#endif 61 - + 62 - +/* SumatraPDF: also load fonts included with Windows */ 63 - +#ifdef _WIN32 64 - + 65 - +#ifndef UNICODE 66 - +#define UNICODE 67 - +#endif 68 - +#ifndef _UNICODE 69 - +#define _UNICODE 70 - +#endif 71 - + 72 - +#include <windows.h> 73 - + 74 - +// TODO: Use more of FreeType for TTF parsing (for performance reasons, 75 - +// the fonts can't be parsed completely, though) 76 - +#include <ft2build.h> 77 - +#include FT_TRUETYPE_IDS_H 78 - +#include FT_TRUETYPE_TAGS_H 79 - + 80 - +#define TTC_VERSION1 0x00010000 81 - +#define TTC_VERSION2 0x00020000 82 - + 83 - +#define MAX_FACENAME 128 84 - + 85 - +// Note: the font face must be the first field so that the structure 86 - +// can be treated like a simple string for searching 87 - +typedef struct pdf_fontmapMS_s 88 - +{ 89 - + char fontface[MAX_FACENAME]; 90 - + char fontpath[MAX_PATH]; 91 - + int index; 92 - +} pdf_fontmapMS; 93 - + 94 - +typedef struct pdf_fontlistMS_s 95 - +{ 96 - + pdf_fontmapMS *fontmap; 97 - + int len; 98 - + int cap; 99 - +} pdf_fontlistMS; 100 - + 101 - +typedef struct _tagTT_OFFSET_TABLE 102 - +{ 103 - + ULONG uVersion; 104 - + USHORT uNumOfTables; 105 - + USHORT uSearchRange; 106 - + USHORT uEntrySelector; 107 - + USHORT uRangeShift; 108 - +} TT_OFFSET_TABLE; 109 - + 110 - +typedef struct _tagTT_TABLE_DIRECTORY 111 - +{ 112 - + ULONG uTag; //table name 113 - + ULONG uCheckSum; //Check sum 114 - + ULONG uOffset; //Offset from beginning of file 115 - + ULONG uLength; //length of the table in bytes 116 - +} TT_TABLE_DIRECTORY; 117 - + 118 - +typedef struct _tagTT_NAME_TABLE_HEADER 119 - +{ 120 - + USHORT uFSelector; //format selector. Always 0 121 - + USHORT uNRCount; //Name Records count 122 - + USHORT uStorageOffset; //Offset for strings storage, from start of the table 123 - +} TT_NAME_TABLE_HEADER; 124 - + 125 - +typedef struct _tagTT_NAME_RECORD 126 - +{ 127 - + USHORT uPlatformID; 128 - + USHORT uEncodingID; 129 - + USHORT uLanguageID; 130 - + USHORT uNameID; 131 - + USHORT uStringLength; 132 - + USHORT uStringOffset; //from start of storage area 133 - +} TT_NAME_RECORD; 134 - + 135 - +typedef struct _tagFONT_COLLECTION 136 - +{ 137 - + ULONG Tag; 138 - + ULONG Version; 139 - + ULONG NumFonts; 140 - +} FONT_COLLECTION; 141 - + 142 - +static struct { 143 - + char *name; 144 - + char *pattern; 145 - +} baseSubstitutes[] = { 146 - + { "Courier", "CourierNewPSMT" }, 147 - + { "Courier-Bold", "CourierNewPS-BoldMT" }, 148 - + { "Courier-Oblique", "CourierNewPS-ItalicMT" }, 149 - + { "Courier-BoldOblique", "CourierNewPS-BoldItalicMT" }, 150 - + { "Helvetica", "ArialMT" }, 151 - + { "Helvetica-Bold", "Arial-BoldMT" }, 152 - + { "Helvetica-Oblique", "Arial-ItalicMT" }, 153 - + { "Helvetica-BoldOblique", "Arial-BoldItalicMT" }, 154 - + { "Times-Roman", "TimesNewRomanPSMT" }, 155 - + { "Times-Bold", "TimesNewRomanPS-BoldMT" }, 156 - + { "Times-Italic", "TimesNewRomanPS-ItalicMT" }, 157 - + { "Times-BoldItalic", "TimesNewRomanPS-BoldItalicMT" }, 158 - + { "Symbol", "SymbolMT" }, 159 - +}; 160 - +static const char *base_font_names[][10] = 161 - +{ 162 - + { "Courier", "CourierNew", "CourierNewPSMT", NULL }, 163 - + { "Courier-Bold", "CourierNew,Bold", "Courier,Bold", 164 - + "CourierNewPS-BoldMT", "CourierNew-Bold", NULL }, 165 - + { "Courier-Oblique", "CourierNew,Italic", "Courier,Italic", 166 - + "CourierNewPS-ItalicMT", "CourierNew-Italic", NULL }, 167 - + { "Courier-BoldOblique", "CourierNew,BoldItalic", "Courier,BoldItalic", 168 - + "CourierNewPS-BoldItalicMT", "CourierNew-BoldItalic", NULL }, 169 - + { "Helvetica", "ArialMT", "Arial", NULL }, 170 - + { "Helvetica-Bold", "Arial-BoldMT", "Arial,Bold", "Arial-Bold", 171 - + "Helvetica,Bold", NULL }, 172 - + { "Helvetica-Oblique", "Arial-ItalicMT", "Arial,Italic", "Arial-Italic", 173 - + "Helvetica,Italic", "Helvetica-Italic", NULL }, 174 - + { "Helvetica-BoldOblique", "Arial-BoldItalicMT", 175 - + "Arial,BoldItalic", "Arial-BoldItalic", 176 - + "Helvetica,BoldItalic", "Helvetica-BoldItalic", NULL }, 177 - + { "Times-Roman", "TimesNewRomanPSMT", "TimesNewRoman", 178 - + "TimesNewRomanPS", NULL }, 179 - + { "Times-Bold", "TimesNewRomanPS-BoldMT", "TimesNewRoman,Bold", 180 - + "TimesNewRomanPS-Bold", "TimesNewRoman-Bold", NULL }, 181 - + { "Times-Italic", "TimesNewRomanPS-ItalicMT", "TimesNewRoman,Italic", 182 - + "TimesNewRomanPS-Italic", "TimesNewRoman-Italic", NULL }, 183 - + { "Times-BoldItalic", "TimesNewRomanPS-BoldItalicMT", 184 - + "TimesNewRoman,BoldItalic", "TimesNewRomanPS-BoldItalic", 185 - + "TimesNewRoman-BoldItalic", NULL }, 186 - + { "Symbol", "Symbol,Italic", "Symbol,Bold", "Symbol,BoldItalic", 187 - + "SymbolMT", "SymbolMT,Italic", "SymbolMT,Bold", "SymbolMT,BoldItalic", NULL }, 188 - + { "ZapfDingbats", NULL } 189 - +}; 190 - + 191 - +static pdf_fontlistMS fontlistMS = 192 - +{ 193 - + NULL, 194 - + 0, 195 - + 0, 196 - +}; 197 - +static int strcmp_ignore_space(const char *a, const char *b); 198 - +static const char *clean_font_name(const char *fontname); 199 - +static const char *pdf_clean_base14_name(const char *fontname); 200 - + 201 - +static inline USHORT BEtoHs(USHORT x) 202 - +{ 203 - + BYTE *data = (BYTE *)&x; 204 - + return (data[0] << 8) | data[1]; 205 - +} 206 - + 207 - +static inline ULONG BEtoHl(ULONG x) 208 - +{ 209 - + BYTE *data = (BYTE *)&x; 210 - + return (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3]; 211 - +} 212 - + 213 - +static int strcmp_ignore_space(const char *a, const char *b) 214 - +{ 215 - + while (1) 216 - + { 217 - + while (*a == ' ') 218 - + a++; 219 - + while (*b == ' ') 220 - + b++; 221 - + if (*a != *b) 222 - + return 1; 223 - + if (*a == 0) 224 - + return *a != *b; 225 - + if (*b == 0) 226 - + return *a != *b; 227 - + a++; 228 - + b++; 229 - + } 230 - +} 231 - + 232 - +/* A little bit more sophisticated name matching so that e.g. "EurostileExtended" 233 - + matches "EurostileExtended-Roman" or "Tahoma-Bold,Bold" matches "Tahoma-Bold" */ 234 - +static int 235 - +lookup_compare(const void *elem1, const void *elem2) 236 - +{ 237 - + const char *val1 = elem1; 238 - + const char *val2 = elem2; 239 - + int len1 = strlen(val1); 240 - + int len2 = strlen(val2); 241 - + 242 - + if (len1 != len2) 243 - + { 244 - + const char *rest = len1 > len2 ? val1 + len2 : val2 + len1; 245 - + if (',' == *rest || !_stricmp(rest, "-roman")) 246 - + return _strnicmp(val1, val2, fz_mini(len1, len2)); 247 - + } 248 - + 249 - + return _stricmp(val1, val2); 250 - +} 251 - + 252 - +static void 253 - +remove_spaces(char *srcDest) 254 - +{ 255 - + char *dest; 256 - + 257 - + for (dest = srcDest; *srcDest; srcDest++) 258 - + if (*srcDest != ' ') 259 - + *dest++ = *srcDest; 260 - + *dest = '\0'; 261 - +} 262 - + 263 - +static int 264 - +str_ends_with(const char *str, const char *end) 265 - +{ 266 - + size_t len1 = strlen(str); 267 - + size_t len2 = strlen(end); 268 - + 269 - + return len1 >= len2 && !strcmp(str + len1 - len2, end); 270 - +} 271 - + 272 - +static pdf_fontmapMS * 273 - +pdf_find_windows_font_path(const char *fontname) 274 - +{ 275 - + return bsearch(fontname, fontlistMS.fontmap, fontlistMS.len, sizeof(pdf_fontmapMS), lookup_compare); 276 - +} 277 - + 278 - +/* source and dest can be same */ 279 - +static void 280 - +decode_unicode_BE(fz_context *ctx, char *source, int sourcelen, char *dest, int destlen) 281 - +{ 282 - + WCHAR *tmp; 283 - + int converted, i; 284 - + 285 - + if (sourcelen % 2 != 0) 286 - + fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid unicode string"); 287 - + 288 - + tmp = fz_malloc_array(ctx, sourcelen / 2 + 1, sizeof(WCHAR)); 289 - + for (i = 0; i < sourcelen / 2; i++) 290 - + tmp[i] = BEtoHs(((WCHAR *)source)[i]); 291 - + tmp[sourcelen / 2] = '\0'; 292 - + 293 - + converted = WideCharToMultiByte(CP_UTF8, 0, tmp, -1, dest, destlen, NULL, NULL); 294 - + fz_free(ctx, tmp); 295 - + if (!converted) 296 - + fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid unicode string"); 297 - +} 298 - + 299 - +static void 300 - +decode_platform_string(fz_context *ctx, int platform, int enctype, char *source, int sourcelen, char *dest, int destlen) 301 - +{ 302 - + switch (platform) 303 - + { 304 - + case TT_PLATFORM_APPLE_UNICODE: 305 - + switch (enctype) 306 - + { 307 - + case TT_APPLE_ID_DEFAULT: 308 - + case TT_APPLE_ID_UNICODE_2_0: 309 - + decode_unicode_BE(ctx, source, sourcelen, dest, destlen); 310 - + return; 311 - + } 312 - + fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype); 313 - + case TT_PLATFORM_MACINTOSH: 314 - + switch (enctype) 315 - + { 316 - + case TT_MAC_ID_ROMAN: 317 - + if (sourcelen + 1 > destlen) 318 - + fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : overlong fontname: %s", source); 319 - + // TODO: Convert to UTF-8 from what encoding? 320 - + memcpy(dest, source, sourcelen); 321 - + dest[sourcelen] = 0; 322 - + return; 323 - + } 324 - + fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype); 325 - + case TT_PLATFORM_MICROSOFT: 326 - + switch (enctype) 327 - + { 328 - + case TT_MS_ID_SYMBOL_CS: 329 - + case TT_MS_ID_UNICODE_CS: 330 - + case TT_MS_ID_UCS_4: 331 - + decode_unicode_BE(ctx, source, sourcelen, dest, destlen); 332 - + return; 333 - + } 334 - + fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype); 335 - + default: 336 - + fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : unsupported encoding (%d/%d)", platform, enctype); 337 - + } 338 - +} 339 - + 340 - +static void 341 - +grow_system_font_list(fz_context *ctx, pdf_fontlistMS *fl) 342 - +{ 343 - + int newcap; 344 - + pdf_fontmapMS *newitems; 345 - + 346 - + if (fl->cap == 0) 347 - + newcap = 1024; 348 - + else 349 - + newcap = fl->cap * 2; 350 - + 351 - + // use realloc/free for the fontmap, since the list can 352 - + // remain in memory even with all fz_contexts destroyed 353 - + newitems = realloc(fl->fontmap, newcap * sizeof(pdf_fontmapMS)); 354 - + if (!newitems) 355 - + fz_throw(ctx, FZ_ERROR_GENERIC, "OOM in grow_system_font_list"); 356 - + memset(newitems + fl->cap, 0, sizeof(pdf_fontmapMS) * (newcap - fl->cap)); 357 - + 358 - + fl->fontmap = newitems; 359 - + fl->cap = newcap; 360 - +} 361 - + 362 - +static void 363 - +append_mapping(fz_context *ctx, pdf_fontlistMS *fl, const char *facename, const char *path, int index) 364 - +{ 365 - + if (fl->len == fl->cap) 366 - + grow_system_font_list(ctx, fl); 367 - + 368 - + if (fl->len >= fl->cap) 369 - + fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : fontlist overflow"); 370 - + 371 - + fz_strlcpy(fl->fontmap[fl->len].fontface, facename, sizeof(fl->fontmap[0].fontface)); 372 - + fz_strlcpy(fl->fontmap[fl->len].fontpath, path, sizeof(fl->fontmap[0].fontpath)); 373 - + fl->fontmap[fl->len].index = index; 374 - + 375 - + ++fl->len; 376 - +} 377 - + 378 - +static void 379 - +safe_read(fz_context *ctx, fz_stream *file, int offset, char *buf, int size) 380 - +{ 381 - + int n; 382 - + fz_seek(ctx, file, offset, 0); 383 - + n = fz_read(ctx, file, (unsigned char *)buf, size); 384 - + if (n != size) 385 - + fz_throw(ctx, FZ_ERROR_GENERIC, "safe_read: read %d, expected %d", n, size); 386 - +} 387 - + 388 - +static void 389 - +read_ttf_string(fz_context *ctx, fz_stream *file, int offset, TT_NAME_RECORD *ttRecordBE, char *buf, int size) 390 - +{ 391 - + char szTemp[MAX_FACENAME * 2]; 392 - + // ignore empty and overlong strings 393 - + int stringLength = BEtoHs(ttRecordBE->uStringLength); 394 - + if (stringLength == 0 || stringLength >= sizeof(szTemp)) 395 - + return; 396 - + 397 - + safe_read(ctx, file, offset + BEtoHs(ttRecordBE->uStringOffset), szTemp, stringLength); 398 - + decode_platform_string(ctx, BEtoHs(ttRecordBE->uPlatformID), 399 - + BEtoHs(ttRecordBE->uEncodingID), szTemp, stringLength, buf, size); 400 - +} 401 - + 402 - +static void 403 - +makeFakePSName(char szName[MAX_FACENAME], const char *szStyle) 404 - +{ 405 - + // append the font's subfamily, unless it's a Regular font 406 - + if (*szStyle && _stricmp(szStyle, "Regular") != 0) 407 - + { 408 - + fz_strlcat(szName, "-", MAX_FACENAME); 409 - + fz_strlcat(szName, szStyle, MAX_FACENAME); 410 - + } 411 - + remove_spaces(szName); 412 - +} 413 - + 414 - +static void 415 - +parseTTF(fz_context *ctx, fz_stream *file, int offset, int index, const char *path) 416 - +{ 417 - + TT_OFFSET_TABLE ttOffsetTableBE; 418 - + TT_TABLE_DIRECTORY tblDirBE; 419 - + TT_NAME_TABLE_HEADER ttNTHeaderBE; 420 - + TT_NAME_RECORD ttRecordBE; 421 - + 422 - + char szPSName[MAX_FACENAME] = { 0 }; 423 - + char szTTName[MAX_FACENAME] = { 0 }; 424 - + char szStyle[MAX_FACENAME] = { 0 }; 425 - + char szCJKName[MAX_FACENAME] = { 0 }; 426 - + int i, count, tblOffset; 427 - + 428 - + safe_read(ctx, file, offset, (char *)&ttOffsetTableBE, sizeof(TT_OFFSET_TABLE)); 429 - + 430 - + // check if this is a TrueType font of version 1.0 or an OpenType font 431 - + if (BEtoHl(ttOffsetTableBE.uVersion) != TTC_VERSION1 && 432 - + BEtoHl(ttOffsetTableBE.uVersion) != TTAG_OTTO) 433 - + { 434 - + fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid font version %x", (unsigned int)BEtoHl(ttOffsetTableBE.uVersion)); 435 - + } 436 - + 437 - + // determine the name table's offset by iterating through the offset table 438 - + count = BEtoHs(ttOffsetTableBE.uNumOfTables); 439 - + for (i = 0; i < count; i++) 440 - + { 441 - + int entryOffset = offset + sizeof(TT_OFFSET_TABLE) + i * sizeof(TT_TABLE_DIRECTORY); 442 - + safe_read(ctx, file, entryOffset, (char *)&tblDirBE, sizeof(TT_TABLE_DIRECTORY)); 443 - + if (!BEtoHl(tblDirBE.uTag) || BEtoHl(tblDirBE.uTag) == TTAG_name) 444 - + break; 445 - + } 446 - + if (count == i || !BEtoHl(tblDirBE.uTag)) 447 - + fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : nameless font"); 448 - + tblOffset = BEtoHl(tblDirBE.uOffset); 449 - + 450 - + // read the 'name' table for record count and offsets 451 - + safe_read(ctx, file, tblOffset, (char *)&ttNTHeaderBE, sizeof(TT_NAME_TABLE_HEADER)); 452 - + offset = tblOffset + sizeof(TT_NAME_TABLE_HEADER); 453 - + tblOffset += BEtoHs(ttNTHeaderBE.uStorageOffset); 454 - + 455 - + // read through the strings for PostScript name and font family 456 - + count = BEtoHs(ttNTHeaderBE.uNRCount); 457 - + for (i = 0; i < count; i++) 458 - + { 459 - + short langId, nameId; 460 - + BOOL isCJKName; 461 - + 462 - + safe_read(ctx, file, offset + i * sizeof(TT_NAME_RECORD), (char *)&ttRecordBE, sizeof(TT_NAME_RECORD)); 463 - + 464 - + langId = BEtoHs(ttRecordBE.uLanguageID); 465 - + nameId = BEtoHs(ttRecordBE.uNameID); 466 - + isCJKName = TT_NAME_ID_FONT_FAMILY == nameId && LANG_CHINESE == PRIMARYLANGID(langId); 467 - + 468 - + // ignore non-English strings (except for Chinese font names) 469 - + if (langId && langId != TT_MS_LANGID_ENGLISH_UNITED_STATES && !isCJKName) 470 - + continue; 471 - + // ignore names other than font (sub)family and PostScript name 472 - + fz_try(ctx) 473 - + { 474 - + if (isCJKName) 475 - + read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szCJKName, sizeof(szCJKName)); 476 - + else if (TT_NAME_ID_FONT_FAMILY == nameId) 477 - + read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szTTName, sizeof(szTTName)); 478 - + else if (TT_NAME_ID_FONT_SUBFAMILY == nameId) 479 - + read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szStyle, sizeof(szStyle)); 480 - + else if (TT_NAME_ID_PS_NAME == nameId) 481 - + read_ttf_string(ctx, file, tblOffset, &ttRecordBE, szPSName, sizeof(szPSName)); 482 - + } 483 - + fz_catch(ctx) 484 - + { 485 - + fz_warn(ctx, "ignoring face name decoding fonterror"); 486 - + } 487 - + } 488 - + 489 - + // try to prevent non-Arial fonts from accidentally substituting Arial 490 - + if (!strcmp(szPSName, "ArialMT")) 491 - + { 492 - + // cf. https://code.google.com/p/sumatrapdf/issues/detail?id=2471 493 - + if (strcmp(szTTName, "Arial") != 0) 494 - + szPSName[0] = '\0'; 495 - + // TODO: is there a better way to distinguish Arial Caps from Arial proper? 496 - + // cf. http://code.google.com/p/sumatrapdf/issues/detail?id=1290 497 - + else if (strstr(path, "caps") || strstr(path, "Caps")) 498 - + fz_throw(ctx, FZ_ERROR_GENERIC, "ignore %s, as it can't be distinguished from Arial,Regular", path); 499 - + } 500 - + 501 - + if (szPSName[0]) 502 - + append_mapping(ctx, &fontlistMS, szPSName, path, index); 503 - + if (szTTName[0]) 504 - + { 505 - + // derive a PostScript-like name and add it, if it's different from the font's 506 - + // included PostScript name; cf. http://code.google.com/p/sumatrapdf/issues/detail?id=376 507 - + makeFakePSName(szTTName, szStyle); 508 - + // compare the two names before adding this one 509 - + if (lookup_compare(szTTName, szPSName)) 510 - + append_mapping(ctx, &fontlistMS, szTTName, path, index); 511 - + } 512 - + if (szCJKName[0]) 513 - + { 514 - + makeFakePSName(szCJKName, szStyle); 515 - + if (lookup_compare(szCJKName, szPSName) && lookup_compare(szCJKName, szTTName)) 516 - + append_mapping(ctx, &fontlistMS, szCJKName, path, index); 517 - + } 518 - +} 519 - + 520 - +static void 521 - +parseTTFs(fz_context *ctx, const char *path) 522 - +{ 523 - + fz_stream *file = fz_open_file(ctx, path); 524 - + /* "fonterror : %s not found", path */ 525 - + fz_try(ctx) 526 - + { 527 - + parseTTF(ctx, file, 0, 0, path); 528 - + } 529 - + fz_always(ctx) 530 - + { 531 - + fz_drop_stream(ctx,file); 532 - + } 533 - + fz_catch(ctx) 534 - + { 535 - + fz_rethrow(ctx); 536 - + } 537 - +} 538 - + 539 - +static void 540 - +parseTTCs(fz_context *ctx, const char *path) 541 - +{ 542 - + FONT_COLLECTION fontcollectionBE; 543 - + ULONG i, numFonts, *offsettableBE = NULL; 544 - + 545 - + fz_stream *file = fz_open_file(ctx, path); 546 - + /* "fonterror : %s not found", path */ 547 - + 548 - + fz_var(offsettableBE); 549 - + 550 - + fz_try(ctx) 551 - + { 552 - + safe_read(ctx, file, 0, (char *)&fontcollectionBE, sizeof(FONT_COLLECTION)); 553 - + if (BEtoHl(fontcollectionBE.Tag) != TTAG_ttcf) 554 - + fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : wrong format %x", (unsigned int)BEtoHl(fontcollectionBE.Tag)); 555 - + if (BEtoHl(fontcollectionBE.Version) != TTC_VERSION1 && 556 - + BEtoHl(fontcollectionBE.Version) != TTC_VERSION2) 557 - + { 558 - + fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror : invalid version %x", (unsigned int)BEtoHl(fontcollectionBE.Version)); 559 - + } 560 - + 561 - + numFonts = BEtoHl(fontcollectionBE.NumFonts); 562 - + offsettableBE = fz_malloc_array(ctx, numFonts, sizeof(ULONG)); 563 - + 564 - + safe_read(ctx, file, sizeof(FONT_COLLECTION), (char *)offsettableBE, numFonts * sizeof(ULONG)); 565 - + for (i = 0; i < numFonts; i++) 566 - + parseTTF(ctx, file, BEtoHl(offsettableBE[i]), i, path); 567 - + } 568 - + fz_always(ctx) 569 - + { 570 - + fz_free(ctx, offsettableBE); 571 - + fz_drop_stream(ctx,file); 572 - + } 573 - + fz_catch(ctx) 574 - + { 575 - + fz_rethrow(ctx); 576 - + } 577 - +} 578 - + 579 - +static void 580 - +extend_system_font_list(fz_context *ctx, const WCHAR *path) 581 - +{ 582 - + WCHAR szPath[MAX_PATH], *lpFileName; 583 - + WIN32_FIND_DATA FileData; 584 - + HANDLE hList; 585 - + 586 - + GetFullPathName(path, nelem(szPath), szPath, &lpFileName); 587 - + 588 - + hList = FindFirstFile(szPath, &FileData); 589 - + if (hList == INVALID_HANDLE_VALUE) 590 - + { 591 - + // Don't complain about missing directories 592 - + if (GetLastError() == ERROR_FILE_NOT_FOUND) 593 - + return; 594 - + fz_throw(ctx, FZ_ERROR_GENERIC, "extend_system_font_list: unknown error %d", (int)GetLastError()); 595 - + } 596 - + do 597 - + { 598 - + if (!(FileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) 599 - + { 600 - + char szPathUtf8[MAX_PATH], *fileExt; 601 - + int res; 602 - + lstrcpyn(lpFileName, FileData.cFileName, szPath + MAX_PATH - lpFileName); 603 - + res = WideCharToMultiByte(CP_UTF8, 0, szPath, -1, szPathUtf8, sizeof(szPathUtf8), NULL, NULL); 604 - + if (!res) 605 - + { 606 - + fz_warn(ctx, "WideCharToMultiByte failed for %S", szPath); 607 - + continue; 608 - + } 609 - + fileExt = szPathUtf8 + strlen(szPathUtf8) - 4; 610 - + fz_try(ctx) 611 - + { 612 - + if (!_stricmp(fileExt, ".ttc")) 613 - + parseTTCs(ctx, szPathUtf8); 614 - + else if (!_stricmp(fileExt, ".ttf") || !_stricmp(fileExt, ".otf")) 615 - + parseTTFs(ctx, szPathUtf8); 616 - + } 617 - + fz_catch(ctx) 618 - + { 619 - + // ignore errors occurring while parsing a given font file 620 - + } 621 - + } 622 - + } while (FindNextFile(hList, &FileData)); 623 - + FindClose(hList); 624 - +} 625 - + 626 - +static void 627 - +destroy_system_font_list(void) 628 - +{ 629 - + free(fontlistMS.fontmap); 630 - + memset(&fontlistMS, 0, sizeof(fontlistMS)); 631 - +} 632 - + 633 - +static void 634 - +create_system_font_list(fz_context *ctx) 635 - +{ 636 - + WCHAR szFontDir[MAX_PATH]; 637 - + UINT cch; 638 - + 639 - + cch = GetWindowsDirectory(szFontDir, nelem(szFontDir) - 12); 640 - + if (0 < cch && cch < nelem(szFontDir) - 12) 641 - + { 642 - + /* willus.com edit--Win XP default MSVCRT.DLL doesn't have wcscat_s */ 643 - +#ifdef _WIN64 644 - + wcscat_s(szFontDir, MAX_PATH, L"\\Fonts\\*.?t?"); 645 - +#else 646 - + wcscat(szFontDir,L"\\Fonts\\*.?t?"); 647 - +#endif 648 - + extend_system_font_list(ctx, szFontDir); 649 - + } 650 - + 651 - + if (fontlistMS.len == 0) 652 - + fz_warn(ctx, "couldn't find any usable system fonts"); 653 - + 654 - +#ifdef NOCJKFONT 655 - + { 656 - + // If no CJK fallback font is builtin but one has been shipped separately (in the same 657 - + // directory as the main executable), add it to the list of loadable system fonts 658 - + WCHAR szFile[MAX_PATH], *lpFileName; 659 - + GetModuleFileName(0, szFontDir, MAX_PATH); 660 - + GetFullPathName(szFontDir, MAX_PATH, szFile, &lpFileName); 661 - + lstrcpyn(lpFileName, L"DroidSansFallback.ttf", szFile + MAX_PATH - lpFileName); 662 - + extend_system_font_list(ctx, szFile); 663 - + } 664 - +#endif 665 - + 666 - + // sort the font list, so that it can be searched binarily 667 - + qsort(fontlistMS.fontmap, fontlistMS.len, sizeof(pdf_fontmapMS), _stricmp); 668 - + 669 - +#ifdef DEBUG 670 - + // allow to overwrite system fonts for debugging purposes 671 - + // (either pass a full path or a search pattern such as "fonts\*.ttf") 672 - + cch = GetEnvironmentVariable(L"MUPDF_FONTS_PATTERN", szFontDir, nelem(szFontDir)); 673 - + if (0 < cch && cch < nelem(szFontDir)) 674 - + { 675 - + int i, prev_len = fontlistMS.len; 676 - + extend_system_font_list(ctx, szFontDir); 677 - + for (i = prev_len; i < fontlistMS.len; i++) 678 - + { 679 - + pdf_fontmapMS *entry = bsearch(fontlistMS.fontmap[i].fontface, fontlistMS.fontmap, prev_len, sizeof(pdf_fontmapMS), lookup_compare); 680 - + if (entry) 681 - + *entry = fontlistMS.fontmap[i]; 682 - + } 683 - + qsort(fontlistMS.fontmap, fontlistMS.len, sizeof(pdf_fontmapMS), _stricmp); 684 - + } 685 - +#endif 686 - + 687 - + // make sure to clean up after ourselves 688 - + atexit(destroy_system_font_list); 689 - +} 690 - + 691 - +static fz_font * 692 - +pdf_load_windows_font_by_name(fz_context *ctx, const char *orig_name) 693 - +{ 694 - + pdf_fontmapMS *found = NULL; 695 - + char *comma, *fontname; 696 - + fz_font *font; 697 - + 698 - + /* WILLUS MOD--not multi-threaded for k2pdfopt */ 699 - + /* fz_synchronize_begin(); */ 700 - + if (fontlistMS.len == 0) 701 - + { 702 - + fz_try(ctx) 703 - + { 704 - + create_system_font_list(ctx); 705 - + } 706 - + fz_catch(ctx) { } 707 - + } 708 - + /* WILLUS MOD--not multi-threaded for k2pdfopt */ 709 - + /* fz_synchronize_end(); */ 710 - + if (fontlistMS.len == 0) 711 - + fz_throw(ctx, FZ_ERROR_GENERIC, "fonterror: couldn't find any fonts"); 712 - + 713 - + // work on a normalized copy of the font name 714 - + fontname = fz_strdup(ctx, orig_name); 715 - + remove_spaces(fontname); 716 - + 717 - + // first, try to find the exact font name (including appended style information) 718 - + comma = strchr(fontname, ','); 719 - + if (comma) 720 - + { 721 - + *comma = '-'; 722 - + found = pdf_find_windows_font_path(fontname); 723 - + *comma = ','; 724 - + } 725 - + // second, substitute the font name with a known PostScript name 726 - + else 727 - + { 728 - + int i; 729 - + for (i = 0; i < nelem(baseSubstitutes) && !found; i++) 730 - + if (!strcmp(fontname, baseSubstitutes[i].name)) 731 - + found = pdf_find_windows_font_path(baseSubstitutes[i].pattern); 732 - + } 733 - + // third, search for the font name without additional style information 734 - + if (!found) 735 - + found = pdf_find_windows_font_path(fontname); 736 - + // fourth, try to separate style from basename for prestyled fonts (e.g. "ArialBold") 737 - + if (!found && !comma && (str_ends_with(fontname, "Bold") || str_ends_with(fontname, "Italic"))) 738 - + { 739 - + int styleLen = str_ends_with(fontname, "Bold") ? 4 : str_ends_with(fontname, "BoldItalic") ? 10 : 6; 740 - + fontname = fz_resize_array(ctx, fontname, strlen(fontname) + 2, sizeof(char)); 741 - + comma = fontname + strlen(fontname) - styleLen; 742 - + memmove(comma + 1, comma, styleLen + 1); 743 - + *comma = '-'; 744 - + found = pdf_find_windows_font_path(fontname); 745 - + *comma = ','; 746 - + if (!found) 747 - + found = pdf_find_windows_font_path(fontname); 748 - + } 749 - + // fifth, try to convert the font name from the common Chinese codepage 936 750 - + if (!found && fontname[0] < 0) 751 - + { 752 - + WCHAR cjkNameW[MAX_FACENAME]; 753 - + char cjkName[MAX_FACENAME]; 754 - + if (MultiByteToWideChar(936, MB_ERR_INVALID_CHARS, fontname, -1, cjkNameW, nelem(cjkNameW)) && 755 - + WideCharToMultiByte(CP_UTF8, 0, cjkNameW, -1, cjkName, nelem(cjkName), NULL, NULL)) 756 - + { 757 - + comma = strchr(cjkName, ','); 758 - + if (comma) 759 - + { 760 - + *comma = '-'; 761 - + found = pdf_find_windows_font_path(cjkName); 762 - + *comma = ','; 763 - + } 764 - + if (!found) 765 - + found = pdf_find_windows_font_path(cjkName); 766 - + } 767 - + } 768 - + 769 - + fz_free(ctx, fontname); 770 - + if (!found) 771 - + fz_throw(ctx, FZ_ERROR_GENERIC, "couldn't find system font '%s'", orig_name); 772 - + 773 - + /* 774 - + fz_warn(ctx, "loading non-embedded font '%s' from '%s'", orig_name, found->fontpath); 775 - + */ 776 - + 777 - + font = fz_new_font_from_file(ctx, orig_name, found->fontpath, found->index, 778 - + strcmp(found->fontface, "DroidSansFallback") != 0); 779 - + /* willus mod for MuPDF v1.10, 10-21-2016 */ 780 - + { 781 - + fz_font_flags_t *flags; 782 - + flags=fz_font_flags(font); 783 - + if (flags!=NULL) 784 - + flags->ft_substitute = 1; 785 - + } 786 - + return font; 787 - +} 788 - + 789 - +static fz_font * 790 - +pdf_load_windows_font(fz_context *ctx, const char *fontname, int bold, int italic, int needs_exact_metrics) 791 - +{ 792 - + if (needs_exact_metrics) 793 - + { 794 - + const char *clean_name; 795 - + /* WILLUS: Declare pdf_clean_base14_name() */ 796 - + extern const char *pdf_clean_base14_name(const char *fontname); 797 - + 798 - + /* TODO: the metrics for Times-Roman and Courier don't match 799 - + those of Windows' Times New Roman and Courier New; for 800 - + some reason, Poppler doesn't seem to have this problem */ 801 - + int len; 802 - + if (fz_lookup_builtin_font(ctx,fontname, bold, italic, &len)) 803 - + return NULL; 804 - + 805 - + /* cf. http://code.google.com/p/sumatrapdf/issues/detail?id=2173 */ 806 - + clean_name = pdf_clean_base14_name(fontname); 807 - + if (clean_name != fontname && !strncmp(clean_name, "Times-", 6)) 808 - + return NULL; 809 - + } 810 - + 811 - + // TODO: unset font->ft_substitute for base14/needs_exact_metrics? 812 - + return pdf_load_windows_font_by_name(ctx, fontname); 813 - +} 814 - + 815 - +static const char *clean_font_name(const char *fontname) 816 - +{ 817 - + int i, k; 818 - + for (i = 0; i < nelem(base_font_names); i++) 819 - + for (k = 0; base_font_names[i][k]; k++) 820 - + if (!strcmp_ignore_space(base_font_names[i][k], fontname)) 821 - + return base_font_names[i][0]; 822 - + return fontname; 823 - +} 824 - + 825 - + 826 - +/* SumatraPDF: expose clean_font_name */ 827 - +static const char * pdf_clean_base14_name(const char *fontname) 828 - +{ 829 - + return clean_font_name(fontname); 830 - +} 831 - + 832 - +static fz_font * 833 - +pdf_load_windows_cjk_font(fz_context *ctx, const char *fontname, int ros, int serif) 834 - +{ 835 - + fz_font *font; 836 - + 837 - + font=NULL; /* WILLUS: Avoid compiler warning */ 838 - + /* try to find a matching system font before falling back to an approximate one */ 839 - + fz_try(ctx) 840 - + { 841 - + font = pdf_load_windows_font_by_name(ctx, fontname); 842 - + } 843 - + fz_catch(ctx) 844 - + { 845 - + font = NULL; 846 - + } 847 - + if (font) 848 - + return font; 849 - + 850 - + /* try to fall back to a reasonable system font */ 851 - + fz_try(ctx) 852 - + { 853 - + if (serif) 854 - + { 855 - + switch (ros) 856 - + { 857 - + case FZ_ADOBE_CNS: font = pdf_load_windows_font_by_name(ctx, "MingLiU"); break; 858 - + case FZ_ADOBE_GB: font = pdf_load_windows_font_by_name(ctx, "SimSun"); break; 859 - + case FZ_ADOBE_JAPAN: font = pdf_load_windows_font_by_name(ctx, "MS-Mincho"); break; 860 - + case FZ_ADOBE_KOREA: font = pdf_load_windows_font_by_name(ctx, "Batang"); break; 861 - + default: fz_throw(ctx, FZ_ERROR_GENERIC, "invalid serif ros"); 862 - + } 863 - + } 864 - + else 865 - + { 866 - + switch (ros) 867 - + { 868 - + case FZ_ADOBE_CNS: font = pdf_load_windows_font_by_name(ctx, "DFKaiShu-SB-Estd-BF"); break; 869 - + case FZ_ADOBE_GB: 870 - + fz_try(ctx) 871 - + { 872 - + font = pdf_load_windows_font_by_name(ctx, "KaiTi"); 873 - + } 874 - + fz_catch(ctx) 875 - + { 876 - + font = pdf_load_windows_font_by_name(ctx, "KaiTi_GB2312"); 877 - + } 878 - + break; 879 - + case FZ_ADOBE_JAPAN: font = pdf_load_windows_font_by_name(ctx, "MS-Gothic"); break; 880 - + case FZ_ADOBE_KOREA: font = pdf_load_windows_font_by_name(ctx, "Gulim"); break; 881 - + default: fz_throw(ctx, FZ_ERROR_GENERIC, "invalid sans-serif ros"); 882 - + } 883 - + } 884 - + } 885 - + fz_catch(ctx) 886 - + { 887 - +#ifdef NOCJKFONT 888 - + /* If no CJK fallback font is builtin, maybe one has been shipped separately */ 889 - + font = pdf_load_windows_font_by_name(ctx, "DroidSansFallback"); 890 - +#else 891 - + fz_rethrow(ctx); 892 - +#endif 893 - + } 894 - + 895 - + return font; 896 - +} 897 - + 898 - +#endif 899 - + 900 - +void pdf_install_load_system_font_funcs(fz_context *ctx) 901 - +{ 902 - +#ifdef _WIN32 903 - + fz_install_load_system_font_funcs(ctx, pdf_load_windows_font, pdf_load_windows_cjk_font, NULL); 904 - +#endif 905 - +} 906 - diff --git a/source/fitz/font.c b/source/fitz/font.c 907 - index 00c6e8f99..1448b4a56 100644 908 - --- a/source/fitz/font.c 909 - +++ b/source/fitz/font.c 910 - @@ -4,8 +4,11 @@ 911 - #include "draw-imp.h" 912 - 913 - #include <ft2build.h> 914 - +/* willus mod -- remove hb includes */ 915 - +/* 916 - #include "hb.h" 917 - #include "hb-ft.h" 918 - +*/ 919 - 920 - #include <assert.h> 921 - 922 - diff --git a/source/fitz/stext-device.c b/source/fitz/stext-device.c 923 - index 2df90305e..b1f99e056 100644 924 - --- a/source/fitz/stext-device.c 925 - +++ b/source/fitz/stext-device.c 926 - @@ -825,6 +825,11 @@ fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options 927 - dev->lastchar = ' '; 928 - dev->curdir = 1; 929 - dev->lasttext = NULL; 930 - + /* willus mod -- seems like this should be here, but not sure. */ 931 - + if (opts) 932 - + dev->flags = opts->flags; 933 - + else 934 - + dev->flags = 0; 935 - 936 - return (fz_device*)dev; 937 - } 938 - diff --git a/source/fitz/string.c b/source/fitz/string.c 939 - index f8eedb682..7a767983d 100644 940 - --- a/source/fitz/string.c 941 - +++ b/source/fitz/string.c 942 - @@ -560,6 +560,10 @@ fz_utflen(const char *s) 943 - */ 944 - float fz_atof(const char *s) 945 - { 946 - +/* willus mod: atof(s), #if-#else-#endif */ 947 - +#if (!defined(__SSE__)) 948 - + return(atof(s)); 949 - +#else 950 - float result; 951 - 952 - if (s == NULL) 953 - @@ -572,6 +576,7 @@ float fz_atof(const char *s) 954 - return 1; 955 - result = fz_clamp(result, -FLT_MAX, FLT_MAX); 956 - return result; 957 - +#endif 958 - } 959 - 960 - /* 961 - diff --git a/source/pdf/pdf-annot.c b/source/pdf/pdf-annot.c 962 - index 4dfdf36fe..acff7d12a 100644 963 - --- a/source/pdf/pdf-annot.c 964 - +++ b/source/pdf/pdf-annot.c 965 - @@ -5,8 +5,20 @@ 966 - #include <string.h> 967 - #include <time.h> 968 - 969 - +/* willus mod--don't use _mkgmtime--not available in Win XP */ 970 - #ifdef _WIN32 971 - -#define timegm _mkgmtime 972 - +static time_t timegm(struct tm *date); 973 - +static time_t timegm(struct tm *date) 974 - + 975 - + { 976 - + time_t t,z; 977 - + struct tm gmz; 978 - + 979 - + z=(time_t)0; 980 - + gmz=(*gmtime(&z)); 981 - + t=mktime(date)-mktime(&gmz); 982 - + return(t); 983 - + } 984 - #endif 985 - 986 - #define isdigit(c) (c >= '0' && c <= '9') 987 - diff --git a/source/pdf/pdf-link.c b/source/pdf/pdf-link.c 988 - index 37444b471..613cc05b9 100644 989 - --- a/source/pdf/pdf-link.c 990 - +++ b/source/pdf/pdf-link.c 991 - @@ -345,6 +345,9 @@ pdf_resolve_link(fz_context *ctx, pdf_document *doc, const char *uri, float *xp, 992 - } 993 - return page; 994 - } 995 - +/* willus mod -- be quiet */ 996 - +/* 997 - fz_warn(ctx, "unknown link uri '%s'", uri); 998 - +*/ 999 - return -1; 1000 - } 1001 - diff --git a/source/pdf/pdf-parse.c b/source/pdf/pdf-parse.c 1002 - index 04a772204..9dd0cd898 100644 1003 - --- a/source/pdf/pdf-parse.c 1004 - +++ b/source/pdf/pdf-parse.c 1005 - @@ -663,9 +663,14 @@ pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc, 1006 - if (c == '\r') 1007 - { 1008 - c = fz_peek_byte(ctx, file); 1009 - +/* willus mod -- no warning */ 1010 - +/* 1011 - if (c != '\n') 1012 - fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen); 1013 - else 1014 - +*/ 1015 - +if (c=='\n') 1016 - +/* willus mod -- end */ 1017 - fz_read_byte(ctx, file); 1018 - } 1019 - stm_ofs = fz_tell(ctx, file); 1020 - diff --git a/source/pdf/pdf-xref.c b/source/pdf/pdf-xref.c 1021 - index 8f888059b..08de7bfba 100644 1022 - --- a/source/pdf/pdf-xref.c 1023 - +++ b/source/pdf/pdf-xref.c 1024 - @@ -710,8 +710,11 @@ pdf_xref_size_from_old_trailer(fz_context *ctx, pdf_document *doc, pdf_lexbuf *b 1025 - if (!s) 1026 - fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length missing"); 1027 - len = fz_atoi(fz_strsep(&s, " ")); 1028 - +/* willus mod -- no warning */ 1029 - +/* 1030 - if (len < 0) 1031 - fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length must be positive"); 1032 - +*/ 1033 - 1034 - /* broken pdfs where the section is not on a separate line */ 1035 - if (s && *s != '\0') 1036 - @@ -1378,7 +1381,10 @@ pdf_init_document(fz_context *ctx, pdf_document *doc) 1037 - { 1038 - pdf_drop_xref_sections(ctx, doc); 1039 - fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); 1040 - +/* willus mod -- be quiet */ 1041 - +/* 1042 - fz_warn(ctx, "trying to repair broken xref"); 1043 - +*/ 1044 - repaired = 1; 1045 - } 1046 - 1047 - @@ -1506,7 +1512,10 @@ pdf_drop_document_imp(fz_context *ctx, pdf_document *doc) 1048 - /* Swallow error, but continue dropping */ 1049 - } 1050 - 1051 - +/* willu smod -- no pdf_drop_js */ 1052 - +/* 1053 - pdf_drop_js(ctx, doc->js); 1054 - +*/ 1055 - 1056 - pdf_drop_xref_sections(ctx, doc); 1057 - fz_free(ctx, doc->xref_index); 1058 - -- 1059 - 2.22.0 1060 -

-675

pkgs/applications/misc/k2pdfopt/tesseract.patch

··· 1 - From 39aa8502eee7bb669a29d1a9b3bfe5c9595ad960 Mon Sep 17 00:00:00 2001 2 - From: Daniel Fullmer <danielrf12@gmail.com> 3 - Date: Fri, 13 Sep 2019 13:45:05 -0400 4 - Subject: [PATCH] Willus mod changes from k2pdfopt 5 - 6 - --- 7 - src/api/Makefile.am | 1 + 8 - src/api/baseapi.cpp | 87 +++++++++++ 9 - src/api/baseapi.h | 3 + 10 - src/api/tesscapi.cpp | 311 +++++++++++++++++++++++++++++++++++++ 11 - src/api/tesseract.h | 29 ++++ 12 - src/ccmain/tessedit.cpp | 5 +- 13 - src/ccutil/ccutil.h | 7 + 14 - src/ccutil/genericvector.h | 21 ++- 15 - src/ccutil/mainblk.cpp | 17 +- 16 - src/ccutil/params.cpp | 3 +- 17 - src/ccutil/serialis.cpp | 3 + 18 - src/ccutil/serialis.h | 2 + 19 - src/lstm/input.cpp | 3 + 20 - 13 files changed, 488 insertions(+), 4 deletions(-) 21 - create mode 100644 src/api/tesscapi.cpp 22 - create mode 100644 src/api/tesseract.h 23 - 24 - diff --git a/src/api/Makefile.am b/src/api/Makefile.am 25 - index d9b76eb6..cd2dc30f 100644 26 - --- a/src/api/Makefile.am 27 - +++ b/src/api/Makefile.am 28 - @@ -39,6 +39,7 @@ libtesseract_api_la_SOURCES += lstmboxrenderer.cpp 29 - libtesseract_api_la_SOURCES += pdfrenderer.cpp 30 - libtesseract_api_la_SOURCES += wordstrboxrenderer.cpp 31 - libtesseract_api_la_SOURCES += renderer.cpp 32 - +libtesseract_api_la_SOURCES += tesscapi.cpp 33 - 34 - lib_LTLIBRARIES += libtesseract.la 35 - libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS) $(libarchive_LIBS) 36 - diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp 37 - index 9245d07c..ea964ee6 100644 38 - --- a/src/api/baseapi.cpp 39 - +++ b/src/api/baseapi.cpp 40 - @@ -215,6 +215,14 @@ TessBaseAPI::TessBaseAPI() 41 - // Use the current locale if building debug code. 42 - std::locale::global(std::locale("")); 43 - #endif 44 - + const char *locale; 45 - + locale = std::setlocale(LC_ALL, nullptr); 46 - +/* willus mod Remove assertions--taken care of in tesscapi.cpp */ 47 - +// ASSERT_HOST(!strcmp(locale, "C")); 48 - + locale = std::setlocale(LC_CTYPE, nullptr); 49 - +// ASSERT_HOST(!strcmp(locale, "C")); 50 - + locale = std::setlocale(LC_NUMERIC, nullptr); 51 - +// ASSERT_HOST(!strcmp(locale, "C")); 52 - } 53 - 54 - TessBaseAPI::~TessBaseAPI() { 55 - @@ -1333,6 +1341,85 @@ static void AddBoxToTSV(const PageIterator* it, PageIteratorLevel level, 56 - text->add_str_int("\t", bottom - top); 57 - } 58 - 59 - +/* willus mod */ 60 - +int TessBaseAPI::GetOCRWords(int **x00,int **y00,int **x11,int **y11,int **ybaseline0, 61 - + char **utf8words) 62 - + 63 - + { 64 - + int iword,nwords,totlen,it8; 65 - + int *x0,*y0,*x1,*y1,*ybaseline; 66 - + char *tutf8; 67 - + 68 - + ResultIterator *res_it = GetIterator(); 69 - + /* Count words */ 70 - + iword=0; 71 - + totlen=0; 72 - + while (!res_it->Empty(RIL_BLOCK)) 73 - + { 74 - + if (res_it->Empty(RIL_WORD)) 75 - + { 76 - + res_it->Next(RIL_WORD); 77 - + continue; 78 - + } 79 - + iword++; 80 - + STRING textstr=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get(); 81 - + totlen+=strlen(textstr.string())+1; 82 - + res_it->Next(RIL_WORD); 83 - + } 84 - + nwords=iword; 85 - +/* 86 - +printf("\nnwords=%d, totlen=%d\n",nwords,totlen); 87 - +*/ 88 - + x0=(*x00)=(int *)malloc(sizeof(int)*5*nwords); 89 - + y0=(*y00)=&x0[nwords]; 90 - + x1=(*x11)=&y0[nwords]; 91 - + y1=(*y11)=&x1[nwords]; 92 - + ybaseline=(*ybaseline0)=&y1[nwords]; 93 - + tutf8=(*utf8words)=(char *)malloc(totlen); 94 - + iword=0; 95 - + it8=0; 96 - + res_it->Begin(); 97 - + while (!res_it->Empty(RIL_BLOCK)) 98 - + { 99 - + if (res_it->Empty(RIL_WORD)) 100 - + { 101 - + res_it->Next(RIL_WORD); 102 - + continue; 103 - + } 104 - + STRING textstr=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get(); 105 - + strcpy(&tutf8[it8],textstr.string()); 106 - + it8 += strlen(&tutf8[it8])+1; 107 - + /* 108 - + STRING textstr(""); 109 - + textstr += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get(); 110 - + */ 111 - +/* 112 - +printf("Word %d: '%s'\n",iword,textstr.string()); 113 - +*/ 114 - + int left, top, right, bottom; 115 - + int u1,v1,u2,v2; 116 - + res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); 117 - + res_it->Baseline(RIL_WORD, &u1, &v1, &u2, &v2); 118 - + x0[iword]=left; 119 - + x1[iword]=right; 120 - + y0[iword]=top; 121 - + y1[iword]=bottom; 122 - + ybaseline[iword]=(v1+v2)/2; 123 - + iword++; 124 - +/* 125 - +printf("BB: (%d,%d)-(%d,%d) BL: (%d,%d)-(%d,%d)\n",left,bottom,right,top,x1,y1,x2,y2); 126 - +*/ 127 - + res_it->Next(RIL_WORD); 128 - + } 129 - +/* 130 - +printf("iword=%d\n",iword); 131 - +*/ 132 - + return(iword); 133 - + } 134 - + 135 - +/* willus mod */ 136 - +int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words); 137 - + 138 - /** 139 - * Make a TSV-formatted string from the internal data structures. 140 - * page_number is 0-based but will appear in the output as 1-based. 141 - diff --git a/src/api/baseapi.h b/src/api/baseapi.h 142 - index 3724dd92..23be5920 100644 143 - --- a/src/api/baseapi.h 144 - +++ b/src/api/baseapi.h 145 - @@ -575,6 +575,9 @@ class TESS_API TessBaseAPI { 146 - */ 147 - char* GetHOCRText(ETEXT_DESC* monitor, int page_number); 148 - 149 - +/* willus mod */ 150 - +int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words); 151 - + 152 - /** 153 - * Make a HTML-formatted string with hOCR markup from the internal 154 - * data structures. 155 - diff --git a/src/api/tesscapi.cpp b/src/api/tesscapi.cpp 156 - new file mode 100644 157 - index 00000000..1752fafe 158 - --- /dev/null 159 - +++ b/src/api/tesscapi.cpp 160 - @@ -0,0 +1,311 @@ 161 - +/* 162 - +** tesscapi.cpp willus.com attempt at C wrapper for tesseract. 163 - +** (Butchered from tesseractmain.cpp) 164 - +** Last udpated 9-1-12 165 - +** 166 - +** Copyright (C) 2012 http://willus.com 167 - +** 168 - +** This program is free software: you can redistribute it and/or modify 169 - +** it under the terms of the GNU Affero General Public License as 170 - +** published by the Free Software Foundation, either version 3 of the 171 - +** License, or (at your option) any later version. 172 - +** 173 - +** This program is distributed in the hope that it will be useful, 174 - +** but WITHOUT ANY WARRANTY; without even the implied warranty of 175 - +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 176 - +** GNU Affero General Public License for more details. 177 - +** 178 - +** You should have received a copy of the GNU Affero General Public License 179 - +** along with this program. If not, see <http://www.gnu.org/licenses/>. 180 - +** 181 - +*/ 182 - + 183 - +/* 184 - +#include "mfcpch.h" 185 - +*/ 186 - +// #define USE_VLD //Uncomment for Visual Leak Detector. 187 - +#if (defined _MSC_VER && defined USE_VLD) 188 - +#include <vld.h> 189 - +#endif 190 - + 191 - +// Include automatically generated configuration file if running autoconf 192 - +#ifdef HAVE_CONFIG_H 193 - +#include "config_auto.h" 194 - +#endif 195 - +#include <locale.h> 196 - +#ifdef USING_GETTEXT 197 - +#include <libintl.h> 198 - +#define _(x) gettext(x) 199 - +#else 200 - +#define _(x) (x) 201 - +#endif 202 - + 203 - +#include "allheaders.h" 204 - +#include "baseapi.h" 205 - +#include "strngs.h" 206 - +#include "params.h" 207 - +#include "blobs.h" 208 - +#include "simddetect.h" 209 - +#include "tesseractclass.h" 210 - +/* 211 - +#include "notdll.h" 212 - +*/ 213 - + 214 - +/* C Wrappers */ 215 - +#include "tesseract.h" 216 - + 217 - +// static tesseract::TessBaseAPI api[4]; 218 - + 219 - +/* 220 - +** ocr_type=0: OEM_DEFAULT 221 - +** ocr_type=1: OEM_TESSERACT_ONLY 222 - +** ocr_type=2: OEM_LSTM_ONLY 223 - +** ocr_type=3: OEM_TESSERACT_LSTM_COMBINED 224 - +*/ 225 - +void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out, 226 - + char *initstr,int maxlen,int *status) 227 - + 228 - + { 229 - + char original_locale[256]; 230 - + tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI; 231 - +/* 232 - +printf("@tess_capi_init\n"); 233 - +printf(" datapath='%s'\n",datapath); 234 - +printf(" language='%s'\n",language); 235 - +printf(" ocr_type=%d\n",ocr_type); 236 - +*/ 237 - +#ifdef USE_NLS 238 - + setlocale (LC_ALL, ""); 239 - + bindtextdomain (PACKAGE, LOCALEDIR); 240 - + textdomain (PACKAGE); 241 - +#endif 242 - + /* willus mod, 11-24-16 */ 243 - + /* Tesseract needs "C" locale to correctly parse all data .traineddata files. */ 244 - +/* 245 - +printf("locale='%s'\n",setlocale(LC_ALL,NULL)); 246 - +printf("ctype='%s'\n",setlocale(LC_CTYPE,NULL)); 247 - +printf("numeric='%s'\n",setlocale(LC_NUMERIC,NULL)); 248 - +*/ 249 - + strncpy(original_locale,setlocale(LC_ALL,NULL),255); 250 - + original_locale[255]='\0'; 251 - +/* 252 - +printf("original_locale='%s'\n",original_locale); 253 - +*/ 254 - + setlocale(LC_ALL,"C"); 255 - +/* 256 - +printf("new locale='%s'\n",setlocale(LC_ALL,NULL)); 257 - +printf("new ctype='%s'\n",setlocale(LC_CTYPE,NULL)); 258 - +printf("new numeric='%s'\n",setlocale(LC_NUMERIC,NULL)); 259 - +*/ 260 - + // fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version()); 261 - + // Make the order of args a bit more forgiving than it used to be. 262 - + const char* lang = "eng"; 263 - + tesseract::PageSegMode pagesegmode = tesseract::PSM_SINGLE_BLOCK; 264 - + if (language!=NULL && language[0]!='\0') 265 - + lang = language; 266 - + /* 267 - + if (output == NULL) 268 - + { 269 - + fprintf(stderr, _("Usage:%s imagename outputbase [-l lang] " 270 - + "[-psm pagesegmode] [configfile...]\n"), argv[0]); 271 - + fprintf(stderr, 272 - + _("pagesegmode values are:\n" 273 - + "0 = Orientation and script detection (OSD) only.\n" 274 - + "1 = Automatic page segmentation with OSD.\n" 275 - + "2 = Automatic page segmentation, but no OSD, or OCR\n" 276 - + "3 = Fully automatic page segmentation, but no OSD. (Default)\n" 277 - + "4 = Assume a single column of text of variable sizes.\n" 278 - + "5 = Assume a single uniform block of vertically aligned text.\n" 279 - + "6 = Assume a single uniform block of text.\n" 280 - + "7 = Treat the image as a single text line.\n" 281 - + "8 = Treat the image as a single word.\n" 282 - + "9 = Treat the image as a single word in a circle.\n" 283 - + "10 = Treat the image as a single character.\n")); 284 - + fprintf(stderr, _("-l lang and/or -psm pagesegmode must occur before any" 285 - + "configfile.\n")); 286 - + exit(1); 287 - + } 288 - + */ 289 - +/* 290 - +printf("SSE = %s\n",SIMDDetect::IsSSEAvailable() ? "AVAILABLE" : "NOT AVAILABLE"); 291 - +printf("AVX = %s\n",SIMDDetect::IsAVXAvailable() ? "AVAILABLE" : "NOT AVAILABLE"); 292 - +*/ 293 - +/* 294 - +v4.00 loads either TESSERACT enginer, LSTM engine, or both. No CUBE. 295 - +*/ 296 - + ocr_type=0; /* Ignore specified and use default */ 297 - + api->SetOutputName(NULL); 298 - + (*status)=api->Init(datapath,lang, 299 - + ocr_type==0 ? tesseract::OEM_DEFAULT : 300 - + (ocr_type==1 ? tesseract::OEM_TESSERACT_ONLY : 301 - + (ocr_type==2 ? tesseract::OEM_LSTM_ONLY : 302 - + (tesseract::OEM_TESSERACT_LSTM_COMBINED)))); 303 - + if ((*status)!=0) 304 - + { 305 - + /* willus mod, 11-24-16 */ 306 - + setlocale(LC_ALL,original_locale); 307 - + api->End(); 308 - + delete api; 309 - + return(NULL); 310 - + } 311 - + /* 312 - + api.Init("tesscapi",lang,tesseract::OEM_DEFAULT, 313 - + &(argv[arg]), argc - arg, NULL, NULL, false); 314 - + */ 315 - + // We have 2 possible sources of pagesegmode: a config file and 316 - + // the command line. For backwards compatability reasons, the 317 - + // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the 318 - + // default for this program is tesseract::PSM_AUTO. We will let 319 - + // the config file take priority, so the command-line default 320 - + // can take priority over the tesseract default, so we use the 321 - + // value from the command line only if the retrieved mode 322 - + // is still tesseract::PSM_SINGLE_BLOCK, indicating no change 323 - + // in any config file. Therefore the only way to force 324 - + // tesseract::PSM_SINGLE_BLOCK is from the command line. 325 - + // It would be simpler if we could set the value before Init, 326 - + // but that doesn't work. 327 - + if (api->GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK) 328 - + api->SetPageSegMode(pagesegmode); 329 - + 330 - + /* 331 - + ** Initialization message 332 - + */ 333 - + { 334 - + char istr[1024]; 335 - + int sse,avx; 336 - + 337 - +// printf("tessedit_ocr_engine_mode = %d\n",tessedit_ocr_engine_mode); 338 - + sprintf(istr,"%s",api->Version()); 339 - + sse=tesseract::SIMDDetect::IsSSEAvailable(); 340 - + avx=tesseract::SIMDDetect::IsAVXAvailable(); 341 - + if (sse || avx) 342 - + sprintf(&istr[strlen(istr)]," [%s]",sse&&avx?"SSE+AVX":(sse?"SSE":"AVX")); 343 - + sprintf(&istr[strlen(istr)],"\n Tesseract data folder = '%s'",datapath==NULL?getenv("TESSDATA_PREFIX"):datapath); 344 - + strcat(istr,"\n Tesseract languages: "); 345 - + GenericVector<STRING> languages; 346 - + api->GetLoadedLanguagesAsVector(&languages); 347 - +/* 348 - +printf("OEM=%d\n",api->oem()); 349 - +printf("Langs='%s'\n",api->GetInitLanguagesAsString()); 350 - +printf("AnyTessLang()=%d\n",(int)api->tesseract()->AnyTessLang()); 351 - +printf("AnyLSTMLang()=%d\n",(int)api->tesseract()->AnyLSTMLang()); 352 - +printf("num_sub_langs()=%d\n",api->tesseract()->num_sub_langs()); 353 - +printf("languages.size()=%d\n",(int)languages.size()); 354 - +*/ 355 - + 356 - + for (int i=0;i<=api->tesseract()->num_sub_langs();i++) 357 - + { 358 - + tesseract::Tesseract *lang1; 359 - + int eng; 360 - + lang1 = i==0 ? api->tesseract() : api->tesseract()->get_sub_lang(i-1); 361 - + eng=(int)lang1->tessedit_ocr_engine_mode; 362 - + sprintf(&istr[strlen(istr)],"%s%s [%s]",i==0?"":", ",lang1->lang.string(), 363 - + eng==2?"LSTM+Tess":(eng==1?"LSTM":"Tess")); 364 - + } 365 - +/* 366 - +printf("%d. '%s'\n",i+1,languages[i].string()); 367 - +printf(" sublang[%d].oem_engine = %d\n",i+1,(int)api->tesseract()->get_sub_lang(i)->tessedit_ocr_engine_mode); 368 - +*/ 369 - + 370 - + /* 371 - + if (ocr_type==0 || ocr_type==3) 372 - + sprintf(&istr[strlen(istr)],"[LSTM+] (lang="); 373 - + else if (ocr_type==2) 374 - + sprintf(&istr[strlen(istr)],"[LSTM] (lang="); 375 - + strncpy(&istr[strlen(istr)],language,253-strlen(istr)); 376 - + istr[253]='\0'; 377 - + strcat(istr,")"); 378 - + */ 379 - + if (out!=NULL) 380 - + fprintf(out,"%s\n",istr); 381 - + if (initstr!=NULL) 382 - + { 383 - + strncpy(initstr,istr,maxlen-1); 384 - + initstr[maxlen-1]='\0'; 385 - + } 386 - + } 387 - + 388 - + 389 - + /* Turn off LSTM debugging output */ 390 - + api->SetVariable("lstm_debug_level","0"); 391 - +#if (WILLUSDEBUG & 1) 392 - + api->SetVariable("lstm_debug_level","9"); 393 - + api->SetVariable("paragraph_debug_level","9"); 394 - + api->SetVariable("tessdata_manager_debug_level","9"); 395 - + api->SetVariable("tosp_debug_level","9"); 396 - + api->SetVariable("wordrec_debug_level","9"); 397 - + api->SetVariable("segsearch_debug_level","9"); 398 - +#endif 399 - + /* willus mod, 11-24-16 */ 400 - + setlocale(LC_ALL,original_locale); 401 - + return((void *)api); 402 - + } 403 - + 404 - + 405 - +int tess_capi_get_ocr(void *vapi,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out) 406 - + 407 - + { 408 - + tesseract::TessBaseAPI *api; 409 - + static int old_segmode=-1; 410 - + 411 - + api=(tesseract::TessBaseAPI *)vapi; 412 - + if (old_segmode != segmode) 413 - + { 414 - + old_segmode=segmode; 415 - + api->SetPageSegMode((tesseract::PageSegMode)segmode); 416 - + } 417 - + if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL)) 418 - + { 419 - + /* pixDestroy(&pix); */ 420 - + if (out!=NULL) 421 - + fprintf(out,"tesscapi: Error during bitmap processing.\n"); 422 - + api->Clear(); 423 - + return(-1); 424 - + } 425 - + strncpy(outstr,api->GetUTF8Text(),maxlen-1); 426 - + outstr[maxlen-1]='\0'; 427 - + api->Clear(); 428 - + return(0); 429 - + } 430 - + 431 - + 432 - +int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode, 433 - + int **left,int **top,int **right,int **bottom, 434 - + int **ybase,char **text,int *nw, 435 - + FILE *out) 436 - + 437 - + { 438 - + tesseract::TessBaseAPI *api; 439 - + static int old_segmode=-1; 440 - + 441 - + api=(tesseract::TessBaseAPI *)vapi; 442 - + if (old_segmode != segmode) 443 - + { 444 - + old_segmode=segmode; 445 - + api->SetPageSegMode((tesseract::PageSegMode)segmode); 446 - + } 447 - + if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL)) 448 - + { 449 - + if (out!=NULL) 450 - + fprintf(out,"tesscapi: Error during bitmap processing.\n"); 451 - + api->Clear(); 452 - + (*nw)=0; 453 - + return(-1); 454 - + } 455 - + (*nw)=api->GetOCRWords(left,top,right,bottom,ybase,text); 456 - + api->Clear(); 457 - + return(0); 458 - + } 459 - + 460 - + 461 - +void tess_capi_end(void *vapi) 462 - + 463 - + { 464 - + tesseract::TessBaseAPI *api; 465 - + 466 - + if (vapi==NULL) 467 - + return; 468 - + api=(tesseract::TessBaseAPI *)vapi; 469 - + api->End(); 470 - + delete api; 471 - + } 472 - diff --git a/src/api/tesseract.h b/src/api/tesseract.h 473 - new file mode 100644 474 - index 00000000..575948cc 475 - --- /dev/null 476 - +++ b/src/api/tesseract.h 477 - @@ -0,0 +1,29 @@ 478 - +/* 479 - +** Willus.com's Tesseract C Wrappers 480 - +** 481 - +** 6-8-12 482 - +** 483 - +*/ 484 - + 485 - +#ifndef _TESSERACT_H_ 486 - +#define _TESSERACT_H_ 487 - + 488 - +//#include <leptonica.h> 489 - +#ifdef __cplusplus 490 - +extern "C" { 491 - +#endif 492 - + 493 - +void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out, 494 - + char *initstr,int maxlen,int *status); 495 - +int tess_capi_get_ocr(void *api,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out); 496 - +int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode, 497 - + int **left,int **top,int **right,int **bottom, 498 - + int **ybase,char **text,int *nw, 499 - + FILE *out); 500 - +void tess_capi_end(void *api); 501 - + 502 - +#ifdef __cplusplus 503 - +} 504 - +#endif 505 - + 506 - +#endif 507 - diff --git a/src/ccmain/tessedit.cpp b/src/ccmain/tessedit.cpp 508 - index 17f0951b..7af94ee2 100644 509 - --- a/src/ccmain/tessedit.cpp 510 - +++ b/src/ccmain/tessedit.cpp 511 - @@ -101,6 +101,10 @@ bool Tesseract::init_tesseract_lang_data( 512 - " to your \"tessdata\" directory.\n"); 513 - return false; 514 - } 515 - + /* willus mod */ 516 - + TFile fp; 517 - + strncpy(fp.tfile_filename,tessdata_path.string(),511); 518 - + fp.tfile_filename[511]='\0'; 519 - #ifndef DISABLED_LEGACY_ENGINE 520 - if (oem == OEM_DEFAULT) { 521 - // Set the engine mode from availability, which can then be overridden by 522 - @@ -116,7 +120,6 @@ bool Tesseract::init_tesseract_lang_data( 523 - #endif // ndef DISABLED_LEGACY_ENGINE 524 - 525 - // If a language specific config file (lang.config) exists, load it in. 526 - - TFile fp; 527 - if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) { 528 - ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, 529 - this->params()); 530 - diff --git a/src/ccutil/ccutil.h b/src/ccutil/ccutil.h 531 - index 71e89c60..bdeccc14 100644 532 - --- a/src/ccutil/ccutil.h 533 - +++ b/src/ccutil/ccutil.h 534 - @@ -80,6 +80,13 @@ class CCUtil { 535 - // Member parameters. 536 - // These have to be declared and initialized after params_ member, since 537 - // params_ should be initialized before parameters are added to it. 538 - +/* willus mod */ 539 - +/* 540 - + #ifdef _WIN32 541 - + STRING_VAR_H(tessedit_module_name, WINDLLNAME, 542 - + "Module colocated with tessdata dir"); 543 - + #endif 544 - +*/ 545 - INT_VAR_H(ambigs_debug_level, 0, "Debug level for unichar ambiguities"); 546 - BOOL_VAR_H(use_definite_ambigs_for_classifier, false, 547 - "Use definite ambiguities when running character classifier"); 548 - diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h 549 - index 3556d153..3a5e8662 100644 550 - --- a/src/ccutil/genericvector.h 551 - +++ b/src/ccutil/genericvector.h 552 - @@ -382,7 +382,26 @@ inline bool LoadDataFromFile(const char* filename, GenericVector<char>* data) { 553 - // reserve an extra byte in case caller wants to append a '\0' character 554 - data->reserve(size + 1); 555 - data->resize_no_init(size); 556 - - result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size; 557 - + /* willus mod Dec 2018--weird issue with Win XP and MinGW gcc 7.3.0 */ 558 - + /* Can't read entire file at once -- need to break up into smaller blocksize reads */ 559 - + { 560 - + int frs,n; 561 - + int blocksize; 562 - + blocksize=1024*1024; 563 - + for (n=0;1;) 564 - + { 565 - + int bs; 566 - + bs= size-n > blocksize ? blocksize : size-n; 567 - + frs=(int)fread(&(*data)[n],1,bs,fp); 568 - + n+=frs; 569 - + if (frs<bs || bs<blocksize || n>=size) 570 - + break; 571 - + } 572 - + result = static_cast<long>((long)n==size); 573 - + } 574 - + /* 575 - + result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size; 576 - + */ 577 - } 578 - fclose(fp); 579 - } 580 - diff --git a/src/ccutil/mainblk.cpp b/src/ccutil/mainblk.cpp 581 - index 52b04b04..80b26044 100644 582 - --- a/src/ccutil/mainblk.cpp 583 - +++ b/src/ccutil/mainblk.cpp 584 - @@ -55,8 +55,22 @@ void CCUtil::main_setup(const char *argv0, const char *basename) { 585 - #if defined(_WIN32) 586 - } else if (datadir == nullptr || _access(datadir.string(), 0) != 0) { 587 - /* Look for tessdata in directory of executable. */ 588 - + /* 589 - + char drive[_MAX_DRIVE]; 590 - + char dir[_MAX_DIR]; 591 - + */ 592 - char path[_MAX_PATH]; 593 - - DWORD length = GetModuleFileName(nullptr, path, sizeof(path)); 594 - + int i; 595 - + /* DWORD length = */ GetModuleFileName(nullptr, path, sizeof(path)); 596 - + /* willus mod--avoid _splitpath_s -- not in XP */ 597 - + for (i=strlen(path)-1;i>=0 && path[i]!='/' && path[i]!='\\';i--); 598 - + if (i>=0) 599 - + { 600 - + path[i]='\0'; 601 - + datadir=path; 602 - + datadir += "/tessdata"; 603 - + } 604 - + /* 605 - if (length > 0 && length < sizeof(path)) { 606 - char* separator = std::strrchr(path, '\\'); 607 - if (separator != nullptr) { 608 - @@ -65,6 +79,7 @@ void CCUtil::main_setup(const char *argv0, const char *basename) { 609 - datadir += "/tessdata"; 610 - } 611 - } 612 - + */ 613 - #endif /* _WIN32 */ 614 - #if defined(TESSDATA_PREFIX) 615 - } else { 616 - diff --git a/src/ccutil/params.cpp b/src/ccutil/params.cpp 617 - index 00bf2563..486c5ce0 100644 618 - --- a/src/ccutil/params.cpp 619 - +++ b/src/ccutil/params.cpp 620 - @@ -82,7 +82,8 @@ bool ParamUtils::ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, 621 - 622 - if (!foundit) { 623 - anyerr = true; // had an error 624 - - tprintf("Warning: Parameter not found: %s\n", line); 625 - + /* willus mod */ 626 - + tprintf("Tesseract warning: Parameter %s not found in file %s.\n",line,fp->tfile_filename); 627 - } 628 - } 629 - } 630 - diff --git a/src/ccutil/serialis.cpp b/src/ccutil/serialis.cpp 631 - index 7def011f..6107a494 100644 632 - --- a/src/ccutil/serialis.cpp 633 - +++ b/src/ccutil/serialis.cpp 634 - @@ -201,6 +201,9 @@ bool TFile::Open(const STRING& filename, FileReader reader) { 635 - offset_ = 0; 636 - is_writing_ = false; 637 - swap_ = false; 638 - + /* willus mod */ 639 - + strncpy(tfile_filename,filename.string(),511); 640 - + tfile_filename[511]='\0'; 641 - if (reader == nullptr) 642 - return LoadDataFromFile(filename, data_); 643 - else 644 - diff --git a/src/ccutil/serialis.h b/src/ccutil/serialis.h 645 - index 095b9227..4cc8251e 100644 646 - --- a/src/ccutil/serialis.h 647 - +++ b/src/ccutil/serialis.h 648 - @@ -77,6 +77,8 @@ class TFile { 649 - public: 650 - TFile(); 651 - ~TFile(); 652 - + /* willus mod */ 653 - + char tfile_filename[512]; 654 - 655 - // All the Open methods load the whole file into memory for reading. 656 - // Opens a file with a supplied reader, or nullptr to use the default. 657 - diff --git a/src/lstm/input.cpp b/src/lstm/input.cpp 658 - index 73b584b3..0b0b54c3 100644 659 - --- a/src/lstm/input.cpp 660 - +++ b/src/lstm/input.cpp 661 - @@ -93,8 +93,11 @@ Pix* Input::PrepareLSTMInputs(const ImageData& image_data, 662 - return nullptr; 663 - } 664 - if (width < min_width || height < min_width) { 665 - + /* willus mod -- no warning */ 666 - + /* 667 - tprintf("Image too small to scale!! (%dx%d vs min width of %d)\n", width, 668 - height, min_width); 669 - + */ 670 - pixDestroy(&pix); 671 - return nullptr; 672 - } 673 - -- 674 - 2.22.0 675 -