Loading SimpleOCR.code-workspace +1 −1 Original line number Diff line number Diff line Loading @@ -10,7 +10,7 @@ "tasks": [ { "label": "Base: Build all codeunits", "command": "scbuildcodeunits", "command": "scbuildcodeunits {{.CLI_ARGS}}", "type": "shell", "options": { "cwd": "${workspaceFolder}" Loading SimpleOCRLibrary/Other/CommonTasks.py +0 −2 Original line number Diff line number Diff line import os from ScriptCollection.GeneralUtilities import GeneralUtilities from ScriptCollection.TFCPS.DotNet.TFCPS_CodeUnitSpecific_DotNet import TFCPS_CodeUnitSpecific_DotNet_Functions,TFCPS_CodeUnitSpecific_DotNet_CLI from ScriptCollection.TFCPS.DotNet.CertificateGeneratorInformationNoGenerate import CertificateGeneratorInformationNoGenerate Loading SimpleOCRLibrary/SimpleOCRLibrary/OCRService.cs +12 −50 Original line number Diff line number Diff line Loading @@ -6,6 +6,7 @@ using SimpleOCR.Library.Core.VIsitors; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text.RegularExpressions; using Tesseract; Loading @@ -22,7 +23,7 @@ namespace SimpleOCR.Library.Core /// <remarks><see cref="OCRService"/> takes care of the download itself. The only requirement is that git is available as command.</remarks> public OCRService(string dataFolder, IGRYLog log) { this._DataFolder = dataFolder; this._DataFolder = dataFolder.Replace('\\', '/'); this._Log = log; } public string GetOCRContent(byte[] fileContent, ISet<string> languages) Loading @@ -44,7 +45,7 @@ namespace SimpleOCR.Library.Core throw new BadRequestException($"Language '{language}' is not supported."); } } string dataPath = this.GetTessDataPath(); string dataPath = _DataFolder; using TesseractEngine engine = new TesseractEngine(dataPath, languagesConcatenated, EngineMode.Default); using MemoryStream ms = new MemoryStream(fileContent); using Pix img = Pix.LoadFromMemory(ms.ToArray()); Loading @@ -63,7 +64,8 @@ namespace SimpleOCR.Library.Core { var result = new HashSet<string>(); string pattern = @"([a-z][a-z][a-z])\.traineddata"; var files = Directory.GetFiles(this.GetTessDataPath()); string tessdataFolder = _DataFolder; var files = Directory.GetFiles(tessdataFolder); foreach (var file in files) { string filename = Path.GetFileName(file); Loading @@ -80,49 +82,13 @@ namespace SimpleOCR.Library.Core { try { GRYLibrary.Core.Misc.Utilities.AssertCondition(!string.IsNullOrEmpty(_DataFolder),"No OCR-data-folder set."); this._Log.Log($"OCRFolder: {this._DataFolder}"); string tessdataFolder = _DataFolder; GRYLibrary.Core.Misc.Utilities.AssertCondition(!string.IsNullOrEmpty(tessdataFolder), "No OCR-data-folder set."); this._Log.Log($"OCRFolder: {tessdataFolder}"); if (!this.IsInitialized) { this._Log.Log("Initialize OCR-data"); GRYLibrary.Core.Misc.Utilities.EnsureDirectoryExists(this._DataFolder); string tessdataFolder = this.GetTessDataPath(); if (!Directory.Exists(tessdataFolder)) { string repoOwner = "tesseract-ocr"; string repoName = "tessdata_best"; this._Log.Log("Download OCR-data..."); GRYLibrary.Core.Misc.Utilities.EnsureDirectoryExists(tessdataFolder); using (var e = new ExternalProgramExecutor(new ExternalProgramExecutorConfiguration() { Program = "git", Argument = $"clone --recurse-submodules https://github.com/{repoOwner}/{repoName} {tessdataFolder}", })) { e.LogObject = this._Log; e.Run(); } GRYLibrary.Core.Misc.Utilities.ForEachFileAndDirectoryTransitively(tessdataFolder, (string path, object _) => { path = path.Replace("\\", "/"); if (path.EndsWith("/.git")) { GRYLibrary.Core.Misc.Utilities.EnsureDirectoryDoesNotExist(path); } }, (string path, object _) => { path = path.Replace("\\", "/"); if (path.EndsWith("/.git")) { GRYLibrary.Core.Misc.Utilities.EnsureFileDoesNotExist(path); } }, false); GRYLibrary.Core.Misc.Utilities.EnsureDirectoryDoesNotExist(Path.Combine(tessdataFolder, ".git")); } this._Log.Log("Finished initialization of OCR-data..."); var traineddataFiles = Directory.GetFiles(_DataFolder).Where(file => file.EndsWith(".traineddata")); GRYLibrary.Core.Misc.Utilities.AssertCondition(1 < traineddataFiles.Count(), "Expected multiple *.traineddata-files."); this.IsInitialized = true; } } Loading @@ -132,14 +98,10 @@ namespace SimpleOCR.Library.Core } } private string GetTessDataPath() { return Path.Combine(this._DataFolder, "tessdata_best").Replace('\\', '/'); } public void ReInitialize() { GRYLibrary.Core.Misc.Utilities.EnsureDirectoryDoesNotExist(this.GetTessDataPath()); GRYLibrary.Core.Misc.Utilities.EnsureDirectoryDoesNotExist(_DataFolder); this.IsInitialized = false; this.Initialize(); } Loading Taskfile.yml +1 −1 Original line number Diff line number Diff line Loading @@ -7,7 +7,7 @@ tasks: silent: true dir: "." cmds: - "scbuildcodeunits" - "scbuildcodeunits {{.CLI_ARGS}}" aliases: - basebuildallcodeunits - bb Loading Loading
SimpleOCR.code-workspace +1 −1 Original line number Diff line number Diff line Loading @@ -10,7 +10,7 @@ "tasks": [ { "label": "Base: Build all codeunits", "command": "scbuildcodeunits", "command": "scbuildcodeunits {{.CLI_ARGS}}", "type": "shell", "options": { "cwd": "${workspaceFolder}" Loading
SimpleOCRLibrary/Other/CommonTasks.py +0 −2 Original line number Diff line number Diff line import os from ScriptCollection.GeneralUtilities import GeneralUtilities from ScriptCollection.TFCPS.DotNet.TFCPS_CodeUnitSpecific_DotNet import TFCPS_CodeUnitSpecific_DotNet_Functions,TFCPS_CodeUnitSpecific_DotNet_CLI from ScriptCollection.TFCPS.DotNet.CertificateGeneratorInformationNoGenerate import CertificateGeneratorInformationNoGenerate Loading
SimpleOCRLibrary/SimpleOCRLibrary/OCRService.cs +12 −50 Original line number Diff line number Diff line Loading @@ -6,6 +6,7 @@ using SimpleOCR.Library.Core.VIsitors; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text.RegularExpressions; using Tesseract; Loading @@ -22,7 +23,7 @@ namespace SimpleOCR.Library.Core /// <remarks><see cref="OCRService"/> takes care of the download itself. The only requirement is that git is available as command.</remarks> public OCRService(string dataFolder, IGRYLog log) { this._DataFolder = dataFolder; this._DataFolder = dataFolder.Replace('\\', '/'); this._Log = log; } public string GetOCRContent(byte[] fileContent, ISet<string> languages) Loading @@ -44,7 +45,7 @@ namespace SimpleOCR.Library.Core throw new BadRequestException($"Language '{language}' is not supported."); } } string dataPath = this.GetTessDataPath(); string dataPath = _DataFolder; using TesseractEngine engine = new TesseractEngine(dataPath, languagesConcatenated, EngineMode.Default); using MemoryStream ms = new MemoryStream(fileContent); using Pix img = Pix.LoadFromMemory(ms.ToArray()); Loading @@ -63,7 +64,8 @@ namespace SimpleOCR.Library.Core { var result = new HashSet<string>(); string pattern = @"([a-z][a-z][a-z])\.traineddata"; var files = Directory.GetFiles(this.GetTessDataPath()); string tessdataFolder = _DataFolder; var files = Directory.GetFiles(tessdataFolder); foreach (var file in files) { string filename = Path.GetFileName(file); Loading @@ -80,49 +82,13 @@ namespace SimpleOCR.Library.Core { try { GRYLibrary.Core.Misc.Utilities.AssertCondition(!string.IsNullOrEmpty(_DataFolder),"No OCR-data-folder set."); this._Log.Log($"OCRFolder: {this._DataFolder}"); string tessdataFolder = _DataFolder; GRYLibrary.Core.Misc.Utilities.AssertCondition(!string.IsNullOrEmpty(tessdataFolder), "No OCR-data-folder set."); this._Log.Log($"OCRFolder: {tessdataFolder}"); if (!this.IsInitialized) { this._Log.Log("Initialize OCR-data"); GRYLibrary.Core.Misc.Utilities.EnsureDirectoryExists(this._DataFolder); string tessdataFolder = this.GetTessDataPath(); if (!Directory.Exists(tessdataFolder)) { string repoOwner = "tesseract-ocr"; string repoName = "tessdata_best"; this._Log.Log("Download OCR-data..."); GRYLibrary.Core.Misc.Utilities.EnsureDirectoryExists(tessdataFolder); using (var e = new ExternalProgramExecutor(new ExternalProgramExecutorConfiguration() { Program = "git", Argument = $"clone --recurse-submodules https://github.com/{repoOwner}/{repoName} {tessdataFolder}", })) { e.LogObject = this._Log; e.Run(); } GRYLibrary.Core.Misc.Utilities.ForEachFileAndDirectoryTransitively(tessdataFolder, (string path, object _) => { path = path.Replace("\\", "/"); if (path.EndsWith("/.git")) { GRYLibrary.Core.Misc.Utilities.EnsureDirectoryDoesNotExist(path); } }, (string path, object _) => { path = path.Replace("\\", "/"); if (path.EndsWith("/.git")) { GRYLibrary.Core.Misc.Utilities.EnsureFileDoesNotExist(path); } }, false); GRYLibrary.Core.Misc.Utilities.EnsureDirectoryDoesNotExist(Path.Combine(tessdataFolder, ".git")); } this._Log.Log("Finished initialization of OCR-data..."); var traineddataFiles = Directory.GetFiles(_DataFolder).Where(file => file.EndsWith(".traineddata")); GRYLibrary.Core.Misc.Utilities.AssertCondition(1 < traineddataFiles.Count(), "Expected multiple *.traineddata-files."); this.IsInitialized = true; } } Loading @@ -132,14 +98,10 @@ namespace SimpleOCR.Library.Core } } private string GetTessDataPath() { return Path.Combine(this._DataFolder, "tessdata_best").Replace('\\', '/'); } public void ReInitialize() { GRYLibrary.Core.Misc.Utilities.EnsureDirectoryDoesNotExist(this.GetTessDataPath()); GRYLibrary.Core.Misc.Utilities.EnsureDirectoryDoesNotExist(_DataFolder); this.IsInitialized = false; this.Initialize(); } Loading
Taskfile.yml +1 −1 Original line number Diff line number Diff line Loading @@ -7,7 +7,7 @@ tasks: silent: true dir: "." cmds: - "scbuildcodeunits" - "scbuildcodeunits {{.CLI_ARGS}}" aliases: - basebuildallcodeunits - bb Loading