Unverified Commit a993bcaa authored by Marius Göcke's avatar Marius Göcke
Browse files

wip

parent dba5a7c4
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -23,7 +23,6 @@ namespace SimpleOCR.CLI.Core.Runner
            SimpleOCR.Library.Core.IOCRService ocrService = new SimpleOCR.Library.Core.OCRService(new SimpleOCR.Library.Core.OCRServiceConfiguration() { DataFolder = Utilities.GetTargetFolder(_Options.OCRDataFolder) }, GeneralLogger.CreateUsingConsole());
            ocrService.Initialize();
            (FileType fileType, byte[] content, string mimeType) = SimpleOCR.Library.Core.Misc.Utilities.LoadFile(_Options.File);
            content = fileType.Accept(new ToPictureVisitor(content, mimeType));
            string result = ocrService.GetOCRContent(content, mimeType, new HashSet<string>(this._Options.Languages));
            if(this._Options.Outputfile == null)
            {
+2 −1
Original line number Diff line number Diff line
@@ -10,10 +10,11 @@ namespace SimpleOCR.Library.Core
        /// <param name="languages">Set of languages which are supposed to check for ocr-content. The list must contain ISO639-3 identifier of any language.</param>
        public string GetOCRContent(byte[] fileContent, string mimeType, ISet<string> languages);
        /// <returns>Returns the file converted to fileType=jpg.</returns>
        public byte[] ToPicture(byte[] fileContent, string mimeType);
        public List<byte[]> ToPictures(byte[] fileContent, string mimeType);
        /// <returns>Returns a list of language-identifier in ISO-639-3.</returns>
        public ISet<Language> GetSupportedLanguages();
        public void ReInitialize();
        public void DownloadOCRData();
        public string GetDataFolder();
    }
}
+3 −3
Original line number Diff line number Diff line
@@ -70,9 +70,9 @@ namespace SimpleOCR.Library.Core.Misc

        public static string[] WrapText(string text, Font font, int maxWidth)
        {
            var result = new System.Collections.Generic.List<string>();
            using (var bmp = new Bitmap(1, 1))
            using (var g = Graphics.FromImage(bmp))
            List<string> result = new System.Collections.Generic.List<string>();
            using (Bitmap bmp = new Bitmap(1, 1))
            using (Graphics g = Graphics.FromImage(bmp))
            {
                int start = 0;
                while (start < text.Length)
+17 −30
Original line number Diff line number Diff line
@@ -4,12 +4,14 @@ using GRYLibrary.Core.ExecutePrograms;
using GRYLibrary.Core.ExecutePrograms.WaitingStates;
using GRYLibrary.Core.Logging.GRYLogger;
using GRYLibrary.Core.Misc;
using SimpleOCR.Library.Core.FileTypes;
using SimpleOCR.Library.Core.Other;
using SimpleOCR.Library.Core.Visitors;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using Tesseract;

@@ -30,45 +32,25 @@ namespace SimpleOCR.Library.Core
        }
        public string GetOCRContent(byte[] fileContent, string mimeType, ISet<string> languagesAsISO639_3Names)
        {
            if (languagesAsISO639_3Names.Count == 0)
            {
                throw new BadRequestException("No content-language defined for OCR-search.");
            }
            string languagesConcatenated = string.Join("+", languagesAsISO639_3Names);
            ISet<Language> supportedLanguages = this.GetSupportedLanguages();
            foreach (string languagesAsISO639_3Name in languagesAsISO639_3Names)
            {
                if (languagesAsISO639_3Name.Length != 3)
                {
                    throw new BadRequestException($"Language '{languagesAsISO639_3Name}' is not a valid ISO639-3 identifier because its length is not equal to 3.");
                }
                if (!supportedLanguages.Select(l => l.ISO639_3_Name).Contains(languagesAsISO639_3Name))
                {
                    throw new BadRequestException($"Language '{languagesAsISO639_3Name}' is not supported.");
                }
            }
            string dataPath = this._Configuration.DataFolder;
            using TesseractEngine engine = new TesseractEngine(dataPath, languagesConcatenated, EngineMode.Default);
            using MemoryStream ms = new MemoryStream(fileContent);
            using Pix img = Pix.LoadFromMemory(ms.ToArray());
            using Page page = engine.Process(img);
            string plainResult = page.GetText();
            string result = Misc.Utilities.NormalizeOCRResult(plainResult);
            FileType fileType = SimpleOCR.Library.Core.Misc.Utilities.GetDocumentType(mimeType);   
            string result=fileType.Accept(new GetOCRContentVisitor(fileContent,mimeType,languagesAsISO639_3Names,this));
            
            result = Misc.Utilities.NormalizeOCRResult(result);
            return result;
        }

        public byte[] ToPicture(byte[] fileContent, string mimeType)
        public List<byte[]> ToPictures(byte[] fileContent, string mimeType)
        {
            return SimpleOCR.Library.Core.Misc.Utilities.GetDocumentType(mimeType).Accept(new ToPictureVisitor(fileContent, mimeType));
            return SimpleOCR.Library.Core.Misc.Utilities.GetDocumentType(mimeType).Accept(new ToPicturesVisitor(fileContent, mimeType));
        }

        public ISet<Language> GetSupportedLanguages()
        {
            var result = new HashSet<Language>();
            HashSet<Language> result = new HashSet<Language>();
            string pattern = @"([a-z][a-z][a-z])\.traineddata";
            string tessdataFolder = this._Configuration.DataFolder;
            var files = Directory.GetFiles(tessdataFolder);
            var allLanguages = Misc.Utilities.GetValidLanguages().ToDictionary(item => item.ISO639_3_Name);
            Dictionary<string, Language> allLanguages = Misc.Utilities.GetValidLanguages().ToDictionary(item => item.ISO639_3_Name);
            foreach (var file in files)
            {
                string filename = Path.GetFileName(file);
@@ -139,13 +121,13 @@ namespace SimpleOCR.Library.Core
                    GRYLibrary.Core.Misc.Utilities.EnsureDirectoryExists(this._Configuration.DataFolder);
                    GRYLibrary.Core.Misc.Utilities.AssertCondition(GRYLibrary.Core.Misc.Utilities.DirectoryIsEmpty(this._Configuration.DataFolder), $"Data-folder \"{this._Configuration.DataFolder}\" is not empty.");

                    using (var tmpFolder = new GRYLibrary.Core.Misc.TempFolder())
                    using (TempFolder tmpFolder = new GRYLibrary.Core.Misc.TempFolder())
                    {
                        string tessdataFolder = tmpFolder.Path;
                        string repoOwner = "tesseract-ocr";
                        string repoName = "tessdata";
                        this._Log.Log("Download OCR-data...");
                        using (var e = new ExternalProgramExecutor(new ExternalProgramExecutorConfiguration()
                        using (ExternalProgramExecutor e = new ExternalProgramExecutor(new ExternalProgramExecutorConfiguration()
                        {
                            Program = "git",
                            Argument = $"clone --recurse-submodules https://github.com/{repoOwner}/{repoName} {tessdataFolder}",
@@ -191,5 +173,10 @@ namespace SimpleOCR.Library.Core
        {
            this.Initialize();
        }

        public string GetDataFolder()
        {
            return _Configuration.DataFolder;
        }
    }
}
+109 −0
Original line number Diff line number Diff line
using GRYLibrary.Core.Exceptions;
using SimpleOCR.Library.Core.FileTypes;
using SimpleOCR.Library.Core.Other;
using SkiaSharp;
using Sprache;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using Tesseract;

namespace SimpleOCR.Library.Core.Visitors
{
    internal class GetOCRContentVisitor : IFileTypeVisitor<string>
    {
        private byte[] _FileContent;
        private string _MimeType;
        private ISet<string> _LanguagesAsISO639_3Names;
        private readonly IOCRService _OCRService;

        public GetOCRContentVisitor(byte[] fileContent, string mimeType, ISet<string> languagesAsISO639_3Names, OCRService oCRService)
        {
            this._FileContent = fileContent;
            this._MimeType = mimeType;
            this._LanguagesAsISO639_3Names = languagesAsISO639_3Names;
            this._OCRService = oCRService;
        }

        public string Handle(Word word)
        {
            return GetTextFromPictures(word);
        }

        public string Handle(PlainText text)
        {
            return new UTF8Encoding(false).GetString(_FileContent);
        }

        public string Handle(Picture picture)
        {
            using TesseractEngine engine = GetTessDataEngine();
            return GetTextFromPicture(_FileContent, engine);
        }

        public string Handle(PDF pDF)
        {
            return GetTextFromPictures(pDF);
        }

        public string Handle(FileTypes.Other other)
        {
            throw new NotSupportedException();
        }

        public string Handle(Excel excel)
        {
            return GetTextFromPictures(excel);
        }

        private TesseractEngine GetTessDataEngine()
        {
            if (_LanguagesAsISO639_3Names.Count == 0)
            {
                throw new BadRequestException("No content-language defined for OCR-search.");
            }
            string languagesConcatenated = string.Join("+", _LanguagesAsISO639_3Names);
            ISet<Language> supportedLanguages = _OCRService.GetSupportedLanguages();
            foreach (string languagesAsISO639_3Name in _LanguagesAsISO639_3Names)
            {
                if (languagesAsISO639_3Name.Length != 3)
                {
                    throw new BadRequestException($"Language '{languagesAsISO639_3Name}' is not a valid ISO639-3 identifier because its length is not equal to 3.");
                }
                if (!supportedLanguages.Select(l => l.ISO639_3_Name).Contains(languagesAsISO639_3Name))
                {
                    throw new BadRequestException($"Language '{languagesAsISO639_3Name}' is not supported.");
                }
            }
            string dataPath = this._OCRService.GetDataFolder();
          return new TesseractEngine(dataPath, languagesConcatenated, EngineMode.Default);
        }

        private string GetTextFromPicture(byte[] content, TesseractEngine engine)
        {
            using MemoryStream ms = new MemoryStream(_FileContent);
            using Pix img = Pix.LoadFromMemory(ms.ToArray());
            using Page page = engine.Process(img);
            string text = page.GetText();
            return text;
        }
        private string GetTextFromPictures(FileType fileType)
        {
            string result = string.Empty;
            using TesseractEngine engine = GetTessDataEngine();
            result = string.Empty;
            foreach (var item in fileType.Accept(new ToPicturesVisitor(_FileContent, _MimeType)))
            {
                string text = GetTextFromPicture( item, engine);
                if (result != string.Empty)
                {
                    result = "\n";
                }
                result = text;
            }
            return result;
        }
    }
}
Loading