Unverified Commit 9e879830 authored by Marius Göcke's avatar Marius Göcke
Browse files

Merge branch 'wip' into other/next-release

# Conflicts:
#	SimpleOCRCLI/SimpleOCRCLI/Helper/ProgramStarter.cs
#	SimpleOCRCLI/SimpleOCRCLI/Properties/launchSettings.json
#	SimpleOCRCLI/SimpleOCRCLI/Runner/RunDownloader.cs
#	SimpleOCRCLI/SimpleOCRCLI/Runner/RunOCRAnalysis.cs
#	SimpleOCRCLI/SimpleOCRCLI/SimpleOCRCLI.csproj
#	SimpleOCRCLI/SimpleOCRCLI/Verbs/OCRAnalysis.cs
#	SimpleOCRLibrary/SimpleOCRLibrary/Misc/Visitors/ToPicturesVisitor.cs
#	SimpleOCRLibrary/SimpleOCRLibrary/OCRService.cs
#	SimpleOCRLibrary/SimpleOCRLibrary/SimpleOCRLibrary.csproj
#	SimpleOCRLibrary/SimpleOCRLibrary/VIsitors/ToPictureVisitor.cs
#	SimpleOCRLibrary/SimpleOCRLibrary/Visitors/ToPicturesVisitor.cs
parents 78e55d68 a993bcaa
Loading
Loading
Loading
Loading
+6 −4
Original line number Diff line number Diff line
@@ -27,9 +27,11 @@
        <WarningLevel>4</WarningLevel>
        <Prefer32Bit>false</Prefer32Bit>
        <SignAssembly>true</SignAssembly>
        <AssemblyOriginatorKeyFile>..\..\Other\Resources\PublicKeys\StronglyNamedKey\SimpleOCRPublicKey.snk</AssemblyOriginatorKeyFile>
        <AssemblyOriginatorKeyFile>
            ..\..\Other\Resources\PublicKeys\StronglyNamedKey\SimpleOCRPublicKey.snk</AssemblyOriginatorKeyFile>
        <DelaySign>true</DelaySign>
        <NoWarn>1701;1702;CA1822;CS1591;CS1573;IDE0045;IDE0046;IDE0054;IDE0058;IDE0074;IDE0090;CA1859</NoWarn>
        <NoWarn>
            1701;1702;CA1822;CS1591;CS1573;IDE0045;IDE0046;IDE0054;IDE0058;IDE0074;IDE0090;CA1859</NoWarn>
        <WarningsAsErrors>NU1605</WarningsAsErrors>
        <ErrorLog>..\Other\Resources\CodeAnalysisResult\SimpleOCRCLI.sarif</ErrorLog>
        <OutputType>Exe</OutputType>
+32 −0
Original line number Diff line number Diff line
@@ -23,7 +23,11 @@ namespace SimpleOCR.Library.Core.Misc.Visitors
            this._MimeType = mimeType;
        }

<<<<<<<< HEAD:SimpleOCRLibrary/SimpleOCRLibrary/Misc/Visitors/ToPicturesVisitor.cs
        public List<byte[]> Handle(PlainText text)
========
        public List<byte[]> Handle(SimpleOCR.Library.Core.FileTypes.PlainText text)
>>>>>>>> wip:SimpleOCRLibrary/SimpleOCRLibrary/Visitors/ToPicturesVisitor.cs
        {
            if (this._FileContent == null || this._FileContent.Length == 0)
                throw new ArgumentException("File content is empty.");
@@ -104,10 +108,38 @@ namespace SimpleOCR.Library.Core.Misc.Visitors

        public List<byte[]> Handle(PDF pDF)
        {
<<<<<<<< HEAD:SimpleOCRLibrary/SimpleOCRLibrary/Misc/Visitors/ToPicturesVisitor.cs
            return SimpleOCR.Library.Core.Misc.Utilities.PDFToPNGsUsingImageMagick(_FileContent);
        }


========
            return PDFToPNGsUsingImageMagick(_FileContent);
        }

        private List<byte[]> PDFToPNGsUsingImageMagick(byte[] fileContent)
        {
            using MagickImageCollection images = new MagickImageCollection();

            MagickReadSettings settings = new MagickReadSettings
            {
                Density = new Density(300, 300)
            };

            images.Read(fileContent, settings);

            List<byte[]> result = new List<byte[]>();

            foreach (var img in images)
            {
                img.Format = MagickFormat.Png;
                result.Add(img.ToByteArray());
            }

            return result;
        }

>>>>>>>> wip:SimpleOCRLibrary/SimpleOCRLibrary/Visitors/ToPicturesVisitor.cs
        public List<byte[]> Handle(Word word)
        {
            return this.UseSoffice(this._FileContent);
+5 −4
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@ using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace SimpleOCR.Library.Core
+7 −4
Original line number Diff line number Diff line
@@ -27,9 +27,11 @@
        <WarningLevel>4</WarningLevel>
        <Prefer32Bit>false</Prefer32Bit>
        <SignAssembly>true</SignAssembly>
        <AssemblyOriginatorKeyFile>..\..\Other\Resources\PublicKeys\StronglyNamedKey\SimpleOCRPublicKey.snk</AssemblyOriginatorKeyFile>
        <AssemblyOriginatorKeyFile>
            ..\..\Other\Resources\PublicKeys\StronglyNamedKey\SimpleOCRPublicKey.snk</AssemblyOriginatorKeyFile>
        <DelaySign>true</DelaySign>
        <NoWarn>1701;1702;CA1822;CS1591;CS1573;IDE0045;IDE0046;IDE0054;IDE0058;IDE0074;IDE0090;ASP0014;SYSLIB10;IDE0017;CS8002;CA1859;IDE0034;IDE0063;IDE0350</NoWarn>
        <NoWarn>
            1701;1702;CA1822;CS1591;CS1573;IDE0045;IDE0046;IDE0054;IDE0058;IDE0074;IDE0090;ASP0014;SYSLIB10;IDE0017;CS8002;CA1859;IDE0034;IDE0063;IDE0350</NoWarn>
        <WarningsAsErrors>NU1605</WarningsAsErrors>
        <ErrorLog>..\Other\Resources\CodeAnalysisResult\SimpleOCRLibrary.sarif</ErrorLog>
        <OutputType>Library</OutputType>
@@ -74,7 +76,8 @@
        <PackageReference Include="System.Collections.Immutable" Version="10.0.1" />
        <PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.1" />
        <PackageReference Include="Microsoft.Extensions.Options" Version="10.0.1" />
        <PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.1" />
        <PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions"
            Version="10.0.1" />
        <PackageReference Include="System.Drawing.Common" Version="10.0.5" />
        <PackageReference Include="System.IdentityModel.Tokens.Jwt" Version="8.15.0" />
        <PackageReference Include="System.Reactive" Version="6.1.0" />
+109 −0
Original line number Diff line number Diff line
using GRYLibrary.Core.Exceptions;
using SimpleOCR.Library.Core.FileTypes;
using SimpleOCR.Library.Core.Other;
using SkiaSharp;
using Sprache;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using Tesseract;

namespace SimpleOCR.Library.Core.Visitors
{
    internal class GetOCRContentVisitor : IFileTypeVisitor<string>
    {
        private byte[] _FileContent;
        private string _MimeType;
        private ISet<string> _LanguagesAsISO639_3Names;
        private readonly IOCRService _OCRService;

        public GetOCRContentVisitor(byte[] fileContent, string mimeType, ISet<string> languagesAsISO639_3Names, OCRService oCRService)
        {
            this._FileContent = fileContent;
            this._MimeType = mimeType;
            this._LanguagesAsISO639_3Names = languagesAsISO639_3Names;
            this._OCRService = oCRService;
        }

        public string Handle(Word word)
        {
            return GetTextFromPictures(word);
        }

        public string Handle(PlainText text)
        {
            return new UTF8Encoding(false).GetString(_FileContent);
        }

        public string Handle(Picture picture)
        {
            using TesseractEngine engine = GetTessDataEngine();
            return GetTextFromPicture(_FileContent, engine);
        }

        public string Handle(PDF pDF)
        {
            return GetTextFromPictures(pDF);
        }

        public string Handle(FileTypes.Other other)
        {
            throw new NotSupportedException();
        }

        public string Handle(Excel excel)
        {
            return GetTextFromPictures(excel);
        }

        private TesseractEngine GetTessDataEngine()
        {
            if (_LanguagesAsISO639_3Names.Count == 0)
            {
                throw new BadRequestException("No content-language defined for OCR-search.");
            }
            string languagesConcatenated = string.Join("+", _LanguagesAsISO639_3Names);
            ISet<Language> supportedLanguages = _OCRService.GetSupportedLanguages();
            foreach (string languagesAsISO639_3Name in _LanguagesAsISO639_3Names)
            {
                if (languagesAsISO639_3Name.Length != 3)
                {
                    throw new BadRequestException($"Language '{languagesAsISO639_3Name}' is not a valid ISO639-3 identifier because its length is not equal to 3.");
                }
                if (!supportedLanguages.Select(l => l.ISO639_3_Name).Contains(languagesAsISO639_3Name))
                {
                    throw new BadRequestException($"Language '{languagesAsISO639_3Name}' is not supported.");
                }
            }
            string dataPath = this._OCRService.GetDataFolder();
          return new TesseractEngine(dataPath, languagesConcatenated, EngineMode.Default);
        }

        private string GetTextFromPicture(byte[] content, TesseractEngine engine)
        {
            using MemoryStream ms = new MemoryStream(_FileContent);
            using Pix img = Pix.LoadFromMemory(ms.ToArray());
            using Page page = engine.Process(img);
            string text = page.GetText();
            return text;
        }
        private string GetTextFromPictures(FileType fileType)
        {
            string result = string.Empty;
            using TesseractEngine engine = GetTessDataEngine();
            result = string.Empty;
            foreach (var item in fileType.Accept(new ToPicturesVisitor(_FileContent, _MimeType)))
            {
                string text = GetTextFromPicture( item, engine);
                if (result != string.Empty)
                {
                    result = "\n";
                }
                result = text;
            }
            return result;
        }
    }
}
Loading