Loading SimpleOCR/Other/Build/Build.py +1 −1 Original line number Diff line number Diff line Loading @@ -12,7 +12,7 @@ def build(): "image_debian":tf.tfcps_Tools_General.oci_image_manager.get_registry_address_for_image_with_default_tag(tf.get_repository_folder(),"Debian"), }) tf.tfcps_Tools_General.merge_sbom_file_from_dependent_codeunit_into_this(tf.get_codeunit_folder(),tf.get_codeunit_name(),"SimpleOCRService",tf.use_cache()) # TODO add libreoffice etc. to the sbom. if __name__ == "__main__": build() SimpleOCR/SimpleOCR/Dockerfile +2 −1 Original line number Diff line number Diff line Loading @@ -16,7 +16,8 @@ RUN mkdir /Workspace && \ mkdir /Workspace/Other/Certificates && \ mkdir /Workspace/Other/EntryPoint && \ apt-get update && \ apt-get install -y curl nginx git libreoffice-core libreoffice-writer libreoffice-calc wget nano libgomp1 tesseract-ocr libpng-dev libjpeg-dev libtiff-dev libwebp-dev ghostscript apt-get install -y curl nginx git libreoffice libreoffice-writer libreoffice-calc libreoffice-impress fonts-dejavu fonts-liberation nano libgomp1 tesseract-ocr libpng-dev libjpeg-dev libtiff-dev libwebp-dev ghostscript # TODO install a certain version of libreoffice etc. which is defined in the dependencies-folder and which can be updated by a script. WORKDIR /Workspace/Other/EntryPoint Loading SimpleOCRLibrary/Other/UpdateDependencies.py +1 −0 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ from ScriptCollection.TFCPS.DotNet.TFCPS_CodeUnitSpecific_DotNet import TFCPS_Co def update_dependencies(): tf:TFCPS_CodeUnitSpecific_DotNet_Functions=TFCPS_CodeUnitSpecific_DotNet_CLI.parse(__file__) tf.update_dependencies() #TODO call default-sc-function to update dependencies in requirements.txt if __name__ == "__main__": Loading SimpleOCRLibrary/SimpleOCRLibrary/Misc/Utilities.cs +2 −0 Original line number Diff line number Diff line Loading @@ -25,6 +25,8 @@ namespace SimpleOCR.Library.Core.Misc ["application/vnd.openxmlformats-officedocument.spreadsheetml.template"] = "xltx", ["application/vnd.ms-excel.template.macroEnabled.12"] = "xltm", ["application/vnd.ms-excel.sheet.binary.macroEnabled.12"] = "xlsb", ["application/msword"] = "doc", ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"] = "docx", }; } Loading SimpleOCRLibrary/SimpleOCRLibrary/Misc/Visitors/GetOCRContentVisitor.cs +5 −3 Original line number Diff line number Diff line Loading @@ -12,14 +12,16 @@ namespace SimpleOCR.Library.Core.Misc.Visitors private string _MimeType; private ISet<string> _LanguagesAsISO639_3Names; private readonly IOCRService _OCRService; private readonly bool _EnforceVerbose; private readonly TesseractCallBase _TesseractCallBase; public GetOCRContentVisitor(byte[] fileContent, string mimeType, ISet<string> languagesAsISO639_3Names, OCRService oCRService) public GetOCRContentVisitor(byte[] fileContent, string mimeType, ISet<string> languagesAsISO639_3Names, OCRService oCRService,bool enforceVerbose) { this._FileContent = fileContent; this._MimeType = mimeType; this._LanguagesAsISO639_3Names = languagesAsISO639_3Names; this._OCRService = oCRService; GRYLibrary.Core.OperatingSystem.OperatingSystem os = GRYLibrary.Core.OperatingSystem.OperatingSystem.GetCurrentOperatingSystem(); _EnforceVerbose = enforceVerbose; if(os is GRYLibrary.Core.OperatingSystem.ConcreteOperatingSystems.Windows) { this._TesseractCallBase = new TesseractByLibrary(); Loading Loading @@ -66,9 +68,9 @@ namespace SimpleOCR.Library.Core.Misc.Visitors private string GetTextFromPictures(FileType fileType) { string result = string.Empty; foreach (byte[]? pictureContent in fileType.Accept(new ToPicturesVisitor(this._FileContent, this._MimeType))) foreach (byte[]? pictureContent in fileType.Accept(new ToPicturesVisitor(this._FileContent, this._MimeType,_EnforceVerbose))) { var visitor = this.GetCallTesseractVisitor(pictureContent); ITesseractCallBaseVisitor<string> visitor = this.GetCallTesseractVisitor(pictureContent); string text = this._TesseractCallBase.Accept(visitor); if (result != string.Empty) { Loading Loading
SimpleOCR/Other/Build/Build.py +1 −1 Original line number Diff line number Diff line Loading @@ -12,7 +12,7 @@ def build(): "image_debian":tf.tfcps_Tools_General.oci_image_manager.get_registry_address_for_image_with_default_tag(tf.get_repository_folder(),"Debian"), }) tf.tfcps_Tools_General.merge_sbom_file_from_dependent_codeunit_into_this(tf.get_codeunit_folder(),tf.get_codeunit_name(),"SimpleOCRService",tf.use_cache()) # TODO add libreoffice etc. to the sbom. if __name__ == "__main__": build()
SimpleOCR/SimpleOCR/Dockerfile +2 −1 Original line number Diff line number Diff line Loading @@ -16,7 +16,8 @@ RUN mkdir /Workspace && \ mkdir /Workspace/Other/Certificates && \ mkdir /Workspace/Other/EntryPoint && \ apt-get update && \ apt-get install -y curl nginx git libreoffice-core libreoffice-writer libreoffice-calc wget nano libgomp1 tesseract-ocr libpng-dev libjpeg-dev libtiff-dev libwebp-dev ghostscript apt-get install -y curl nginx git libreoffice libreoffice-writer libreoffice-calc libreoffice-impress fonts-dejavu fonts-liberation nano libgomp1 tesseract-ocr libpng-dev libjpeg-dev libtiff-dev libwebp-dev ghostscript # TODO install a certain version of libreoffice etc. which is defined in the dependencies-folder and which can be updated by a script. WORKDIR /Workspace/Other/EntryPoint Loading
SimpleOCRLibrary/Other/UpdateDependencies.py +1 −0 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ from ScriptCollection.TFCPS.DotNet.TFCPS_CodeUnitSpecific_DotNet import TFCPS_Co def update_dependencies(): tf:TFCPS_CodeUnitSpecific_DotNet_Functions=TFCPS_CodeUnitSpecific_DotNet_CLI.parse(__file__) tf.update_dependencies() #TODO call default-sc-function to update dependencies in requirements.txt if __name__ == "__main__": Loading
SimpleOCRLibrary/SimpleOCRLibrary/Misc/Utilities.cs +2 −0 Original line number Diff line number Diff line Loading @@ -25,6 +25,8 @@ namespace SimpleOCR.Library.Core.Misc ["application/vnd.openxmlformats-officedocument.spreadsheetml.template"] = "xltx", ["application/vnd.ms-excel.template.macroEnabled.12"] = "xltm", ["application/vnd.ms-excel.sheet.binary.macroEnabled.12"] = "xlsb", ["application/msword"] = "doc", ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"] = "docx", }; } Loading
SimpleOCRLibrary/SimpleOCRLibrary/Misc/Visitors/GetOCRContentVisitor.cs +5 −3 Original line number Diff line number Diff line Loading @@ -12,14 +12,16 @@ namespace SimpleOCR.Library.Core.Misc.Visitors private string _MimeType; private ISet<string> _LanguagesAsISO639_3Names; private readonly IOCRService _OCRService; private readonly bool _EnforceVerbose; private readonly TesseractCallBase _TesseractCallBase; public GetOCRContentVisitor(byte[] fileContent, string mimeType, ISet<string> languagesAsISO639_3Names, OCRService oCRService) public GetOCRContentVisitor(byte[] fileContent, string mimeType, ISet<string> languagesAsISO639_3Names, OCRService oCRService,bool enforceVerbose) { this._FileContent = fileContent; this._MimeType = mimeType; this._LanguagesAsISO639_3Names = languagesAsISO639_3Names; this._OCRService = oCRService; GRYLibrary.Core.OperatingSystem.OperatingSystem os = GRYLibrary.Core.OperatingSystem.OperatingSystem.GetCurrentOperatingSystem(); _EnforceVerbose = enforceVerbose; if(os is GRYLibrary.Core.OperatingSystem.ConcreteOperatingSystems.Windows) { this._TesseractCallBase = new TesseractByLibrary(); Loading Loading @@ -66,9 +68,9 @@ namespace SimpleOCR.Library.Core.Misc.Visitors private string GetTextFromPictures(FileType fileType) { string result = string.Empty; foreach (byte[]? pictureContent in fileType.Accept(new ToPicturesVisitor(this._FileContent, this._MimeType))) foreach (byte[]? pictureContent in fileType.Accept(new ToPicturesVisitor(this._FileContent, this._MimeType,_EnforceVerbose))) { var visitor = this.GetCallTesseractVisitor(pictureContent); ITesseractCallBaseVisitor<string> visitor = this.GetCallTesseractVisitor(pictureContent); string text = this._TesseractCallBase.Accept(visitor); if (result != string.Empty) { Loading