From f5ee17a96c1f0e820c9f8017b3fb57f47abe89a3 Mon Sep 17 00:00:00 2001 From: David Martin Date: Mon, 11 Jun 2018 20:27:36 +1000 Subject: [PATCH 1/5] Simplify psm_parameter() with a conditional expression. It is a bit more compact and arguably more Pythonic. --- src/pyocr/tesseract.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py index c1329e0..52d348f 100755 --- a/src/pyocr/tesseract.py +++ b/src/pyocr/tesseract.py @@ -164,10 +164,7 @@ def can_detect_orientation(): def psm_parameter(): """Return the psm option string depending on the Tesseract version.""" version = get_version() - if version[0] <= 3: - return "-psm" - - return "--psm" + return "--psm" if version[0] > 3 else "-psm" def detect_orientation(image, lang=None): From ddfe4ddbdd9b1e51ec4111d9ec576de2b9ddcdc2 Mon Sep 17 00:00:00 2001 From: David Martin Date: Mon, 11 Jun 2018 20:34:09 +1000 Subject: [PATCH 2/5] Use absolute import to work around circular builder/tesseract import issues. Unfortunately builders.py now imports psm_parameter from tesseract.py, and tesseract.py imports DigitBuilder from builders.py. We could move psm_parameter into util.py, but it needs get_version as well and it is very much tesseract specific. I do not see a great solution for this and keeping the circular import seems the least bad solution. We keep it from breaking the program by using an explicit import of the module [0]. [0] https://stackoverflow.com/a/37126790 --- src/pyocr/builders.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pyocr/builders.py b/src/pyocr/builders.py index f27b829..710839f 100644 --- a/src/pyocr/builders.py +++ b/src/pyocr/builders.py @@ -14,7 +14,7 @@ import xml.dom.minidom import logging -from .tesseract import psm_parameter +import pyocr.tesseract from .util import to_unicode logger = logging.getLogger(__name__) @@ -306,7 +306,7 @@ class TextBuilder(BaseBuilder): def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False, cuneiform_fax=False, cuneiform_singlecolumn=False): file_ext = ["txt"] - tess_flags = [psm_parameter(), str(tesseract_layout)] + tess_flags = [pyocr.tesseract.psm_parameter(), str(tesseract_layout)] cun_args = ["-f", "text"] # Add custom cuneiform parameters if needed for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"), @@ -563,7 +563,7 @@ class WordBoxBuilder(BaseBuilder): def __init__(self, tesseract_layout=1): file_ext = ["html", "hocr"] - tess_flags = [psm_parameter(), str(tesseract_layout)] + tess_flags = [pyocr.tesseract.psm_parameter(), str(tesseract_layout)] tess_conf = ["hocr"] cun_args = ["-f", "hocr"] super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, @@ -639,7 +639,7 @@ class LineBoxBuilder(BaseBuilder): def __init__(self, tesseract_layout=1): file_ext = ["html", "hocr"] - tess_flags = [psm_parameter(), str(tesseract_layout)] + tess_flags = [pyocr.tesseract.psm_parameter(), str(tesseract_layout)] tess_conf = ["hocr"] cun_args = ["-f", "hocr"] super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, From adfff7bcf043db2cb7288f76a19f1a7ed3b67095 Mon Sep 17 00:00:00 2001 From: David Martin Date: Mon, 11 Jun 2018 20:46:55 +1000 Subject: [PATCH 3/5] Use absolute import in tesseract.py to avoid circular import issue. In ddfe4ddbdd9b1e51ec4111d9ec576de2b9ddcdc2 we used absolute imports to avoid the cicular dependencies between tesseract.py and builders.py to be an issue. For it to work both modules have to use explicit imports. --- src/pyocr/tesseract.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py index 52d348f..4952d86 100755 --- a/src/pyocr/tesseract.py +++ b/src/pyocr/tesseract.py @@ -27,7 +27,7 @@ from . import builders from . import util -from .builders import DigitBuilder # backward compatibility +import pyocr.builders # backward compatibility from .error import TesseractError # backward compatibility from .util import digits_only @@ -240,7 +240,7 @@ def get_available_builders(): builders.TextBuilder, builders.WordBoxBuilder, CharBoxBuilder, - builders.DigitBuilder, + pyocr.builders.DigitBuilder, ] From e34cb4d193706a59cbd4c4989c57ca0e8e28d993 Mon Sep 17 00:00:00 2001 From: David Martin Date: Wed, 13 Jun 2018 21:01:18 +1000 Subject: [PATCH 4/5] Import psm_parameter later to avoid circular import issues. We are importing from builders.py in tesseract.py, and vice versa. So to avoid it falling over we simply do the import of the psm_parameter only when it is actually needed. --- src/pyocr/builders.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/pyocr/builders.py b/src/pyocr/builders.py index 710839f..3574013 100644 --- a/src/pyocr/builders.py +++ b/src/pyocr/builders.py @@ -14,7 +14,6 @@ import xml.dom.minidom import logging -import pyocr.tesseract from .util import to_unicode logger = logging.getLogger(__name__) @@ -305,8 +304,9 @@ class TextBuilder(BaseBuilder): def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False, cuneiform_fax=False, cuneiform_singlecolumn=False): + from .tesseract import psm_parameter + tess_flags = [psm_parameter(), str(tesseract_layout)] file_ext = ["txt"] - tess_flags = [pyocr.tesseract.psm_parameter(), str(tesseract_layout)] cun_args = ["-f", "text"] # Add custom cuneiform parameters if needed for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"), @@ -562,8 +562,9 @@ class WordBoxBuilder(BaseBuilder): """ def __init__(self, tesseract_layout=1): + from .tesseract import psm_parameter + tess_flags = [psm_parameter(), str(tesseract_layout)] file_ext = ["html", "hocr"] - tess_flags = [pyocr.tesseract.psm_parameter(), str(tesseract_layout)] tess_conf = ["hocr"] cun_args = ["-f", "hocr"] super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, @@ -638,8 +639,9 @@ class LineBoxBuilder(BaseBuilder): """ def __init__(self, tesseract_layout=1): + from .tesseract import psm_parameter + tess_flags = [psm_parameter(), str(tesseract_layout)] file_ext = ["html", "hocr"] - tess_flags = [pyocr.tesseract.psm_parameter(), str(tesseract_layout)] tess_conf = ["hocr"] cun_args = ["-f", "hocr"] super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf, From cc3c0b9a69b0c901a355c5117f4b5660ec32c3f0 Mon Sep 17 00:00:00 2001 From: David Martin Date: Wed, 13 Jun 2018 21:04:33 +1000 Subject: [PATCH 5/5] Revert circular import workarounds from tesseract.py. This should not be necessary anymore now that we updated builders.py to only import from tesseract.py when it is actually used. --- src/pyocr/tesseract.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py index 4952d86..52d348f 100755 --- a/src/pyocr/tesseract.py +++ b/src/pyocr/tesseract.py @@ -27,7 +27,7 @@ from . import builders from . import util -import pyocr.builders # backward compatibility +from .builders import DigitBuilder # backward compatibility from .error import TesseractError # backward compatibility from .util import digits_only @@ -240,7 +240,7 @@ def get_available_builders(): builders.TextBuilder, builders.WordBoxBuilder, CharBoxBuilder, - pyocr.builders.DigitBuilder, + builders.DigitBuilder, ]