pylokid/pylokid/library/pdftotext.py

#!/usr/bin/env python3

""" extracts data from ELZ PDFs using Poppler pdftotext """

import subprocess
import logging


class PDFParsing:
    """ PDF parsing """

    def __init__(self):
        self.logger = logging.getLogger(__name__)
        self.logger.info("PDF parsing based on pdftotext loaded")

    def extract(self, f_id, file, datafields):

        self.logger.info("[%s] parsing PDF file %s", f_id, file)

        data = {}

        for field, coordinate in datafields.items():

            # x-coordinate of the crop area top left corner
            x = coordinate["xMin"]

            # y-coordinate of the crop area top left corner
            y = coordinate["yMin"]

            # width of crop area in pixels
            w = coordinate["xMax"] - coordinate["xMin"]

            # height of crop area in pixels
            h = coordinate["yMax"] - coordinate["yMin"]

            self.logger.debug(
                "[%s] Computed command for field %s: %s",
                f_id,
                field,
                "pdftotext -f 1 -l 1 -x {} -y {} -W {} -H {}".format(x, y, w, h),
            )

            scrapeddata = subprocess.Popen(
                [
                    "/usr/bin/pdftotext",
                    "-f",
                    "1",
                    "-l",
                    "1",
                    "-x",
                    str(x),
                    "-y",
                    str(y),
                    "-W",
                    str(w),
                    "-H",
                    str(h),
                    file,
                    "-",
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                text=True,
            )
            stdout, _ = scrapeddata.communicate()

            ## TODO: fixup some fields (lowercase, remove unnecessary \n)
            if "edit" in coordinate and coordinate["edit"] == "title":
                data[field] = stdout.rstrip().title()
            else:
                data[field] = stdout.rstrip()

        # sanity check to see if we can correlate the f_id
        if f_id == data["auftrag"]:
            self.logger.debug("[%s] ID matches in PDF", f_id)
            return data
        else:
            self.logger.error(
                '[%s] ID does not match in PDF: "%s"', f_id, data["auftrag"]
            )
            return False

    def extract_einsatzausdruck(self, file, f_id):
        """ extracts information from Einsatzausdruck using external pdftotext """

        self.logger.debug("[%s] Parsing PDF: %s", f_id, file)

        # Get them using 'pdftotext -bbox'
        # y = row
        # x = column: xMax 450 / 590 means full width
        coordinates = {
            "auftrag": {
                "xMin": 70,
                "yMin": 47,
                "xMax": 120,
                "yMax": 58,
            },
            "datum": {
                "xMin": 190,
                "yMin": 47,
                "xMax": 239,
                "yMax": 58,
            },
            "zeit": {
                "xMin": 190,
                "yMin": 59,
                "xMax": 215,
                "yMax": 70,
            },
            "melder": {
                "xMin": 304,
                "yMin": 47,
                "xMax": 446,
                "yMax": 70,
                "edit": "title",
            },
            "erfasser": {
                "xMin": 448,
                "yMin": 59,
                "xMax": 478,
                "yMax": 70,
            },
            # big field until "Disponierte Einheiten"
            "bemerkungen": {
                "xMin": 28,
                "yMin": 112,
                "xMax": 590,
                "yMax": 350,
            },
            "disponierteeinheiten": {
                "xMin": 28,
                "yMin": 366,
                "xMax": 450,
                "yMax": 376,
            },
            "einsatz": {
                "xMin": 76,
                "yMin": 690,
                "xMax": 450,
                "yMax": 703,
            },
            "sondersignal": {
                "xMin": 76,
                "yMin": 707,
                "xMax": 450,
                "yMax": 721,
            },
            "ort": {
                "xMin": 76,
                "yMin": 732,
                "xMax": 590,
                "yMax": 745,
            },
            "hinweis": {
                "xMin": 76,
                "yMin": 773,
                "xMax": 450,
                "yMax": 787,
            },
        }

        return self.extract(f_id, file, coordinates)

    def extract_einsatzprotokoll(self, file, f_id):
        """ extracts information from Einsatzprotokoll using external pdftotext """

        self.logger.debug("[%s] Parsing PDF: %s", f_id, file)

        # Get them using 'pdftotext -bbox'
        # y = row
        # x = column: xMax 450 / 590 means full width
        coordinates = {
            "auftrag": {
                "xMin": 192,
                "yMin": 132,
                "xMax": 238,
                "yMax": 142,
            },
            "angelegt": {
                "xMin": 192,
                "yMin": 294,
                "xMax": 226,
                "yMax": 304,
            },
            "dispo": {
                "xMin": 192,
                "yMin": 312,
                "xMax": 226,
                "yMax": 322,
            },
            "ausgerueckt": {
                "xMin": 192,
                "yMin": 331,
                "xMax": 226,
                "yMax": 341,
            },
            "vorort": {
                "xMin": 192,
                "yMin": 348,
                "xMax": 226,
                "yMax": 358,
            },
        }

        return self.extract(f_id, file, coordinates)
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00			`#!/usr/bin/env python3`

			`""" extracts data from ELZ PDFs using Poppler pdftotext """`

			`import subprocess`
			`import logging`

major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00			`class PDFParsing:`
			`""" PDF parsing """`

			`def __init__(self):`
			`self.logger = logging.getLogger(__name__)`
major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`self.logger.info("PDF parsing based on pdftotext loaded")`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00
			`def extract(self, f_id, file, datafields):`

major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`self.logger.info("[%s] parsing PDF file %s", f_id, file)`
improve parsing 2019-09-22 19:18:58 +00:00
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00			`data = {}`

			`for field, coordinate in datafields.items():`

			`# x-coordinate of the crop area top left corner`
major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`x = coordinate["xMin"]`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00
			`# y-coordinate of the crop area top left corner`
major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`y = coordinate["yMin"]`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00
			`# width of crop area in pixels`
major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`w = coordinate["xMax"] - coordinate["xMin"]`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00
			`# height of crop area in pixels`
major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`h = coordinate["yMax"] - coordinate["yMin"]`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00
major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`self.logger.debug(`
			`"[%s] Computed command for field %s: %s",`
			`f_id,`
			`field,`
			`"pdftotext -f 1 -l 1 -x {} -y {} -W {} -H {}".format(x, y, w, h),`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00			`)`

major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`scrapeddata = subprocess.Popen(`
			`[`
			`"/usr/bin/pdftotext",`
			`"-f",`
			`"1",`
			`"-l",`
			`"1",`
			`"-x",`
			`str(x),`
			`"-y",`
			`str(y),`
			`"-W",`
			`str(w),`
			`"-H",`
			`str(h),`
			`file,`
			`"-",`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00			`],`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.STDOUT,`
major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`text=True,`
			`)`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00			`stdout, _ = scrapeddata.communicate()`

			`## TODO: fixup some fields (lowercase, remove unnecessary \n)`
major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`if "edit" in coordinate and coordinate["edit"] == "title":`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00			`data[field] = stdout.rstrip().title()`
			`else:`
			`data[field] = stdout.rstrip()`

			`# sanity check to see if we can correlate the f_id`
major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`if f_id == data["auftrag"]:`
			`self.logger.debug("[%s] ID matches in PDF", f_id)`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00			`return data`
			`else:`
major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`self.logger.error(`
			`'[%s] ID does not match in PDF: "%s"', f_id, data["auftrag"]`
			`)`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00			`return False`

			`def extract_einsatzausdruck(self, file, f_id):`
			`""" extracts information from Einsatzausdruck using external pdftotext """`

major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`self.logger.debug("[%s] Parsing PDF: %s", f_id, file)`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00
			`# Get them using 'pdftotext -bbox'`
			`# y = row`
			`# x = column: xMax 450 / 590 means full width`
			`coordinates = {`
major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`"auftrag": {`
			`"xMin": 70,`
			`"yMin": 47,`
			`"xMax": 120,`
			`"yMax": 58,`
			`},`
			`"datum": {`
			`"xMin": 190,`
			`"yMin": 47,`
			`"xMax": 239,`
			`"yMax": 58,`
			`},`
			`"zeit": {`
			`"xMin": 190,`
			`"yMin": 59,`
			`"xMax": 215,`
			`"yMax": 70,`
			`},`
			`"melder": {`
			`"xMin": 304,`
			`"yMin": 47,`
			`"xMax": 446,`
			`"yMax": 70,`
			`"edit": "title",`
			`},`
			`"erfasser": {`
			`"xMin": 448,`
			`"yMin": 59,`
			`"xMax": 478,`
			`"yMax": 70,`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00			`},`
			`# big field until "Disponierte Einheiten"`
major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`"bemerkungen": {`
			`"xMin": 28,`
			`"yMin": 112,`
			`"xMax": 590,`
			`"yMax": 350,`
			`},`
			`"disponierteeinheiten": {`
			`"xMin": 28,`
			`"yMin": 366,`
			`"xMax": 450,`
			`"yMax": 376,`
			`},`
			`"einsatz": {`
			`"xMin": 76,`
			`"yMin": 690,`
			`"xMax": 450,`
			`"yMax": 703,`
			`},`
			`"sondersignal": {`
			`"xMin": 76,`
			`"yMin": 707,`
			`"xMax": 450,`
			`"yMax": 721,`
			`},`
			`"ort": {`
			`"xMin": 76,`
			`"yMin": 732,`
			`"xMax": 590,`
			`"yMax": 745,`
			`},`
			`"hinweis": {`
			`"xMin": 76,`
			`"yMin": 773,`
			`"xMax": 450,`
			`"yMax": 787,`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00			`},`
			`}`

			`return self.extract(f_id, file, coordinates)`

			`def extract_einsatzprotokoll(self, file, f_id):`
			`""" extracts information from Einsatzprotokoll using external pdftotext """`

major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`self.logger.debug("[%s] Parsing PDF: %s", f_id, file)`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00
			`# Get them using 'pdftotext -bbox'`
			`# y = row`
			`# x = column: xMax 450 / 590 means full width`
			`coordinates = {`
major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`"auftrag": {`
			`"xMin": 192,`
			`"yMin": 132,`
			`"xMax": 238,`
			`"yMax": 142,`
			`},`
			`"angelegt": {`
			`"xMin": 192,`
			`"yMin": 294,`
			`"xMax": 226,`
			`"yMax": 304,`
			`},`
			`"dispo": {`
			`"xMin": 192,`
			`"yMin": 312,`
			`"xMax": 226,`
			`"yMax": 322,`
			`},`
			`"ausgerueckt": {`
			`"xMin": 192,`
			`"yMin": 331,`
			`"xMax": 226,`
			`"yMax": 341,`
			`},`
			`"vorort": {`
			`"xMin": 192,`
			`"yMin": 348,`
			`"xMax": 226,`
			`"yMax": 358,`
complete rework of pdf parsing 2019-09-22 16:10:09 +00:00			`},`
			`}`

major update to reuse already existing records Lodur since some time automatically creates Einsatzrapporte via an API from SRZ/GVZ. One of the main features of Pylokid was to exactly do that. With that new change this isn't necessary anymore. Pylokid has been amended to find the pre-existing entry and work with that - enhancing it with any additional information missing and uploads PDFs to the right place. While at it a very small modernization has been made and the project moved to use Poetry and Black formatting. But it's still the same ugly code - to reflect Lodur. 2021-02-14 14:00:15 +00:00			`return self.extract(f_id, file, coordinates)`