complete rework of pdf parsing

2019-09-22 18:10:09 +02:00 · 2019-09-22 18:10:09 +02:00 · b0228afcfd
parent b5c7d7b7b1
commit b0228afcfd
6 changed files with 175 additions and 235 deletions
--- a/library/lodur.py
+++ b/library/lodur.py
@ -74,7 +74,7 @@ class Lodur:
                        '%H:%M',
                    )
                    zh_am_schad = datetime.strptime(
-                        pdf_data['anort'],
+                        pdf_data['vorort'],
                        '%H:%M',
                    )
                except ValueError as err:
@ -120,9 +120,9 @@ class Lodur:
                '%H:%M',
            )
            eins_ereig = pdf_data['einsatz']
-            bemerkungen = pdf_data['bemerkungen']
+            bemerkungen = pdf_data['bemerkungen'] + '\n' + pdf_data['disponierteeinheiten']
            wer_ala = pdf_data['melder']
-            adr = pdf_data['strasse'] + ', ' + pdf_data['ort']
+            adr = pdf_data['ort']
        else:
            date = datetime.now()
            time = datetime.now()
--- a/library/pdf_extract.py
+++ b/library/pdf_extract.py
@ -1,209 +0,0 @@
-#!/usr/bin/env python3
-
-""" extracts data from ELZ PDFs """
-
-import io
-import logging
-from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
-from pdfminer.converter import TextConverter
-from pdfminer.layout import LAParams
-from pdfminer.pdfpage import PDFPage
-
-class PDFHandling:
-    """ PDF handling like parsing """
-
-    def __init__(self):
-        self.logger = logging.getLogger(__name__)
-
-        # less logging for pdfminer - more is not needed
-        logger_doc = logging.getLogger('pdfminer.pdfdocument')
-        logger_doc.setLevel(logging.WARNING)
-        logger_page = logging.getLogger('pdfminer.pdfpage')
-        logger_page.setLevel(logging.WARNING)
-        logger_interp = logging.getLogger('pdfminer.pdfinterp')
-        logger_interp.setLevel(logging.WARNING)
-        logger_psparser = logging.getLogger('pdfminer.psparser')
-        logger_psparser.setLevel(logging.WARNING)
-        logger_cmapdb = logging.getLogger('pdfminer.cmapdb')
-        logger_cmapdb.setLevel(logging.WARNING)
-        logger_pdfparser = logging.getLogger('pdfminer.pdfparser')
-        logger_pdfparser.setLevel(logging.WARNING)
-
-    def concatenate_to_multiline_string(self, data, start, end):
-        """ concatenates multiple lines to a single multiline string """
-
-        res = ''
-        counter = start
-        while counter <= end:
-            res += data[counter] + '\n'
-            counter += 1
-        return res
-
-    def convert(self, file):
-        """ converts the PDF to a multiline string """
-
-        pagenums = set()
-        manager = PDFResourceManager()
-        codec = 'utf-8'
-        caching = True
-
-        output = io.StringIO()
-        converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
-
-        interpreter = PDFPageInterpreter(manager, converter)
-        infile = open(file, 'rb')
-
-        for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True):
-            interpreter.process_page(page)
-
-        converted_pdf = output.getvalue()
-
-        infile.close()
-        converter.close()
-        output.close()
-        return converted_pdf
-
-    def extract_einsatzausdruck(self, file, f_id):
-        """ extracts as many information from the parsed Einsatzausdruck as possible """
-
-        converted = self.convert(file)
-        splited = converted.splitlines()
-
-        self.logger.debug('[%s] Parsed PDF raw:\n %s', f_id, converted)
-        self.logger.debug('[%s] Line-splited PDF: %s', f_id, splited)
-
-        # search some well-known words for later positional computation
-        try:
-            index_einsatzauftragfw = splited.index('Einsatzauftrag Feuerwehr')
-            index_erfasser = splited.index('Erfasser')
-            index_auftrag = splited.index('Auftrag')
-            index_bemerkungen = splited.index('Bemerkungen')
-            index_dispo = splited.index('Disponierte Einheiten')
-            index_einsatz = splited.index('Einsatz')
-            index_hinweis = splited.index('Hinweis')
-            index_maps = splited.index('Google Maps')
-        except ValueError as err:
-            self.logger.error('[%s] PDF file does not look like a Einsatzausdruck: %s', f_id, err)
-            return False
-
-        # the PDF parsing not always produces the same output
-        # let's define the already known output
-        if index_bemerkungen == 6:
-            self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
-            # get length of bemerkungen field
-            # it lives between the line which contains 'Bemerkungen' and
-            # the line 'Disponierte Einheiten'
-            length_bemerkungen = index_auftrag - index_bemerkungen - 1
-            erfasser = splited[index_dispo - 2]
-            auftrag = splited[index_erfasser + 2]
-            datum = splited[index_erfasser + 3]
-            zeit = splited[index_erfasser + 4]
-            einsatz = splited[index_einsatz - 6]
-            sondersignal = splited[index_einsatz - 5]
-            ort = splited[index_einsatz - 3]
-            strasse = splited[index_einsatz - 2]
-            # sometimes there is just a phone number for the field melder but on
-            # the second line, so the lines vary for erfasser and melder
-            if index_dispo - index_erfasser == 10:
-                melder = splited[index_dispo - 4] + ', ' + splited[index_dispo - 3]
-            else:
-                melder = splited[index_dispo - 4]
-        # BMA style
-        elif index_bemerkungen == 20:
-            self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
-            length_bemerkungen = index_dispo - index_bemerkungen - 1
-            erfasser = splited[index_bemerkungen - 2]
-            auftrag = splited[index_einsatzauftragfw + 2]
-            datum = splited[index_einsatzauftragfw + 3]
-            zeit = splited[index_einsatzauftragfw + 4]
-            einsatz = splited[index_einsatz + 6]
-            sondersignal = splited[index_einsatz + 7]
-            ort = splited[index_einsatz + 9]
-            strasse = splited[index_einsatz + 10]
-            melder = 'BMA' # There is no melder on a BMA Einsatzausdruck
-        elif index_bemerkungen == 21 or index_bemerkungen == 22:
-            self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
-            length_bemerkungen = index_dispo - index_bemerkungen - 1
-            erfasser = splited[index_bemerkungen - 2]
-            auftrag = splited[index_erfasser + 2]
-            datum = splited[index_erfasser + 3]
-            zeit = splited[index_erfasser + 4]
-            einsatz = splited[index_einsatz - 6]
-            sondersignal = splited[index_einsatz - 5]
-            ort = splited[index_einsatz - 3]
-            strasse = splited[index_einsatz - 2]
-            if index_bemerkungen - index_erfasser == 10:
-                melder = splited[index_bemerkungen - 4] + ', ' + splited[index_bemerkungen - 3]
-            else:
-                melder = splited[index_bemerkungen - 4]
-        elif index_bemerkungen == 24:
-            self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
-            length_bemerkungen = index_dispo - index_bemerkungen - 1
-            erfasser = splited[index_bemerkungen - 2]
-            auftrag = splited[index_einsatzauftragfw + 4]
-            datum = splited[index_einsatzauftragfw + 9]
-            zeit = splited[index_einsatzauftragfw + 10]
-            einsatz = splited[index_einsatz - 4]
-            sondersignal = splited[index_einsatz - 3]
-            ort = ''
-            strasse = splited[index_einsatz - 2]
-            melder = splited[index_dispo - 8] + ', ' + splited[index_dispo - 7]
-        else:
-            self.logger.error('[%s] Unknown location of Bemerkungen. Line %s', f_id, index_bemerkungen)
-            return False
-
-        # sanity check to see if we can correlate the f_id
-        if f_id == auftrag:
-            self.logger.info('[%s] ID matches in PDF', f_id)
-        else:
-            self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, auftrag)
-            return False
-
-        # try to find out if there is a hinweis
-        # if yes, the difference between the indexes is 4, else it's shorter
-        if index_maps - index_hinweis == 4:
-            hinweis = splited[index_hinweis+2]
-        else:
-            hinweis = ''
-
-        data = {
-            'auftrag': auftrag,
-            'datum': datum,
-            'zeit': zeit,
-            'melder': melder,
-            'erfasser': erfasser,
-            'bemerkungen': self.concatenate_to_multiline_string(
-                splited,
-                index_bemerkungen + 1,
-                index_bemerkungen + length_bemerkungen
-            ).rstrip(),
-            'einsatz': einsatz,
-            'sondersignal': sondersignal,
-            'ort': ort.title(),
-            'strasse': strasse.title(),
-            #'objekt': splited[],
-            'hinweis': hinweis,
-        }
-        return data
-
-    def extract_einsatzprotokoll(self, file, f_id):
-        """ extracts as many information from the parsed Einsatzprotokoll as possible """
-
-        splited = self.convert(file).splitlines()
-
-        # sanity check to see if we can correlate the f_id
-        if f_id == splited[26]:
-            self.logger.info('[%s] ID matches in PDF', f_id)
-        else:
-            self.logger.error('[%s] ID does not match in PDF', f_id)
-            return False
-
-        data = {
-            'auftrag': splited[26],
-            'datum': splited[25],
-            'angelegt': splited[28],
-            'disposition': splited[30],
-            'ausgerueckt': splited[32],
-            'anort': splited[33],
-        }
-        return data
--- a/library/pdftotext.py
+++ b/library/pdftotext.py
@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+
+""" extracts data from ELZ PDFs using Poppler pdftotext """
+
+import subprocess
+import logging
+
+class PDFParsing:
+    """ PDF parsing """
+
+    def __init__(self):
+        self.logger = logging.getLogger(__name__)
+        self.logger.info('PDF parsing based on pdftotext loaded')
+
+    def extract(self, f_id, file, datafields):
+
+        data = {}
+
+        for field, coordinate in datafields.items():
+
+            # x-coordinate of the crop area top left corner
+            x = coordinate['xMin']
+
+            # y-coordinate of the crop area top left corner
+            y = coordinate['yMin']
+
+            # width of crop area in pixels
+            w = coordinate['xMax'] - coordinate['xMin']
+
+            # height of crop area in pixels
+            h = coordinate['yMax'] - coordinate['yMin']
+
+            self.logger.debug('[%s] Computed command for field %s: %s', f_id, field,
+              'pdftotext -f 1 -l 1 -x {} -y {} -W {} -H {}'.format(x,y,w,h)
+            )
+
+            scrapeddata = subprocess.Popen([
+                  '/usr/bin/pdftotext',
+                  '-f', '1',
+                  '-l', '1',
+                  '-x', str(x),
+                  '-y', str(y),
+                  '-W', str(w),
+                  '-H', str(h),
+                  file,
+                  '-'
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True)
+            stdout, _ = scrapeddata.communicate()
+
+            ## TODO: fixup some fields (lowercase, remove unnecessary \n)
+            if 'edit' in coordinate and coordinate['edit'] == 'title':
+                data[field] = stdout.rstrip().title()
+            else:
+                data[field] = stdout.rstrip()
+
+        # sanity check to see if we can correlate the f_id
+        if f_id == data['auftrag']:
+            self.logger.debug('[%s] ID matches in PDF', f_id)
+            return data
+        else:
+            self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, data['auftrag'])
+            return False
+
+    def extract_einsatzausdruck(self, file, f_id):
+        """ extracts information from Einsatzausdruck using external pdftotext """
+
+        self.logger.debug('[%s] Parsing PDF: %s', f_id, file)
+
+        # Get them using 'pdftotext -bbox'
+        # y = row
+        # x = column: xMax 450 / 590 means full width
+        coordinates = {
+            'auftrag': {
+                'xMin': 70, 'yMin': 47, 'xMax': 120,'yMax': 58,
+            },
+            'datum': {
+                'xMin': 190, 'yMin': 47, 'xMax': 239, 'yMax': 58,
+            },
+            'zeit': {
+                'xMin': 190, 'yMin': 59, 'xMax': 215, 'yMax': 70,
+            },
+            'melder': {
+                'xMin': 304, 'yMin': 47, 'xMax': 446, 'yMax': 70, 'edit': 'title'
+            },
+            'erfasser':{
+                'xMin': 448, 'yMin': 59, 'xMax': 478, 'yMax': 70,
+            },
+            # big field until "Disponierte Einheiten"
+            'bemerkungen': {
+                'xMin': 28, 'yMin': 112, 'xMax': 590, 'yMax': 350,
+            },
+            'disponierteeinheiten': {
+                'xMin': 28, 'yMin': 366, 'xMax': 450, 'yMax': 376,
+            },
+            'einsatz': {
+                'xMin': 76, 'yMin': 690, 'xMax': 450, 'yMax': 703,
+            },
+            'sondersignal': {
+                'xMin': 76, 'yMin': 707, 'xMax': 450, 'yMax': 721,
+            },
+            'ort': {
+                'xMin': 76, 'yMin': 732, 'xMax': 590, 'yMax': 745,
+            },
+            'hinweis': {
+                'xMin': 76, 'yMin': 773, 'xMax': 450, 'yMax': 787,
+            },
+        }
+
+        return self.extract(f_id, file, coordinates)
+
+    def extract_einsatzprotokoll(self, file, f_id):
+        """ extracts information from Einsatzprotokoll using external pdftotext """
+
+        self.logger.debug('[%s] Parsing PDF: %s', f_id, file)
+
+        # Get them using 'pdftotext -bbox'
+        # y = row
+        # x = column: xMax 450 / 590 means full width
+        coordinates = {
+            'auftrag': {
+                'xMin': 192, 'yMin': 132, 'xMax': 238,'yMax': 142,
+            },
+            'angelegt': {
+                'xMin': 192, 'yMin': 294, 'xMax': 226, 'yMax': 304,
+            },
+            'dispo': {
+                'xMin': 192, 'yMin': 312, 'xMax': 226, 'yMax': 322,
+            },
+            'ausgerueckt': {
+                'xMin': 192, 'yMin': 331, 'xMax': 226, 'yMax': 341,
+            },
+            'vorort':{
+                'xMin': 192, 'yMin': 348, 'xMax': 226, 'yMax': 358,
+            },
+        }
+
+        return self.extract(f_id, file, coordinates)
--- a/main.py
+++ b/main.py
@ -14,7 +14,7 @@ from library.emailhandling import EmailHandling
 from library.lodur import Lodur
 from library.mqtt import MQTTClient
 from library.gotify import GotifyClient
-from library.pdf_extract import PDFHandling
+from library.pdftotext import PDFParsing
 from library.webdav import WebDav

 # Configuration
@ -92,7 +92,7 @@ def main():
    )

    # Initialize PDF Parser
-    pdf = PDFHandling()
+    pdf = PDFParsing()

    # Main Loop
    while True:
--- a/test_pdf_parsing.py
+++ b/test_pdf_parsing.py
@ -1,21 +0,0 @@
-import re
-import logging
-from pprint import pprint
-from pathlib import Path
-from library.pdf_extract import PDFHandling
-
-PATH = '/tmp/pylokid'
-
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-
-PDF = PDFHandling()
-
-for path in Path(PATH).glob('**/*.pdf'):
-    file = str(path)
-    print(file)
-    f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
-    print(f_id)
-    pprint(PDF.extract_einsatzausdruck(file, f_id))
--- a/test_pdftotext.py
+++ b/test_pdftotext.py
@ -0,0 +1,30 @@
+import re
+import logging
+from pprint import pprint
+from pathlib import Path
+from library.pdftotext import PDFParsing
+
+PATH = '/home/tobru/Documents/Feuerwehr/Stab/Fourier/Einsatzdepeschen/2019'
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+
+PDF = PDFParsing()
+
+for path in Path(PATH).glob('**/Einsatzausdruck*.pdf'):
+    file = str(path)
+    print(file)
+    f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
+    print(f_id)
+    pprint(PDF.extract_einsatzausdruck(file, f_id))
+
+"""
+for path in Path(PATH).glob('**/Einsatzprotokoll*.pdf'):
+    file = str(path)
+    print(file)
+    f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
+    print(f_id)
+    pprint(PDF.extract_einsatzprotokoll(file, f_id))
+"""