diff --git a/library/lodur.py b/library/lodur.py index 2615135..bf1e569 100644 --- a/library/lodur.py +++ b/library/lodur.py @@ -74,7 +74,7 @@ class Lodur: '%H:%M', ) zh_am_schad = datetime.strptime( - pdf_data['anort'], + pdf_data['vorort'], '%H:%M', ) except ValueError as err: @@ -120,9 +120,9 @@ class Lodur: '%H:%M', ) eins_ereig = pdf_data['einsatz'] - bemerkungen = pdf_data['bemerkungen'] + bemerkungen = pdf_data['bemerkungen'] + '\n' + pdf_data['disponierteeinheiten'] wer_ala = pdf_data['melder'] - adr = pdf_data['strasse'] + ', ' + pdf_data['ort'] + adr = pdf_data['ort'] else: date = datetime.now() time = datetime.now() diff --git a/library/pdf_extract.py b/library/pdf_extract.py deleted file mode 100644 index b21caf0..0000000 --- a/library/pdf_extract.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python3 - -""" extracts data from ELZ PDFs """ - -import io -import logging -from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter -from pdfminer.converter import TextConverter -from pdfminer.layout import LAParams -from pdfminer.pdfpage import PDFPage - -class PDFHandling: - """ PDF handling like parsing """ - - def __init__(self): - self.logger = logging.getLogger(__name__) - - # less logging for pdfminer - more is not needed - logger_doc = logging.getLogger('pdfminer.pdfdocument') - logger_doc.setLevel(logging.WARNING) - logger_page = logging.getLogger('pdfminer.pdfpage') - logger_page.setLevel(logging.WARNING) - logger_interp = logging.getLogger('pdfminer.pdfinterp') - logger_interp.setLevel(logging.WARNING) - logger_psparser = logging.getLogger('pdfminer.psparser') - logger_psparser.setLevel(logging.WARNING) - logger_cmapdb = logging.getLogger('pdfminer.cmapdb') - logger_cmapdb.setLevel(logging.WARNING) - logger_pdfparser = logging.getLogger('pdfminer.pdfparser') - logger_pdfparser.setLevel(logging.WARNING) - - def concatenate_to_multiline_string(self, data, start, end): - """ concatenates multiple lines to a single multiline string """ - - res = '' - counter = start - while counter <= end: - res += data[counter] + '\n' - counter += 1 - return res - - def convert(self, file): - """ converts the PDF to a multiline string """ - - pagenums = set() - manager = PDFResourceManager() - codec = 'utf-8' - caching = True - - output = io.StringIO() - converter = TextConverter(manager, output, codec=codec, laparams=LAParams()) - - interpreter = PDFPageInterpreter(manager, converter) - infile = open(file, 'rb') - - for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True): - interpreter.process_page(page) - - converted_pdf = output.getvalue() - - infile.close() - converter.close() - output.close() - return converted_pdf - - def extract_einsatzausdruck(self, file, f_id): - """ extracts as many information from the parsed Einsatzausdruck as possible """ - - converted = self.convert(file) - splited = converted.splitlines() - - self.logger.debug('[%s] Parsed PDF raw:\n %s', f_id, converted) - self.logger.debug('[%s] Line-splited PDF: %s', f_id, splited) - - # search some well-known words for later positional computation - try: - index_einsatzauftragfw = splited.index('Einsatzauftrag Feuerwehr') - index_erfasser = splited.index('Erfasser') - index_auftrag = splited.index('Auftrag') - index_bemerkungen = splited.index('Bemerkungen') - index_dispo = splited.index('Disponierte Einheiten') - index_einsatz = splited.index('Einsatz') - index_hinweis = splited.index('Hinweis') - index_maps = splited.index('Google Maps') - except ValueError as err: - self.logger.error('[%s] PDF file does not look like a Einsatzausdruck: %s', f_id, err) - return False - - # the PDF parsing not always produces the same output - # let's define the already known output - if index_bemerkungen == 6: - self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen) - # get length of bemerkungen field - # it lives between the line which contains 'Bemerkungen' and - # the line 'Disponierte Einheiten' - length_bemerkungen = index_auftrag - index_bemerkungen - 1 - erfasser = splited[index_dispo - 2] - auftrag = splited[index_erfasser + 2] - datum = splited[index_erfasser + 3] - zeit = splited[index_erfasser + 4] - einsatz = splited[index_einsatz - 6] - sondersignal = splited[index_einsatz - 5] - ort = splited[index_einsatz - 3] - strasse = splited[index_einsatz - 2] - # sometimes there is just a phone number for the field melder but on - # the second line, so the lines vary for erfasser and melder - if index_dispo - index_erfasser == 10: - melder = splited[index_dispo - 4] + ', ' + splited[index_dispo - 3] - else: - melder = splited[index_dispo - 4] - # BMA style - elif index_bemerkungen == 20: - self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen) - length_bemerkungen = index_dispo - index_bemerkungen - 1 - erfasser = splited[index_bemerkungen - 2] - auftrag = splited[index_einsatzauftragfw + 2] - datum = splited[index_einsatzauftragfw + 3] - zeit = splited[index_einsatzauftragfw + 4] - einsatz = splited[index_einsatz + 6] - sondersignal = splited[index_einsatz + 7] - ort = splited[index_einsatz + 9] - strasse = splited[index_einsatz + 10] - melder = 'BMA' # There is no melder on a BMA Einsatzausdruck - elif index_bemerkungen == 21 or index_bemerkungen == 22: - self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen) - length_bemerkungen = index_dispo - index_bemerkungen - 1 - erfasser = splited[index_bemerkungen - 2] - auftrag = splited[index_erfasser + 2] - datum = splited[index_erfasser + 3] - zeit = splited[index_erfasser + 4] - einsatz = splited[index_einsatz - 6] - sondersignal = splited[index_einsatz - 5] - ort = splited[index_einsatz - 3] - strasse = splited[index_einsatz - 2] - if index_bemerkungen - index_erfasser == 10: - melder = splited[index_bemerkungen - 4] + ', ' + splited[index_bemerkungen - 3] - else: - melder = splited[index_bemerkungen - 4] - elif index_bemerkungen == 24: - self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen) - length_bemerkungen = index_dispo - index_bemerkungen - 1 - erfasser = splited[index_bemerkungen - 2] - auftrag = splited[index_einsatzauftragfw + 4] - datum = splited[index_einsatzauftragfw + 9] - zeit = splited[index_einsatzauftragfw + 10] - einsatz = splited[index_einsatz - 4] - sondersignal = splited[index_einsatz - 3] - ort = '' - strasse = splited[index_einsatz - 2] - melder = splited[index_dispo - 8] + ', ' + splited[index_dispo - 7] - else: - self.logger.error('[%s] Unknown location of Bemerkungen. Line %s', f_id, index_bemerkungen) - return False - - # sanity check to see if we can correlate the f_id - if f_id == auftrag: - self.logger.info('[%s] ID matches in PDF', f_id) - else: - self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, auftrag) - return False - - # try to find out if there is a hinweis - # if yes, the difference between the indexes is 4, else it's shorter - if index_maps - index_hinweis == 4: - hinweis = splited[index_hinweis+2] - else: - hinweis = '' - - data = { - 'auftrag': auftrag, - 'datum': datum, - 'zeit': zeit, - 'melder': melder, - 'erfasser': erfasser, - 'bemerkungen': self.concatenate_to_multiline_string( - splited, - index_bemerkungen + 1, - index_bemerkungen + length_bemerkungen - ).rstrip(), - 'einsatz': einsatz, - 'sondersignal': sondersignal, - 'ort': ort.title(), - 'strasse': strasse.title(), - #'objekt': splited[], - 'hinweis': hinweis, - } - return data - - def extract_einsatzprotokoll(self, file, f_id): - """ extracts as many information from the parsed Einsatzprotokoll as possible """ - - splited = self.convert(file).splitlines() - - # sanity check to see if we can correlate the f_id - if f_id == splited[26]: - self.logger.info('[%s] ID matches in PDF', f_id) - else: - self.logger.error('[%s] ID does not match in PDF', f_id) - return False - - data = { - 'auftrag': splited[26], - 'datum': splited[25], - 'angelegt': splited[28], - 'disposition': splited[30], - 'ausgerueckt': splited[32], - 'anort': splited[33], - } - return data diff --git a/library/pdftotext.py b/library/pdftotext.py new file mode 100644 index 0000000..f8f1aea --- /dev/null +++ b/library/pdftotext.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +""" extracts data from ELZ PDFs using Poppler pdftotext """ + +import subprocess +import logging + +class PDFParsing: + """ PDF parsing """ + + def __init__(self): + self.logger = logging.getLogger(__name__) + self.logger.info('PDF parsing based on pdftotext loaded') + + def extract(self, f_id, file, datafields): + + data = {} + + for field, coordinate in datafields.items(): + + # x-coordinate of the crop area top left corner + x = coordinate['xMin'] + + # y-coordinate of the crop area top left corner + y = coordinate['yMin'] + + # width of crop area in pixels + w = coordinate['xMax'] - coordinate['xMin'] + + # height of crop area in pixels + h = coordinate['yMax'] - coordinate['yMin'] + + self.logger.debug('[%s] Computed command for field %s: %s', f_id, field, + 'pdftotext -f 1 -l 1 -x {} -y {} -W {} -H {}'.format(x,y,w,h) + ) + + scrapeddata = subprocess.Popen([ + '/usr/bin/pdftotext', + '-f', '1', + '-l', '1', + '-x', str(x), + '-y', str(y), + '-W', str(w), + '-H', str(h), + file, + '-' + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True) + stdout, _ = scrapeddata.communicate() + + ## TODO: fixup some fields (lowercase, remove unnecessary \n) + if 'edit' in coordinate and coordinate['edit'] == 'title': + data[field] = stdout.rstrip().title() + else: + data[field] = stdout.rstrip() + + # sanity check to see if we can correlate the f_id + if f_id == data['auftrag']: + self.logger.debug('[%s] ID matches in PDF', f_id) + return data + else: + self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, data['auftrag']) + return False + + def extract_einsatzausdruck(self, file, f_id): + """ extracts information from Einsatzausdruck using external pdftotext """ + + self.logger.debug('[%s] Parsing PDF: %s', f_id, file) + + # Get them using 'pdftotext -bbox' + # y = row + # x = column: xMax 450 / 590 means full width + coordinates = { + 'auftrag': { + 'xMin': 70, 'yMin': 47, 'xMax': 120,'yMax': 58, + }, + 'datum': { + 'xMin': 190, 'yMin': 47, 'xMax': 239, 'yMax': 58, + }, + 'zeit': { + 'xMin': 190, 'yMin': 59, 'xMax': 215, 'yMax': 70, + }, + 'melder': { + 'xMin': 304, 'yMin': 47, 'xMax': 446, 'yMax': 70, 'edit': 'title' + }, + 'erfasser':{ + 'xMin': 448, 'yMin': 59, 'xMax': 478, 'yMax': 70, + }, + # big field until "Disponierte Einheiten" + 'bemerkungen': { + 'xMin': 28, 'yMin': 112, 'xMax': 590, 'yMax': 350, + }, + 'disponierteeinheiten': { + 'xMin': 28, 'yMin': 366, 'xMax': 450, 'yMax': 376, + }, + 'einsatz': { + 'xMin': 76, 'yMin': 690, 'xMax': 450, 'yMax': 703, + }, + 'sondersignal': { + 'xMin': 76, 'yMin': 707, 'xMax': 450, 'yMax': 721, + }, + 'ort': { + 'xMin': 76, 'yMin': 732, 'xMax': 590, 'yMax': 745, + }, + 'hinweis': { + 'xMin': 76, 'yMin': 773, 'xMax': 450, 'yMax': 787, + }, + } + + return self.extract(f_id, file, coordinates) + + def extract_einsatzprotokoll(self, file, f_id): + """ extracts information from Einsatzprotokoll using external pdftotext """ + + self.logger.debug('[%s] Parsing PDF: %s', f_id, file) + + # Get them using 'pdftotext -bbox' + # y = row + # x = column: xMax 450 / 590 means full width + coordinates = { + 'auftrag': { + 'xMin': 192, 'yMin': 132, 'xMax': 238,'yMax': 142, + }, + 'angelegt': { + 'xMin': 192, 'yMin': 294, 'xMax': 226, 'yMax': 304, + }, + 'dispo': { + 'xMin': 192, 'yMin': 312, 'xMax': 226, 'yMax': 322, + }, + 'ausgerueckt': { + 'xMin': 192, 'yMin': 331, 'xMax': 226, 'yMax': 341, + }, + 'vorort':{ + 'xMin': 192, 'yMin': 348, 'xMax': 226, 'yMax': 358, + }, + } + + return self.extract(f_id, file, coordinates) \ No newline at end of file diff --git a/main.py b/main.py index 9eeba59..a8b5f10 100644 --- a/main.py +++ b/main.py @@ -14,7 +14,7 @@ from library.emailhandling import EmailHandling from library.lodur import Lodur from library.mqtt import MQTTClient from library.gotify import GotifyClient -from library.pdf_extract import PDFHandling +from library.pdftotext import PDFParsing from library.webdav import WebDav # Configuration @@ -92,7 +92,7 @@ def main(): ) # Initialize PDF Parser - pdf = PDFHandling() + pdf = PDFParsing() # Main Loop while True: diff --git a/test_pdf_parsing.py b/test_pdf_parsing.py deleted file mode 100644 index 6a1203f..0000000 --- a/test_pdf_parsing.py +++ /dev/null @@ -1,21 +0,0 @@ -import re -import logging -from pprint import pprint -from pathlib import Path -from library.pdf_extract import PDFHandling - -PATH = '/tmp/pylokid' - -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) - -PDF = PDFHandling() - -for path in Path(PATH).glob('**/*.pdf'): - file = str(path) - print(file) - f_id = re.search('.*(F[0-9]{8})_.*', file).group(1) - print(f_id) - pprint(PDF.extract_einsatzausdruck(file, f_id)) diff --git a/test_pdftotext.py b/test_pdftotext.py new file mode 100644 index 0000000..86e1f0b --- /dev/null +++ b/test_pdftotext.py @@ -0,0 +1,30 @@ +import re +import logging +from pprint import pprint +from pathlib import Path +from library.pdftotext import PDFParsing + +PATH = '/home/tobru/Documents/Feuerwehr/Stab/Fourier/Einsatzdepeschen/2019' + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +PDF = PDFParsing() + +for path in Path(PATH).glob('**/Einsatzausdruck*.pdf'): + file = str(path) + print(file) + f_id = re.search('.*(F[0-9]{8})_.*', file).group(1) + print(f_id) + pprint(PDF.extract_einsatzausdruck(file, f_id)) + +""" +for path in Path(PATH).glob('**/Einsatzprotokoll*.pdf'): + file = str(path) + print(file) + f_id = re.search('.*(F[0-9]{8})_.*', file).group(1) + print(f_id) + pprint(PDF.extract_einsatzprotokoll(file, f_id)) +"""