#!/usr/bin/env python3 """ extracts data from ELZ PDFs """ import io import logging from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage class PDFHandling: """ PDF handling like parsing """ def __init__(self): self.logger = logging.getLogger(__name__) # less logging for pdfminer - more is not needed logger_doc = logging.getLogger('pdfminer.pdfdocument') logger_doc.setLevel(logging.WARNING) logger_page = logging.getLogger('pdfminer.pdfpage') logger_page.setLevel(logging.WARNING) logger_interp = logging.getLogger('pdfminer.pdfinterp') logger_interp.setLevel(logging.WARNING) logger_psparser = logging.getLogger('pdfminer.psparser') logger_psparser.setLevel(logging.WARNING) logger_cmapdb = logging.getLogger('pdfminer.cmapdb') logger_cmapdb.setLevel(logging.WARNING) logger_pdfparser = logging.getLogger('pdfminer.pdfparser') logger_pdfparser.setLevel(logging.WARNING) def concatenate_to_multiline_string(self, data, start, end): """ concatenates multiple lines to a single multiline string """ res = '' counter = start while counter <= end: res += data[counter] + '\n' counter += 1 return res def convert(self, file): """ converts the PDF to a multiline string """ pagenums = set() manager = PDFResourceManager() codec = 'utf-8' caching = True output = io.StringIO() converter = TextConverter(manager, output, codec=codec, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(file, 'rb') for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True): interpreter.process_page(page) converted_pdf = output.getvalue() infile.close() converter.close() output.close() return converted_pdf def extract_einsatzausdruck(self, file, f_id): """ extracts as many information from the parsed Einsatzausdruck as possible """ converted = self.convert(file) splited = converted.splitlines() self.logger.debug('[%s] Parsed PDF raw:\n %s', f_id, converted) self.logger.debug('[%s] Line-splited PDF: %s', f_id, splited) # search some well-known words for later positional computation try: index_einsatzauftragfw = splited.index('Einsatzauftrag Feuerwehr') index_erfasser = splited.index('Erfasser') index_auftrag = splited.index('Auftrag') index_bemerkungen = splited.index('Bemerkungen') index_dispo = splited.index('Disponierte Einheiten') index_einsatz = splited.index('Einsatz') index_hinweis = splited.index('Hinweis') index_maps = splited.index('Google Maps') except ValueError as err: self.logger.error('[%s] PDF file does not look like a Einsatzausdruck: %s', f_id, err) return False # the PDF parsing not always produces the same output # let's define the already known output if index_bemerkungen == 6: self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen) # get length of bemerkungen field # it lives between the line which contains 'Bemerkungen' and # the line 'Disponierte Einheiten' length_bemerkungen = index_auftrag - index_bemerkungen - 1 erfasser = splited[index_dispo - 2] auftrag = splited[index_erfasser + 2] datum = splited[index_erfasser + 3] zeit = splited[index_erfasser + 4] einsatz = splited[index_einsatz - 6] sondersignal = splited[index_einsatz - 5] ort = splited[index_einsatz - 3] strasse = splited[index_einsatz - 2] # sometimes there is just a phone number for the field melder but on # the second line, so the lines vary for erfasser and melder if index_dispo - index_erfasser == 10: melder = splited[index_dispo - 4] + ', ' + splited[index_dispo - 3] else: melder = splited[index_dispo - 4] # BMA style elif index_bemerkungen == 20: self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen) length_bemerkungen = index_dispo - index_bemerkungen - 1 erfasser = splited[index_bemerkungen - 2] auftrag = splited[index_einsatzauftragfw + 2] datum = splited[index_einsatzauftragfw + 3] zeit = splited[index_einsatzauftragfw + 4] einsatz = splited[index_einsatz + 6] sondersignal = splited[index_einsatz + 7] ort = splited[index_einsatz + 9] strasse = splited[index_einsatz + 10] melder = 'BMA' # There is no melder on a BMA Einsatzausdruck elif index_bemerkungen == 21 or index_bemerkungen == 22: self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen) length_bemerkungen = index_dispo - index_bemerkungen - 1 erfasser = splited[index_bemerkungen - 2] auftrag = splited[index_erfasser + 2] datum = splited[index_erfasser + 3] zeit = splited[index_erfasser + 4] einsatz = splited[index_einsatz - 6] sondersignal = splited[index_einsatz - 5] ort = splited[index_einsatz - 3] strasse = splited[index_einsatz - 2] if index_bemerkungen - index_erfasser == 10: melder = splited[index_bemerkungen - 4] + ', ' + splited[index_bemerkungen - 3] else: melder = splited[index_bemerkungen - 4] elif index_bemerkungen == 24: self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen) length_bemerkungen = index_dispo - index_bemerkungen - 1 erfasser = splited[index_bemerkungen - 2] auftrag = splited[index_einsatzauftragfw + 4] datum = splited[index_einsatzauftragfw + 9] zeit = splited[index_einsatzauftragfw + 10] einsatz = splited[index_einsatz - 4] sondersignal = splited[index_einsatz - 3] ort = '' strasse = splited[index_einsatz - 2] melder = splited[index_dispo - 8] + ', ' + splited[index_dispo - 7] else: self.logger.error('[%s] Unknown location of Bemerkungen. Line %s', f_id, index_bemerkungen) return False # sanity check to see if we can correlate the f_id if f_id == auftrag: self.logger.info('[%s] ID matches in PDF', f_id) else: self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, auftrag) return False # try to find out if there is a hinweis # if yes, the difference between the indexes is 4, else it's shorter if index_maps - index_hinweis == 4: hinweis = splited[index_hinweis+2] else: hinweis = '' data = { 'auftrag': auftrag, 'datum': datum, 'zeit': zeit, 'melder': melder, 'erfasser': erfasser, 'bemerkungen': self.concatenate_to_multiline_string( splited, index_bemerkungen + 1, index_bemerkungen + length_bemerkungen ).rstrip(), 'einsatz': einsatz, 'sondersignal': sondersignal, 'ort': ort.title(), 'strasse': strasse.title(), #'objekt': splited[], 'hinweis': hinweis, } return data def extract_einsatzprotokoll(self, file, f_id): """ extracts as many information from the parsed Einsatzprotokoll as possible """ splited = self.convert(file).splitlines() # sanity check to see if we can correlate the f_id if f_id == splited[26]: self.logger.info('[%s] ID matches in PDF', f_id) else: self.logger.error('[%s] ID does not match in PDF', f_id) return False data = { 'auftrag': splited[26], 'datum': splited[25], 'angelegt': splited[28], 'disposition': splited[30], 'ausgerueckt': splited[32], 'anort': splited[33], } return data