pylokid/library/pdf_extract.py

210 lines
8.5 KiB
Python
Raw Normal View History

2017-12-28 19:07:56 +00:00
#!/usr/bin/env python3
2017-12-30 16:01:13 +00:00
""" extracts data from ELZ PDFs """
2017-12-28 19:07:56 +00:00
import io
2017-12-30 16:01:13 +00:00
import logging
2017-12-28 19:07:56 +00:00
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
2017-12-30 16:01:13 +00:00
from pdfminer.converter import TextConverter
2017-12-28 19:07:56 +00:00
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
2017-12-30 18:19:40 +00:00
class PDFHandling:
""" PDF handling like parsing """
def __init__(self):
self.logger = logging.getLogger(__name__)
# less logging for pdfminer - more is not needed
logger_doc = logging.getLogger('pdfminer.pdfdocument')
logger_doc.setLevel(logging.WARNING)
logger_page = logging.getLogger('pdfminer.pdfpage')
logger_page.setLevel(logging.WARNING)
logger_interp = logging.getLogger('pdfminer.pdfinterp')
logger_interp.setLevel(logging.WARNING)
2018-01-04 20:26:38 +00:00
logger_psparser = logging.getLogger('pdfminer.psparser')
logger_psparser.setLevel(logging.WARNING)
logger_cmapdb = logging.getLogger('pdfminer.cmapdb')
logger_cmapdb.setLevel(logging.WARNING)
logger_pdfparser = logging.getLogger('pdfminer.pdfparser')
logger_pdfparser.setLevel(logging.WARNING)
2017-12-30 18:19:40 +00:00
def concatenate_to_multiline_string(self, data, start, end):
""" concatenates multiple lines to a single multiline string """
res = ''
counter = start
while counter <= end:
res += data[counter] + '\n'
counter += 1
return res
def convert(self, file):
""" converts the PDF to a multiline string """
pagenums = set()
manager = PDFResourceManager()
codec = 'utf-8'
caching = True
output = io.StringIO()
converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(file, 'rb')
for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True):
interpreter.process_page(page)
converted_pdf = output.getvalue()
infile.close()
converter.close()
output.close()
return converted_pdf
def extract_einsatzausdruck(self, file, f_id):
""" extracts as many information from the parsed Einsatzausdruck as possible """
2018-01-04 20:26:38 +00:00
converted = self.convert(file)
splited = converted.splitlines()
2017-12-30 18:19:40 +00:00
self.logger.debug('[%s] Parsed PDF raw:\n %s', f_id, converted)
self.logger.debug('[%s] Line-splited PDF: %s', f_id, splited)
2017-12-30 18:19:40 +00:00
2018-01-02 15:45:37 +00:00
# search some well-known words for later positional computation
2018-01-03 11:54:41 +00:00
try:
2018-01-16 08:16:36 +00:00
index_einsatzauftragfw = splited.index('Einsatzauftrag Feuerwehr')
2018-01-04 20:26:38 +00:00
index_erfasser = splited.index('Erfasser')
index_auftrag = splited.index('Auftrag')
2017-12-30 18:19:40 +00:00
index_bemerkungen = splited.index('Bemerkungen')
index_dispo = splited.index('Disponierte Einheiten')
2018-01-04 20:26:38 +00:00
index_einsatz = splited.index('Einsatz')
2017-12-30 18:19:40 +00:00
index_hinweis = splited.index('Hinweis')
2018-01-04 20:26:38 +00:00
index_maps = splited.index('Google Maps')
except ValueError as err:
self.logger.error('[%s] PDF file does not look like a Einsatzausdruck: %s', f_id, err)
2017-12-30 18:19:40 +00:00
return False
2018-01-04 20:26:38 +00:00
# the PDF parsing not always produces the same output
# let's define the already known output
if index_bemerkungen == 6:
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
2018-01-04 20:26:38 +00:00
# get length of bemerkungen field
# it lives between the line which contains 'Bemerkungen' and
# the line 'Disponierte Einheiten'
length_bemerkungen = index_auftrag - index_bemerkungen - 1
erfasser = splited[index_dispo - 2]
2018-01-16 08:16:36 +00:00
auftrag = splited[index_erfasser + 2]
datum = splited[index_erfasser + 3]
zeit = splited[index_erfasser + 4]
einsatz = splited[index_einsatz - 6]
sondersignal = splited[index_einsatz - 5]
ort = splited[index_einsatz - 3]
strasse = splited[index_einsatz - 2]
2018-01-04 20:26:38 +00:00
# sometimes there is just a phone number for the field melder but on
# the second line, so the lines vary for erfasser and melder
if index_dispo - index_erfasser == 10:
melder = splited[index_dispo - 4] + ', ' + splited[index_dispo - 3]
else:
melder = splited[index_dispo - 4]
2018-01-16 08:16:36 +00:00
# BMA style
elif index_bemerkungen == 20:
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
2018-01-16 08:16:36 +00:00
length_bemerkungen = index_dispo - index_bemerkungen - 1
erfasser = splited[index_bemerkungen - 2]
auftrag = splited[index_einsatzauftragfw + 2]
datum = splited[index_einsatzauftragfw + 3]
zeit = splited[index_einsatzauftragfw + 4]
einsatz = splited[index_einsatz + 6]
sondersignal = splited[index_einsatz + 7]
ort = splited[index_einsatz + 9]
strasse = splited[index_einsatz + 10]
melder = 'BMA' # There is no melder on a BMA Einsatzausdruck
2018-01-04 20:26:38 +00:00
elif index_bemerkungen == 21 or index_bemerkungen == 22:
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
2018-01-04 20:26:38 +00:00
length_bemerkungen = index_dispo - index_bemerkungen - 1
erfasser = splited[index_bemerkungen - 2]
2018-01-16 08:16:36 +00:00
auftrag = splited[index_erfasser + 2]
datum = splited[index_erfasser + 3]
zeit = splited[index_erfasser + 4]
einsatz = splited[index_einsatz - 6]
sondersignal = splited[index_einsatz - 5]
ort = splited[index_einsatz - 3]
strasse = splited[index_einsatz - 2]
2018-01-04 20:26:38 +00:00
if index_bemerkungen - index_erfasser == 10:
melder = splited[index_bemerkungen - 4] + ', ' + splited[index_bemerkungen - 3]
else:
melder = splited[index_bemerkungen - 4]
elif index_bemerkungen == 24:
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
length_bemerkungen = index_dispo - index_bemerkungen - 1
erfasser = splited[index_bemerkungen - 2]
auftrag = splited[index_einsatzauftragfw + 4]
datum = splited[index_einsatzauftragfw + 9]
zeit = splited[index_einsatzauftragfw + 10]
einsatz = splited[index_einsatz - 4]
sondersignal = splited[index_einsatz - 3]
ort = ''
strasse = splited[index_einsatz - 2]
melder = splited[index_dispo - 8] + ', ' + splited[index_dispo - 7]
2018-01-04 20:26:38 +00:00
else:
self.logger.error('[%s] Unknown location of Bemerkungen. Line %s', f_id, index_bemerkungen)
2018-01-04 20:26:38 +00:00
return False
# sanity check to see if we can correlate the f_id
if f_id == auftrag:
self.logger.info('[%s] ID matches in PDF', f_id)
else:
self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, auftrag)
return False
# try to find out if there is a hinweis
# if yes, the difference between the indexes is 4, else it's shorter
if index_maps - index_hinweis == 4:
hinweis = splited[index_hinweis+2]
else:
hinweis = ''
2017-12-30 18:19:40 +00:00
data = {
2018-01-04 20:26:38 +00:00
'auftrag': auftrag,
2018-01-16 08:16:36 +00:00
'datum': datum,
'zeit': zeit,
2018-01-04 20:26:38 +00:00
'melder': melder,
'erfasser': erfasser,
2017-12-30 18:19:40 +00:00
'bemerkungen': self.concatenate_to_multiline_string(
splited,
index_bemerkungen + 1,
2017-12-30 18:19:40 +00:00
index_bemerkungen + length_bemerkungen
).rstrip(),
2018-01-16 08:16:36 +00:00
'einsatz': einsatz,
'sondersignal': sondersignal,
'ort': ort.title(),
'strasse': strasse.title(),
2017-12-30 18:19:40 +00:00
#'objekt': splited[],
2018-01-04 20:26:38 +00:00
'hinweis': hinweis,
2017-12-30 18:19:40 +00:00
}
return data
def extract_einsatzprotokoll(self, file, f_id):
""" extracts as many information from the parsed Einsatzprotokoll as possible """
splited = self.convert(file).splitlines()
# sanity check to see if we can correlate the f_id
if f_id == splited[26]:
self.logger.info('[%s] ID matches in PDF', f_id)
2017-12-30 18:19:40 +00:00
else:
self.logger.error('[%s] ID does not match in PDF', f_id)
2017-12-30 18:19:40 +00:00
return False
data = {
'auftrag': splited[26],
'datum': splited[25],
'angelegt': splited[28],
'disposition': splited[30],
'ausgerueckt': splited[32],
'anort': splited[33],
}
return data