pylokid/library/pdf_extract.py

129 lines
4.3 KiB
Python
Raw Normal View History

2017-12-28 19:07:56 +00:00
#!/usr/bin/env python3
2017-12-30 16:01:13 +00:00
""" extracts data from ELZ PDFs """
2017-12-28 19:07:56 +00:00
import io
2017-12-30 16:01:13 +00:00
import logging
2017-12-28 19:07:56 +00:00
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
2017-12-30 16:01:13 +00:00
from pdfminer.converter import TextConverter
2017-12-28 19:07:56 +00:00
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
2017-12-30 18:19:40 +00:00
class PDFHandling:
""" PDF handling like parsing """
def __init__(self):
self.logger = logging.getLogger(__name__)
# less logging for pdfminer - more is not needed
logger_doc = logging.getLogger('pdfminer.pdfdocument')
logger_doc.setLevel(logging.WARNING)
logger_page = logging.getLogger('pdfminer.pdfpage')
logger_page.setLevel(logging.WARNING)
logger_interp = logging.getLogger('pdfminer.pdfinterp')
logger_interp.setLevel(logging.WARNING)
def concatenate_to_multiline_string(self, data, start, end):
""" concatenates multiple lines to a single multiline string """
res = ''
counter = start
while counter <= end:
res += data[counter] + '\n'
counter += 1
return res
def convert(self, file):
""" converts the PDF to a multiline string """
pagenums = set()
manager = PDFResourceManager()
codec = 'utf-8'
caching = True
output = io.StringIO()
converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(file, 'rb')
for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True):
interpreter.process_page(page)
converted_pdf = output.getvalue()
infile.close()
converter.close()
output.close()
return converted_pdf
def extract_einsatzausdruck(self, file, f_id):
""" extracts as many information from the parsed Einsatzausdruck as possible """
splited = self.convert(file).splitlines()
2018-01-03 11:54:41 +00:00
self.logger.info('[%s] Parsed PDF raw: %s', f_id, splited)
2017-12-30 18:19:40 +00:00
# sanity check to see if we can correlate the f_id
if f_id == splited[14]:
self.logger.info('[%s] ID matches in PDF', f_id)
2017-12-30 18:19:40 +00:00
else:
self.logger.error('[%s] ID does not match in PDF', f_id)
2017-12-30 18:19:40 +00:00
return False
2018-01-02 15:45:37 +00:00
# search some well-known words for later positional computation
2018-01-03 11:54:41 +00:00
try:
2017-12-30 18:19:40 +00:00
index_bemerkungen = splited.index('Bemerkungen')
index_dispo = splited.index('Disponierte Einheiten')
index_hinweis = splited.index('Hinweis')
2018-01-02 15:45:37 +00:00
except IndexError:
self.logger.error('[%s] PDF file does not look like a Einsatzausdruck', f_id)
2017-12-30 18:19:40 +00:00
return False
# get length of bemerkungen field
# it lives between the line which contains 'Bemerkungen' and
# the line 'Disponierte Einheiten'
length_bemerkungen = index_dispo - index_bemerkungen - 1
data = {
'auftrag': splited[14],
'datum': splited[15],
'zeit': splited[16],
'melder': splited[18] + ' ' + splited[19],
2017-12-30 18:19:40 +00:00
'erfasser': splited[20],
'bemerkungen': self.concatenate_to_multiline_string(
splited,
index_bemerkungen + 1,
2017-12-30 18:19:40 +00:00
index_bemerkungen + length_bemerkungen
).rstrip(),
2017-12-30 18:19:40 +00:00
'einsatz': splited[index_dispo+5],
'sondersignal': splited[index_dispo+6],
2017-12-30 18:19:40 +00:00
'plzort': splited[index_dispo+8].title(),
'strasse': splited[index_dispo+9].title(),
#'objekt': splited[],
'hinweis': splited[index_hinweis+2]
}
return data
def extract_einsatzprotokoll(self, file, f_id):
""" extracts as many information from the parsed Einsatzprotokoll as possible """
splited = self.convert(file).splitlines()
# sanity check to see if we can correlate the f_id
if f_id == splited[26]:
self.logger.info('[%s] ID matches in PDF', f_id)
2017-12-30 18:19:40 +00:00
else:
self.logger.error('[%s] ID does not match in PDF', f_id)
2017-12-30 18:19:40 +00:00
return False
data = {
'auftrag': splited[26],
'datum': splited[25],
'angelegt': splited[28],
'disposition': splited[30],
'ausgerueckt': splited[32],
'anort': splited[33],
}
return data