complete rework of pdf parsing

This commit is contained in:
Tobias Brunner 2019-09-22 18:10:09 +02:00
parent b5c7d7b7b1
commit b0228afcfd
6 changed files with 175 additions and 235 deletions

View File

@ -74,7 +74,7 @@ class Lodur:
'%H:%M',
)
zh_am_schad = datetime.strptime(
pdf_data['anort'],
pdf_data['vorort'],
'%H:%M',
)
except ValueError as err:
@ -120,9 +120,9 @@ class Lodur:
'%H:%M',
)
eins_ereig = pdf_data['einsatz']
bemerkungen = pdf_data['bemerkungen']
bemerkungen = pdf_data['bemerkungen'] + '\n' + pdf_data['disponierteeinheiten']
wer_ala = pdf_data['melder']
adr = pdf_data['strasse'] + ', ' + pdf_data['ort']
adr = pdf_data['ort']
else:
date = datetime.now()
time = datetime.now()

View File

@ -1,209 +0,0 @@
#!/usr/bin/env python3
""" extracts data from ELZ PDFs """
import io
import logging
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
class PDFHandling:
""" PDF handling like parsing """
def __init__(self):
self.logger = logging.getLogger(__name__)
# less logging for pdfminer - more is not needed
logger_doc = logging.getLogger('pdfminer.pdfdocument')
logger_doc.setLevel(logging.WARNING)
logger_page = logging.getLogger('pdfminer.pdfpage')
logger_page.setLevel(logging.WARNING)
logger_interp = logging.getLogger('pdfminer.pdfinterp')
logger_interp.setLevel(logging.WARNING)
logger_psparser = logging.getLogger('pdfminer.psparser')
logger_psparser.setLevel(logging.WARNING)
logger_cmapdb = logging.getLogger('pdfminer.cmapdb')
logger_cmapdb.setLevel(logging.WARNING)
logger_pdfparser = logging.getLogger('pdfminer.pdfparser')
logger_pdfparser.setLevel(logging.WARNING)
def concatenate_to_multiline_string(self, data, start, end):
""" concatenates multiple lines to a single multiline string """
res = ''
counter = start
while counter <= end:
res += data[counter] + '\n'
counter += 1
return res
def convert(self, file):
""" converts the PDF to a multiline string """
pagenums = set()
manager = PDFResourceManager()
codec = 'utf-8'
caching = True
output = io.StringIO()
converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(file, 'rb')
for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True):
interpreter.process_page(page)
converted_pdf = output.getvalue()
infile.close()
converter.close()
output.close()
return converted_pdf
def extract_einsatzausdruck(self, file, f_id):
""" extracts as many information from the parsed Einsatzausdruck as possible """
converted = self.convert(file)
splited = converted.splitlines()
self.logger.debug('[%s] Parsed PDF raw:\n %s', f_id, converted)
self.logger.debug('[%s] Line-splited PDF: %s', f_id, splited)
# search some well-known words for later positional computation
try:
index_einsatzauftragfw = splited.index('Einsatzauftrag Feuerwehr')
index_erfasser = splited.index('Erfasser')
index_auftrag = splited.index('Auftrag')
index_bemerkungen = splited.index('Bemerkungen')
index_dispo = splited.index('Disponierte Einheiten')
index_einsatz = splited.index('Einsatz')
index_hinweis = splited.index('Hinweis')
index_maps = splited.index('Google Maps')
except ValueError as err:
self.logger.error('[%s] PDF file does not look like a Einsatzausdruck: %s', f_id, err)
return False
# the PDF parsing not always produces the same output
# let's define the already known output
if index_bemerkungen == 6:
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
# get length of bemerkungen field
# it lives between the line which contains 'Bemerkungen' and
# the line 'Disponierte Einheiten'
length_bemerkungen = index_auftrag - index_bemerkungen - 1
erfasser = splited[index_dispo - 2]
auftrag = splited[index_erfasser + 2]
datum = splited[index_erfasser + 3]
zeit = splited[index_erfasser + 4]
einsatz = splited[index_einsatz - 6]
sondersignal = splited[index_einsatz - 5]
ort = splited[index_einsatz - 3]
strasse = splited[index_einsatz - 2]
# sometimes there is just a phone number for the field melder but on
# the second line, so the lines vary for erfasser and melder
if index_dispo - index_erfasser == 10:
melder = splited[index_dispo - 4] + ', ' + splited[index_dispo - 3]
else:
melder = splited[index_dispo - 4]
# BMA style
elif index_bemerkungen == 20:
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
length_bemerkungen = index_dispo - index_bemerkungen - 1
erfasser = splited[index_bemerkungen - 2]
auftrag = splited[index_einsatzauftragfw + 2]
datum = splited[index_einsatzauftragfw + 3]
zeit = splited[index_einsatzauftragfw + 4]
einsatz = splited[index_einsatz + 6]
sondersignal = splited[index_einsatz + 7]
ort = splited[index_einsatz + 9]
strasse = splited[index_einsatz + 10]
melder = 'BMA' # There is no melder on a BMA Einsatzausdruck
elif index_bemerkungen == 21 or index_bemerkungen == 22:
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
length_bemerkungen = index_dispo - index_bemerkungen - 1
erfasser = splited[index_bemerkungen - 2]
auftrag = splited[index_erfasser + 2]
datum = splited[index_erfasser + 3]
zeit = splited[index_erfasser + 4]
einsatz = splited[index_einsatz - 6]
sondersignal = splited[index_einsatz - 5]
ort = splited[index_einsatz - 3]
strasse = splited[index_einsatz - 2]
if index_bemerkungen - index_erfasser == 10:
melder = splited[index_bemerkungen - 4] + ', ' + splited[index_bemerkungen - 3]
else:
melder = splited[index_bemerkungen - 4]
elif index_bemerkungen == 24:
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
length_bemerkungen = index_dispo - index_bemerkungen - 1
erfasser = splited[index_bemerkungen - 2]
auftrag = splited[index_einsatzauftragfw + 4]
datum = splited[index_einsatzauftragfw + 9]
zeit = splited[index_einsatzauftragfw + 10]
einsatz = splited[index_einsatz - 4]
sondersignal = splited[index_einsatz - 3]
ort = ''
strasse = splited[index_einsatz - 2]
melder = splited[index_dispo - 8] + ', ' + splited[index_dispo - 7]
else:
self.logger.error('[%s] Unknown location of Bemerkungen. Line %s', f_id, index_bemerkungen)
return False
# sanity check to see if we can correlate the f_id
if f_id == auftrag:
self.logger.info('[%s] ID matches in PDF', f_id)
else:
self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, auftrag)
return False
# try to find out if there is a hinweis
# if yes, the difference between the indexes is 4, else it's shorter
if index_maps - index_hinweis == 4:
hinweis = splited[index_hinweis+2]
else:
hinweis = ''
data = {
'auftrag': auftrag,
'datum': datum,
'zeit': zeit,
'melder': melder,
'erfasser': erfasser,
'bemerkungen': self.concatenate_to_multiline_string(
splited,
index_bemerkungen + 1,
index_bemerkungen + length_bemerkungen
).rstrip(),
'einsatz': einsatz,
'sondersignal': sondersignal,
'ort': ort.title(),
'strasse': strasse.title(),
#'objekt': splited[],
'hinweis': hinweis,
}
return data
def extract_einsatzprotokoll(self, file, f_id):
""" extracts as many information from the parsed Einsatzprotokoll as possible """
splited = self.convert(file).splitlines()
# sanity check to see if we can correlate the f_id
if f_id == splited[26]:
self.logger.info('[%s] ID matches in PDF', f_id)
else:
self.logger.error('[%s] ID does not match in PDF', f_id)
return False
data = {
'auftrag': splited[26],
'datum': splited[25],
'angelegt': splited[28],
'disposition': splited[30],
'ausgerueckt': splited[32],
'anort': splited[33],
}
return data

140
library/pdftotext.py Normal file
View File

@ -0,0 +1,140 @@
#!/usr/bin/env python3
""" extracts data from ELZ PDFs using Poppler pdftotext """
import subprocess
import logging
class PDFParsing:
""" PDF parsing """
def __init__(self):
self.logger = logging.getLogger(__name__)
self.logger.info('PDF parsing based on pdftotext loaded')
def extract(self, f_id, file, datafields):
data = {}
for field, coordinate in datafields.items():
# x-coordinate of the crop area top left corner
x = coordinate['xMin']
# y-coordinate of the crop area top left corner
y = coordinate['yMin']
# width of crop area in pixels
w = coordinate['xMax'] - coordinate['xMin']
# height of crop area in pixels
h = coordinate['yMax'] - coordinate['yMin']
self.logger.debug('[%s] Computed command for field %s: %s', f_id, field,
'pdftotext -f 1 -l 1 -x {} -y {} -W {} -H {}'.format(x,y,w,h)
)
scrapeddata = subprocess.Popen([
'/usr/bin/pdftotext',
'-f', '1',
'-l', '1',
'-x', str(x),
'-y', str(y),
'-W', str(w),
'-H', str(h),
file,
'-'
],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True)
stdout, _ = scrapeddata.communicate()
## TODO: fixup some fields (lowercase, remove unnecessary \n)
if 'edit' in coordinate and coordinate['edit'] == 'title':
data[field] = stdout.rstrip().title()
else:
data[field] = stdout.rstrip()
# sanity check to see if we can correlate the f_id
if f_id == data['auftrag']:
self.logger.debug('[%s] ID matches in PDF', f_id)
return data
else:
self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, data['auftrag'])
return False
def extract_einsatzausdruck(self, file, f_id):
""" extracts information from Einsatzausdruck using external pdftotext """
self.logger.debug('[%s] Parsing PDF: %s', f_id, file)
# Get them using 'pdftotext -bbox'
# y = row
# x = column: xMax 450 / 590 means full width
coordinates = {
'auftrag': {
'xMin': 70, 'yMin': 47, 'xMax': 120,'yMax': 58,
},
'datum': {
'xMin': 190, 'yMin': 47, 'xMax': 239, 'yMax': 58,
},
'zeit': {
'xMin': 190, 'yMin': 59, 'xMax': 215, 'yMax': 70,
},
'melder': {
'xMin': 304, 'yMin': 47, 'xMax': 446, 'yMax': 70, 'edit': 'title'
},
'erfasser':{
'xMin': 448, 'yMin': 59, 'xMax': 478, 'yMax': 70,
},
# big field until "Disponierte Einheiten"
'bemerkungen': {
'xMin': 28, 'yMin': 112, 'xMax': 590, 'yMax': 350,
},
'disponierteeinheiten': {
'xMin': 28, 'yMin': 366, 'xMax': 450, 'yMax': 376,
},
'einsatz': {
'xMin': 76, 'yMin': 690, 'xMax': 450, 'yMax': 703,
},
'sondersignal': {
'xMin': 76, 'yMin': 707, 'xMax': 450, 'yMax': 721,
},
'ort': {
'xMin': 76, 'yMin': 732, 'xMax': 590, 'yMax': 745,
},
'hinweis': {
'xMin': 76, 'yMin': 773, 'xMax': 450, 'yMax': 787,
},
}
return self.extract(f_id, file, coordinates)
def extract_einsatzprotokoll(self, file, f_id):
""" extracts information from Einsatzprotokoll using external pdftotext """
self.logger.debug('[%s] Parsing PDF: %s', f_id, file)
# Get them using 'pdftotext -bbox'
# y = row
# x = column: xMax 450 / 590 means full width
coordinates = {
'auftrag': {
'xMin': 192, 'yMin': 132, 'xMax': 238,'yMax': 142,
},
'angelegt': {
'xMin': 192, 'yMin': 294, 'xMax': 226, 'yMax': 304,
},
'dispo': {
'xMin': 192, 'yMin': 312, 'xMax': 226, 'yMax': 322,
},
'ausgerueckt': {
'xMin': 192, 'yMin': 331, 'xMax': 226, 'yMax': 341,
},
'vorort':{
'xMin': 192, 'yMin': 348, 'xMax': 226, 'yMax': 358,
},
}
return self.extract(f_id, file, coordinates)

View File

@ -14,7 +14,7 @@ from library.emailhandling import EmailHandling
from library.lodur import Lodur
from library.mqtt import MQTTClient
from library.gotify import GotifyClient
from library.pdf_extract import PDFHandling
from library.pdftotext import PDFParsing
from library.webdav import WebDav
# Configuration
@ -92,7 +92,7 @@ def main():
)
# Initialize PDF Parser
pdf = PDFHandling()
pdf = PDFParsing()
# Main Loop
while True:

View File

@ -1,21 +0,0 @@
import re
import logging
from pprint import pprint
from pathlib import Path
from library.pdf_extract import PDFHandling
PATH = '/tmp/pylokid'
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
PDF = PDFHandling()
for path in Path(PATH).glob('**/*.pdf'):
file = str(path)
print(file)
f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
print(f_id)
pprint(PDF.extract_einsatzausdruck(file, f_id))

30
test_pdftotext.py Normal file
View File

@ -0,0 +1,30 @@
import re
import logging
from pprint import pprint
from pathlib import Path
from library.pdftotext import PDFParsing
PATH = '/home/tobru/Documents/Feuerwehr/Stab/Fourier/Einsatzdepeschen/2019'
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
PDF = PDFParsing()
for path in Path(PATH).glob('**/Einsatzausdruck*.pdf'):
file = str(path)
print(file)
f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
print(f_id)
pprint(PDF.extract_einsatzausdruck(file, f_id))
"""
for path in Path(PATH).glob('**/Einsatzprotokoll*.pdf'):
file = str(path)
print(file)
f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
print(f_id)
pprint(PDF.extract_einsatzprotokoll(file, f_id))
"""