much improved pdf parsing

This commit is contained in:
Tobias Brunner 2018-01-04 21:26:38 +01:00
parent 69e77c491a
commit 152aa3d4b6
4 changed files with 85 additions and 25 deletions

View File

@ -96,7 +96,7 @@ class Lodur:
eins_ereig = pdf_data['einsatz']
bemerkungen = pdf_data['bemerkungen']
wer_ala = pdf_data['melder']
adr = pdf_data['strasse'] + ', ' + pdf_data['plzort']
adr = pdf_data['strasse'] + ', ' + pdf_data['ort']
else:
date = datetime.now()
time = datetime.now()

View File

@ -42,7 +42,7 @@ class MQTTClient:
#self.mqtt_client.publish(topic + 'sondersignal', pdf_data['sondersignal'])
#self.mqtt_client.publish(
# topic + 'adresse',
# pdf_data['strasse'] + ', ' + pdf_data['plzort']
# pdf_data['strasse'] + ', ' + pdf_data['ort']
#)
#self.mqtt_client.publish(topic + 'hinweis', pdf_data['hinweis'])
#self.mqtt_client.publish(topic + 'bemerkungen', pdf_data['bemerkungen'])

View File

@ -22,6 +22,12 @@ class PDFHandling:
logger_page.setLevel(logging.WARNING)
logger_interp = logging.getLogger('pdfminer.pdfinterp')
logger_interp.setLevel(logging.WARNING)
logger_psparser = logging.getLogger('pdfminer.psparser')
logger_psparser.setLevel(logging.WARNING)
logger_cmapdb = logging.getLogger('pdfminer.cmapdb')
logger_cmapdb.setLevel(logging.WARNING)
logger_pdfparser = logging.getLogger('pdfminer.pdfparser')
logger_pdfparser.setLevel(logging.WARNING)
def concatenate_to_multiline_string(self, data, start, end):
""" concatenates multiple lines to a single multiline string """
@ -60,48 +66,83 @@ class PDFHandling:
def extract_einsatzausdruck(self, file, f_id):
""" extracts as many information from the parsed Einsatzausdruck as possible """
splited = self.convert(file).splitlines()
converted = self.convert(file)
splited = converted.splitlines()
self.logger.info('[%s] Parsed PDF raw: %s', f_id, splited)
# sanity check to see if we can correlate the f_id
if f_id == splited[14]:
self.logger.info('[%s] ID matches in PDF', f_id)
else:
self.logger.error('[%s] ID does not match in PDF', f_id)
return False
self.logger.debug('[%s] Parsed PDF raw: %s', f_id, converted)
# search some well-known words for later positional computation
try:
index_erfasser = splited.index('Erfasser')
index_auftrag = splited.index('Auftrag')
index_bemerkungen = splited.index('Bemerkungen')
index_dispo = splited.index('Disponierte Einheiten')
index_einsatz = splited.index('Einsatz')
index_hinweis = splited.index('Hinweis')
index_maps = splited.index('Google Maps')
except IndexError:
self.logger.error('[%s] PDF file does not look like a Einsatzausdruck', f_id)
return False
# get length of bemerkungen field
# it lives between the line which contains 'Bemerkungen' and
# the line 'Disponierte Einheiten'
length_bemerkungen = index_dispo - index_bemerkungen - 1
# the PDF parsing not always produces the same output
# let's define the already known output
if index_bemerkungen == 6:
# get length of bemerkungen field
# it lives between the line which contains 'Bemerkungen' and
# the line 'Disponierte Einheiten'
length_bemerkungen = index_auftrag - index_bemerkungen - 1
erfasser = splited[index_dispo - 2]
# sometimes there is just a phone number for the field melder but on
# the second line, so the lines vary for erfasser and melder
if index_dispo - index_erfasser == 10:
melder = splited[index_dispo - 4] + ', ' + splited[index_dispo - 3]
else:
melder = splited[index_dispo - 4]
elif index_bemerkungen == 21 or index_bemerkungen == 22:
length_bemerkungen = index_dispo - index_bemerkungen - 1
erfasser = splited[index_bemerkungen - 2]
if index_bemerkungen - index_erfasser == 10:
melder = splited[index_bemerkungen - 4] + ', ' + splited[index_bemerkungen - 3]
else:
melder = splited[index_bemerkungen - 4]
else:
self.logger.error('[%s] Unknown parser output', f_id)
return False
# sanity check to see if we can correlate the f_id
auftrag = splited[index_erfasser + 2]
if f_id == auftrag:
self.logger.info('[%s] ID matches in PDF', f_id)
else:
self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, auftrag)
return False
# try to find out if there is a hinweis
# if yes, the difference between the indexes is 4, else it's shorter
if index_maps - index_hinweis == 4:
hinweis = splited[index_hinweis+2]
else:
hinweis = ''
data = {
'auftrag': splited[14],
'datum': splited[15],
'zeit': splited[16],
'melder': splited[18] + ' ' + splited[19],
'erfasser': splited[20],
'auftrag': auftrag,
'datum': splited[index_erfasser + 3],
'zeit': splited[index_erfasser + 4],
'melder': melder,
'erfasser': erfasser,
'bemerkungen': self.concatenate_to_multiline_string(
splited,
index_bemerkungen + 1,
index_bemerkungen + length_bemerkungen
).rstrip(),
'einsatz': splited[index_dispo+5],
'sondersignal': splited[index_dispo+6],
'plzort': splited[index_dispo+8].title(),
'strasse': splited[index_dispo+9].title(),
'einsatz': splited[index_einsatz - 6],
'sondersignal': splited[index_einsatz - 5],
'ort': splited[index_einsatz - 3].title(),
'strasse': splited[index_einsatz - 2].title(),
#'objekt': splited[],
'hinweis': splited[index_hinweis+2]
'hinweis': hinweis,
}
return data

19
test_pdf_parsing.py Normal file
View File

@ -0,0 +1,19 @@
import re
import logging
from pprint import pprint
from pathlib import Path
from library.pdf_extract import PDFHandling
PATH = '/tmp/pylokid'
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
PDF = PDFHandling()
for path in Path(PATH).glob('**/*.pdf'):
file = str(path)
f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
pprint(PDF.extract_einsatzausdruck(file, f_id))