much improved pdf parsing
This commit is contained in:
parent
69e77c491a
commit
152aa3d4b6
|
@ -96,7 +96,7 @@ class Lodur:
|
|||
eins_ereig = pdf_data['einsatz']
|
||||
bemerkungen = pdf_data['bemerkungen']
|
||||
wer_ala = pdf_data['melder']
|
||||
adr = pdf_data['strasse'] + ', ' + pdf_data['plzort']
|
||||
adr = pdf_data['strasse'] + ', ' + pdf_data['ort']
|
||||
else:
|
||||
date = datetime.now()
|
||||
time = datetime.now()
|
||||
|
|
|
@ -42,7 +42,7 @@ class MQTTClient:
|
|||
#self.mqtt_client.publish(topic + 'sondersignal', pdf_data['sondersignal'])
|
||||
#self.mqtt_client.publish(
|
||||
# topic + 'adresse',
|
||||
# pdf_data['strasse'] + ', ' + pdf_data['plzort']
|
||||
# pdf_data['strasse'] + ', ' + pdf_data['ort']
|
||||
#)
|
||||
#self.mqtt_client.publish(topic + 'hinweis', pdf_data['hinweis'])
|
||||
#self.mqtt_client.publish(topic + 'bemerkungen', pdf_data['bemerkungen'])
|
||||
|
|
|
@ -22,6 +22,12 @@ class PDFHandling:
|
|||
logger_page.setLevel(logging.WARNING)
|
||||
logger_interp = logging.getLogger('pdfminer.pdfinterp')
|
||||
logger_interp.setLevel(logging.WARNING)
|
||||
logger_psparser = logging.getLogger('pdfminer.psparser')
|
||||
logger_psparser.setLevel(logging.WARNING)
|
||||
logger_cmapdb = logging.getLogger('pdfminer.cmapdb')
|
||||
logger_cmapdb.setLevel(logging.WARNING)
|
||||
logger_pdfparser = logging.getLogger('pdfminer.pdfparser')
|
||||
logger_pdfparser.setLevel(logging.WARNING)
|
||||
|
||||
def concatenate_to_multiline_string(self, data, start, end):
|
||||
""" concatenates multiple lines to a single multiline string """
|
||||
|
@ -60,48 +66,83 @@ class PDFHandling:
|
|||
def extract_einsatzausdruck(self, file, f_id):
|
||||
""" extracts as many information from the parsed Einsatzausdruck as possible """
|
||||
|
||||
splited = self.convert(file).splitlines()
|
||||
converted = self.convert(file)
|
||||
splited = converted.splitlines()
|
||||
|
||||
self.logger.info('[%s] Parsed PDF raw: %s', f_id, splited)
|
||||
|
||||
# sanity check to see if we can correlate the f_id
|
||||
if f_id == splited[14]:
|
||||
self.logger.info('[%s] ID matches in PDF', f_id)
|
||||
else:
|
||||
self.logger.error('[%s] ID does not match in PDF', f_id)
|
||||
return False
|
||||
self.logger.debug('[%s] Parsed PDF raw: %s', f_id, converted)
|
||||
|
||||
# search some well-known words for later positional computation
|
||||
try:
|
||||
index_erfasser = splited.index('Erfasser')
|
||||
index_auftrag = splited.index('Auftrag')
|
||||
index_bemerkungen = splited.index('Bemerkungen')
|
||||
index_dispo = splited.index('Disponierte Einheiten')
|
||||
index_einsatz = splited.index('Einsatz')
|
||||
index_hinweis = splited.index('Hinweis')
|
||||
index_maps = splited.index('Google Maps')
|
||||
except IndexError:
|
||||
self.logger.error('[%s] PDF file does not look like a Einsatzausdruck', f_id)
|
||||
return False
|
||||
|
||||
# get length of bemerkungen field
|
||||
# it lives between the line which contains 'Bemerkungen' and
|
||||
# the line 'Disponierte Einheiten'
|
||||
length_bemerkungen = index_dispo - index_bemerkungen - 1
|
||||
# the PDF parsing not always produces the same output
|
||||
# let's define the already known output
|
||||
if index_bemerkungen == 6:
|
||||
# get length of bemerkungen field
|
||||
# it lives between the line which contains 'Bemerkungen' and
|
||||
# the line 'Disponierte Einheiten'
|
||||
length_bemerkungen = index_auftrag - index_bemerkungen - 1
|
||||
erfasser = splited[index_dispo - 2]
|
||||
# sometimes there is just a phone number for the field melder but on
|
||||
# the second line, so the lines vary for erfasser and melder
|
||||
if index_dispo - index_erfasser == 10:
|
||||
melder = splited[index_dispo - 4] + ', ' + splited[index_dispo - 3]
|
||||
else:
|
||||
melder = splited[index_dispo - 4]
|
||||
elif index_bemerkungen == 21 or index_bemerkungen == 22:
|
||||
length_bemerkungen = index_dispo - index_bemerkungen - 1
|
||||
erfasser = splited[index_bemerkungen - 2]
|
||||
if index_bemerkungen - index_erfasser == 10:
|
||||
melder = splited[index_bemerkungen - 4] + ', ' + splited[index_bemerkungen - 3]
|
||||
else:
|
||||
melder = splited[index_bemerkungen - 4]
|
||||
else:
|
||||
self.logger.error('[%s] Unknown parser output', f_id)
|
||||
return False
|
||||
|
||||
# sanity check to see if we can correlate the f_id
|
||||
auftrag = splited[index_erfasser + 2]
|
||||
if f_id == auftrag:
|
||||
self.logger.info('[%s] ID matches in PDF', f_id)
|
||||
else:
|
||||
self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, auftrag)
|
||||
return False
|
||||
|
||||
|
||||
|
||||
# try to find out if there is a hinweis
|
||||
# if yes, the difference between the indexes is 4, else it's shorter
|
||||
if index_maps - index_hinweis == 4:
|
||||
hinweis = splited[index_hinweis+2]
|
||||
else:
|
||||
hinweis = ''
|
||||
|
||||
data = {
|
||||
'auftrag': splited[14],
|
||||
'datum': splited[15],
|
||||
'zeit': splited[16],
|
||||
'melder': splited[18] + ' ' + splited[19],
|
||||
'erfasser': splited[20],
|
||||
'auftrag': auftrag,
|
||||
'datum': splited[index_erfasser + 3],
|
||||
'zeit': splited[index_erfasser + 4],
|
||||
'melder': melder,
|
||||
'erfasser': erfasser,
|
||||
'bemerkungen': self.concatenate_to_multiline_string(
|
||||
splited,
|
||||
index_bemerkungen + 1,
|
||||
index_bemerkungen + length_bemerkungen
|
||||
).rstrip(),
|
||||
'einsatz': splited[index_dispo+5],
|
||||
'sondersignal': splited[index_dispo+6],
|
||||
'plzort': splited[index_dispo+8].title(),
|
||||
'strasse': splited[index_dispo+9].title(),
|
||||
'einsatz': splited[index_einsatz - 6],
|
||||
'sondersignal': splited[index_einsatz - 5],
|
||||
'ort': splited[index_einsatz - 3].title(),
|
||||
'strasse': splited[index_einsatz - 2].title(),
|
||||
#'objekt': splited[],
|
||||
'hinweis': splited[index_hinweis+2]
|
||||
'hinweis': hinweis,
|
||||
}
|
||||
return data
|
||||
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
import re
|
||||
import logging
|
||||
from pprint import pprint
|
||||
from pathlib import Path
|
||||
from library.pdf_extract import PDFHandling
|
||||
|
||||
PATH = '/tmp/pylokid'
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
PDF = PDFHandling()
|
||||
|
||||
for path in Path(PATH).glob('**/*.pdf'):
|
||||
file = str(path)
|
||||
f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
|
||||
pprint(PDF.extract_einsatzausdruck(file, f_id))
|
Loading…
Reference in New Issue