improve PDF parsing

This commit is contained in:
Tobias Brunner 2017-12-30 17:01:13 +01:00
parent 9ff811f1ba
commit 625b8c7da3
3 changed files with 125 additions and 31 deletions

View File

@ -24,24 +24,41 @@ def create_einsatzrapport(username, password, base_url, f_id, pdf_data):
('is_herznotfall', ''),
)
# when PDF parsing fails, pdf_data is false. fill with tbd when this happens
if pdf_data:
date = datetime.strptime(
pdf_data['datum'],
'%d.%m.%Y',
)
time = datetime.strptime(
pdf_data['zeit'],
'%H:%M',
)
eins_ereig = pdf_data['einsatz']
adr = pdf_data['strasse'] + ', ' + pdf_data['plzort']
else:
date = datetime.now()
eins_ereig = 'TBD'
adr = 'TBD'
data = {
'e_r_num': (None, f_id), # 01. Einsatzrapportnummer
'eins_stat_kantone': (None, '1'), # 02. Einsatzart FKS
'emergency_concept_id': (None, '2'), # 03. Verrechnungsart
'ver_sart': (None, 'ab'), # 03. Verrechnungsart internal: ab, th, uh, ak, tt
'dtv_d': (None, str(datetime.now().day)), # 04. Datum von
'dtv_m': (None, str(datetime.now().month)), # 04. Datum von
'dtv_y': (None, str(datetime.now().year)), # 04. Datum von
'dtb_d': (None, str(datetime.now().day)), # 04. Datum bis
'dtb_m': (None, str(datetime.now().month)), # 04. Datum bis
'dtb_y': (None, str(datetime.now().year)), # 04. Datum bis
'ztv_h': (None, '11'), # 05. Zeit von
'ztv_m': (None, '11'), # 05. Zeit von
'ztb_h': (None, '12'), # 05. Zeit bis
'ztb_m': (None, '12'), # 05. Zeit bis
'dtv_d': (None, str(date.day)), # 04. Datum von
'dtv_m': (None, str(date.month)), # 04. Datum von
'dtv_y': (None, str(date.year)), # 04. Datum von
'dtb_d': (None, str(date.day)), # 04. Datum bis - we dont know yet the end date
'dtb_m': (None, str(date.month)), # 04. Datum bis - assume the same day
'dtb_y': (None, str(date.year)), # 04. Datum bis
'ztv_h': (None, str(time.hour)), # 05. Zeit von
'ztv_m': (None, str(time.minute)), # 05. Zeit von
'ztb_h': (None, str(time.hour + 1)), # 05. Zeit bis - we dont know yet the end time
'ztb_m': (None, str(time.minute)), # 05. Zeit bis - just add one hour and correct later
'e_ort_1': (None, '306'), # 06. Einsatzort: Urdorf 306, Birmensdorf 298
'eins_ereig': (None, pdf_data['einsatz']), # 07. Ereignis # TODO utf-8?
'adr': (None, 'TBD'), # 08. Adresse
'eins_ereig': (None, eins_ereig.encode('iso-8859-1')), # 07. Ereignis
'adr': (None, adr.encode('iso-8859-1')), # 08. Adresse
#'zh_alarmierung_h': (None, 'UNKNOWN'), # 12. Alarmierung
#'zh_alarmierung_m': (None, 'UNKNOWN'), # 12. Alarmierung
#'zh_fw_ausg_h': (None, 'UNKNOWN'), # 13. FW ausgerückt
@ -57,7 +74,7 @@ def create_einsatzrapport(username, password, base_url, f_id, pdf_data):
'bk': (None, 'TBD3'), # 20. Bemerkungen
'en_kr_feuwehr': (None, '1'), # 21. Einsatzkräfte
'ali_io': (None, '1'), # 24. Alarmierung
'kopie_gvz': (None, '1'), # 31. Kopie innert 10 Tagen an
'kopie_gvz': (None, '1'), # 31. Kopie innert 10 Tagen an GVZ
'mannschaftd_einsa': (None, '70'), # 32. Einsatzleiter|in
}
@ -96,3 +113,33 @@ def upload_alarmdepesche(username, password, base_url, lodur_id, file_name, file
params=params,
files=data,
)
# TODO this doesnt work. We first have to fetch the current form with its
# data, update the fields we want to change and resubmit the form
def update_einsatzrapport(username, password, base_url, lodur_id, data):
""" Update the Einsatzrapport """
session = requests.session()
login_data = {
'login_member_name': username,
'login_member_pwd': password,
}
# Authenticate
session.post(base_url, data=login_data)
params = (
('modul', '36'),
('what', '144'),
('sp', '1'),
('event', lodur_id),
('edit', '1'),
('is_herznotfall', ''),
)
answer = session.post(
'https://lodur-zh.ch/urdorf/index.php',
params=params,
files=data,
)
print(answer.headers)

17
main.py
View File

@ -15,7 +15,7 @@ import imaplib
import aioeasywebdav
from dotenv import load_dotenv, find_dotenv
import paho.mqtt.client as mqtt
from lodur_connect import create_einsatzrapport, upload_alarmdepesche
from lodur_connect import create_einsatzrapport, upload_alarmdepesche, update_einsatzrapport
import pdf_extract
_EMAIL_SUBJECTS = '(OR SUBJECT "Einsatzausdruck_FW" SUBJECT "Einsatzprotokoll" UNSEEN)'
@ -235,10 +235,14 @@ def main():
)
else:
# this is real - publish Einsatz on MQTT
# TODO publish more information about the einsatz - coming from the PDF
mqtt_client.publish('pylokid/' + f_type, f_id)
# get as many information from PDF as possible
pdf_data = pdf_extract.get_einsatzausdruck(os.path.join(tmp_dir, file_name))
pdf_data = pdf_extract.extract_einsatzausdruck(
os.path.join(tmp_dir, file_name),
f_id,
)
# create new Einsatzrapport in Lodur
logger.info('Creating Einsatzrapport in Lodur for ' + f_id)
@ -279,6 +283,15 @@ def main():
file_name,
os.path.join(tmp_dir, file_name),
)
pdf_data = pdf_extract.extract_einsatzprotokoll(
os.path.join(tmp_dir, file_name),
f_id,
)
# only update when parsing was successfull
if pdf_data:
logger.info('Updating Einsatzrapport with data from PDF - not yet implemented')
else:
logger.info('Updating Einsatzrapport not possible - PDF parsing failed')
else:
logger.error('Cannot process Einsatzprotokoll as there is no Lodur ID')
else:

View File

@ -1,12 +1,16 @@
#!/usr/bin/env python3
""" extracts data from ELZ PDFs """
import io
import logging
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def concatenate_to_multiline_string(data, start, end):
""" concatenates multiple lines to a single multiline string """
res = ''
counter = start
while counter <= end:
@ -15,6 +19,7 @@ def concatenate_to_multiline_string(data, start, end):
return res
def convert(file):
""" converts the PDF to a multiline string """
pagenums = set()
manager = PDFResourceManager()
codec = 'utf-8'
@ -29,41 +34,70 @@ def convert(file):
for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True):
interpreter.process_page(page)
convertedPDF = output.getvalue()
converted_pdf = output.getvalue()
infile.close()
converter.close()
output.close()
return convertedPDF
return converted_pdf
def get_einsatzausdruck(file):
def extract_einsatzausdruck(file, f_id):
""" extracts as many information from the parsed Einsatzausdruck as possible """
splited = convert(file).splitlines()
# sometimes the "second part - below map" doesnt start at the same index
# depending on the lenght of the bemerkungen
# therefore we compute a simple offset for the second part
# TODO: make it better
second_part_offset = 29 - splited.index('Disponierte Einheiten')
# sanity check to see if we can correlate the f_id
if f_id == splited[14]:
logging.info('PDF parsing: f_id matches line 14')
else:
logging.error('PDF parsing: f_id doesn\'t match line 14')
return False
try:
# search some well-known words for later positional computation
index_bemerkungen = splited.index('Bemerkungen')
index_dispo = splited.index('Disponierte Einheiten')
index_hinweis = splited.index('Hinweis')
except:
loggin.error('PDF file doesn\'t look like a Einsatzausdruck')
return False
# get length of bemerkungen field
# it lives between the line which contains 'Bemerkungen' and
# the line 'Disponierte Einheiten'
length_bemerkungen = index_dispo - index_bemerkungen - 1
data = {
'auftrag': splited[14],
'datum': splited[15],
'zeit': splited[16],
'melder': concatenate_to_multiline_string(splited,18,19),
'melder': concatenate_to_multiline_string(splited, 18, 19),
'erfasser': splited[20],
'bemerkungen': concatenate_to_multiline_string(splited,23,28),
'einsatz': splited[34-second_part_offset],
'ort': splited[37-second_part_offset],
'strasse': splited[38-second_part_offset],
'bemerkungen': concatenate_to_multiline_string(
splited,
index_bemerkungen,
index_bemerkungen + length_bemerkungen
),
'einsatz': splited[index_dispo+5],
'plzort': splited[index_dispo+8].title(),
'strasse': splited[index_dispo+9].title(),
#'objekt': splited[],
'hinweis': splited[50-second_part_offset]
'hinweis': splited[index_hinweis+2]
}
return data
def get_einsatzprotokoll(file):
def extract_einsatzprotokoll(file, f_id):
""" extracts as many information from the parsed Einsatzprotokoll as possible """
splited = convert(file).splitlines()
# sanity check to see if we can correlate the f_id
if f_id == splited[26]:
logging.info('PDF parsing: f_id matches line 26')
else:
logging.error('PDF parsing: f_id doesn\'t match line 26')
return False
data = {
'auftrag': splited[26],
'datum': splited[25],