improve PDF parsing
This commit is contained in:
parent
9ff811f1ba
commit
625b8c7da3
|
@ -24,24 +24,41 @@ def create_einsatzrapport(username, password, base_url, f_id, pdf_data):
|
|||
('is_herznotfall', ''),
|
||||
)
|
||||
|
||||
# when PDF parsing fails, pdf_data is false. fill with tbd when this happens
|
||||
if pdf_data:
|
||||
date = datetime.strptime(
|
||||
pdf_data['datum'],
|
||||
'%d.%m.%Y',
|
||||
)
|
||||
time = datetime.strptime(
|
||||
pdf_data['zeit'],
|
||||
'%H:%M',
|
||||
)
|
||||
eins_ereig = pdf_data['einsatz']
|
||||
adr = pdf_data['strasse'] + ', ' + pdf_data['plzort']
|
||||
else:
|
||||
date = datetime.now()
|
||||
eins_ereig = 'TBD'
|
||||
adr = 'TBD'
|
||||
|
||||
data = {
|
||||
'e_r_num': (None, f_id), # 01. Einsatzrapportnummer
|
||||
'eins_stat_kantone': (None, '1'), # 02. Einsatzart FKS
|
||||
'emergency_concept_id': (None, '2'), # 03. Verrechnungsart
|
||||
'ver_sart': (None, 'ab'), # 03. Verrechnungsart internal: ab, th, uh, ak, tt
|
||||
'dtv_d': (None, str(datetime.now().day)), # 04. Datum von
|
||||
'dtv_m': (None, str(datetime.now().month)), # 04. Datum von
|
||||
'dtv_y': (None, str(datetime.now().year)), # 04. Datum von
|
||||
'dtb_d': (None, str(datetime.now().day)), # 04. Datum bis
|
||||
'dtb_m': (None, str(datetime.now().month)), # 04. Datum bis
|
||||
'dtb_y': (None, str(datetime.now().year)), # 04. Datum bis
|
||||
'ztv_h': (None, '11'), # 05. Zeit von
|
||||
'ztv_m': (None, '11'), # 05. Zeit von
|
||||
'ztb_h': (None, '12'), # 05. Zeit bis
|
||||
'ztb_m': (None, '12'), # 05. Zeit bis
|
||||
'dtv_d': (None, str(date.day)), # 04. Datum von
|
||||
'dtv_m': (None, str(date.month)), # 04. Datum von
|
||||
'dtv_y': (None, str(date.year)), # 04. Datum von
|
||||
'dtb_d': (None, str(date.day)), # 04. Datum bis - we dont know yet the end date
|
||||
'dtb_m': (None, str(date.month)), # 04. Datum bis - assume the same day
|
||||
'dtb_y': (None, str(date.year)), # 04. Datum bis
|
||||
'ztv_h': (None, str(time.hour)), # 05. Zeit von
|
||||
'ztv_m': (None, str(time.minute)), # 05. Zeit von
|
||||
'ztb_h': (None, str(time.hour + 1)), # 05. Zeit bis - we dont know yet the end time
|
||||
'ztb_m': (None, str(time.minute)), # 05. Zeit bis - just add one hour and correct later
|
||||
'e_ort_1': (None, '306'), # 06. Einsatzort: Urdorf 306, Birmensdorf 298
|
||||
'eins_ereig': (None, pdf_data['einsatz']), # 07. Ereignis # TODO utf-8?
|
||||
'adr': (None, 'TBD'), # 08. Adresse
|
||||
'eins_ereig': (None, eins_ereig.encode('iso-8859-1')), # 07. Ereignis
|
||||
'adr': (None, adr.encode('iso-8859-1')), # 08. Adresse
|
||||
#'zh_alarmierung_h': (None, 'UNKNOWN'), # 12. Alarmierung
|
||||
#'zh_alarmierung_m': (None, 'UNKNOWN'), # 12. Alarmierung
|
||||
#'zh_fw_ausg_h': (None, 'UNKNOWN'), # 13. FW ausgerückt
|
||||
|
@ -57,7 +74,7 @@ def create_einsatzrapport(username, password, base_url, f_id, pdf_data):
|
|||
'bk': (None, 'TBD3'), # 20. Bemerkungen
|
||||
'en_kr_feuwehr': (None, '1'), # 21. Einsatzkräfte
|
||||
'ali_io': (None, '1'), # 24. Alarmierung
|
||||
'kopie_gvz': (None, '1'), # 31. Kopie innert 10 Tagen an
|
||||
'kopie_gvz': (None, '1'), # 31. Kopie innert 10 Tagen an GVZ
|
||||
'mannschaftd_einsa': (None, '70'), # 32. Einsatzleiter|in
|
||||
}
|
||||
|
||||
|
@ -96,3 +113,33 @@ def upload_alarmdepesche(username, password, base_url, lodur_id, file_name, file
|
|||
params=params,
|
||||
files=data,
|
||||
)
|
||||
|
||||
# TODO this doesnt work. We first have to fetch the current form with its
|
||||
# data, update the fields we want to change and resubmit the form
|
||||
def update_einsatzrapport(username, password, base_url, lodur_id, data):
|
||||
""" Update the Einsatzrapport """
|
||||
|
||||
session = requests.session()
|
||||
login_data = {
|
||||
'login_member_name': username,
|
||||
'login_member_pwd': password,
|
||||
}
|
||||
|
||||
# Authenticate
|
||||
session.post(base_url, data=login_data)
|
||||
|
||||
params = (
|
||||
('modul', '36'),
|
||||
('what', '144'),
|
||||
('sp', '1'),
|
||||
('event', lodur_id),
|
||||
('edit', '1'),
|
||||
('is_herznotfall', ''),
|
||||
)
|
||||
|
||||
answer = session.post(
|
||||
'https://lodur-zh.ch/urdorf/index.php',
|
||||
params=params,
|
||||
files=data,
|
||||
)
|
||||
print(answer.headers)
|
||||
|
|
17
main.py
17
main.py
|
@ -15,7 +15,7 @@ import imaplib
|
|||
import aioeasywebdav
|
||||
from dotenv import load_dotenv, find_dotenv
|
||||
import paho.mqtt.client as mqtt
|
||||
from lodur_connect import create_einsatzrapport, upload_alarmdepesche
|
||||
from lodur_connect import create_einsatzrapport, upload_alarmdepesche, update_einsatzrapport
|
||||
import pdf_extract
|
||||
|
||||
_EMAIL_SUBJECTS = '(OR SUBJECT "Einsatzausdruck_FW" SUBJECT "Einsatzprotokoll" UNSEEN)'
|
||||
|
@ -235,10 +235,14 @@ def main():
|
|||
)
|
||||
else:
|
||||
# this is real - publish Einsatz on MQTT
|
||||
# TODO publish more information about the einsatz - coming from the PDF
|
||||
mqtt_client.publish('pylokid/' + f_type, f_id)
|
||||
|
||||
# get as many information from PDF as possible
|
||||
pdf_data = pdf_extract.get_einsatzausdruck(os.path.join(tmp_dir, file_name))
|
||||
pdf_data = pdf_extract.extract_einsatzausdruck(
|
||||
os.path.join(tmp_dir, file_name),
|
||||
f_id,
|
||||
)
|
||||
|
||||
# create new Einsatzrapport in Lodur
|
||||
logger.info('Creating Einsatzrapport in Lodur for ' + f_id)
|
||||
|
@ -279,6 +283,15 @@ def main():
|
|||
file_name,
|
||||
os.path.join(tmp_dir, file_name),
|
||||
)
|
||||
pdf_data = pdf_extract.extract_einsatzprotokoll(
|
||||
os.path.join(tmp_dir, file_name),
|
||||
f_id,
|
||||
)
|
||||
# only update when parsing was successfull
|
||||
if pdf_data:
|
||||
logger.info('Updating Einsatzrapport with data from PDF - not yet implemented')
|
||||
else:
|
||||
logger.info('Updating Einsatzrapport not possible - PDF parsing failed')
|
||||
else:
|
||||
logger.error('Cannot process Einsatzprotokoll as there is no Lodur ID')
|
||||
else:
|
||||
|
|
|
@ -1,12 +1,16 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
""" extracts data from ELZ PDFs """
|
||||
|
||||
import io
|
||||
import logging
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter
|
||||
from pdfminer.converter import TextConverter
|
||||
from pdfminer.layout import LAParams
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
|
||||
def concatenate_to_multiline_string(data, start, end):
|
||||
""" concatenates multiple lines to a single multiline string """
|
||||
res = ''
|
||||
counter = start
|
||||
while counter <= end:
|
||||
|
@ -15,6 +19,7 @@ def concatenate_to_multiline_string(data, start, end):
|
|||
return res
|
||||
|
||||
def convert(file):
|
||||
""" converts the PDF to a multiline string """
|
||||
pagenums = set()
|
||||
manager = PDFResourceManager()
|
||||
codec = 'utf-8'
|
||||
|
@ -29,41 +34,70 @@ def convert(file):
|
|||
for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True):
|
||||
interpreter.process_page(page)
|
||||
|
||||
convertedPDF = output.getvalue()
|
||||
converted_pdf = output.getvalue()
|
||||
|
||||
infile.close()
|
||||
converter.close()
|
||||
output.close()
|
||||
return convertedPDF
|
||||
return converted_pdf
|
||||
|
||||
def get_einsatzausdruck(file):
|
||||
def extract_einsatzausdruck(file, f_id):
|
||||
""" extracts as many information from the parsed Einsatzausdruck as possible """
|
||||
|
||||
splited = convert(file).splitlines()
|
||||
# sometimes the "second part - below map" doesnt start at the same index
|
||||
# depending on the lenght of the bemerkungen
|
||||
# therefore we compute a simple offset for the second part
|
||||
# TODO: make it better
|
||||
second_part_offset = 29 - splited.index('Disponierte Einheiten')
|
||||
|
||||
# sanity check to see if we can correlate the f_id
|
||||
if f_id == splited[14]:
|
||||
logging.info('PDF parsing: f_id matches line 14')
|
||||
else:
|
||||
logging.error('PDF parsing: f_id doesn\'t match line 14')
|
||||
return False
|
||||
|
||||
try:
|
||||
# search some well-known words for later positional computation
|
||||
index_bemerkungen = splited.index('Bemerkungen')
|
||||
index_dispo = splited.index('Disponierte Einheiten')
|
||||
index_hinweis = splited.index('Hinweis')
|
||||
except:
|
||||
loggin.error('PDF file doesn\'t look like a Einsatzausdruck')
|
||||
return False
|
||||
|
||||
# get length of bemerkungen field
|
||||
# it lives between the line which contains 'Bemerkungen' and
|
||||
# the line 'Disponierte Einheiten'
|
||||
length_bemerkungen = index_dispo - index_bemerkungen - 1
|
||||
|
||||
data = {
|
||||
'auftrag': splited[14],
|
||||
'datum': splited[15],
|
||||
'zeit': splited[16],
|
||||
'melder': concatenate_to_multiline_string(splited,18,19),
|
||||
'melder': concatenate_to_multiline_string(splited, 18, 19),
|
||||
'erfasser': splited[20],
|
||||
'bemerkungen': concatenate_to_multiline_string(splited,23,28),
|
||||
'einsatz': splited[34-second_part_offset],
|
||||
'ort': splited[37-second_part_offset],
|
||||
'strasse': splited[38-second_part_offset],
|
||||
'bemerkungen': concatenate_to_multiline_string(
|
||||
splited,
|
||||
index_bemerkungen,
|
||||
index_bemerkungen + length_bemerkungen
|
||||
),
|
||||
'einsatz': splited[index_dispo+5],
|
||||
'plzort': splited[index_dispo+8].title(),
|
||||
'strasse': splited[index_dispo+9].title(),
|
||||
#'objekt': splited[],
|
||||
'hinweis': splited[50-second_part_offset]
|
||||
'hinweis': splited[index_hinweis+2]
|
||||
}
|
||||
return data
|
||||
|
||||
def get_einsatzprotokoll(file):
|
||||
def extract_einsatzprotokoll(file, f_id):
|
||||
""" extracts as many information from the parsed Einsatzprotokoll as possible """
|
||||
|
||||
splited = convert(file).splitlines()
|
||||
|
||||
# sanity check to see if we can correlate the f_id
|
||||
if f_id == splited[26]:
|
||||
logging.info('PDF parsing: f_id matches line 26')
|
||||
else:
|
||||
logging.error('PDF parsing: f_id doesn\'t match line 26')
|
||||
return False
|
||||
|
||||
data = {
|
||||
'auftrag': splited[26],
|
||||
'datum': splited[25],
|
||||
|
|
Loading…
Reference in New Issue