initial work on pdf data extract

This commit is contained in:
Tobias Brunner 2017-12-28 20:07:56 +01:00
parent d3885c47d9
commit 9ff811f1ba
3 changed files with 82 additions and 2 deletions

View File

@ -4,7 +4,7 @@ import re
from datetime import datetime
import requests
def create_einsatzrapport(username, password, base_url, f_id):
def create_einsatzrapport(username, password, base_url, f_id, pdf_data):
session = requests.session()
login_data = {
@ -40,7 +40,7 @@ def create_einsatzrapport(username, password, base_url, f_id):
'ztb_h': (None, '12'), # 05. Zeit bis
'ztb_m': (None, '12'), # 05. Zeit bis
'e_ort_1': (None, '306'), # 06. Einsatzort: Urdorf 306, Birmensdorf 298
'eins_ereig': (None, f_id), # 07. Ereignis
'eins_ereig': (None, pdf_data['einsatz']), # 07. Ereignis # TODO utf-8?
'adr': (None, 'TBD'), # 08. Adresse
#'zh_alarmierung_h': (None, 'UNKNOWN'), # 12. Alarmierung
#'zh_alarmierung_m': (None, 'UNKNOWN'), # 12. Alarmierung

View File

@ -16,6 +16,7 @@ import aioeasywebdav
from dotenv import load_dotenv, find_dotenv
import paho.mqtt.client as mqtt
from lodur_connect import create_einsatzrapport, upload_alarmdepesche
import pdf_extract
_EMAIL_SUBJECTS = '(OR SUBJECT "Einsatzausdruck_FW" SUBJECT "Einsatzprotokoll" UNSEEN)'
_INTERVAL = 10
@ -236,6 +237,9 @@ def main():
# this is real - publish Einsatz on MQTT
mqtt_client.publish('pylokid/' + f_type, f_id)
# get as many information from PDF as possible
pdf_data = pdf_extract.get_einsatzausdruck(os.path.join(tmp_dir, file_name))
# create new Einsatzrapport in Lodur
logger.info('Creating Einsatzrapport in Lodur for ' + f_id)
lodur_id = create_einsatzrapport(
@ -243,6 +247,7 @@ def main():
lodur_password,
lodur_base_url,
f_id,
pdf_data,
)
logger.info('Sent data to Lodur. Assigned Lodur ID: ' + lodur_id)
# store lodur id in webdav

75
pdf_extract.py Normal file
View File

@ -0,0 +1,75 @@
#!/usr/bin/env python3
import io
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def concatenate_to_multiline_string(data, start, end):
res = ''
counter = start
while counter <= end:
res += data[counter] + '\n'
counter += 1
return res
def convert(file):
pagenums = set()
manager = PDFResourceManager()
codec = 'utf-8'
caching = True
output = io.StringIO()
converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(file, 'rb')
for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True):
interpreter.process_page(page)
convertedPDF = output.getvalue()
infile.close()
converter.close()
output.close()
return convertedPDF
def get_einsatzausdruck(file):
""" extracts as many information from the parsed Einsatzausdruck as possible """
splited = convert(file).splitlines()
# sometimes the "second part - below map" doesnt start at the same index
# depending on the lenght of the bemerkungen
# therefore we compute a simple offset for the second part
# TODO: make it better
second_part_offset = 29 - splited.index('Disponierte Einheiten')
data = {
'auftrag': splited[14],
'datum': splited[15],
'zeit': splited[16],
'melder': concatenate_to_multiline_string(splited,18,19),
'erfasser': splited[20],
'bemerkungen': concatenate_to_multiline_string(splited,23,28),
'einsatz': splited[34-second_part_offset],
'ort': splited[37-second_part_offset],
'strasse': splited[38-second_part_offset],
#'objekt': splited[],
'hinweis': splited[50-second_part_offset]
}
return data
def get_einsatzprotokoll(file):
""" extracts as many information from the parsed Einsatzprotokoll as possible """
splited = convert(file).splitlines()
data = {
'auftrag': splited[26],
'datum': splited[25],
'angelegt': splited[28],
'disposition': splited[30],
'ausgerueckt': splited[32],
'anort': splited[33],
}
return data