complete rework of pdf parsing
This commit is contained in:
parent
b5c7d7b7b1
commit
b0228afcfd
|
@ -74,7 +74,7 @@ class Lodur:
|
||||||
'%H:%M',
|
'%H:%M',
|
||||||
)
|
)
|
||||||
zh_am_schad = datetime.strptime(
|
zh_am_schad = datetime.strptime(
|
||||||
pdf_data['anort'],
|
pdf_data['vorort'],
|
||||||
'%H:%M',
|
'%H:%M',
|
||||||
)
|
)
|
||||||
except ValueError as err:
|
except ValueError as err:
|
||||||
|
@ -120,9 +120,9 @@ class Lodur:
|
||||||
'%H:%M',
|
'%H:%M',
|
||||||
)
|
)
|
||||||
eins_ereig = pdf_data['einsatz']
|
eins_ereig = pdf_data['einsatz']
|
||||||
bemerkungen = pdf_data['bemerkungen']
|
bemerkungen = pdf_data['bemerkungen'] + '\n' + pdf_data['disponierteeinheiten']
|
||||||
wer_ala = pdf_data['melder']
|
wer_ala = pdf_data['melder']
|
||||||
adr = pdf_data['strasse'] + ', ' + pdf_data['ort']
|
adr = pdf_data['ort']
|
||||||
else:
|
else:
|
||||||
date = datetime.now()
|
date = datetime.now()
|
||||||
time = datetime.now()
|
time = datetime.now()
|
||||||
|
|
|
@ -1,209 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
""" extracts data from ELZ PDFs """
|
|
||||||
|
|
||||||
import io
|
|
||||||
import logging
|
|
||||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
||||||
from pdfminer.converter import TextConverter
|
|
||||||
from pdfminer.layout import LAParams
|
|
||||||
from pdfminer.pdfpage import PDFPage
|
|
||||||
|
|
||||||
class PDFHandling:
|
|
||||||
""" PDF handling like parsing """
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# less logging for pdfminer - more is not needed
|
|
||||||
logger_doc = logging.getLogger('pdfminer.pdfdocument')
|
|
||||||
logger_doc.setLevel(logging.WARNING)
|
|
||||||
logger_page = logging.getLogger('pdfminer.pdfpage')
|
|
||||||
logger_page.setLevel(logging.WARNING)
|
|
||||||
logger_interp = logging.getLogger('pdfminer.pdfinterp')
|
|
||||||
logger_interp.setLevel(logging.WARNING)
|
|
||||||
logger_psparser = logging.getLogger('pdfminer.psparser')
|
|
||||||
logger_psparser.setLevel(logging.WARNING)
|
|
||||||
logger_cmapdb = logging.getLogger('pdfminer.cmapdb')
|
|
||||||
logger_cmapdb.setLevel(logging.WARNING)
|
|
||||||
logger_pdfparser = logging.getLogger('pdfminer.pdfparser')
|
|
||||||
logger_pdfparser.setLevel(logging.WARNING)
|
|
||||||
|
|
||||||
def concatenate_to_multiline_string(self, data, start, end):
|
|
||||||
""" concatenates multiple lines to a single multiline string """
|
|
||||||
|
|
||||||
res = ''
|
|
||||||
counter = start
|
|
||||||
while counter <= end:
|
|
||||||
res += data[counter] + '\n'
|
|
||||||
counter += 1
|
|
||||||
return res
|
|
||||||
|
|
||||||
def convert(self, file):
|
|
||||||
""" converts the PDF to a multiline string """
|
|
||||||
|
|
||||||
pagenums = set()
|
|
||||||
manager = PDFResourceManager()
|
|
||||||
codec = 'utf-8'
|
|
||||||
caching = True
|
|
||||||
|
|
||||||
output = io.StringIO()
|
|
||||||
converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
|
|
||||||
|
|
||||||
interpreter = PDFPageInterpreter(manager, converter)
|
|
||||||
infile = open(file, 'rb')
|
|
||||||
|
|
||||||
for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True):
|
|
||||||
interpreter.process_page(page)
|
|
||||||
|
|
||||||
converted_pdf = output.getvalue()
|
|
||||||
|
|
||||||
infile.close()
|
|
||||||
converter.close()
|
|
||||||
output.close()
|
|
||||||
return converted_pdf
|
|
||||||
|
|
||||||
def extract_einsatzausdruck(self, file, f_id):
|
|
||||||
""" extracts as many information from the parsed Einsatzausdruck as possible """
|
|
||||||
|
|
||||||
converted = self.convert(file)
|
|
||||||
splited = converted.splitlines()
|
|
||||||
|
|
||||||
self.logger.debug('[%s] Parsed PDF raw:\n %s', f_id, converted)
|
|
||||||
self.logger.debug('[%s] Line-splited PDF: %s', f_id, splited)
|
|
||||||
|
|
||||||
# search some well-known words for later positional computation
|
|
||||||
try:
|
|
||||||
index_einsatzauftragfw = splited.index('Einsatzauftrag Feuerwehr')
|
|
||||||
index_erfasser = splited.index('Erfasser')
|
|
||||||
index_auftrag = splited.index('Auftrag')
|
|
||||||
index_bemerkungen = splited.index('Bemerkungen')
|
|
||||||
index_dispo = splited.index('Disponierte Einheiten')
|
|
||||||
index_einsatz = splited.index('Einsatz')
|
|
||||||
index_hinweis = splited.index('Hinweis')
|
|
||||||
index_maps = splited.index('Google Maps')
|
|
||||||
except ValueError as err:
|
|
||||||
self.logger.error('[%s] PDF file does not look like a Einsatzausdruck: %s', f_id, err)
|
|
||||||
return False
|
|
||||||
|
|
||||||
# the PDF parsing not always produces the same output
|
|
||||||
# let's define the already known output
|
|
||||||
if index_bemerkungen == 6:
|
|
||||||
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
|
|
||||||
# get length of bemerkungen field
|
|
||||||
# it lives between the line which contains 'Bemerkungen' and
|
|
||||||
# the line 'Disponierte Einheiten'
|
|
||||||
length_bemerkungen = index_auftrag - index_bemerkungen - 1
|
|
||||||
erfasser = splited[index_dispo - 2]
|
|
||||||
auftrag = splited[index_erfasser + 2]
|
|
||||||
datum = splited[index_erfasser + 3]
|
|
||||||
zeit = splited[index_erfasser + 4]
|
|
||||||
einsatz = splited[index_einsatz - 6]
|
|
||||||
sondersignal = splited[index_einsatz - 5]
|
|
||||||
ort = splited[index_einsatz - 3]
|
|
||||||
strasse = splited[index_einsatz - 2]
|
|
||||||
# sometimes there is just a phone number for the field melder but on
|
|
||||||
# the second line, so the lines vary for erfasser and melder
|
|
||||||
if index_dispo - index_erfasser == 10:
|
|
||||||
melder = splited[index_dispo - 4] + ', ' + splited[index_dispo - 3]
|
|
||||||
else:
|
|
||||||
melder = splited[index_dispo - 4]
|
|
||||||
# BMA style
|
|
||||||
elif index_bemerkungen == 20:
|
|
||||||
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
|
|
||||||
length_bemerkungen = index_dispo - index_bemerkungen - 1
|
|
||||||
erfasser = splited[index_bemerkungen - 2]
|
|
||||||
auftrag = splited[index_einsatzauftragfw + 2]
|
|
||||||
datum = splited[index_einsatzauftragfw + 3]
|
|
||||||
zeit = splited[index_einsatzauftragfw + 4]
|
|
||||||
einsatz = splited[index_einsatz + 6]
|
|
||||||
sondersignal = splited[index_einsatz + 7]
|
|
||||||
ort = splited[index_einsatz + 9]
|
|
||||||
strasse = splited[index_einsatz + 10]
|
|
||||||
melder = 'BMA' # There is no melder on a BMA Einsatzausdruck
|
|
||||||
elif index_bemerkungen == 21 or index_bemerkungen == 22:
|
|
||||||
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
|
|
||||||
length_bemerkungen = index_dispo - index_bemerkungen - 1
|
|
||||||
erfasser = splited[index_bemerkungen - 2]
|
|
||||||
auftrag = splited[index_erfasser + 2]
|
|
||||||
datum = splited[index_erfasser + 3]
|
|
||||||
zeit = splited[index_erfasser + 4]
|
|
||||||
einsatz = splited[index_einsatz - 6]
|
|
||||||
sondersignal = splited[index_einsatz - 5]
|
|
||||||
ort = splited[index_einsatz - 3]
|
|
||||||
strasse = splited[index_einsatz - 2]
|
|
||||||
if index_bemerkungen - index_erfasser == 10:
|
|
||||||
melder = splited[index_bemerkungen - 4] + ', ' + splited[index_bemerkungen - 3]
|
|
||||||
else:
|
|
||||||
melder = splited[index_bemerkungen - 4]
|
|
||||||
elif index_bemerkungen == 24:
|
|
||||||
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
|
|
||||||
length_bemerkungen = index_dispo - index_bemerkungen - 1
|
|
||||||
erfasser = splited[index_bemerkungen - 2]
|
|
||||||
auftrag = splited[index_einsatzauftragfw + 4]
|
|
||||||
datum = splited[index_einsatzauftragfw + 9]
|
|
||||||
zeit = splited[index_einsatzauftragfw + 10]
|
|
||||||
einsatz = splited[index_einsatz - 4]
|
|
||||||
sondersignal = splited[index_einsatz - 3]
|
|
||||||
ort = ''
|
|
||||||
strasse = splited[index_einsatz - 2]
|
|
||||||
melder = splited[index_dispo - 8] + ', ' + splited[index_dispo - 7]
|
|
||||||
else:
|
|
||||||
self.logger.error('[%s] Unknown location of Bemerkungen. Line %s', f_id, index_bemerkungen)
|
|
||||||
return False
|
|
||||||
|
|
||||||
# sanity check to see if we can correlate the f_id
|
|
||||||
if f_id == auftrag:
|
|
||||||
self.logger.info('[%s] ID matches in PDF', f_id)
|
|
||||||
else:
|
|
||||||
self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, auftrag)
|
|
||||||
return False
|
|
||||||
|
|
||||||
# try to find out if there is a hinweis
|
|
||||||
# if yes, the difference between the indexes is 4, else it's shorter
|
|
||||||
if index_maps - index_hinweis == 4:
|
|
||||||
hinweis = splited[index_hinweis+2]
|
|
||||||
else:
|
|
||||||
hinweis = ''
|
|
||||||
|
|
||||||
data = {
|
|
||||||
'auftrag': auftrag,
|
|
||||||
'datum': datum,
|
|
||||||
'zeit': zeit,
|
|
||||||
'melder': melder,
|
|
||||||
'erfasser': erfasser,
|
|
||||||
'bemerkungen': self.concatenate_to_multiline_string(
|
|
||||||
splited,
|
|
||||||
index_bemerkungen + 1,
|
|
||||||
index_bemerkungen + length_bemerkungen
|
|
||||||
).rstrip(),
|
|
||||||
'einsatz': einsatz,
|
|
||||||
'sondersignal': sondersignal,
|
|
||||||
'ort': ort.title(),
|
|
||||||
'strasse': strasse.title(),
|
|
||||||
#'objekt': splited[],
|
|
||||||
'hinweis': hinweis,
|
|
||||||
}
|
|
||||||
return data
|
|
||||||
|
|
||||||
def extract_einsatzprotokoll(self, file, f_id):
|
|
||||||
""" extracts as many information from the parsed Einsatzprotokoll as possible """
|
|
||||||
|
|
||||||
splited = self.convert(file).splitlines()
|
|
||||||
|
|
||||||
# sanity check to see if we can correlate the f_id
|
|
||||||
if f_id == splited[26]:
|
|
||||||
self.logger.info('[%s] ID matches in PDF', f_id)
|
|
||||||
else:
|
|
||||||
self.logger.error('[%s] ID does not match in PDF', f_id)
|
|
||||||
return False
|
|
||||||
|
|
||||||
data = {
|
|
||||||
'auftrag': splited[26],
|
|
||||||
'datum': splited[25],
|
|
||||||
'angelegt': splited[28],
|
|
||||||
'disposition': splited[30],
|
|
||||||
'ausgerueckt': splited[32],
|
|
||||||
'anort': splited[33],
|
|
||||||
}
|
|
||||||
return data
|
|
140
library/pdftotext.py
Normal file
140
library/pdftotext.py
Normal file
|
@ -0,0 +1,140 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
""" extracts data from ELZ PDFs using Poppler pdftotext """
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import logging
|
||||||
|
|
||||||
|
class PDFParsing:
|
||||||
|
""" PDF parsing """
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
self.logger.info('PDF parsing based on pdftotext loaded')
|
||||||
|
|
||||||
|
def extract(self, f_id, file, datafields):
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
for field, coordinate in datafields.items():
|
||||||
|
|
||||||
|
# x-coordinate of the crop area top left corner
|
||||||
|
x = coordinate['xMin']
|
||||||
|
|
||||||
|
# y-coordinate of the crop area top left corner
|
||||||
|
y = coordinate['yMin']
|
||||||
|
|
||||||
|
# width of crop area in pixels
|
||||||
|
w = coordinate['xMax'] - coordinate['xMin']
|
||||||
|
|
||||||
|
# height of crop area in pixels
|
||||||
|
h = coordinate['yMax'] - coordinate['yMin']
|
||||||
|
|
||||||
|
self.logger.debug('[%s] Computed command for field %s: %s', f_id, field,
|
||||||
|
'pdftotext -f 1 -l 1 -x {} -y {} -W {} -H {}'.format(x,y,w,h)
|
||||||
|
)
|
||||||
|
|
||||||
|
scrapeddata = subprocess.Popen([
|
||||||
|
'/usr/bin/pdftotext',
|
||||||
|
'-f', '1',
|
||||||
|
'-l', '1',
|
||||||
|
'-x', str(x),
|
||||||
|
'-y', str(y),
|
||||||
|
'-W', str(w),
|
||||||
|
'-H', str(h),
|
||||||
|
file,
|
||||||
|
'-'
|
||||||
|
],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
|
text=True)
|
||||||
|
stdout, _ = scrapeddata.communicate()
|
||||||
|
|
||||||
|
## TODO: fixup some fields (lowercase, remove unnecessary \n)
|
||||||
|
if 'edit' in coordinate and coordinate['edit'] == 'title':
|
||||||
|
data[field] = stdout.rstrip().title()
|
||||||
|
else:
|
||||||
|
data[field] = stdout.rstrip()
|
||||||
|
|
||||||
|
# sanity check to see if we can correlate the f_id
|
||||||
|
if f_id == data['auftrag']:
|
||||||
|
self.logger.debug('[%s] ID matches in PDF', f_id)
|
||||||
|
return data
|
||||||
|
else:
|
||||||
|
self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, data['auftrag'])
|
||||||
|
return False
|
||||||
|
|
||||||
|
def extract_einsatzausdruck(self, file, f_id):
|
||||||
|
""" extracts information from Einsatzausdruck using external pdftotext """
|
||||||
|
|
||||||
|
self.logger.debug('[%s] Parsing PDF: %s', f_id, file)
|
||||||
|
|
||||||
|
# Get them using 'pdftotext -bbox'
|
||||||
|
# y = row
|
||||||
|
# x = column: xMax 450 / 590 means full width
|
||||||
|
coordinates = {
|
||||||
|
'auftrag': {
|
||||||
|
'xMin': 70, 'yMin': 47, 'xMax': 120,'yMax': 58,
|
||||||
|
},
|
||||||
|
'datum': {
|
||||||
|
'xMin': 190, 'yMin': 47, 'xMax': 239, 'yMax': 58,
|
||||||
|
},
|
||||||
|
'zeit': {
|
||||||
|
'xMin': 190, 'yMin': 59, 'xMax': 215, 'yMax': 70,
|
||||||
|
},
|
||||||
|
'melder': {
|
||||||
|
'xMin': 304, 'yMin': 47, 'xMax': 446, 'yMax': 70, 'edit': 'title'
|
||||||
|
},
|
||||||
|
'erfasser':{
|
||||||
|
'xMin': 448, 'yMin': 59, 'xMax': 478, 'yMax': 70,
|
||||||
|
},
|
||||||
|
# big field until "Disponierte Einheiten"
|
||||||
|
'bemerkungen': {
|
||||||
|
'xMin': 28, 'yMin': 112, 'xMax': 590, 'yMax': 350,
|
||||||
|
},
|
||||||
|
'disponierteeinheiten': {
|
||||||
|
'xMin': 28, 'yMin': 366, 'xMax': 450, 'yMax': 376,
|
||||||
|
},
|
||||||
|
'einsatz': {
|
||||||
|
'xMin': 76, 'yMin': 690, 'xMax': 450, 'yMax': 703,
|
||||||
|
},
|
||||||
|
'sondersignal': {
|
||||||
|
'xMin': 76, 'yMin': 707, 'xMax': 450, 'yMax': 721,
|
||||||
|
},
|
||||||
|
'ort': {
|
||||||
|
'xMin': 76, 'yMin': 732, 'xMax': 590, 'yMax': 745,
|
||||||
|
},
|
||||||
|
'hinweis': {
|
||||||
|
'xMin': 76, 'yMin': 773, 'xMax': 450, 'yMax': 787,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
return self.extract(f_id, file, coordinates)
|
||||||
|
|
||||||
|
def extract_einsatzprotokoll(self, file, f_id):
|
||||||
|
""" extracts information from Einsatzprotokoll using external pdftotext """
|
||||||
|
|
||||||
|
self.logger.debug('[%s] Parsing PDF: %s', f_id, file)
|
||||||
|
|
||||||
|
# Get them using 'pdftotext -bbox'
|
||||||
|
# y = row
|
||||||
|
# x = column: xMax 450 / 590 means full width
|
||||||
|
coordinates = {
|
||||||
|
'auftrag': {
|
||||||
|
'xMin': 192, 'yMin': 132, 'xMax': 238,'yMax': 142,
|
||||||
|
},
|
||||||
|
'angelegt': {
|
||||||
|
'xMin': 192, 'yMin': 294, 'xMax': 226, 'yMax': 304,
|
||||||
|
},
|
||||||
|
'dispo': {
|
||||||
|
'xMin': 192, 'yMin': 312, 'xMax': 226, 'yMax': 322,
|
||||||
|
},
|
||||||
|
'ausgerueckt': {
|
||||||
|
'xMin': 192, 'yMin': 331, 'xMax': 226, 'yMax': 341,
|
||||||
|
},
|
||||||
|
'vorort':{
|
||||||
|
'xMin': 192, 'yMin': 348, 'xMax': 226, 'yMax': 358,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
return self.extract(f_id, file, coordinates)
|
4
main.py
4
main.py
|
@ -14,7 +14,7 @@ from library.emailhandling import EmailHandling
|
||||||
from library.lodur import Lodur
|
from library.lodur import Lodur
|
||||||
from library.mqtt import MQTTClient
|
from library.mqtt import MQTTClient
|
||||||
from library.gotify import GotifyClient
|
from library.gotify import GotifyClient
|
||||||
from library.pdf_extract import PDFHandling
|
from library.pdftotext import PDFParsing
|
||||||
from library.webdav import WebDav
|
from library.webdav import WebDav
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
|
@ -92,7 +92,7 @@ def main():
|
||||||
)
|
)
|
||||||
|
|
||||||
# Initialize PDF Parser
|
# Initialize PDF Parser
|
||||||
pdf = PDFHandling()
|
pdf = PDFParsing()
|
||||||
|
|
||||||
# Main Loop
|
# Main Loop
|
||||||
while True:
|
while True:
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
import re
|
|
||||||
import logging
|
|
||||||
from pprint import pprint
|
|
||||||
from pathlib import Path
|
|
||||||
from library.pdf_extract import PDFHandling
|
|
||||||
|
|
||||||
PATH = '/tmp/pylokid'
|
|
||||||
|
|
||||||
logging.basicConfig(
|
|
||||||
level=logging.INFO,
|
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
||||||
)
|
|
||||||
|
|
||||||
PDF = PDFHandling()
|
|
||||||
|
|
||||||
for path in Path(PATH).glob('**/*.pdf'):
|
|
||||||
file = str(path)
|
|
||||||
print(file)
|
|
||||||
f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
|
|
||||||
print(f_id)
|
|
||||||
pprint(PDF.extract_einsatzausdruck(file, f_id))
|
|
30
test_pdftotext.py
Normal file
30
test_pdftotext.py
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
from pprint import pprint
|
||||||
|
from pathlib import Path
|
||||||
|
from library.pdftotext import PDFParsing
|
||||||
|
|
||||||
|
PATH = '/home/tobru/Documents/Feuerwehr/Stab/Fourier/Einsatzdepeschen/2019'
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
|
PDF = PDFParsing()
|
||||||
|
|
||||||
|
for path in Path(PATH).glob('**/Einsatzausdruck*.pdf'):
|
||||||
|
file = str(path)
|
||||||
|
print(file)
|
||||||
|
f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
|
||||||
|
print(f_id)
|
||||||
|
pprint(PDF.extract_einsatzausdruck(file, f_id))
|
||||||
|
|
||||||
|
"""
|
||||||
|
for path in Path(PATH).glob('**/Einsatzprotokoll*.pdf'):
|
||||||
|
file = str(path)
|
||||||
|
print(file)
|
||||||
|
f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
|
||||||
|
print(f_id)
|
||||||
|
pprint(PDF.extract_einsatzprotokoll(file, f_id))
|
||||||
|
"""
|
Loading…
Reference in a new issue