complete rework of pdf parsing
This commit is contained in:
parent
b5c7d7b7b1
commit
b0228afcfd
|
@ -74,7 +74,7 @@ class Lodur:
|
|||
'%H:%M',
|
||||
)
|
||||
zh_am_schad = datetime.strptime(
|
||||
pdf_data['anort'],
|
||||
pdf_data['vorort'],
|
||||
'%H:%M',
|
||||
)
|
||||
except ValueError as err:
|
||||
|
@ -120,9 +120,9 @@ class Lodur:
|
|||
'%H:%M',
|
||||
)
|
||||
eins_ereig = pdf_data['einsatz']
|
||||
bemerkungen = pdf_data['bemerkungen']
|
||||
bemerkungen = pdf_data['bemerkungen'] + '\n' + pdf_data['disponierteeinheiten']
|
||||
wer_ala = pdf_data['melder']
|
||||
adr = pdf_data['strasse'] + ', ' + pdf_data['ort']
|
||||
adr = pdf_data['ort']
|
||||
else:
|
||||
date = datetime.now()
|
||||
time = datetime.now()
|
||||
|
|
|
@ -1,209 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
""" extracts data from ELZ PDFs """
|
||||
|
||||
import io
|
||||
import logging
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.converter import TextConverter
|
||||
from pdfminer.layout import LAParams
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
|
||||
class PDFHandling:
|
||||
""" PDF handling like parsing """
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# less logging for pdfminer - more is not needed
|
||||
logger_doc = logging.getLogger('pdfminer.pdfdocument')
|
||||
logger_doc.setLevel(logging.WARNING)
|
||||
logger_page = logging.getLogger('pdfminer.pdfpage')
|
||||
logger_page.setLevel(logging.WARNING)
|
||||
logger_interp = logging.getLogger('pdfminer.pdfinterp')
|
||||
logger_interp.setLevel(logging.WARNING)
|
||||
logger_psparser = logging.getLogger('pdfminer.psparser')
|
||||
logger_psparser.setLevel(logging.WARNING)
|
||||
logger_cmapdb = logging.getLogger('pdfminer.cmapdb')
|
||||
logger_cmapdb.setLevel(logging.WARNING)
|
||||
logger_pdfparser = logging.getLogger('pdfminer.pdfparser')
|
||||
logger_pdfparser.setLevel(logging.WARNING)
|
||||
|
||||
def concatenate_to_multiline_string(self, data, start, end):
|
||||
""" concatenates multiple lines to a single multiline string """
|
||||
|
||||
res = ''
|
||||
counter = start
|
||||
while counter <= end:
|
||||
res += data[counter] + '\n'
|
||||
counter += 1
|
||||
return res
|
||||
|
||||
def convert(self, file):
|
||||
""" converts the PDF to a multiline string """
|
||||
|
||||
pagenums = set()
|
||||
manager = PDFResourceManager()
|
||||
codec = 'utf-8'
|
||||
caching = True
|
||||
|
||||
output = io.StringIO()
|
||||
converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
|
||||
|
||||
interpreter = PDFPageInterpreter(manager, converter)
|
||||
infile = open(file, 'rb')
|
||||
|
||||
for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True):
|
||||
interpreter.process_page(page)
|
||||
|
||||
converted_pdf = output.getvalue()
|
||||
|
||||
infile.close()
|
||||
converter.close()
|
||||
output.close()
|
||||
return converted_pdf
|
||||
|
||||
def extract_einsatzausdruck(self, file, f_id):
|
||||
""" extracts as many information from the parsed Einsatzausdruck as possible """
|
||||
|
||||
converted = self.convert(file)
|
||||
splited = converted.splitlines()
|
||||
|
||||
self.logger.debug('[%s] Parsed PDF raw:\n %s', f_id, converted)
|
||||
self.logger.debug('[%s] Line-splited PDF: %s', f_id, splited)
|
||||
|
||||
# search some well-known words for later positional computation
|
||||
try:
|
||||
index_einsatzauftragfw = splited.index('Einsatzauftrag Feuerwehr')
|
||||
index_erfasser = splited.index('Erfasser')
|
||||
index_auftrag = splited.index('Auftrag')
|
||||
index_bemerkungen = splited.index('Bemerkungen')
|
||||
index_dispo = splited.index('Disponierte Einheiten')
|
||||
index_einsatz = splited.index('Einsatz')
|
||||
index_hinweis = splited.index('Hinweis')
|
||||
index_maps = splited.index('Google Maps')
|
||||
except ValueError as err:
|
||||
self.logger.error('[%s] PDF file does not look like a Einsatzausdruck: %s', f_id, err)
|
||||
return False
|
||||
|
||||
# the PDF parsing not always produces the same output
|
||||
# let's define the already known output
|
||||
if index_bemerkungen == 6:
|
||||
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
|
||||
# get length of bemerkungen field
|
||||
# it lives between the line which contains 'Bemerkungen' and
|
||||
# the line 'Disponierte Einheiten'
|
||||
length_bemerkungen = index_auftrag - index_bemerkungen - 1
|
||||
erfasser = splited[index_dispo - 2]
|
||||
auftrag = splited[index_erfasser + 2]
|
||||
datum = splited[index_erfasser + 3]
|
||||
zeit = splited[index_erfasser + 4]
|
||||
einsatz = splited[index_einsatz - 6]
|
||||
sondersignal = splited[index_einsatz - 5]
|
||||
ort = splited[index_einsatz - 3]
|
||||
strasse = splited[index_einsatz - 2]
|
||||
# sometimes there is just a phone number for the field melder but on
|
||||
# the second line, so the lines vary for erfasser and melder
|
||||
if index_dispo - index_erfasser == 10:
|
||||
melder = splited[index_dispo - 4] + ', ' + splited[index_dispo - 3]
|
||||
else:
|
||||
melder = splited[index_dispo - 4]
|
||||
# BMA style
|
||||
elif index_bemerkungen == 20:
|
||||
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
|
||||
length_bemerkungen = index_dispo - index_bemerkungen - 1
|
||||
erfasser = splited[index_bemerkungen - 2]
|
||||
auftrag = splited[index_einsatzauftragfw + 2]
|
||||
datum = splited[index_einsatzauftragfw + 3]
|
||||
zeit = splited[index_einsatzauftragfw + 4]
|
||||
einsatz = splited[index_einsatz + 6]
|
||||
sondersignal = splited[index_einsatz + 7]
|
||||
ort = splited[index_einsatz + 9]
|
||||
strasse = splited[index_einsatz + 10]
|
||||
melder = 'BMA' # There is no melder on a BMA Einsatzausdruck
|
||||
elif index_bemerkungen == 21 or index_bemerkungen == 22:
|
||||
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
|
||||
length_bemerkungen = index_dispo - index_bemerkungen - 1
|
||||
erfasser = splited[index_bemerkungen - 2]
|
||||
auftrag = splited[index_erfasser + 2]
|
||||
datum = splited[index_erfasser + 3]
|
||||
zeit = splited[index_erfasser + 4]
|
||||
einsatz = splited[index_einsatz - 6]
|
||||
sondersignal = splited[index_einsatz - 5]
|
||||
ort = splited[index_einsatz - 3]
|
||||
strasse = splited[index_einsatz - 2]
|
||||
if index_bemerkungen - index_erfasser == 10:
|
||||
melder = splited[index_bemerkungen - 4] + ', ' + splited[index_bemerkungen - 3]
|
||||
else:
|
||||
melder = splited[index_bemerkungen - 4]
|
||||
elif index_bemerkungen == 24:
|
||||
self.logger.info('[%s] Found Bemerkungen on line %s', f_id, index_bemerkungen)
|
||||
length_bemerkungen = index_dispo - index_bemerkungen - 1
|
||||
erfasser = splited[index_bemerkungen - 2]
|
||||
auftrag = splited[index_einsatzauftragfw + 4]
|
||||
datum = splited[index_einsatzauftragfw + 9]
|
||||
zeit = splited[index_einsatzauftragfw + 10]
|
||||
einsatz = splited[index_einsatz - 4]
|
||||
sondersignal = splited[index_einsatz - 3]
|
||||
ort = ''
|
||||
strasse = splited[index_einsatz - 2]
|
||||
melder = splited[index_dispo - 8] + ', ' + splited[index_dispo - 7]
|
||||
else:
|
||||
self.logger.error('[%s] Unknown location of Bemerkungen. Line %s', f_id, index_bemerkungen)
|
||||
return False
|
||||
|
||||
# sanity check to see if we can correlate the f_id
|
||||
if f_id == auftrag:
|
||||
self.logger.info('[%s] ID matches in PDF', f_id)
|
||||
else:
|
||||
self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, auftrag)
|
||||
return False
|
||||
|
||||
# try to find out if there is a hinweis
|
||||
# if yes, the difference between the indexes is 4, else it's shorter
|
||||
if index_maps - index_hinweis == 4:
|
||||
hinweis = splited[index_hinweis+2]
|
||||
else:
|
||||
hinweis = ''
|
||||
|
||||
data = {
|
||||
'auftrag': auftrag,
|
||||
'datum': datum,
|
||||
'zeit': zeit,
|
||||
'melder': melder,
|
||||
'erfasser': erfasser,
|
||||
'bemerkungen': self.concatenate_to_multiline_string(
|
||||
splited,
|
||||
index_bemerkungen + 1,
|
||||
index_bemerkungen + length_bemerkungen
|
||||
).rstrip(),
|
||||
'einsatz': einsatz,
|
||||
'sondersignal': sondersignal,
|
||||
'ort': ort.title(),
|
||||
'strasse': strasse.title(),
|
||||
#'objekt': splited[],
|
||||
'hinweis': hinweis,
|
||||
}
|
||||
return data
|
||||
|
||||
def extract_einsatzprotokoll(self, file, f_id):
|
||||
""" extracts as many information from the parsed Einsatzprotokoll as possible """
|
||||
|
||||
splited = self.convert(file).splitlines()
|
||||
|
||||
# sanity check to see if we can correlate the f_id
|
||||
if f_id == splited[26]:
|
||||
self.logger.info('[%s] ID matches in PDF', f_id)
|
||||
else:
|
||||
self.logger.error('[%s] ID does not match in PDF', f_id)
|
||||
return False
|
||||
|
||||
data = {
|
||||
'auftrag': splited[26],
|
||||
'datum': splited[25],
|
||||
'angelegt': splited[28],
|
||||
'disposition': splited[30],
|
||||
'ausgerueckt': splited[32],
|
||||
'anort': splited[33],
|
||||
}
|
||||
return data
|
|
@ -0,0 +1,140 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
""" extracts data from ELZ PDFs using Poppler pdftotext """
|
||||
|
||||
import subprocess
|
||||
import logging
|
||||
|
||||
class PDFParsing:
|
||||
""" PDF parsing """
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logger.info('PDF parsing based on pdftotext loaded')
|
||||
|
||||
def extract(self, f_id, file, datafields):
|
||||
|
||||
data = {}
|
||||
|
||||
for field, coordinate in datafields.items():
|
||||
|
||||
# x-coordinate of the crop area top left corner
|
||||
x = coordinate['xMin']
|
||||
|
||||
# y-coordinate of the crop area top left corner
|
||||
y = coordinate['yMin']
|
||||
|
||||
# width of crop area in pixels
|
||||
w = coordinate['xMax'] - coordinate['xMin']
|
||||
|
||||
# height of crop area in pixels
|
||||
h = coordinate['yMax'] - coordinate['yMin']
|
||||
|
||||
self.logger.debug('[%s] Computed command for field %s: %s', f_id, field,
|
||||
'pdftotext -f 1 -l 1 -x {} -y {} -W {} -H {}'.format(x,y,w,h)
|
||||
)
|
||||
|
||||
scrapeddata = subprocess.Popen([
|
||||
'/usr/bin/pdftotext',
|
||||
'-f', '1',
|
||||
'-l', '1',
|
||||
'-x', str(x),
|
||||
'-y', str(y),
|
||||
'-W', str(w),
|
||||
'-H', str(h),
|
||||
file,
|
||||
'-'
|
||||
],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True)
|
||||
stdout, _ = scrapeddata.communicate()
|
||||
|
||||
## TODO: fixup some fields (lowercase, remove unnecessary \n)
|
||||
if 'edit' in coordinate and coordinate['edit'] == 'title':
|
||||
data[field] = stdout.rstrip().title()
|
||||
else:
|
||||
data[field] = stdout.rstrip()
|
||||
|
||||
# sanity check to see if we can correlate the f_id
|
||||
if f_id == data['auftrag']:
|
||||
self.logger.debug('[%s] ID matches in PDF', f_id)
|
||||
return data
|
||||
else:
|
||||
self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, data['auftrag'])
|
||||
return False
|
||||
|
||||
def extract_einsatzausdruck(self, file, f_id):
|
||||
""" extracts information from Einsatzausdruck using external pdftotext """
|
||||
|
||||
self.logger.debug('[%s] Parsing PDF: %s', f_id, file)
|
||||
|
||||
# Get them using 'pdftotext -bbox'
|
||||
# y = row
|
||||
# x = column: xMax 450 / 590 means full width
|
||||
coordinates = {
|
||||
'auftrag': {
|
||||
'xMin': 70, 'yMin': 47, 'xMax': 120,'yMax': 58,
|
||||
},
|
||||
'datum': {
|
||||
'xMin': 190, 'yMin': 47, 'xMax': 239, 'yMax': 58,
|
||||
},
|
||||
'zeit': {
|
||||
'xMin': 190, 'yMin': 59, 'xMax': 215, 'yMax': 70,
|
||||
},
|
||||
'melder': {
|
||||
'xMin': 304, 'yMin': 47, 'xMax': 446, 'yMax': 70, 'edit': 'title'
|
||||
},
|
||||
'erfasser':{
|
||||
'xMin': 448, 'yMin': 59, 'xMax': 478, 'yMax': 70,
|
||||
},
|
||||
# big field until "Disponierte Einheiten"
|
||||
'bemerkungen': {
|
||||
'xMin': 28, 'yMin': 112, 'xMax': 590, 'yMax': 350,
|
||||
},
|
||||
'disponierteeinheiten': {
|
||||
'xMin': 28, 'yMin': 366, 'xMax': 450, 'yMax': 376,
|
||||
},
|
||||
'einsatz': {
|
||||
'xMin': 76, 'yMin': 690, 'xMax': 450, 'yMax': 703,
|
||||
},
|
||||
'sondersignal': {
|
||||
'xMin': 76, 'yMin': 707, 'xMax': 450, 'yMax': 721,
|
||||
},
|
||||
'ort': {
|
||||
'xMin': 76, 'yMin': 732, 'xMax': 590, 'yMax': 745,
|
||||
},
|
||||
'hinweis': {
|
||||
'xMin': 76, 'yMin': 773, 'xMax': 450, 'yMax': 787,
|
||||
},
|
||||
}
|
||||
|
||||
return self.extract(f_id, file, coordinates)
|
||||
|
||||
def extract_einsatzprotokoll(self, file, f_id):
|
||||
""" extracts information from Einsatzprotokoll using external pdftotext """
|
||||
|
||||
self.logger.debug('[%s] Parsing PDF: %s', f_id, file)
|
||||
|
||||
# Get them using 'pdftotext -bbox'
|
||||
# y = row
|
||||
# x = column: xMax 450 / 590 means full width
|
||||
coordinates = {
|
||||
'auftrag': {
|
||||
'xMin': 192, 'yMin': 132, 'xMax': 238,'yMax': 142,
|
||||
},
|
||||
'angelegt': {
|
||||
'xMin': 192, 'yMin': 294, 'xMax': 226, 'yMax': 304,
|
||||
},
|
||||
'dispo': {
|
||||
'xMin': 192, 'yMin': 312, 'xMax': 226, 'yMax': 322,
|
||||
},
|
||||
'ausgerueckt': {
|
||||
'xMin': 192, 'yMin': 331, 'xMax': 226, 'yMax': 341,
|
||||
},
|
||||
'vorort':{
|
||||
'xMin': 192, 'yMin': 348, 'xMax': 226, 'yMax': 358,
|
||||
},
|
||||
}
|
||||
|
||||
return self.extract(f_id, file, coordinates)
|
4
main.py
4
main.py
|
@ -14,7 +14,7 @@ from library.emailhandling import EmailHandling
|
|||
from library.lodur import Lodur
|
||||
from library.mqtt import MQTTClient
|
||||
from library.gotify import GotifyClient
|
||||
from library.pdf_extract import PDFHandling
|
||||
from library.pdftotext import PDFParsing
|
||||
from library.webdav import WebDav
|
||||
|
||||
# Configuration
|
||||
|
@ -92,7 +92,7 @@ def main():
|
|||
)
|
||||
|
||||
# Initialize PDF Parser
|
||||
pdf = PDFHandling()
|
||||
pdf = PDFParsing()
|
||||
|
||||
# Main Loop
|
||||
while True:
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
import re
|
||||
import logging
|
||||
from pprint import pprint
|
||||
from pathlib import Path
|
||||
from library.pdf_extract import PDFHandling
|
||||
|
||||
PATH = '/tmp/pylokid'
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
PDF = PDFHandling()
|
||||
|
||||
for path in Path(PATH).glob('**/*.pdf'):
|
||||
file = str(path)
|
||||
print(file)
|
||||
f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
|
||||
print(f_id)
|
||||
pprint(PDF.extract_einsatzausdruck(file, f_id))
|
|
@ -0,0 +1,30 @@
|
|||
import re
|
||||
import logging
|
||||
from pprint import pprint
|
||||
from pathlib import Path
|
||||
from library.pdftotext import PDFParsing
|
||||
|
||||
PATH = '/home/tobru/Documents/Feuerwehr/Stab/Fourier/Einsatzdepeschen/2019'
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
PDF = PDFParsing()
|
||||
|
||||
for path in Path(PATH).glob('**/Einsatzausdruck*.pdf'):
|
||||
file = str(path)
|
||||
print(file)
|
||||
f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
|
||||
print(f_id)
|
||||
pprint(PDF.extract_einsatzausdruck(file, f_id))
|
||||
|
||||
"""
|
||||
for path in Path(PATH).glob('**/Einsatzprotokoll*.pdf'):
|
||||
file = str(path)
|
||||
print(file)
|
||||
f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
|
||||
print(f_id)
|
||||
pprint(PDF.extract_einsatzprotokoll(file, f_id))
|
||||
"""
|
Loading…
Reference in New Issue