detect more pdf parser styles
This commit is contained in:
parent
86fd6bca18
commit
dc403b9dd7
|
@ -73,6 +73,7 @@ class PDFHandling:
|
||||||
|
|
||||||
# search some well-known words for later positional computation
|
# search some well-known words for later positional computation
|
||||||
try:
|
try:
|
||||||
|
index_einsatzauftragfw = splited.index('Einsatzauftrag Feuerwehr')
|
||||||
index_erfasser = splited.index('Erfasser')
|
index_erfasser = splited.index('Erfasser')
|
||||||
index_auftrag = splited.index('Auftrag')
|
index_auftrag = splited.index('Auftrag')
|
||||||
index_bemerkungen = splited.index('Bemerkungen')
|
index_bemerkungen = splited.index('Bemerkungen')
|
||||||
|
@ -80,27 +81,56 @@ class PDFHandling:
|
||||||
index_einsatz = splited.index('Einsatz')
|
index_einsatz = splited.index('Einsatz')
|
||||||
index_hinweis = splited.index('Hinweis')
|
index_hinweis = splited.index('Hinweis')
|
||||||
index_maps = splited.index('Google Maps')
|
index_maps = splited.index('Google Maps')
|
||||||
except IndexError:
|
except ValueError:
|
||||||
self.logger.error('[%s] PDF file does not look like a Einsatzausdruck', f_id)
|
self.logger.error('[%s] PDF file does not look like a Einsatzausdruck', f_id)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# the PDF parsing not always produces the same output
|
# the PDF parsing not always produces the same output
|
||||||
# let's define the already known output
|
# let's define the already known output
|
||||||
if index_bemerkungen == 6:
|
if index_bemerkungen == 6:
|
||||||
|
self.logger.info('[%s] Found Bemerkungen on line 6', f_id)
|
||||||
# get length of bemerkungen field
|
# get length of bemerkungen field
|
||||||
# it lives between the line which contains 'Bemerkungen' and
|
# it lives between the line which contains 'Bemerkungen' and
|
||||||
# the line 'Disponierte Einheiten'
|
# the line 'Disponierte Einheiten'
|
||||||
length_bemerkungen = index_auftrag - index_bemerkungen - 1
|
length_bemerkungen = index_auftrag - index_bemerkungen - 1
|
||||||
erfasser = splited[index_dispo - 2]
|
erfasser = splited[index_dispo - 2]
|
||||||
|
auftrag = splited[index_erfasser + 2]
|
||||||
|
datum = splited[index_erfasser + 3]
|
||||||
|
zeit = splited[index_erfasser + 4]
|
||||||
|
einsatz = splited[index_einsatz - 6]
|
||||||
|
sondersignal = splited[index_einsatz - 5]
|
||||||
|
ort = splited[index_einsatz - 3]
|
||||||
|
strasse = splited[index_einsatz - 2]
|
||||||
# sometimes there is just a phone number for the field melder but on
|
# sometimes there is just a phone number for the field melder but on
|
||||||
# the second line, so the lines vary for erfasser and melder
|
# the second line, so the lines vary for erfasser and melder
|
||||||
if index_dispo - index_erfasser == 10:
|
if index_dispo - index_erfasser == 10:
|
||||||
melder = splited[index_dispo - 4] + ', ' + splited[index_dispo - 3]
|
melder = splited[index_dispo - 4] + ', ' + splited[index_dispo - 3]
|
||||||
else:
|
else:
|
||||||
melder = splited[index_dispo - 4]
|
melder = splited[index_dispo - 4]
|
||||||
elif index_bemerkungen == 21 or index_bemerkungen == 22:
|
# BMA style
|
||||||
|
elif index_bemerkungen == 20:
|
||||||
|
self.logger.info('[%s] Found Bemerkungen on line 20', f_id)
|
||||||
length_bemerkungen = index_dispo - index_bemerkungen - 1
|
length_bemerkungen = index_dispo - index_bemerkungen - 1
|
||||||
erfasser = splited[index_bemerkungen - 2]
|
erfasser = splited[index_bemerkungen - 2]
|
||||||
|
auftrag = splited[index_einsatzauftragfw + 2]
|
||||||
|
datum = splited[index_einsatzauftragfw + 3]
|
||||||
|
zeit = splited[index_einsatzauftragfw + 4]
|
||||||
|
einsatz = splited[index_einsatz + 6]
|
||||||
|
sondersignal = splited[index_einsatz + 7]
|
||||||
|
ort = splited[index_einsatz + 9]
|
||||||
|
strasse = splited[index_einsatz + 10]
|
||||||
|
melder = 'BMA' # There is no melder on a BMA Einsatzausdruck
|
||||||
|
elif index_bemerkungen == 21 or index_bemerkungen == 22:
|
||||||
|
self.logger.info('[%s] Found Bemerkungen on line 21 or 22', f_id)
|
||||||
|
length_bemerkungen = index_dispo - index_bemerkungen - 1
|
||||||
|
erfasser = splited[index_bemerkungen - 2]
|
||||||
|
auftrag = splited[index_erfasser + 2]
|
||||||
|
datum = splited[index_erfasser + 3]
|
||||||
|
zeit = splited[index_erfasser + 4]
|
||||||
|
einsatz = splited[index_einsatz - 6]
|
||||||
|
sondersignal = splited[index_einsatz - 5]
|
||||||
|
ort = splited[index_einsatz - 3]
|
||||||
|
strasse = splited[index_einsatz - 2]
|
||||||
if index_bemerkungen - index_erfasser == 10:
|
if index_bemerkungen - index_erfasser == 10:
|
||||||
melder = splited[index_bemerkungen - 4] + ', ' + splited[index_bemerkungen - 3]
|
melder = splited[index_bemerkungen - 4] + ', ' + splited[index_bemerkungen - 3]
|
||||||
else:
|
else:
|
||||||
|
@ -110,15 +140,12 @@ class PDFHandling:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# sanity check to see if we can correlate the f_id
|
# sanity check to see if we can correlate the f_id
|
||||||
auftrag = splited[index_erfasser + 2]
|
|
||||||
if f_id == auftrag:
|
if f_id == auftrag:
|
||||||
self.logger.info('[%s] ID matches in PDF', f_id)
|
self.logger.info('[%s] ID matches in PDF', f_id)
|
||||||
else:
|
else:
|
||||||
self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, auftrag)
|
self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, auftrag)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# try to find out if there is a hinweis
|
# try to find out if there is a hinweis
|
||||||
# if yes, the difference between the indexes is 4, else it's shorter
|
# if yes, the difference between the indexes is 4, else it's shorter
|
||||||
if index_maps - index_hinweis == 4:
|
if index_maps - index_hinweis == 4:
|
||||||
|
@ -128,8 +155,8 @@ class PDFHandling:
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
'auftrag': auftrag,
|
'auftrag': auftrag,
|
||||||
'datum': splited[index_erfasser + 3],
|
'datum': datum,
|
||||||
'zeit': splited[index_erfasser + 4],
|
'zeit': zeit,
|
||||||
'melder': melder,
|
'melder': melder,
|
||||||
'erfasser': erfasser,
|
'erfasser': erfasser,
|
||||||
'bemerkungen': self.concatenate_to_multiline_string(
|
'bemerkungen': self.concatenate_to_multiline_string(
|
||||||
|
@ -137,10 +164,10 @@ class PDFHandling:
|
||||||
index_bemerkungen + 1,
|
index_bemerkungen + 1,
|
||||||
index_bemerkungen + length_bemerkungen
|
index_bemerkungen + length_bemerkungen
|
||||||
).rstrip(),
|
).rstrip(),
|
||||||
'einsatz': splited[index_einsatz - 6],
|
'einsatz': einsatz,
|
||||||
'sondersignal': splited[index_einsatz - 5],
|
'sondersignal': sondersignal,
|
||||||
'ort': splited[index_einsatz - 3].title(),
|
'ort': ort.title(),
|
||||||
'strasse': splited[index_einsatz - 2].title(),
|
'strasse': strasse.title(),
|
||||||
#'objekt': splited[],
|
#'objekt': splited[],
|
||||||
'hinweis': hinweis,
|
'hinweis': hinweis,
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,6 @@ from pathlib import Path
|
||||||
from library.pdf_extract import PDFHandling
|
from library.pdf_extract import PDFHandling
|
||||||
|
|
||||||
PATH = '/tmp/pylokid'
|
PATH = '/tmp/pylokid'
|
||||||
PATH = '/home/tobru/tmp/pylokid/Einsatzausdruck_FW'
|
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
|
@ -16,5 +15,7 @@ PDF = PDFHandling()
|
||||||
|
|
||||||
for path in Path(PATH).glob('**/*.pdf'):
|
for path in Path(PATH).glob('**/*.pdf'):
|
||||||
file = str(path)
|
file = str(path)
|
||||||
|
print(file)
|
||||||
f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
|
f_id = re.search('.*(F[0-9]{8})_.*', file).group(1)
|
||||||
|
print(f_id)
|
||||||
pprint(PDF.extract_einsatzausdruck(file, f_id))
|
pprint(PDF.extract_einsatzausdruck(file, f_id))
|
||||||
|
|
Loading…
Reference in a new issue