206 lines
5.6 KiB
Python
206 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
|
|
""" extracts data from ELZ PDFs using Poppler pdftotext """
|
|
|
|
import subprocess
|
|
import logging
|
|
|
|
|
|
class PDFParsing:
|
|
""" PDF parsing """
|
|
|
|
def __init__(self):
|
|
self.logger = logging.getLogger(__name__)
|
|
self.logger.info("PDF parsing based on pdftotext loaded")
|
|
|
|
def extract(self, f_id, file, datafields):
|
|
|
|
self.logger.info("[%s] parsing PDF file %s", f_id, file)
|
|
|
|
data = {}
|
|
|
|
for field, coordinate in datafields.items():
|
|
|
|
# x-coordinate of the crop area top left corner
|
|
x = coordinate["xMin"]
|
|
|
|
# y-coordinate of the crop area top left corner
|
|
y = coordinate["yMin"]
|
|
|
|
# width of crop area in pixels
|
|
w = coordinate["xMax"] - coordinate["xMin"]
|
|
|
|
# height of crop area in pixels
|
|
h = coordinate["yMax"] - coordinate["yMin"]
|
|
|
|
self.logger.debug(
|
|
"[%s] Computed command for field %s: %s",
|
|
f_id,
|
|
field,
|
|
"pdftotext -f 1 -l 1 -x {} -y {} -W {} -H {}".format(x, y, w, h),
|
|
)
|
|
|
|
scrapeddata = subprocess.Popen(
|
|
[
|
|
"/usr/bin/pdftotext",
|
|
"-f",
|
|
"1",
|
|
"-l",
|
|
"1",
|
|
"-x",
|
|
str(x),
|
|
"-y",
|
|
str(y),
|
|
"-W",
|
|
str(w),
|
|
"-H",
|
|
str(h),
|
|
file,
|
|
"-",
|
|
],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT,
|
|
text=True,
|
|
)
|
|
stdout, _ = scrapeddata.communicate()
|
|
|
|
## TODO: fixup some fields (lowercase, remove unnecessary \n)
|
|
if "edit" in coordinate and coordinate["edit"] == "title":
|
|
data[field] = stdout.rstrip().title()
|
|
else:
|
|
data[field] = stdout.rstrip()
|
|
|
|
# sanity check to see if we can correlate the f_id
|
|
if f_id == data["auftrag"]:
|
|
self.logger.debug("[%s] ID matches in PDF", f_id)
|
|
return data
|
|
else:
|
|
self.logger.error(
|
|
'[%s] ID does not match in PDF: "%s"', f_id, data["auftrag"]
|
|
)
|
|
return False
|
|
|
|
def extract_einsatzausdruck(self, file, f_id):
|
|
""" extracts information from Einsatzausdruck using external pdftotext """
|
|
|
|
self.logger.debug("[%s] Parsing PDF: %s", f_id, file)
|
|
|
|
# Get them using 'pdftotext -bbox'
|
|
# y = row
|
|
# x = column: xMax 450 / 590 means full width
|
|
coordinates = {
|
|
"auftrag": {
|
|
"xMin": 70,
|
|
"yMin": 47,
|
|
"xMax": 120,
|
|
"yMax": 58,
|
|
},
|
|
"datum": {
|
|
"xMin": 190,
|
|
"yMin": 47,
|
|
"xMax": 239,
|
|
"yMax": 58,
|
|
},
|
|
"zeit": {
|
|
"xMin": 190,
|
|
"yMin": 59,
|
|
"xMax": 215,
|
|
"yMax": 70,
|
|
},
|
|
"melder": {
|
|
"xMin": 304,
|
|
"yMin": 47,
|
|
"xMax": 446,
|
|
"yMax": 70,
|
|
"edit": "title",
|
|
},
|
|
"erfasser": {
|
|
"xMin": 448,
|
|
"yMin": 59,
|
|
"xMax": 478,
|
|
"yMax": 70,
|
|
},
|
|
# big field until "Disponierte Einheiten"
|
|
"bemerkungen": {
|
|
"xMin": 28,
|
|
"yMin": 112,
|
|
"xMax": 590,
|
|
"yMax": 350,
|
|
},
|
|
"disponierteeinheiten": {
|
|
"xMin": 28,
|
|
"yMin": 366,
|
|
"xMax": 450,
|
|
"yMax": 376,
|
|
},
|
|
"einsatz": {
|
|
"xMin": 76,
|
|
"yMin": 690,
|
|
"xMax": 450,
|
|
"yMax": 703,
|
|
},
|
|
"sondersignal": {
|
|
"xMin": 76,
|
|
"yMin": 707,
|
|
"xMax": 450,
|
|
"yMax": 721,
|
|
},
|
|
"ort": {
|
|
"xMin": 76,
|
|
"yMin": 732,
|
|
"xMax": 590,
|
|
"yMax": 745,
|
|
},
|
|
"hinweis": {
|
|
"xMin": 76,
|
|
"yMin": 773,
|
|
"xMax": 450,
|
|
"yMax": 787,
|
|
},
|
|
}
|
|
|
|
return self.extract(f_id, file, coordinates)
|
|
|
|
def extract_einsatzprotokoll(self, file, f_id):
|
|
""" extracts information from Einsatzprotokoll using external pdftotext """
|
|
|
|
self.logger.debug("[%s] Parsing PDF: %s", f_id, file)
|
|
|
|
# Get them using 'pdftotext -bbox'
|
|
# y = row
|
|
# x = column: xMax 450 / 590 means full width
|
|
coordinates = {
|
|
"auftrag": {
|
|
"xMin": 192,
|
|
"yMin": 132,
|
|
"xMax": 238,
|
|
"yMax": 142,
|
|
},
|
|
"angelegt": {
|
|
"xMin": 192,
|
|
"yMin": 294,
|
|
"xMax": 226,
|
|
"yMax": 304,
|
|
},
|
|
"dispo": {
|
|
"xMin": 192,
|
|
"yMin": 312,
|
|
"xMax": 226,
|
|
"yMax": 322,
|
|
},
|
|
"ausgerueckt": {
|
|
"xMin": 192,
|
|
"yMin": 331,
|
|
"xMax": 226,
|
|
"yMax": 341,
|
|
},
|
|
"vorort": {
|
|
"xMin": 192,
|
|
"yMin": 348,
|
|
"xMax": 226,
|
|
"yMax": 358,
|
|
},
|
|
}
|
|
|
|
return self.extract(f_id, file, coordinates)
|