pylokid/pylokid/library/pdftotext.py

206 lines
5.6 KiB
Python
Raw Normal View History

2019-09-22 16:10:09 +00:00
#!/usr/bin/env python3
""" extracts data from ELZ PDFs using Poppler pdftotext """
import subprocess
import logging
2019-09-22 16:10:09 +00:00
class PDFParsing:
""" PDF parsing """
def __init__(self):
self.logger = logging.getLogger(__name__)
self.logger.info("PDF parsing based on pdftotext loaded")
2019-09-22 16:10:09 +00:00
def extract(self, f_id, file, datafields):
self.logger.info("[%s] parsing PDF file %s", f_id, file)
2019-09-22 19:18:58 +00:00
2019-09-22 16:10:09 +00:00
data = {}
for field, coordinate in datafields.items():
# x-coordinate of the crop area top left corner
x = coordinate["xMin"]
2019-09-22 16:10:09 +00:00
# y-coordinate of the crop area top left corner
y = coordinate["yMin"]
2019-09-22 16:10:09 +00:00
# width of crop area in pixels
w = coordinate["xMax"] - coordinate["xMin"]
2019-09-22 16:10:09 +00:00
# height of crop area in pixels
h = coordinate["yMax"] - coordinate["yMin"]
2019-09-22 16:10:09 +00:00
self.logger.debug(
"[%s] Computed command for field %s: %s",
f_id,
field,
"pdftotext -f 1 -l 1 -x {} -y {} -W {} -H {}".format(x, y, w, h),
2019-09-22 16:10:09 +00:00
)
scrapeddata = subprocess.Popen(
[
"/usr/bin/pdftotext",
"-f",
"1",
"-l",
"1",
"-x",
str(x),
"-y",
str(y),
"-W",
str(w),
"-H",
str(h),
file,
"-",
2019-09-22 16:10:09 +00:00
],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
)
2019-09-22 16:10:09 +00:00
stdout, _ = scrapeddata.communicate()
## TODO: fixup some fields (lowercase, remove unnecessary \n)
if "edit" in coordinate and coordinate["edit"] == "title":
2019-09-22 16:10:09 +00:00
data[field] = stdout.rstrip().title()
else:
data[field] = stdout.rstrip()
# sanity check to see if we can correlate the f_id
if f_id == data["auftrag"]:
self.logger.debug("[%s] ID matches in PDF", f_id)
2019-09-22 16:10:09 +00:00
return data
else:
self.logger.error(
'[%s] ID does not match in PDF: "%s"', f_id, data["auftrag"]
)
2019-09-22 16:10:09 +00:00
return False
def extract_einsatzausdruck(self, file, f_id):
""" extracts information from Einsatzausdruck using external pdftotext """
self.logger.debug("[%s] Parsing PDF: %s", f_id, file)
2019-09-22 16:10:09 +00:00
# Get them using 'pdftotext -bbox'
# y = row
# x = column: xMax 450 / 590 means full width
coordinates = {
"auftrag": {
"xMin": 70,
"yMin": 47,
"xMax": 120,
"yMax": 58,
},
"datum": {
"xMin": 190,
"yMin": 47,
"xMax": 239,
"yMax": 58,
},
"zeit": {
"xMin": 190,
"yMin": 59,
"xMax": 215,
"yMax": 70,
},
"melder": {
"xMin": 304,
"yMin": 47,
"xMax": 446,
"yMax": 70,
"edit": "title",
},
"erfasser": {
"xMin": 448,
"yMin": 59,
"xMax": 478,
"yMax": 70,
2019-09-22 16:10:09 +00:00
},
# big field until "Disponierte Einheiten"
"bemerkungen": {
"xMin": 28,
"yMin": 112,
"xMax": 590,
"yMax": 350,
},
"disponierteeinheiten": {
"xMin": 28,
"yMin": 366,
"xMax": 450,
"yMax": 376,
},
"einsatz": {
"xMin": 76,
"yMin": 690,
"xMax": 450,
"yMax": 703,
},
"sondersignal": {
"xMin": 76,
"yMin": 707,
"xMax": 450,
"yMax": 721,
},
"ort": {
"xMin": 76,
"yMin": 732,
"xMax": 590,
"yMax": 745,
},
"hinweis": {
"xMin": 76,
"yMin": 773,
"xMax": 450,
"yMax": 787,
2019-09-22 16:10:09 +00:00
},
}
return self.extract(f_id, file, coordinates)
def extract_einsatzprotokoll(self, file, f_id):
""" extracts information from Einsatzprotokoll using external pdftotext """
self.logger.debug("[%s] Parsing PDF: %s", f_id, file)
2019-09-22 16:10:09 +00:00
# Get them using 'pdftotext -bbox'
# y = row
# x = column: xMax 450 / 590 means full width
coordinates = {
"auftrag": {
"xMin": 192,
"yMin": 132,
"xMax": 238,
"yMax": 142,
},
"angelegt": {
"xMin": 192,
"yMin": 294,
"xMax": 226,
"yMax": 304,
},
"dispo": {
"xMin": 192,
"yMin": 312,
"xMax": 226,
"yMax": 322,
},
"ausgerueckt": {
"xMin": 192,
"yMin": 331,
"xMax": 226,
"yMax": 341,
},
"vorort": {
"xMin": 192,
"yMin": 348,
"xMax": 226,
"yMax": 358,
2019-09-22 16:10:09 +00:00
},
}
return self.extract(f_id, file, coordinates)