#!/usr/bin/env python3 """ extracts data from ELZ PDFs using Poppler pdftotext """ import subprocess import logging class PDFParsing: """ PDF parsing """ def __init__(self): self.logger = logging.getLogger(__name__) self.logger.info("PDF parsing based on pdftotext loaded") def extract(self, f_id, file, datafields): self.logger.info("[%s] parsing PDF file %s", f_id, file) data = {} for field, coordinate in datafields.items(): # x-coordinate of the crop area top left corner x = coordinate["xMin"] # y-coordinate of the crop area top left corner y = coordinate["yMin"] # width of crop area in pixels w = coordinate["xMax"] - coordinate["xMin"] # height of crop area in pixels h = coordinate["yMax"] - coordinate["yMin"] self.logger.debug( "[%s] Computed command for field %s: %s", f_id, field, "pdftotext -f 1 -l 1 -x {} -y {} -W {} -H {}".format(x, y, w, h), ) scrapeddata = subprocess.Popen( [ "/usr/bin/pdftotext", "-f", "1", "-l", "1", "-x", str(x), "-y", str(y), "-W", str(w), "-H", str(h), file, "-", ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, ) stdout, _ = scrapeddata.communicate() ## TODO: fixup some fields (lowercase, remove unnecessary \n) if "edit" in coordinate and coordinate["edit"] == "title": data[field] = stdout.rstrip().title() else: data[field] = stdout.rstrip() # sanity check to see if we can correlate the f_id if f_id == data["auftrag"]: self.logger.debug("[%s] ID matches in PDF", f_id) return data else: self.logger.error( '[%s] ID does not match in PDF: "%s"', f_id, data["auftrag"] ) return False def extract_einsatzausdruck(self, file, f_id): """ extracts information from Einsatzausdruck using external pdftotext """ self.logger.debug("[%s] Parsing PDF: %s", f_id, file) # Get them using 'pdftotext -bbox' # y = row # x = column: xMax 450 / 590 means full width coordinates = { "auftrag": { "xMin": 70, "yMin": 47, "xMax": 120, "yMax": 58, }, "datum": { "xMin": 190, "yMin": 47, "xMax": 239, "yMax": 58, }, "zeit": { "xMin": 190, "yMin": 59, "xMax": 215, "yMax": 70, }, "melder": { "xMin": 304, "yMin": 47, "xMax": 446, "yMax": 70, "edit": "title", }, "erfasser": { "xMin": 448, "yMin": 59, "xMax": 478, "yMax": 70, }, # big field until "Disponierte Einheiten" "bemerkungen": { "xMin": 28, "yMin": 112, "xMax": 590, "yMax": 350, }, "disponierteeinheiten": { "xMin": 28, "yMin": 366, "xMax": 450, "yMax": 376, }, "einsatz": { "xMin": 76, "yMin": 690, "xMax": 450, "yMax": 703, }, "sondersignal": { "xMin": 76, "yMin": 707, "xMax": 450, "yMax": 721, }, "ort": { "xMin": 76, "yMin": 732, "xMax": 590, "yMax": 745, }, "hinweis": { "xMin": 76, "yMin": 773, "xMax": 450, "yMax": 787, }, } return self.extract(f_id, file, coordinates) def extract_einsatzprotokoll(self, file, f_id): """ extracts information from Einsatzprotokoll using external pdftotext """ self.logger.debug("[%s] Parsing PDF: %s", f_id, file) # Get them using 'pdftotext -bbox' # y = row # x = column: xMax 450 / 590 means full width coordinates = { "auftrag": { "xMin": 192, "yMin": 132, "xMax": 238, "yMax": 142, }, "angelegt": { "xMin": 192, "yMin": 294, "xMax": 226, "yMax": 304, }, "dispo": { "xMin": 192, "yMin": 312, "xMax": 226, "yMax": 322, }, "ausgerueckt": { "xMin": 192, "yMin": 331, "xMax": 226, "yMax": 341, }, "vorort": { "xMin": 192, "yMin": 348, "xMax": 226, "yMax": 358, }, } return self.extract(f_id, file, coordinates)