#!/usr/bin/env python3 """ extracts data from ELZ PDFs using Poppler pdftotext """ import subprocess import logging class PDFParsing: """ PDF parsing """ def __init__(self): self.logger = logging.getLogger(__name__) self.logger.info('PDF parsing based on pdftotext loaded') def extract(self, f_id, file, datafields): self.logger.info('[%s] parsing PDF file %s', f_id, file) data = {} for field, coordinate in datafields.items(): # x-coordinate of the crop area top left corner x = coordinate['xMin'] # y-coordinate of the crop area top left corner y = coordinate['yMin'] # width of crop area in pixels w = coordinate['xMax'] - coordinate['xMin'] # height of crop area in pixels h = coordinate['yMax'] - coordinate['yMin'] self.logger.debug('[%s] Computed command for field %s: %s', f_id, field, 'pdftotext -f 1 -l 1 -x {} -y {} -W {} -H {}'.format(x,y,w,h) ) scrapeddata = subprocess.Popen([ '/usr/bin/pdftotext', '-f', '1', '-l', '1', '-x', str(x), '-y', str(y), '-W', str(w), '-H', str(h), file, '-' ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) stdout, _ = scrapeddata.communicate() ## TODO: fixup some fields (lowercase, remove unnecessary \n) if 'edit' in coordinate and coordinate['edit'] == 'title': data[field] = stdout.rstrip().title() else: data[field] = stdout.rstrip() # sanity check to see if we can correlate the f_id if f_id == data['auftrag']: self.logger.debug('[%s] ID matches in PDF', f_id) return data else: self.logger.error('[%s] ID does not match in PDF: "%s"', f_id, data['auftrag']) return False def extract_einsatzausdruck(self, file, f_id): """ extracts information from Einsatzausdruck using external pdftotext """ self.logger.debug('[%s] Parsing PDF: %s', f_id, file) # Get them using 'pdftotext -bbox' # y = row # x = column: xMax 450 / 590 means full width coordinates = { 'auftrag': { 'xMin': 70, 'yMin': 47, 'xMax': 120,'yMax': 58, }, 'datum': { 'xMin': 190, 'yMin': 47, 'xMax': 239, 'yMax': 58, }, 'zeit': { 'xMin': 190, 'yMin': 59, 'xMax': 215, 'yMax': 70, }, 'melder': { 'xMin': 304, 'yMin': 47, 'xMax': 446, 'yMax': 70, 'edit': 'title' }, 'erfasser':{ 'xMin': 448, 'yMin': 59, 'xMax': 478, 'yMax': 70, }, # big field until "Disponierte Einheiten" 'bemerkungen': { 'xMin': 28, 'yMin': 112, 'xMax': 590, 'yMax': 350, }, 'disponierteeinheiten': { 'xMin': 28, 'yMin': 366, 'xMax': 450, 'yMax': 376, }, 'einsatz': { 'xMin': 76, 'yMin': 690, 'xMax': 450, 'yMax': 703, }, 'sondersignal': { 'xMin': 76, 'yMin': 707, 'xMax': 450, 'yMax': 721, }, 'ort': { 'xMin': 76, 'yMin': 732, 'xMax': 590, 'yMax': 745, }, 'hinweis': { 'xMin': 76, 'yMin': 773, 'xMax': 450, 'yMax': 787, }, } return self.extract(f_id, file, coordinates) def extract_einsatzprotokoll(self, file, f_id): """ extracts information from Einsatzprotokoll using external pdftotext """ self.logger.debug('[%s] Parsing PDF: %s', f_id, file) # Get them using 'pdftotext -bbox' # y = row # x = column: xMax 450 / 590 means full width coordinates = { 'auftrag': { 'xMin': 192, 'yMin': 132, 'xMax': 238,'yMax': 142, }, 'angelegt': { 'xMin': 192, 'yMin': 294, 'xMax': 226, 'yMax': 304, }, 'dispo': { 'xMin': 192, 'yMin': 312, 'xMax': 226, 'yMax': 322, }, 'ausgerueckt': { 'xMin': 192, 'yMin': 331, 'xMax': 226, 'yMax': 341, }, 'vorort':{ 'xMin': 192, 'yMin': 348, 'xMax': 226, 'yMax': 358, }, } return self.extract(f_id, file, coordinates)